agentv 4.17.1 → 4.18.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/artifact-writer-WH3OE42V.js +40 -0
- package/dist/chunk-HBDOJJFY.js +689 -0
- package/dist/chunk-HBDOJJFY.js.map +1 -0
- package/dist/{chunk-ILIM6IIX.js → chunk-MCBERRMC.js} +196 -161
- package/dist/chunk-MCBERRMC.js.map +1 -0
- package/dist/{chunk-IRU2UOWN.js → chunk-RCOAXXHP.js} +194 -197
- package/dist/chunk-RCOAXXHP.js.map +1 -0
- package/dist/{chunk-ZUNYOUFO.js → chunk-VRPCMCLQ.js} +255 -621
- package/dist/chunk-VRPCMCLQ.js.map +1 -0
- package/dist/cli.js +4 -3
- package/dist/cli.js.map +1 -1
- package/dist/{dist-U5EXNMON.js → dist-7W4OI3X2.js} +30 -33
- package/dist/dist-7W4OI3X2.js.map +1 -0
- package/dist/index.js +4 -3
- package/dist/{interactive-LFCOVXPQ.js → interactive-J4QEU5FG.js} +4 -3
- package/dist/{interactive-LFCOVXPQ.js.map → interactive-J4QEU5FG.js.map} +1 -1
- package/dist/studio/assets/{index-Bhv1TEO2.js → index-BTsTcivx.js} +1 -1
- package/dist/studio/assets/{index-vZYHIvCH.js → index-KfPHd-QM.js} +1 -1
- package/dist/studio/index.html +1 -1
- package/package.json +1 -1
- package/dist/chunk-ILIM6IIX.js.map +0 -1
- package/dist/chunk-IRU2UOWN.js.map +0 -1
- package/dist/chunk-ZUNYOUFO.js.map +0 -1
- /package/dist/{dist-U5EXNMON.js.map → artifact-writer-WH3OE42V.js.map} +0 -0
|
@@ -1,4 +1,21 @@
|
|
|
1
1
|
import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
|
|
2
|
+
import {
|
|
3
|
+
RESULT_INDEX_FILENAME,
|
|
4
|
+
RESULT_RUNS_DIRNAME,
|
|
5
|
+
aggregateRunDir,
|
|
6
|
+
buildDefaultRunDir,
|
|
7
|
+
buildTestTargetKey,
|
|
8
|
+
deduplicateByTestIdTarget,
|
|
9
|
+
isDirectoryPath,
|
|
10
|
+
normalizeExperimentName,
|
|
11
|
+
parseJsonlResults,
|
|
12
|
+
resolveExistingRunPrimaryPath,
|
|
13
|
+
resolveRunIndexPath,
|
|
14
|
+
resolveRunManifestPath,
|
|
15
|
+
resolveWorkspaceOrFilePath,
|
|
16
|
+
toSnakeCaseDeep,
|
|
17
|
+
writeArtifactsFromResults
|
|
18
|
+
} from "./chunk-HBDOJJFY.js";
|
|
2
19
|
import {
|
|
3
20
|
CLI_PLACEHOLDERS,
|
|
4
21
|
COMMON_TARGET_SETTINGS,
|
|
@@ -18,7 +35,7 @@ import {
|
|
|
18
35
|
findGitRoot,
|
|
19
36
|
getResultsRepoStatus,
|
|
20
37
|
interpolateEnv,
|
|
21
|
-
|
|
38
|
+
isGraderKind,
|
|
22
39
|
listTargetNames,
|
|
23
40
|
loadCasesFromFile,
|
|
24
41
|
loadConfig,
|
|
@@ -40,14 +57,13 @@ import {
|
|
|
40
57
|
subscribeToCopilotSdkLogEntries,
|
|
41
58
|
subscribeToPiLogEntries,
|
|
42
59
|
syncResultsRepo,
|
|
43
|
-
toCamelCaseDeep
|
|
44
|
-
|
|
45
|
-
} from "./chunk-IRU2UOWN.js";
|
|
60
|
+
toCamelCaseDeep
|
|
61
|
+
} from "./chunk-RCOAXXHP.js";
|
|
46
62
|
|
|
47
63
|
// package.json
|
|
48
64
|
var package_default = {
|
|
49
65
|
name: "agentv",
|
|
50
|
-
version: "4.
|
|
66
|
+
version: "4.18.0-next.1",
|
|
51
67
|
description: "CLI entry point for AgentV",
|
|
52
68
|
type: "module",
|
|
53
69
|
repository: {
|
|
@@ -261,9 +277,9 @@ async function discoverTargetsFile(options) {
|
|
|
261
277
|
}
|
|
262
278
|
|
|
263
279
|
// src/commands/eval/run-eval.ts
|
|
264
|
-
import { constants as constants4, mkdirSync } from "node:fs";
|
|
265
|
-
import { access as access5 } from "node:fs/promises";
|
|
266
|
-
import
|
|
280
|
+
import { constants as constants4, existsSync as existsSync2, mkdirSync } from "node:fs";
|
|
281
|
+
import { access as access5, readFile as readFile8 } from "node:fs/promises";
|
|
282
|
+
import path15 from "node:path";
|
|
267
283
|
import { pathToFileURL } from "node:url";
|
|
268
284
|
|
|
269
285
|
// src/version-check.ts
|
|
@@ -321,88 +337,15 @@ async function promptContinue() {
|
|
|
321
337
|
}
|
|
322
338
|
|
|
323
339
|
// src/commands/results/remote.ts
|
|
324
|
-
import path6 from "node:path";
|
|
325
|
-
|
|
326
|
-
// src/commands/inspect/utils.ts
|
|
327
|
-
import { readFileSync as readFileSync2, readdirSync, statSync as statSync2 } from "node:fs";
|
|
328
340
|
import path5 from "node:path";
|
|
329
341
|
|
|
330
|
-
// src/commands/
|
|
331
|
-
import {
|
|
332
|
-
import
|
|
333
|
-
var RESULT_INDEX_FILENAME = "index.jsonl";
|
|
334
|
-
var RESULT_RUNS_DIRNAME = "runs";
|
|
335
|
-
var DEFAULT_EXPERIMENT_NAME = "default";
|
|
336
|
-
function normalizeExperimentName(experiment) {
|
|
337
|
-
const trimmed = experiment?.trim();
|
|
338
|
-
if (!trimmed) {
|
|
339
|
-
return DEFAULT_EXPERIMENT_NAME;
|
|
340
|
-
}
|
|
341
|
-
if (!/^[A-Za-z0-9._-]+$/.test(trimmed)) {
|
|
342
|
-
throw new Error(
|
|
343
|
-
`Invalid experiment name "${trimmed}". Use only letters, numbers, ".", "_" and "-".`
|
|
344
|
-
);
|
|
345
|
-
}
|
|
346
|
-
return trimmed;
|
|
347
|
-
}
|
|
348
|
-
function createRunDirName(timestamp = /* @__PURE__ */ new Date()) {
|
|
349
|
-
return timestamp.toISOString().replace(/[:.]/g, "-");
|
|
350
|
-
}
|
|
351
|
-
function buildDefaultRunDir(cwd, experiment, timestamp = /* @__PURE__ */ new Date()) {
|
|
352
|
-
return path3.join(
|
|
353
|
-
cwd,
|
|
354
|
-
".agentv",
|
|
355
|
-
"results",
|
|
356
|
-
RESULT_RUNS_DIRNAME,
|
|
357
|
-
normalizeExperimentName(experiment),
|
|
358
|
-
createRunDirName(timestamp)
|
|
359
|
-
);
|
|
360
|
-
}
|
|
361
|
-
function resolveRunIndexPath(runDir) {
|
|
362
|
-
return path3.join(runDir, RESULT_INDEX_FILENAME);
|
|
363
|
-
}
|
|
364
|
-
function isRunManifestPath(filePath) {
|
|
365
|
-
return path3.basename(filePath) === RESULT_INDEX_FILENAME;
|
|
366
|
-
}
|
|
367
|
-
function resolveExistingRunPrimaryPath(runDir) {
|
|
368
|
-
const indexPath = resolveRunIndexPath(runDir);
|
|
369
|
-
if (existsSync(indexPath)) {
|
|
370
|
-
return indexPath;
|
|
371
|
-
}
|
|
372
|
-
return void 0;
|
|
373
|
-
}
|
|
374
|
-
function isDirectoryPath(filePath) {
|
|
375
|
-
try {
|
|
376
|
-
return statSync(filePath).isDirectory();
|
|
377
|
-
} catch {
|
|
378
|
-
return false;
|
|
379
|
-
}
|
|
380
|
-
}
|
|
381
|
-
function resolveWorkspaceOrFilePath(filePath) {
|
|
382
|
-
if (!isDirectoryPath(filePath)) {
|
|
383
|
-
return filePath;
|
|
384
|
-
}
|
|
385
|
-
const existing = resolveExistingRunPrimaryPath(filePath);
|
|
386
|
-
if (!existing) {
|
|
387
|
-
throw new Error(`Result workspace is missing ${RESULT_INDEX_FILENAME}: ${filePath}`);
|
|
388
|
-
}
|
|
389
|
-
return existing;
|
|
390
|
-
}
|
|
391
|
-
function resolveRunManifestPath(filePath) {
|
|
392
|
-
if (isDirectoryPath(filePath)) {
|
|
393
|
-
return resolveWorkspaceOrFilePath(filePath);
|
|
394
|
-
}
|
|
395
|
-
if (!isRunManifestPath(filePath)) {
|
|
396
|
-
throw new Error(
|
|
397
|
-
`Expected a run workspace directory or ${RESULT_INDEX_FILENAME} manifest: ${filePath}`
|
|
398
|
-
);
|
|
399
|
-
}
|
|
400
|
-
return filePath;
|
|
401
|
-
}
|
|
342
|
+
// src/commands/inspect/utils.ts
|
|
343
|
+
import { readFileSync as readFileSync2, readdirSync, statSync } from "node:fs";
|
|
344
|
+
import path4 from "node:path";
|
|
402
345
|
|
|
403
346
|
// src/commands/results/manifest.ts
|
|
404
|
-
import { existsSync
|
|
405
|
-
import
|
|
347
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
348
|
+
import path3 from "node:path";
|
|
406
349
|
function parseJsonlLines(content) {
|
|
407
350
|
return content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0).map((line) => JSON.parse(line));
|
|
408
351
|
}
|
|
@@ -421,8 +364,8 @@ function readOptionalText(baseDir, relativePath) {
|
|
|
421
364
|
if (!relativePath) {
|
|
422
365
|
return void 0;
|
|
423
366
|
}
|
|
424
|
-
const absolutePath =
|
|
425
|
-
if (!
|
|
367
|
+
const absolutePath = path3.join(baseDir, relativePath);
|
|
368
|
+
if (!existsSync(absolutePath)) {
|
|
426
369
|
return void 0;
|
|
427
370
|
}
|
|
428
371
|
return readFileSync(absolutePath, "utf8");
|
|
@@ -510,8 +453,8 @@ function parseResultManifest(content) {
|
|
|
510
453
|
return parseJsonlLines(content);
|
|
511
454
|
}
|
|
512
455
|
function resolveResultSourcePath(source, cwd) {
|
|
513
|
-
const resolved =
|
|
514
|
-
if (isDirectoryPath(resolved) ||
|
|
456
|
+
const resolved = path3.isAbsolute(source) ? source : path3.resolve(cwd ?? process.cwd(), source);
|
|
457
|
+
if (isDirectoryPath(resolved) || path3.basename(resolved) === RESULT_INDEX_FILENAME) {
|
|
515
458
|
return resolveRunManifestPath(resolved);
|
|
516
459
|
}
|
|
517
460
|
return resolved;
|
|
@@ -520,7 +463,7 @@ function loadManifestResults(sourceFile) {
|
|
|
520
463
|
const resolvedSourceFile = resolveRunManifestPath(sourceFile);
|
|
521
464
|
const content = readFileSync(resolvedSourceFile, "utf8");
|
|
522
465
|
const records = parseResultManifest(content);
|
|
523
|
-
const baseDir =
|
|
466
|
+
const baseDir = path3.dirname(resolvedSourceFile);
|
|
524
467
|
return records.map((record) => hydrateManifestRecord(baseDir, record));
|
|
525
468
|
}
|
|
526
469
|
function loadLightweightResults(sourceFile) {
|
|
@@ -566,10 +509,10 @@ function padLeft(str, len) {
|
|
|
566
509
|
}
|
|
567
510
|
function loadResultFile(filePath) {
|
|
568
511
|
const resolvedFilePath = resolveTraceResultPath(filePath);
|
|
569
|
-
if (
|
|
512
|
+
if (path4.extname(resolvedFilePath) === ".json") {
|
|
570
513
|
return loadOtlpTraceFile(resolvedFilePath);
|
|
571
514
|
}
|
|
572
|
-
if (
|
|
515
|
+
if (path4.basename(resolvedFilePath) === RESULT_INDEX_FILENAME) {
|
|
573
516
|
return loadManifestAsRawResults(resolvedFilePath);
|
|
574
517
|
}
|
|
575
518
|
return loadJsonlRecords(resolvedFilePath);
|
|
@@ -848,7 +791,7 @@ function toTraceSummary(result) {
|
|
|
848
791
|
return toCamelCaseDeep(rawTrace);
|
|
849
792
|
}
|
|
850
793
|
function buildRunId(relativeRunPath) {
|
|
851
|
-
const normalized = relativeRunPath.split(
|
|
794
|
+
const normalized = relativeRunPath.split(path4.sep).join("/");
|
|
852
795
|
const segments = normalized.split("/").filter(Boolean);
|
|
853
796
|
if (segments.length >= 2) {
|
|
854
797
|
const experiment = segments.slice(0, -1).join("/");
|
|
@@ -863,10 +806,10 @@ function buildRunId(relativeRunPath) {
|
|
|
863
806
|
function collectRunManifestPaths(runsDir, currentDir, files) {
|
|
864
807
|
const primaryPath = resolveExistingRunPrimaryPath(currentDir);
|
|
865
808
|
if (primaryPath) {
|
|
866
|
-
const relativeRunPath =
|
|
809
|
+
const relativeRunPath = path4.relative(runsDir, currentDir);
|
|
867
810
|
files.push({
|
|
868
811
|
filePath: primaryPath,
|
|
869
|
-
displayName:
|
|
812
|
+
displayName: path4.basename(currentDir),
|
|
870
813
|
runId: buildRunId(relativeRunPath)
|
|
871
814
|
});
|
|
872
815
|
return;
|
|
@@ -874,7 +817,7 @@ function collectRunManifestPaths(runsDir, currentDir, files) {
|
|
|
874
817
|
const entries = readdirSync(currentDir, { withFileTypes: true });
|
|
875
818
|
for (const entry of entries) {
|
|
876
819
|
if (entry.isDirectory()) {
|
|
877
|
-
collectRunManifestPaths(runsDir,
|
|
820
|
+
collectRunManifestPaths(runsDir, path4.join(currentDir, entry.name), files);
|
|
878
821
|
}
|
|
879
822
|
}
|
|
880
823
|
}
|
|
@@ -884,7 +827,7 @@ function listResultFilesFromRunsDir(runsDir, limit) {
|
|
|
884
827
|
const entries = readdirSync(runsDir, { withFileTypes: true });
|
|
885
828
|
for (const entry of entries) {
|
|
886
829
|
if (entry.isDirectory()) {
|
|
887
|
-
collectRunManifestPaths(runsDir,
|
|
830
|
+
collectRunManifestPaths(runsDir, path4.join(runsDir, entry.name), files);
|
|
888
831
|
}
|
|
889
832
|
}
|
|
890
833
|
} catch {
|
|
@@ -894,7 +837,7 @@ function listResultFilesFromRunsDir(runsDir, limit) {
|
|
|
894
837
|
const metas = [];
|
|
895
838
|
for (const { filePath, displayName, runId } of limited) {
|
|
896
839
|
try {
|
|
897
|
-
const fileStat =
|
|
840
|
+
const fileStat = statSync(filePath);
|
|
898
841
|
const results = loadResultFile(filePath);
|
|
899
842
|
const testCount = results.length;
|
|
900
843
|
const passCount = results.filter((r) => r.score >= DEFAULT_THRESHOLD).length;
|
|
@@ -919,7 +862,7 @@ function listResultFilesFromRunsDir(runsDir, limit) {
|
|
|
919
862
|
}
|
|
920
863
|
function listResultFiles(cwd, limit) {
|
|
921
864
|
return listResultFilesFromRunsDir(
|
|
922
|
-
|
|
865
|
+
path4.join(cwd, ".agentv", "results", RESULT_RUNS_DIRNAME),
|
|
923
866
|
limit
|
|
924
867
|
);
|
|
925
868
|
}
|
|
@@ -971,17 +914,17 @@ function slugify(value) {
|
|
|
971
914
|
return value.trim().replace(/[^A-Za-z0-9._/-]+/g, "-").replace(/\/+/g, "/").replace(/^-+|-+$/g, "").slice(0, 120);
|
|
972
915
|
}
|
|
973
916
|
function getRelativeRunPath(cwd, runDir) {
|
|
974
|
-
const relative =
|
|
975
|
-
if (!relative.startsWith("..") && !
|
|
917
|
+
const relative = path5.relative(path5.join(cwd, ".agentv", "results", "runs"), runDir);
|
|
918
|
+
if (!relative.startsWith("..") && !path5.isAbsolute(relative)) {
|
|
976
919
|
return relative;
|
|
977
920
|
}
|
|
978
|
-
const experiment =
|
|
979
|
-
const runName =
|
|
980
|
-
return experiment && experiment !== runName ?
|
|
921
|
+
const experiment = path5.basename(path5.dirname(runDir));
|
|
922
|
+
const runName = path5.basename(runDir);
|
|
923
|
+
return experiment && experiment !== runName ? path5.join(experiment, runName) : runName;
|
|
981
924
|
}
|
|
982
925
|
function buildBranchName(config, payload) {
|
|
983
|
-
const timestamp =
|
|
984
|
-
const evalStem = payload.test_files.length === 1 ?
|
|
926
|
+
const timestamp = path5.basename(payload.run_dir);
|
|
927
|
+
const evalStem = payload.test_files.length === 1 ? path5.basename(payload.test_files[0]).replace(/\.eval\.ya?ml$/i, "").replace(/\.[^.]+$/i, "") : `${payload.test_files.length}-evals`;
|
|
985
928
|
const experiment = slugify(payload.experiment ?? "default");
|
|
986
929
|
const branchLeaf = slugify(`${experiment}-${evalStem}-${timestamp}`) || timestamp;
|
|
987
930
|
return `${config.branch_prefix}/${branchLeaf}`;
|
|
@@ -1010,7 +953,7 @@ function buildPrBody(payload) {
|
|
|
1010
953
|
"",
|
|
1011
954
|
sections,
|
|
1012
955
|
"",
|
|
1013
|
-
`Run: ${
|
|
956
|
+
`Run: ${path5.basename(payload.run_dir)}`,
|
|
1014
957
|
`Experiment: ${payload.experiment ?? "default"}`,
|
|
1015
958
|
`Eval Files: ${payload.test_files.join(", ")}`
|
|
1016
959
|
].join("\n");
|
|
@@ -1025,7 +968,7 @@ async function maybeWarnLargeArtifact(runDir) {
|
|
|
1025
968
|
}
|
|
1026
969
|
async function loadNormalizedResultsConfig(cwd) {
|
|
1027
970
|
const repoRoot = await findRepoRoot(cwd) ?? cwd;
|
|
1028
|
-
const config = await loadConfig(
|
|
971
|
+
const config = await loadConfig(path5.join(cwd, "_"), repoRoot);
|
|
1029
972
|
if (!config?.results?.export) {
|
|
1030
973
|
return void 0;
|
|
1031
974
|
}
|
|
@@ -1109,7 +1052,7 @@ async function maybeAutoExportRunArtifacts(payload) {
|
|
|
1109
1052
|
const prepared = await prepareResultsRepoBranch(config, branchName);
|
|
1110
1053
|
try {
|
|
1111
1054
|
const relativeRunPath = getRelativeRunPath(payload.cwd, payload.run_dir);
|
|
1112
|
-
const destinationDir =
|
|
1055
|
+
const destinationDir = path5.join(prepared.repoDir, config.path, relativeRunPath);
|
|
1113
1056
|
await stageResultsArtifacts({
|
|
1114
1057
|
repoDir: prepared.repoDir,
|
|
1115
1058
|
sourceDir: payload.run_dir,
|
|
@@ -1143,36 +1086,8 @@ async function maybeAutoExportRunArtifacts(payload) {
|
|
|
1143
1086
|
}
|
|
1144
1087
|
}
|
|
1145
1088
|
|
|
1146
|
-
// src/commands/eval/
|
|
1147
|
-
import {
|
|
1148
|
-
import path7 from "node:path";
|
|
1149
|
-
|
|
1150
|
-
// src/utils/case-conversion.ts
|
|
1151
|
-
function toSnakeCase(str) {
|
|
1152
|
-
if (/^[A-Z]/.test(str)) {
|
|
1153
|
-
return str;
|
|
1154
|
-
}
|
|
1155
|
-
return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
1156
|
-
}
|
|
1157
|
-
function toSnakeCaseDeep(obj) {
|
|
1158
|
-
if (obj === null || obj === void 0) {
|
|
1159
|
-
return obj;
|
|
1160
|
-
}
|
|
1161
|
-
if (Array.isArray(obj)) {
|
|
1162
|
-
return obj.map((item) => toSnakeCaseDeep(item));
|
|
1163
|
-
}
|
|
1164
|
-
if (typeof obj === "object") {
|
|
1165
|
-
const result = {};
|
|
1166
|
-
for (const [key, value] of Object.entries(obj)) {
|
|
1167
|
-
const snakeKey = toSnakeCase(key);
|
|
1168
|
-
result[snakeKey] = toSnakeCaseDeep(value);
|
|
1169
|
-
}
|
|
1170
|
-
return result;
|
|
1171
|
-
}
|
|
1172
|
-
return obj;
|
|
1173
|
-
}
|
|
1174
|
-
|
|
1175
|
-
// src/commands/eval/artifact-writer.ts
|
|
1089
|
+
// src/commands/eval/benchmark-writer.ts
|
|
1090
|
+
import { writeFile } from "node:fs/promises";
|
|
1176
1091
|
function computeStats(values) {
|
|
1177
1092
|
if (values.length === 0) {
|
|
1178
1093
|
return { mean: 0, stddev: 0 };
|
|
@@ -1185,361 +1100,6 @@ function computeStats(values) {
|
|
|
1185
1100
|
};
|
|
1186
1101
|
}
|
|
1187
1102
|
function computePassRate(result) {
|
|
1188
|
-
const scores = result.scores;
|
|
1189
|
-
if (scores && scores.length > 0) {
|
|
1190
|
-
const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
|
|
1191
|
-
return passed / scores.length;
|
|
1192
|
-
}
|
|
1193
|
-
return (result.score ?? 0) >= DEFAULT_THRESHOLD ? 1 : 0;
|
|
1194
|
-
}
|
|
1195
|
-
function countToolCalls(result) {
|
|
1196
|
-
const toolCalls = {};
|
|
1197
|
-
let total = 0;
|
|
1198
|
-
const trace = result.trace;
|
|
1199
|
-
if (trace?.steps) {
|
|
1200
|
-
for (const step of trace.steps) {
|
|
1201
|
-
if (step.toolName || step.type === "tool") {
|
|
1202
|
-
const name = step.toolName ?? "unknown";
|
|
1203
|
-
toolCalls[name] = (toolCalls[name] ?? 0) + 1;
|
|
1204
|
-
total += 1;
|
|
1205
|
-
}
|
|
1206
|
-
}
|
|
1207
|
-
}
|
|
1208
|
-
return { toolCalls, total };
|
|
1209
|
-
}
|
|
1210
|
-
function parseWorkspaceChanges(fileChanges) {
|
|
1211
|
-
if (!fileChanges) {
|
|
1212
|
-
return void 0;
|
|
1213
|
-
}
|
|
1214
|
-
let filesModified = 0;
|
|
1215
|
-
let filesCreated = 0;
|
|
1216
|
-
const lines = fileChanges.split("\n");
|
|
1217
|
-
for (const line of lines) {
|
|
1218
|
-
if (line.startsWith("--- /dev/null")) {
|
|
1219
|
-
filesCreated += 1;
|
|
1220
|
-
} else if (line.startsWith("--- a/")) {
|
|
1221
|
-
filesModified += 1;
|
|
1222
|
-
}
|
|
1223
|
-
}
|
|
1224
|
-
const summaryLines = lines.slice(0, 20);
|
|
1225
|
-
const diffSummary = lines.length > 20 ? `${summaryLines.join("\n")}
|
|
1226
|
-
... (${lines.length - 20} more lines)` : fileChanges;
|
|
1227
|
-
return {
|
|
1228
|
-
files_modified: filesModified,
|
|
1229
|
-
files_created: filesCreated,
|
|
1230
|
-
diff_summary: diffSummary
|
|
1231
|
-
};
|
|
1232
|
-
}
|
|
1233
|
-
function buildAssertions(result) {
|
|
1234
|
-
if (!result.assertions) return [];
|
|
1235
|
-
return result.assertions.map((a) => ({
|
|
1236
|
-
text: a.text,
|
|
1237
|
-
passed: a.passed,
|
|
1238
|
-
evidence: a.evidence ?? ""
|
|
1239
|
-
}));
|
|
1240
|
-
}
|
|
1241
|
-
function buildEvaluators(scores) {
|
|
1242
|
-
if (!scores || scores.length === 0) {
|
|
1243
|
-
return void 0;
|
|
1244
|
-
}
|
|
1245
|
-
return scores.map((s) => ({
|
|
1246
|
-
name: s.name,
|
|
1247
|
-
type: s.type,
|
|
1248
|
-
score: s.score,
|
|
1249
|
-
reasoning: "",
|
|
1250
|
-
weight: s.weight,
|
|
1251
|
-
verdict: s.verdict,
|
|
1252
|
-
assertions: s.assertions,
|
|
1253
|
-
details: s.details
|
|
1254
|
-
}));
|
|
1255
|
-
}
|
|
1256
|
-
function buildGradingArtifact(result) {
|
|
1257
|
-
const assertions = buildAssertions(result);
|
|
1258
|
-
const passed = assertions.filter((e) => e.passed).length;
|
|
1259
|
-
const failed = assertions.filter((e) => !e.passed).length;
|
|
1260
|
-
const total = assertions.length;
|
|
1261
|
-
const { toolCalls, total: totalToolCalls } = countToolCalls(result);
|
|
1262
|
-
const errorsEncountered = result.error ? 1 : 0;
|
|
1263
|
-
return {
|
|
1264
|
-
assertions,
|
|
1265
|
-
summary: {
|
|
1266
|
-
passed,
|
|
1267
|
-
failed,
|
|
1268
|
-
total,
|
|
1269
|
-
pass_rate: total > 0 ? Math.round(passed / total * 1e3) / 1e3 : 0
|
|
1270
|
-
},
|
|
1271
|
-
execution_metrics: {
|
|
1272
|
-
tool_calls: toolCalls,
|
|
1273
|
-
total_tool_calls: totalToolCalls,
|
|
1274
|
-
errors_encountered: errorsEncountered
|
|
1275
|
-
},
|
|
1276
|
-
graders: buildEvaluators(result.scores),
|
|
1277
|
-
workspace_changes: parseWorkspaceChanges(result.fileChanges),
|
|
1278
|
-
conversation: result.conversationId ? {
|
|
1279
|
-
turns: result.trace ? result.trace.steps?.length ?? 0 : 0,
|
|
1280
|
-
conversation_id: result.conversationId
|
|
1281
|
-
} : void 0
|
|
1282
|
-
};
|
|
1283
|
-
}
|
|
1284
|
-
function buildTimingArtifact(results) {
|
|
1285
|
-
let totalInput = 0;
|
|
1286
|
-
let totalOutput = 0;
|
|
1287
|
-
let totalReasoning = 0;
|
|
1288
|
-
let totalDurationMs = 0;
|
|
1289
|
-
for (const result of results) {
|
|
1290
|
-
const usage = result.tokenUsage;
|
|
1291
|
-
if (usage) {
|
|
1292
|
-
totalInput += usage.input ?? 0;
|
|
1293
|
-
totalOutput += usage.output ?? 0;
|
|
1294
|
-
totalReasoning += usage.reasoning ?? 0;
|
|
1295
|
-
}
|
|
1296
|
-
if (result.durationMs != null) {
|
|
1297
|
-
totalDurationMs += result.durationMs;
|
|
1298
|
-
}
|
|
1299
|
-
}
|
|
1300
|
-
return {
|
|
1301
|
-
total_tokens: totalInput + totalOutput,
|
|
1302
|
-
duration_ms: totalDurationMs,
|
|
1303
|
-
total_duration_seconds: Math.round(totalDurationMs / 1e3 * 1e3) / 1e3,
|
|
1304
|
-
token_usage: {
|
|
1305
|
-
input: totalInput,
|
|
1306
|
-
output: totalOutput,
|
|
1307
|
-
reasoning: totalReasoning
|
|
1308
|
-
}
|
|
1309
|
-
};
|
|
1310
|
-
}
|
|
1311
|
-
function buildBenchmarkArtifact(results, evalFile = "", experiment) {
|
|
1312
|
-
const targetSet = /* @__PURE__ */ new Set();
|
|
1313
|
-
const testIdSet = /* @__PURE__ */ new Set();
|
|
1314
|
-
for (const result of results) {
|
|
1315
|
-
targetSet.add(result.target ?? "unknown");
|
|
1316
|
-
testIdSet.add(result.testId ?? "unknown");
|
|
1317
|
-
}
|
|
1318
|
-
const targets = [...targetSet].sort();
|
|
1319
|
-
const testIds = [...testIdSet].sort();
|
|
1320
|
-
const runSummary = {};
|
|
1321
|
-
const notes = [];
|
|
1322
|
-
for (const target of targets) {
|
|
1323
|
-
const targetResults = results.filter((r) => r.target === target);
|
|
1324
|
-
const passRates = targetResults.map(computePassRate);
|
|
1325
|
-
const timings = targetResults.filter((r) => r.durationMs != null).map((r) => r.durationMs / 1e3);
|
|
1326
|
-
const tokens = targetResults.filter((r) => r.tokenUsage != null).map((r) => {
|
|
1327
|
-
const usage = r.tokenUsage;
|
|
1328
|
-
return (usage.input ?? 0) + (usage.output ?? 0);
|
|
1329
|
-
});
|
|
1330
|
-
const entry = {
|
|
1331
|
-
pass_rate: computeStats(passRates),
|
|
1332
|
-
time_seconds: computeStats(timings),
|
|
1333
|
-
tokens: computeStats(tokens)
|
|
1334
|
-
};
|
|
1335
|
-
const toolCallCounts = targetResults.map((r) => countToolCalls(r).total);
|
|
1336
|
-
if (toolCallCounts.some((c2) => c2 > 0)) {
|
|
1337
|
-
entry.tool_calls = computeStats(toolCallCounts);
|
|
1338
|
-
}
|
|
1339
|
-
const costs = targetResults.filter((r) => r.costUsd != null).map((r) => r.costUsd);
|
|
1340
|
-
if (costs.length > 0) {
|
|
1341
|
-
entry.cost_usd = computeStats(costs);
|
|
1342
|
-
}
|
|
1343
|
-
runSummary[target] = entry;
|
|
1344
|
-
}
|
|
1345
|
-
const evaluatorScores = /* @__PURE__ */ new Map();
|
|
1346
|
-
for (const result of results) {
|
|
1347
|
-
if (result.scores) {
|
|
1348
|
-
for (const score of result.scores) {
|
|
1349
|
-
const key = `${score.name}:${score.type}`;
|
|
1350
|
-
if (!evaluatorScores.has(key)) {
|
|
1351
|
-
evaluatorScores.set(key, []);
|
|
1352
|
-
}
|
|
1353
|
-
evaluatorScores.get(key)?.push(score.score);
|
|
1354
|
-
}
|
|
1355
|
-
}
|
|
1356
|
-
}
|
|
1357
|
-
let perEvaluatorSummary;
|
|
1358
|
-
if (evaluatorScores.size > 0) {
|
|
1359
|
-
perEvaluatorSummary = {};
|
|
1360
|
-
for (const [key, scores] of evaluatorScores) {
|
|
1361
|
-
perEvaluatorSummary[key] = computeStats(scores);
|
|
1362
|
-
}
|
|
1363
|
-
}
|
|
1364
|
-
const errorCount = results.filter(
|
|
1365
|
-
(r) => r.executionStatus != null && r.executionStatus === "execution_error"
|
|
1366
|
-
).length;
|
|
1367
|
-
if (errorCount > 0) {
|
|
1368
|
-
notes.push(
|
|
1369
|
-
`${errorCount} test(s) had execution errors and are included in pass_rate as failures`
|
|
1370
|
-
);
|
|
1371
|
-
}
|
|
1372
|
-
if (results.length === 0) {
|
|
1373
|
-
notes.push("No results to summarize");
|
|
1374
|
-
}
|
|
1375
|
-
const firstResult = results[0];
|
|
1376
|
-
const timestamp = firstResult?.timestamp ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
1377
|
-
return {
|
|
1378
|
-
metadata: {
|
|
1379
|
-
eval_file: evalFile,
|
|
1380
|
-
timestamp,
|
|
1381
|
-
targets,
|
|
1382
|
-
tests_run: testIds,
|
|
1383
|
-
experiment
|
|
1384
|
-
},
|
|
1385
|
-
run_summary: runSummary,
|
|
1386
|
-
per_grader_summary: perEvaluatorSummary,
|
|
1387
|
-
notes
|
|
1388
|
-
};
|
|
1389
|
-
}
|
|
1390
|
-
function safeArtifactPathSegment(value, fallback) {
|
|
1391
|
-
const trimmed = value?.trim();
|
|
1392
|
-
if (!trimmed) {
|
|
1393
|
-
return fallback;
|
|
1394
|
-
}
|
|
1395
|
-
return trimmed.replace(/[/\\:*?"<>|]/g, "_");
|
|
1396
|
-
}
|
|
1397
|
-
function safeTestId(testId) {
|
|
1398
|
-
return safeArtifactPathSegment(testId, "unknown");
|
|
1399
|
-
}
|
|
1400
|
-
function getSuite(result) {
|
|
1401
|
-
return result.suite;
|
|
1402
|
-
}
|
|
1403
|
-
function buildArtifactSubdir(result) {
|
|
1404
|
-
const segments = [];
|
|
1405
|
-
const evalSet = getSuite(result);
|
|
1406
|
-
if (evalSet) {
|
|
1407
|
-
segments.push(safeArtifactPathSegment(evalSet, "default"));
|
|
1408
|
-
}
|
|
1409
|
-
segments.push(safeTestId(result.testId));
|
|
1410
|
-
return path7.posix.join(...segments);
|
|
1411
|
-
}
|
|
1412
|
-
function formatOutputMarkdown(output) {
|
|
1413
|
-
return output.map((msg) => `@[${msg.role}]:
|
|
1414
|
-
${String(msg.content ?? "")}`).join("\n\n");
|
|
1415
|
-
}
|
|
1416
|
-
function extractInput(result) {
|
|
1417
|
-
const input = result.input;
|
|
1418
|
-
if (!input) return null;
|
|
1419
|
-
if (typeof input === "string") return input;
|
|
1420
|
-
if (Array.isArray(input) && input.length > 0) {
|
|
1421
|
-
return formatOutputMarkdown(input);
|
|
1422
|
-
}
|
|
1423
|
-
return null;
|
|
1424
|
-
}
|
|
1425
|
-
function buildResultIndexArtifact(result) {
|
|
1426
|
-
const artifactSubdir = buildArtifactSubdir(result);
|
|
1427
|
-
const input = extractInput(result);
|
|
1428
|
-
const hasResponse = Array.isArray(result.output) && result.output.length > 0;
|
|
1429
|
-
return {
|
|
1430
|
-
timestamp: result.timestamp,
|
|
1431
|
-
test_id: result.testId ?? "unknown",
|
|
1432
|
-
suite: getSuite(result),
|
|
1433
|
-
category: result.category,
|
|
1434
|
-
conversation_id: result.conversationId,
|
|
1435
|
-
score: result.score,
|
|
1436
|
-
target: result.target ?? "unknown",
|
|
1437
|
-
scores: result.scores ? toSnakeCaseDeep(result.scores) : void 0,
|
|
1438
|
-
execution_status: result.executionStatus,
|
|
1439
|
-
error: result.error,
|
|
1440
|
-
failure_stage: result.failureStage,
|
|
1441
|
-
failure_reason_code: result.failureReasonCode,
|
|
1442
|
-
workspace_path: result.workspacePath,
|
|
1443
|
-
grading_path: path7.posix.join(artifactSubdir, "grading.json"),
|
|
1444
|
-
timing_path: path7.posix.join(artifactSubdir, "timing.json"),
|
|
1445
|
-
input_path: input ? path7.posix.join(artifactSubdir, "input.md") : void 0,
|
|
1446
|
-
output_path: hasResponse ? path7.posix.join(artifactSubdir, "outputs", "response.md") : void 0,
|
|
1447
|
-
response_path: hasResponse ? path7.posix.join(artifactSubdir, "outputs", "response.md") : void 0
|
|
1448
|
-
};
|
|
1449
|
-
}
|
|
1450
|
-
async function writeJsonlFile(filePath, records) {
|
|
1451
|
-
const content = records.length === 0 ? "" : `${records.map((record) => JSON.stringify(toSnakeCaseDeep(record))).join("\n")}
|
|
1452
|
-
`;
|
|
1453
|
-
await writeFile(filePath, content, "utf8");
|
|
1454
|
-
}
|
|
1455
|
-
function buildTranscriptMessageLines(results) {
|
|
1456
|
-
const lines = [];
|
|
1457
|
-
for (const result of results) {
|
|
1458
|
-
const transcriptLines = toTranscriptJsonLines(
|
|
1459
|
-
{
|
|
1460
|
-
messages: [...result.input ?? [], ...result.output],
|
|
1461
|
-
source: {
|
|
1462
|
-
provider: result.target,
|
|
1463
|
-
sessionId: result.conversationId ?? result.testId,
|
|
1464
|
-
startedAt: result.timestamp
|
|
1465
|
-
},
|
|
1466
|
-
tokenUsage: result.tokenUsage,
|
|
1467
|
-
durationMs: result.durationMs,
|
|
1468
|
-
costUsd: result.costUsd
|
|
1469
|
-
},
|
|
1470
|
-
{
|
|
1471
|
-
testId: result.testId,
|
|
1472
|
-
target: result.target
|
|
1473
|
-
}
|
|
1474
|
-
);
|
|
1475
|
-
lines.push(...transcriptLines.map((line) => JSON.stringify(line)));
|
|
1476
|
-
}
|
|
1477
|
-
return lines.length > 0 ? `${lines.join("\n")}
|
|
1478
|
-
` : "";
|
|
1479
|
-
}
|
|
1480
|
-
async function writeArtifactsFromResults(results, outputDir, options) {
|
|
1481
|
-
const testArtifactDir = outputDir;
|
|
1482
|
-
const timingPath = path7.join(outputDir, "timing.json");
|
|
1483
|
-
const benchmarkPath = path7.join(outputDir, "benchmark.json");
|
|
1484
|
-
const indexPath = path7.join(outputDir, RESULT_INDEX_FILENAME);
|
|
1485
|
-
await mkdir(outputDir, { recursive: true });
|
|
1486
|
-
const indexRecords = [];
|
|
1487
|
-
for (const result of results) {
|
|
1488
|
-
const grading = buildGradingArtifact(result);
|
|
1489
|
-
const timing2 = buildTimingArtifact([result]);
|
|
1490
|
-
const artifactSubdir = buildArtifactSubdir(result);
|
|
1491
|
-
const testDir = path7.join(outputDir, artifactSubdir);
|
|
1492
|
-
const gradingPath = path7.join(testDir, "grading.json");
|
|
1493
|
-
const perTestTimingPath = path7.join(testDir, "timing.json");
|
|
1494
|
-
await mkdir(testDir, { recursive: true });
|
|
1495
|
-
await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}
|
|
1496
|
-
`, "utf8");
|
|
1497
|
-
await writeFile(perTestTimingPath, `${JSON.stringify(timing2, null, 2)}
|
|
1498
|
-
`, "utf8");
|
|
1499
|
-
const input = extractInput(result);
|
|
1500
|
-
if (input) {
|
|
1501
|
-
await writeFile(path7.join(testDir, "input.md"), input, "utf8");
|
|
1502
|
-
}
|
|
1503
|
-
if (result.output && result.output.length > 0) {
|
|
1504
|
-
const outputsDir = path7.join(testDir, "outputs");
|
|
1505
|
-
await mkdir(outputsDir, { recursive: true });
|
|
1506
|
-
await writeFile(
|
|
1507
|
-
path7.join(outputsDir, "response.md"),
|
|
1508
|
-
formatOutputMarkdown(result.output),
|
|
1509
|
-
"utf8"
|
|
1510
|
-
);
|
|
1511
|
-
}
|
|
1512
|
-
indexRecords.push({
|
|
1513
|
-
...buildResultIndexArtifact(result),
|
|
1514
|
-
experiment: options?.experiment
|
|
1515
|
-
});
|
|
1516
|
-
}
|
|
1517
|
-
const timing = buildTimingArtifact(results);
|
|
1518
|
-
await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}
|
|
1519
|
-
`, "utf8");
|
|
1520
|
-
const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
|
|
1521
|
-
await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
|
|
1522
|
-
`, "utf8");
|
|
1523
|
-
await writeJsonlFile(indexPath, indexRecords);
|
|
1524
|
-
const transcriptPath = path7.join(outputDir, "transcript.jsonl");
|
|
1525
|
-
await writeFile(transcriptPath, buildTranscriptMessageLines(results), "utf8");
|
|
1526
|
-
return { testArtifactDir, timingPath, benchmarkPath, indexPath };
|
|
1527
|
-
}
|
|
1528
|
-
|
|
1529
|
-
// src/commands/eval/benchmark-writer.ts
|
|
1530
|
-
import { writeFile as writeFile2 } from "node:fs/promises";
|
|
1531
|
-
function computeStats2(values) {
|
|
1532
|
-
if (values.length === 0) {
|
|
1533
|
-
return { mean: 0, stddev: 0 };
|
|
1534
|
-
}
|
|
1535
|
-
const mean = values.reduce((sum, v) => sum + v, 0) / values.length;
|
|
1536
|
-
const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / values.length;
|
|
1537
|
-
return {
|
|
1538
|
-
mean: Math.round(mean * 1e3) / 1e3,
|
|
1539
|
-
stddev: Math.round(Math.sqrt(variance) * 1e3) / 1e3
|
|
1540
|
-
};
|
|
1541
|
-
}
|
|
1542
|
-
function computePassRate2(result) {
|
|
1543
1103
|
const scores = result.scores;
|
|
1544
1104
|
if (scores && scores.length > 0) {
|
|
1545
1105
|
const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
|
|
@@ -1548,7 +1108,7 @@ function computePassRate2(result) {
|
|
|
1548
1108
|
return result.score >= DEFAULT_THRESHOLD ? 1 : 0;
|
|
1549
1109
|
}
|
|
1550
1110
|
function buildBenchmarkJson(results) {
|
|
1551
|
-
const passRates = results.map(
|
|
1111
|
+
const passRates = results.map(computePassRate);
|
|
1552
1112
|
const timings = results.filter((r) => r.durationMs != null).map((r) => r.durationMs / 1e3);
|
|
1553
1113
|
const tokens = results.filter((r) => r.tokenUsage != null).map((r) => {
|
|
1554
1114
|
const usage = r.tokenUsage;
|
|
@@ -1557,29 +1117,29 @@ function buildBenchmarkJson(results) {
|
|
|
1557
1117
|
return {
|
|
1558
1118
|
run_summary: {
|
|
1559
1119
|
with_skill: {
|
|
1560
|
-
pass_rate:
|
|
1561
|
-
time_seconds:
|
|
1562
|
-
tokens:
|
|
1120
|
+
pass_rate: computeStats(passRates),
|
|
1121
|
+
time_seconds: computeStats(timings),
|
|
1122
|
+
tokens: computeStats(tokens)
|
|
1563
1123
|
}
|
|
1564
1124
|
}
|
|
1565
1125
|
};
|
|
1566
1126
|
}
|
|
1567
1127
|
async function writeBenchmarkJson(outputPath, results) {
|
|
1568
1128
|
const benchmark = buildBenchmarkJson(results);
|
|
1569
|
-
await
|
|
1129
|
+
await writeFile(outputPath, `${JSON.stringify(benchmark, null, 2)}
|
|
1570
1130
|
`, "utf8");
|
|
1571
1131
|
}
|
|
1572
1132
|
|
|
1573
1133
|
// src/commands/eval/env.ts
|
|
1574
1134
|
import { constants as constants3 } from "node:fs";
|
|
1575
1135
|
import { access as access3 } from "node:fs/promises";
|
|
1576
|
-
import
|
|
1136
|
+
import path6 from "node:path";
|
|
1577
1137
|
import { config as loadDotenv } from "dotenv";
|
|
1578
1138
|
function uniqueDirs(directories) {
|
|
1579
1139
|
const seen = /* @__PURE__ */ new Set();
|
|
1580
1140
|
const result = [];
|
|
1581
1141
|
for (const dir of directories) {
|
|
1582
|
-
const absolute =
|
|
1142
|
+
const absolute = path6.resolve(dir);
|
|
1583
1143
|
if (seen.has(absolute)) {
|
|
1584
1144
|
continue;
|
|
1585
1145
|
}
|
|
@@ -1598,14 +1158,14 @@ async function fileExists2(filePath) {
|
|
|
1598
1158
|
}
|
|
1599
1159
|
function collectAncestorDirectories(start, boundary) {
|
|
1600
1160
|
const directories = [];
|
|
1601
|
-
const boundaryDir =
|
|
1602
|
-
let current =
|
|
1161
|
+
const boundaryDir = path6.resolve(boundary);
|
|
1162
|
+
let current = path6.resolve(start);
|
|
1603
1163
|
while (current !== void 0) {
|
|
1604
1164
|
directories.push(current);
|
|
1605
1165
|
if (current === boundaryDir) {
|
|
1606
1166
|
break;
|
|
1607
1167
|
}
|
|
1608
|
-
const parent =
|
|
1168
|
+
const parent = path6.dirname(current);
|
|
1609
1169
|
if (parent === current) {
|
|
1610
1170
|
break;
|
|
1611
1171
|
}
|
|
@@ -1615,12 +1175,12 @@ function collectAncestorDirectories(start, boundary) {
|
|
|
1615
1175
|
}
|
|
1616
1176
|
async function loadEnvFromHierarchy(options) {
|
|
1617
1177
|
const { testFilePath, repoRoot, verbose } = options;
|
|
1618
|
-
const testDir =
|
|
1178
|
+
const testDir = path6.dirname(path6.resolve(testFilePath));
|
|
1619
1179
|
const cwd = process.cwd();
|
|
1620
1180
|
const searchDirs = uniqueDirs([...collectAncestorDirectories(testDir, repoRoot), repoRoot, cwd]);
|
|
1621
1181
|
const envFiles = [];
|
|
1622
1182
|
for (const dir of searchDirs) {
|
|
1623
|
-
const candidate =
|
|
1183
|
+
const candidate = path6.join(dir, ".env");
|
|
1624
1184
|
if (await fileExists2(candidate)) {
|
|
1625
1185
|
envFiles.push(candidate);
|
|
1626
1186
|
}
|
|
@@ -1642,11 +1202,11 @@ async function loadEnvFromHierarchy(options) {
|
|
|
1642
1202
|
}
|
|
1643
1203
|
|
|
1644
1204
|
// src/commands/eval/output-writer.ts
|
|
1645
|
-
import
|
|
1205
|
+
import path12 from "node:path";
|
|
1646
1206
|
|
|
1647
1207
|
// src/commands/eval/html-writer.ts
|
|
1648
|
-
import { mkdir
|
|
1649
|
-
import
|
|
1208
|
+
import { mkdir, writeFile as writeFile2 } from "node:fs/promises";
|
|
1209
|
+
import path7 from "node:path";
|
|
1650
1210
|
|
|
1651
1211
|
// ../../node_modules/.bun/async-mutex@0.5.0/node_modules/async-mutex/index.mjs
|
|
1652
1212
|
var E_TIMEOUT = new Error("timeout while waiting for mutex to become available");
|
|
@@ -1865,7 +1425,7 @@ var HtmlWriter = class _HtmlWriter {
|
|
|
1865
1425
|
this.filePath = filePath;
|
|
1866
1426
|
}
|
|
1867
1427
|
static async open(filePath) {
|
|
1868
|
-
await
|
|
1428
|
+
await mkdir(path7.dirname(filePath), { recursive: true });
|
|
1869
1429
|
const writer = new _HtmlWriter(filePath);
|
|
1870
1430
|
await writer.writeHtml();
|
|
1871
1431
|
return writer;
|
|
@@ -1891,7 +1451,7 @@ var HtmlWriter = class _HtmlWriter {
|
|
|
1891
1451
|
}
|
|
1892
1452
|
async writeHtml() {
|
|
1893
1453
|
const html = generateHtml(this.results, this.isLive);
|
|
1894
|
-
await
|
|
1454
|
+
await writeFile2(this.filePath, html, "utf8");
|
|
1895
1455
|
}
|
|
1896
1456
|
};
|
|
1897
1457
|
function generateHtml(results, isLive) {
|
|
@@ -2326,10 +1886,10 @@ var SCRIPT = `
|
|
|
2326
1886
|
h+='<div class="detail-block"><h4>Output</h4><pre class="detail-pre">'+esc(r.output?JSON.stringify(r.output,null,2):"")+"</pre></div>";
|
|
2327
1887
|
h+="</div>";
|
|
2328
1888
|
|
|
2329
|
-
/*
|
|
1889
|
+
/* grader results */
|
|
2330
1890
|
if(r.scores&&r.scores.length>0){
|
|
2331
|
-
h+="<h4>
|
|
2332
|
-
h+='<table class="eval-table"><thead><tr><th>
|
|
1891
|
+
h+="<h4>Grader Results</h4>";
|
|
1892
|
+
h+='<table class="eval-table"><thead><tr><th>Grader</th><th>Score</th><th>Status</th><th>Assertions</th></tr></thead><tbody>';
|
|
2333
1893
|
for(var i=0;i<r.scores.length;i++){
|
|
2334
1894
|
var ev=r.scores[i],evS=ev.score>=0.5?"pass":"fail";
|
|
2335
1895
|
var evAssertions=ev.assertions||[];
|
|
@@ -2375,8 +1935,8 @@ var SCRIPT = `
|
|
|
2375
1935
|
`;
|
|
2376
1936
|
|
|
2377
1937
|
// src/commands/eval/json-writer.ts
|
|
2378
|
-
import { mkdir as
|
|
2379
|
-
import
|
|
1938
|
+
import { mkdir as mkdir2, writeFile as writeFile3 } from "node:fs/promises";
|
|
1939
|
+
import path8 from "node:path";
|
|
2380
1940
|
var JsonWriter = class _JsonWriter {
|
|
2381
1941
|
filePath;
|
|
2382
1942
|
results = [];
|
|
@@ -2385,7 +1945,7 @@ var JsonWriter = class _JsonWriter {
|
|
|
2385
1945
|
this.filePath = filePath;
|
|
2386
1946
|
}
|
|
2387
1947
|
static async open(filePath) {
|
|
2388
|
-
await
|
|
1948
|
+
await mkdir2(path8.dirname(filePath), { recursive: true });
|
|
2389
1949
|
return new _JsonWriter(filePath);
|
|
2390
1950
|
}
|
|
2391
1951
|
async append(result) {
|
|
@@ -2412,15 +1972,15 @@ var JsonWriter = class _JsonWriter {
|
|
|
2412
1972
|
results: this.results
|
|
2413
1973
|
};
|
|
2414
1974
|
const snakeCaseOutput = toSnakeCaseDeep(output);
|
|
2415
|
-
await
|
|
1975
|
+
await writeFile3(this.filePath, `${JSON.stringify(snakeCaseOutput, null, 2)}
|
|
2416
1976
|
`, "utf8");
|
|
2417
1977
|
}
|
|
2418
1978
|
};
|
|
2419
1979
|
|
|
2420
1980
|
// src/commands/eval/jsonl-writer.ts
|
|
2421
1981
|
import { createWriteStream } from "node:fs";
|
|
2422
|
-
import { mkdir as
|
|
2423
|
-
import
|
|
1982
|
+
import { mkdir as mkdir3 } from "node:fs/promises";
|
|
1983
|
+
import path9 from "node:path";
|
|
2424
1984
|
import { finished } from "node:stream/promises";
|
|
2425
1985
|
var JsonlWriter = class _JsonlWriter {
|
|
2426
1986
|
stream;
|
|
@@ -2429,9 +1989,10 @@ var JsonlWriter = class _JsonlWriter {
|
|
|
2429
1989
|
constructor(stream) {
|
|
2430
1990
|
this.stream = stream;
|
|
2431
1991
|
}
|
|
2432
|
-
static async open(filePath) {
|
|
2433
|
-
await
|
|
2434
|
-
const
|
|
1992
|
+
static async open(filePath, options) {
|
|
1993
|
+
await mkdir3(path9.dirname(filePath), { recursive: true });
|
|
1994
|
+
const flags = options?.append ? "a" : "w";
|
|
1995
|
+
const stream = createWriteStream(filePath, { flags, encoding: "utf8" });
|
|
2435
1996
|
return new _JsonlWriter(stream);
|
|
2436
1997
|
}
|
|
2437
1998
|
async append(record) {
|
|
@@ -2461,8 +2022,8 @@ var JsonlWriter = class _JsonlWriter {
|
|
|
2461
2022
|
};
|
|
2462
2023
|
|
|
2463
2024
|
// src/commands/eval/junit-writer.ts
|
|
2464
|
-
import { mkdir as
|
|
2465
|
-
import
|
|
2025
|
+
import { mkdir as mkdir4, writeFile as writeFile4 } from "node:fs/promises";
|
|
2026
|
+
import path10 from "node:path";
|
|
2466
2027
|
function escapeXml(str) {
|
|
2467
2028
|
return str.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
2468
2029
|
}
|
|
@@ -2476,7 +2037,7 @@ var JunitWriter = class _JunitWriter {
|
|
|
2476
2037
|
this.threshold = options?.threshold ?? 0.5;
|
|
2477
2038
|
}
|
|
2478
2039
|
static async open(filePath, options) {
|
|
2479
|
-
await
|
|
2040
|
+
await mkdir4(path10.dirname(filePath), { recursive: true });
|
|
2480
2041
|
return new _JunitWriter(filePath, options);
|
|
2481
2042
|
}
|
|
2482
2043
|
async append(result) {
|
|
@@ -2545,14 +2106,14 @@ ${testCases.join("\n")}
|
|
|
2545
2106
|
${suiteXmls.join("\n")}
|
|
2546
2107
|
</testsuites>
|
|
2547
2108
|
`;
|
|
2548
|
-
await
|
|
2109
|
+
await writeFile4(this.filePath, xml, "utf8");
|
|
2549
2110
|
}
|
|
2550
2111
|
};
|
|
2551
2112
|
|
|
2552
2113
|
// src/commands/eval/yaml-writer.ts
|
|
2553
2114
|
import { createWriteStream as createWriteStream2 } from "node:fs";
|
|
2554
|
-
import { mkdir as
|
|
2555
|
-
import
|
|
2115
|
+
import { mkdir as mkdir5 } from "node:fs/promises";
|
|
2116
|
+
import path11 from "node:path";
|
|
2556
2117
|
import { finished as finished2 } from "node:stream/promises";
|
|
2557
2118
|
import { stringify as stringifyYaml } from "yaml";
|
|
2558
2119
|
var YamlWriter = class _YamlWriter {
|
|
@@ -2564,7 +2125,7 @@ var YamlWriter = class _YamlWriter {
|
|
|
2564
2125
|
this.stream = stream;
|
|
2565
2126
|
}
|
|
2566
2127
|
static async open(filePath) {
|
|
2567
|
-
await
|
|
2128
|
+
await mkdir5(path11.dirname(filePath), { recursive: true });
|
|
2568
2129
|
const stream = createWriteStream2(filePath, { flags: "w", encoding: "utf8" });
|
|
2569
2130
|
return new _YamlWriter(stream);
|
|
2570
2131
|
}
|
|
@@ -2604,10 +2165,10 @@ var YamlWriter = class _YamlWriter {
|
|
|
2604
2165
|
};
|
|
2605
2166
|
|
|
2606
2167
|
// src/commands/eval/output-writer.ts
|
|
2607
|
-
async function createOutputWriter(filePath, format) {
|
|
2168
|
+
async function createOutputWriter(filePath, format, options) {
|
|
2608
2169
|
switch (format) {
|
|
2609
2170
|
case "jsonl":
|
|
2610
|
-
return JsonlWriter.open(filePath);
|
|
2171
|
+
return JsonlWriter.open(filePath, { append: options?.append });
|
|
2611
2172
|
case "yaml":
|
|
2612
2173
|
return YamlWriter.open(filePath);
|
|
2613
2174
|
case "html":
|
|
@@ -2620,7 +2181,7 @@ async function createOutputWriter(filePath, format) {
|
|
|
2620
2181
|
}
|
|
2621
2182
|
var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml", ".html", ".htm"]);
|
|
2622
2183
|
function createWriterFromPath(filePath, options) {
|
|
2623
|
-
const ext =
|
|
2184
|
+
const ext = path12.extname(filePath).toLowerCase();
|
|
2624
2185
|
switch (ext) {
|
|
2625
2186
|
case ".jsonl":
|
|
2626
2187
|
return JsonlWriter.open(filePath);
|
|
@@ -2719,12 +2280,12 @@ var ProgressDisplay = class {
|
|
|
2719
2280
|
}
|
|
2720
2281
|
addLogPaths(paths) {
|
|
2721
2282
|
const newPaths = [];
|
|
2722
|
-
for (const
|
|
2723
|
-
if (this.logPathSet.has(
|
|
2283
|
+
for (const path17 of paths) {
|
|
2284
|
+
if (this.logPathSet.has(path17)) {
|
|
2724
2285
|
continue;
|
|
2725
2286
|
}
|
|
2726
|
-
this.logPathSet.add(
|
|
2727
|
-
newPaths.push(
|
|
2287
|
+
this.logPathSet.add(path17);
|
|
2288
|
+
newPaths.push(path17);
|
|
2728
2289
|
}
|
|
2729
2290
|
if (newPaths.length === 0) {
|
|
2730
2291
|
return;
|
|
@@ -2777,8 +2338,8 @@ async function loadNonErrorResults(jsonlPath) {
|
|
|
2777
2338
|
}
|
|
2778
2339
|
|
|
2779
2340
|
// src/commands/eval/run-cache.ts
|
|
2780
|
-
import { mkdir as
|
|
2781
|
-
import
|
|
2341
|
+
import { mkdir as mkdir6, readFile, writeFile as writeFile5 } from "node:fs/promises";
|
|
2342
|
+
import path13 from "node:path";
|
|
2782
2343
|
var CACHE_FILENAME = "cache.json";
|
|
2783
2344
|
function resolveRunCacheFile(cache) {
|
|
2784
2345
|
if (cache.lastRunDir) {
|
|
@@ -2787,27 +2348,27 @@ function resolveRunCacheFile(cache) {
|
|
|
2787
2348
|
return "";
|
|
2788
2349
|
}
|
|
2789
2350
|
function cachePath(cwd) {
|
|
2790
|
-
return
|
|
2351
|
+
return path13.join(cwd, ".agentv", CACHE_FILENAME);
|
|
2791
2352
|
}
|
|
2792
2353
|
async function loadRunCache(cwd) {
|
|
2793
2354
|
try {
|
|
2794
|
-
const content = await
|
|
2355
|
+
const content = await readFile(cachePath(cwd), "utf-8");
|
|
2795
2356
|
return JSON.parse(content);
|
|
2796
2357
|
} catch {
|
|
2797
2358
|
return void 0;
|
|
2798
2359
|
}
|
|
2799
2360
|
}
|
|
2800
2361
|
async function saveRunCache(cwd, resultPath) {
|
|
2801
|
-
if (
|
|
2362
|
+
if (path13.basename(resultPath) !== RESULT_INDEX_FILENAME) {
|
|
2802
2363
|
return;
|
|
2803
2364
|
}
|
|
2804
|
-
const dir =
|
|
2805
|
-
await
|
|
2365
|
+
const dir = path13.join(cwd, ".agentv");
|
|
2366
|
+
await mkdir6(dir, { recursive: true });
|
|
2806
2367
|
const cache = {
|
|
2807
|
-
lastRunDir:
|
|
2368
|
+
lastRunDir: path13.dirname(resultPath),
|
|
2808
2369
|
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
2809
2370
|
};
|
|
2810
|
-
await
|
|
2371
|
+
await writeFile5(cachePath(cwd), `${JSON.stringify(cache, null, 2)}
|
|
2811
2372
|
`, "utf-8");
|
|
2812
2373
|
}
|
|
2813
2374
|
|
|
@@ -3072,13 +2633,13 @@ function formatMatrixSummary(results) {
|
|
|
3072
2633
|
}
|
|
3073
2634
|
|
|
3074
2635
|
// ../../packages/core/dist/evaluation/validation/index.js
|
|
3075
|
-
import { readFile as
|
|
3076
|
-
import
|
|
2636
|
+
import { readFile as readFile2 } from "node:fs/promises";
|
|
2637
|
+
import path14 from "node:path";
|
|
3077
2638
|
import { parse } from "yaml";
|
|
3078
2639
|
import { readFile as readFile22, readdir } from "node:fs/promises";
|
|
3079
2640
|
import path22 from "node:path";
|
|
3080
2641
|
import { parse as parse2 } from "yaml";
|
|
3081
|
-
import { readFile as
|
|
2642
|
+
import { readFile as readFile3 } from "node:fs/promises";
|
|
3082
2643
|
import path32 from "node:path";
|
|
3083
2644
|
import { parse as parse3 } from "yaml";
|
|
3084
2645
|
import { readFile as readFile4 } from "node:fs/promises";
|
|
@@ -3097,7 +2658,7 @@ var SCHEMA_TARGETS_V2 = "agentv-targets-v2.2";
|
|
|
3097
2658
|
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
3098
2659
|
async function detectFileType(filePath) {
|
|
3099
2660
|
try {
|
|
3100
|
-
const content = await
|
|
2661
|
+
const content = await readFile2(filePath, "utf8");
|
|
3101
2662
|
const parsed = parse(content);
|
|
3102
2663
|
if (Array.isArray(parsed)) {
|
|
3103
2664
|
return "cases";
|
|
@@ -3125,8 +2686,8 @@ async function detectFileType(filePath) {
|
|
|
3125
2686
|
}
|
|
3126
2687
|
}
|
|
3127
2688
|
function inferFileTypeFromPath(filePath) {
|
|
3128
|
-
const normalized =
|
|
3129
|
-
const basename =
|
|
2689
|
+
const normalized = path14.normalize(filePath).replace(/\\/g, "/");
|
|
2690
|
+
const basename = path14.basename(filePath);
|
|
3130
2691
|
if (normalized.includes("/.agentv/")) {
|
|
3131
2692
|
if (basename === "config.yaml" || basename === "config.yml") {
|
|
3132
2693
|
return "config";
|
|
@@ -3723,7 +3284,7 @@ function validateAssertArray(assertField, parentLocation, filePath, errors, cust
|
|
|
3723
3284
|
continue;
|
|
3724
3285
|
}
|
|
3725
3286
|
const typeValue = rawTypeValue.replace(/_/g, "-");
|
|
3726
|
-
if (!
|
|
3287
|
+
if (!isGraderKind(typeValue) && !customAssertionTypes.has(typeValue)) {
|
|
3727
3288
|
errors.push({
|
|
3728
3289
|
severity: "warning",
|
|
3729
3290
|
filePath,
|
|
@@ -3897,7 +3458,7 @@ async function validateCasesFile(filePath) {
|
|
|
3897
3458
|
const absolutePath = path32.resolve(filePath);
|
|
3898
3459
|
let parsed;
|
|
3899
3460
|
try {
|
|
3900
|
-
const content = await
|
|
3461
|
+
const content = await readFile3(absolutePath, "utf8");
|
|
3901
3462
|
parsed = parse3(content);
|
|
3902
3463
|
} catch (error) {
|
|
3903
3464
|
errors.push({
|
|
@@ -5005,6 +4566,12 @@ Errors in ${targetsFilePath}:`);
|
|
|
5005
4566
|
|
|
5006
4567
|
// src/commands/eval/run-eval.ts
|
|
5007
4568
|
var DEFAULT_WORKERS = 3;
|
|
4569
|
+
function shouldSkipExistingResultForResume(result, rerunFailed) {
|
|
4570
|
+
if (rerunFailed) {
|
|
4571
|
+
return result.executionStatus === "ok";
|
|
4572
|
+
}
|
|
4573
|
+
return result.executionStatus !== "execution_error";
|
|
4574
|
+
}
|
|
5008
4575
|
function normalizeBoolean(value) {
|
|
5009
4576
|
return value === true;
|
|
5010
4577
|
}
|
|
@@ -5175,6 +4742,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
|
5175
4742
|
otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent) || yamlExecution?.otel_capture_content === true,
|
|
5176
4743
|
otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns) || yamlExecution?.otel_group_turns === true,
|
|
5177
4744
|
retryErrors: normalizeString(rawOptions.retryErrors),
|
|
4745
|
+
resume: normalizeBoolean(rawOptions.resume) || normalizeBoolean(rawOptions.rerunFailed),
|
|
4746
|
+
rerunFailed: normalizeBoolean(rawOptions.rerunFailed),
|
|
5178
4747
|
workspaceMode,
|
|
5179
4748
|
workspacePath,
|
|
5180
4749
|
// Precedence: CLI > YAML config > TS config
|
|
@@ -5201,7 +4770,7 @@ async function ensureFileExists(filePath, description) {
|
|
|
5201
4770
|
function buildDefaultOutputPathForExperiment(cwd, experiment) {
|
|
5202
4771
|
const runDir = buildDefaultRunDir(cwd, experiment);
|
|
5203
4772
|
mkdirSync(runDir, { recursive: true });
|
|
5204
|
-
return
|
|
4773
|
+
return path15.join(runDir, "index.jsonl");
|
|
5205
4774
|
}
|
|
5206
4775
|
function createProgressReporter(maxWorkers, options) {
|
|
5207
4776
|
const display = new ProgressDisplay(maxWorkers, options);
|
|
@@ -5215,7 +4784,7 @@ function createProgressReporter(maxWorkers, options) {
|
|
|
5215
4784
|
};
|
|
5216
4785
|
}
|
|
5217
4786
|
function makeTestCaseKey(testFilePath, testId) {
|
|
5218
|
-
return `${
|
|
4787
|
+
return `${path15.resolve(testFilePath)}::${testId}`;
|
|
5219
4788
|
}
|
|
5220
4789
|
function resolveTargetLabel(requestedName, resolvedName) {
|
|
5221
4790
|
if (resolvedName !== requestedName) {
|
|
@@ -5277,7 +4846,7 @@ async function prepareFileMetadata(params) {
|
|
|
5277
4846
|
repoRoot,
|
|
5278
4847
|
verbose: options.verbose
|
|
5279
4848
|
});
|
|
5280
|
-
const relativePath =
|
|
4849
|
+
const relativePath = path15.relative(cwd, testFilePath);
|
|
5281
4850
|
const category = deriveCategory(relativePath);
|
|
5282
4851
|
const suite = await loadTestSuite(testFilePath, repoRoot, {
|
|
5283
4852
|
verbose: options.verbose,
|
|
@@ -5302,7 +4871,7 @@ async function prepareFileMetadata(params) {
|
|
|
5302
4871
|
selections = [
|
|
5303
4872
|
{
|
|
5304
4873
|
selection: transcriptSelection,
|
|
5305
|
-
inlineTargetLabel: `transcript (${
|
|
4874
|
+
inlineTargetLabel: `transcript (${path15.basename(options.transcript)})`
|
|
5306
4875
|
}
|
|
5307
4876
|
];
|
|
5308
4877
|
} else {
|
|
@@ -5372,7 +4941,7 @@ async function prepareFileMetadata(params) {
|
|
|
5372
4941
|
yamlWorkers: suite.workers,
|
|
5373
4942
|
yamlCache: suite.cacheConfig?.enabled,
|
|
5374
4943
|
yamlCachePath: suite.cacheConfig?.cachePath,
|
|
5375
|
-
|
|
4944
|
+
budgetUsd: suite.budgetUsd,
|
|
5376
4945
|
failOnError: suite.failOnError,
|
|
5377
4946
|
threshold: suite.threshold,
|
|
5378
4947
|
tags: suite.metadata?.tags
|
|
@@ -5398,7 +4967,7 @@ async function runSingleEvalFile(params) {
|
|
|
5398
4967
|
testCases,
|
|
5399
4968
|
trialsConfig,
|
|
5400
4969
|
matrixMode,
|
|
5401
|
-
|
|
4970
|
+
budgetUsd,
|
|
5402
4971
|
failOnError,
|
|
5403
4972
|
providerFactory
|
|
5404
4973
|
} = params;
|
|
@@ -5464,7 +5033,7 @@ async function runSingleEvalFile(params) {
|
|
|
5464
5033
|
workspacePath: options.workspacePath,
|
|
5465
5034
|
keepWorkspaces: options.keepWorkspaces,
|
|
5466
5035
|
trials: trialsConfig,
|
|
5467
|
-
|
|
5036
|
+
budgetUsd,
|
|
5468
5037
|
failOnError,
|
|
5469
5038
|
graderTarget: options.graderTarget,
|
|
5470
5039
|
model: options.model,
|
|
@@ -5537,7 +5106,7 @@ async function runEvalCommand(input) {
|
|
|
5537
5106
|
);
|
|
5538
5107
|
}
|
|
5539
5108
|
const repoRoot = await findRepoRoot(cwd);
|
|
5540
|
-
const yamlConfig = await loadConfig(
|
|
5109
|
+
const yamlConfig = await loadConfig(path15.join(cwd, "_"), repoRoot);
|
|
5541
5110
|
if (yamlConfig?.required_version) {
|
|
5542
5111
|
await enforceRequiredVersion(yamlConfig.required_version, {
|
|
5543
5112
|
strict: normalizeBoolean(input.rawOptions.strict)
|
|
@@ -5552,7 +5121,7 @@ async function runEvalCommand(input) {
|
|
|
5552
5121
|
}
|
|
5553
5122
|
let retryNonErrorResults;
|
|
5554
5123
|
if (options.retryErrors) {
|
|
5555
|
-
const retryPath =
|
|
5124
|
+
const retryPath = path15.resolve(options.retryErrors);
|
|
5556
5125
|
await ensureFileExists(retryPath, "Retry-errors JSONL file");
|
|
5557
5126
|
const completedIds = await loadFullyCompletedTestIds(retryPath);
|
|
5558
5127
|
const errorIds = await loadErrorTestIds(retryPath);
|
|
@@ -5565,8 +5134,37 @@ async function runEvalCommand(input) {
|
|
|
5565
5134
|
console.log(`Skipping ${completedIds.length} already-completed test(s).`);
|
|
5566
5135
|
}
|
|
5567
5136
|
}
|
|
5137
|
+
let resumeSkipKeys;
|
|
5138
|
+
let isResumeAppend = false;
|
|
5139
|
+
if (options.resume && !options.retryErrors) {
|
|
5140
|
+
const explicitResumeDir = options.outputDir ?? options.artifacts;
|
|
5141
|
+
if (explicitResumeDir) {
|
|
5142
|
+
const resumeIndexPath = path15.join(path15.resolve(explicitResumeDir), "index.jsonl");
|
|
5143
|
+
if (existsSync2(resumeIndexPath)) {
|
|
5144
|
+
const content = await readFile8(resumeIndexPath, "utf8");
|
|
5145
|
+
const existingResults = parseJsonlResults(content);
|
|
5146
|
+
resumeSkipKeys = /* @__PURE__ */ new Set();
|
|
5147
|
+
for (const r of existingResults) {
|
|
5148
|
+
if (shouldSkipExistingResultForResume(r, options.rerunFailed)) {
|
|
5149
|
+
resumeSkipKeys.add(buildTestTargetKey(r.testId, r.target));
|
|
5150
|
+
}
|
|
5151
|
+
}
|
|
5152
|
+
isResumeAppend = true;
|
|
5153
|
+
const modeLabel = options.rerunFailed ? "Rerun-failed" : "Resume";
|
|
5154
|
+
console.log(
|
|
5155
|
+
`${modeLabel}: found ${existingResults.length} existing result(s), skipping ${resumeSkipKeys.size} completed.`
|
|
5156
|
+
);
|
|
5157
|
+
} else {
|
|
5158
|
+
console.log("Resume: no existing index.jsonl found, starting fresh run.");
|
|
5159
|
+
}
|
|
5160
|
+
} else {
|
|
5161
|
+
console.warn(
|
|
5162
|
+
"Warning: --resume requires --output <dir> to identify the run directory. Ignoring --resume."
|
|
5163
|
+
);
|
|
5164
|
+
}
|
|
5165
|
+
}
|
|
5568
5166
|
if (options.workspacePath) {
|
|
5569
|
-
const resolvedWorkspace =
|
|
5167
|
+
const resolvedWorkspace = path15.resolve(options.workspacePath);
|
|
5570
5168
|
try {
|
|
5571
5169
|
const { stat: stat2 } = await import("node:fs/promises");
|
|
5572
5170
|
const stats = await stat2(resolvedWorkspace);
|
|
@@ -5607,25 +5205,25 @@ async function runEvalCommand(input) {
|
|
|
5607
5205
|
let outputPath;
|
|
5608
5206
|
let usesDefaultArtifactWorkspace;
|
|
5609
5207
|
if (explicitDir) {
|
|
5610
|
-
runDir =
|
|
5208
|
+
runDir = path15.resolve(explicitDir);
|
|
5611
5209
|
mkdirSync(runDir, { recursive: true });
|
|
5612
|
-
outputPath =
|
|
5210
|
+
outputPath = path15.join(runDir, "index.jsonl");
|
|
5613
5211
|
usesDefaultArtifactWorkspace = true;
|
|
5614
5212
|
} else if (options.outPath) {
|
|
5615
|
-
outputPath =
|
|
5616
|
-
runDir =
|
|
5213
|
+
outputPath = path15.resolve(options.outPath);
|
|
5214
|
+
runDir = path15.dirname(outputPath);
|
|
5617
5215
|
mkdirSync(runDir, { recursive: true });
|
|
5618
5216
|
usesDefaultArtifactWorkspace = false;
|
|
5619
5217
|
} else {
|
|
5620
5218
|
outputPath = buildDefaultOutputPathForExperiment(cwd, options.experiment);
|
|
5621
|
-
runDir =
|
|
5219
|
+
runDir = path15.dirname(outputPath);
|
|
5622
5220
|
usesDefaultArtifactWorkspace = true;
|
|
5623
5221
|
}
|
|
5624
5222
|
let otelExporter = null;
|
|
5625
5223
|
const useFileExport = !!options.otelFile;
|
|
5626
5224
|
if (options.exportOtel || useFileExport) {
|
|
5627
5225
|
try {
|
|
5628
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
5226
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-7W4OI3X2.js");
|
|
5629
5227
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
5630
5228
|
let headers = {};
|
|
5631
5229
|
if (options.otelBackend) {
|
|
@@ -5649,7 +5247,7 @@ async function runEvalCommand(input) {
|
|
|
5649
5247
|
headers,
|
|
5650
5248
|
captureContent,
|
|
5651
5249
|
groupTurns: options.otelGroupTurns,
|
|
5652
|
-
otlpFilePath: options.otelFile ?
|
|
5250
|
+
otlpFilePath: options.otelFile ? path15.resolve(options.otelFile) : void 0
|
|
5653
5251
|
});
|
|
5654
5252
|
const initialized = await otelExporter.init();
|
|
5655
5253
|
if (!initialized) {
|
|
@@ -5666,7 +5264,7 @@ async function runEvalCommand(input) {
|
|
|
5666
5264
|
}
|
|
5667
5265
|
}
|
|
5668
5266
|
const primaryWritePath = outputPath;
|
|
5669
|
-
const resolvedExportPaths = options.exportPaths.map((p) =>
|
|
5267
|
+
const resolvedExportPaths = options.exportPaths.map((p) => path15.resolve(p));
|
|
5670
5268
|
console.log(`Artifact directory: ${runDir}`);
|
|
5671
5269
|
if (resolvedExportPaths.length > 0) {
|
|
5672
5270
|
console.log("Export files:");
|
|
@@ -5674,9 +5272,9 @@ async function runEvalCommand(input) {
|
|
|
5674
5272
|
console.log(` ${p}`);
|
|
5675
5273
|
}
|
|
5676
5274
|
}
|
|
5677
|
-
const resolvedTestFiles = input.testFiles.map((file) =>
|
|
5275
|
+
const resolvedTestFiles = input.testFiles.map((file) => path15.resolve(file));
|
|
5678
5276
|
if (options.otelFile) {
|
|
5679
|
-
console.log(`OTLP JSON file: ${
|
|
5277
|
+
console.log(`OTLP JSON file: ${path15.resolve(options.otelFile)}`);
|
|
5680
5278
|
}
|
|
5681
5279
|
const evaluationRunner = await resolveEvaluationRunner();
|
|
5682
5280
|
const allResults = [];
|
|
@@ -5716,7 +5314,7 @@ async function runEvalCommand(input) {
|
|
|
5716
5314
|
for (const [testFilePath, meta] of fileMetadata.entries()) {
|
|
5717
5315
|
if (!matchesTagFilters(meta.tags, options.tags, options.excludeTags)) {
|
|
5718
5316
|
fileMetadata.delete(testFilePath);
|
|
5719
|
-
skippedFiles.push(
|
|
5317
|
+
skippedFiles.push(path15.relative(cwd, testFilePath));
|
|
5720
5318
|
}
|
|
5721
5319
|
}
|
|
5722
5320
|
if (skippedFiles.length > 0 && options.verbose) {
|
|
@@ -5737,7 +5335,7 @@ async function runEvalCommand(input) {
|
|
|
5737
5335
|
cliNoCache: options.noCache,
|
|
5738
5336
|
yamlCache: yamlCacheEnabled
|
|
5739
5337
|
});
|
|
5740
|
-
const cache = cacheEnabled ? new ResponseCache(yamlCachePath ?
|
|
5338
|
+
const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path15.resolve(yamlCachePath) : void 0) : void 0;
|
|
5741
5339
|
if (cacheEnabled) {
|
|
5742
5340
|
console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
|
|
5743
5341
|
}
|
|
@@ -5747,14 +5345,25 @@ async function runEvalCommand(input) {
|
|
|
5747
5345
|
throw new Error("--threshold must be between 0 and 1");
|
|
5748
5346
|
}
|
|
5749
5347
|
const writerOptions = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
|
|
5750
|
-
const outputWriter = await createOutputWriter(primaryWritePath, "jsonl"
|
|
5348
|
+
const outputWriter = await createOutputWriter(primaryWritePath, "jsonl", {
|
|
5349
|
+
append: isResumeAppend
|
|
5350
|
+
});
|
|
5751
5351
|
const isMatrixMode = Array.from(fileMetadata.values()).some((meta) => meta.selections.length > 1);
|
|
5752
5352
|
let totalEvalCount = 0;
|
|
5353
|
+
let resumeSkippedCount = 0;
|
|
5753
5354
|
for (const meta of fileMetadata.values()) {
|
|
5754
5355
|
const suiteTargetNames = meta.selections.map((s) => s.selection.targetName);
|
|
5755
5356
|
for (const test of meta.testCases) {
|
|
5756
5357
|
const testTargetNames = test.targets && test.targets.length > 0 ? test.targets.filter((t) => suiteTargetNames.includes(t)) : suiteTargetNames;
|
|
5757
|
-
|
|
5358
|
+
const effectiveTargets = testTargetNames.length > 0 ? testTargetNames : ["unknown"];
|
|
5359
|
+
for (const tn of effectiveTargets) {
|
|
5360
|
+
const key = `${test.id}::${tn}`;
|
|
5361
|
+
if (resumeSkipKeys?.has(key)) {
|
|
5362
|
+
resumeSkippedCount++;
|
|
5363
|
+
} else {
|
|
5364
|
+
totalEvalCount++;
|
|
5365
|
+
}
|
|
5366
|
+
}
|
|
5758
5367
|
}
|
|
5759
5368
|
}
|
|
5760
5369
|
if (totalEvalCount === 0) {
|
|
@@ -5762,6 +5371,10 @@ async function runEvalCommand(input) {
|
|
|
5762
5371
|
console.log("No execution errors or missing cases in the previous run. Nothing to retry.");
|
|
5763
5372
|
return;
|
|
5764
5373
|
}
|
|
5374
|
+
if (resumeSkipKeys && resumeSkippedCount > 0) {
|
|
5375
|
+
console.log(`Nothing to resume \u2014 all ${resumeSkippedCount} test(s) already completed.`);
|
|
5376
|
+
return;
|
|
5377
|
+
}
|
|
5765
5378
|
throw new Error("No tests matched the provided filters.");
|
|
5766
5379
|
}
|
|
5767
5380
|
const progressReporter = createProgressReporter(options.workers ?? DEFAULT_WORKERS, {
|
|
@@ -5821,7 +5434,7 @@ async function runEvalCommand(input) {
|
|
|
5821
5434
|
const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
|
|
5822
5435
|
let transcriptProviderFactory;
|
|
5823
5436
|
if (options.transcript) {
|
|
5824
|
-
const { TranscriptProvider } = await import("./dist-
|
|
5437
|
+
const { TranscriptProvider } = await import("./dist-7W4OI3X2.js");
|
|
5825
5438
|
const transcriptProvider = await TranscriptProvider.fromFile(options.transcript);
|
|
5826
5439
|
const totalTests = [...fileMetadata.values()].reduce(
|
|
5827
5440
|
(sum, meta) => sum + meta.testCases.length,
|
|
@@ -5852,7 +5465,10 @@ async function runEvalCommand(input) {
|
|
|
5852
5465
|
}
|
|
5853
5466
|
return true;
|
|
5854
5467
|
}) : targetPrep.testCases;
|
|
5855
|
-
|
|
5468
|
+
const filteredTestCases = resumeSkipKeys ? applicableTestCases.filter(
|
|
5469
|
+
(test) => !resumeSkipKeys.has(buildTestTargetKey(test.id, targetName))
|
|
5470
|
+
) : applicableTestCases;
|
|
5471
|
+
if (filteredTestCases.length === 0) {
|
|
5856
5472
|
return [];
|
|
5857
5473
|
}
|
|
5858
5474
|
try {
|
|
@@ -5872,15 +5488,15 @@ async function runEvalCommand(input) {
|
|
|
5872
5488
|
displayIdTracker,
|
|
5873
5489
|
selection,
|
|
5874
5490
|
inlineTargetLabel,
|
|
5875
|
-
testCases:
|
|
5491
|
+
testCases: filteredTestCases,
|
|
5876
5492
|
trialsConfig: options.transcript ? void 0 : targetPrep.trialsConfig,
|
|
5877
5493
|
matrixMode: targetPrep.selections.length > 1,
|
|
5878
|
-
|
|
5494
|
+
budgetUsd: targetPrep.budgetUsd,
|
|
5879
5495
|
failOnError: targetPrep.failOnError,
|
|
5880
5496
|
threshold: resolvedThreshold,
|
|
5881
5497
|
providerFactory: transcriptProviderFactory
|
|
5882
5498
|
});
|
|
5883
|
-
const evalFile =
|
|
5499
|
+
const evalFile = path15.relative(cwd, testFilePath);
|
|
5884
5500
|
const existingSummary = remoteEvalSummaries.find(
|
|
5885
5501
|
(summary2) => summary2.evalFile === evalFile
|
|
5886
5502
|
);
|
|
@@ -5897,10 +5513,10 @@ async function runEvalCommand(input) {
|
|
|
5897
5513
|
const message = fileError instanceof Error ? fileError.message : String(fileError);
|
|
5898
5514
|
console.error(
|
|
5899
5515
|
`
|
|
5900
|
-
[ERROR] \u26A0 Eval file failed: ${
|
|
5516
|
+
[ERROR] \u26A0 Eval file failed: ${path15.basename(testFilePath)} \u2014 ${message}
|
|
5901
5517
|
`
|
|
5902
5518
|
);
|
|
5903
|
-
const errorResults =
|
|
5519
|
+
const errorResults = filteredTestCases.map((testCase) => ({
|
|
5904
5520
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
5905
5521
|
testId: testCase.id,
|
|
5906
5522
|
score: 0,
|
|
@@ -5936,37 +5552,60 @@ async function runEvalCommand(input) {
|
|
|
5936
5552
|
`Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`
|
|
5937
5553
|
);
|
|
5938
5554
|
}
|
|
5555
|
+
await outputWriter.close().catch(() => void 0);
|
|
5556
|
+
let summaryResults = allResults;
|
|
5557
|
+
if (isResumeAppend && usesDefaultArtifactWorkspace) {
|
|
5558
|
+
const content = await readFile8(outputPath, "utf8");
|
|
5559
|
+
summaryResults = deduplicateByTestIdTarget(parseJsonlResults(content));
|
|
5560
|
+
}
|
|
5939
5561
|
const thresholdOpts = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
|
|
5940
|
-
const summary = calculateEvaluationSummary(
|
|
5562
|
+
const summary = calculateEvaluationSummary(summaryResults, thresholdOpts);
|
|
5941
5563
|
console.log(formatEvaluationSummary(summary, thresholdOpts));
|
|
5942
5564
|
const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total;
|
|
5943
5565
|
const thresholdFailed = resolvedThreshold !== void 0 && summary.qualityFailureCount > 0;
|
|
5944
|
-
if (isMatrixMode &&
|
|
5945
|
-
console.log(formatMatrixSummary(
|
|
5566
|
+
if (isMatrixMode && summaryResults.length > 0) {
|
|
5567
|
+
console.log(formatMatrixSummary(summaryResults));
|
|
5946
5568
|
}
|
|
5947
5569
|
if (options.benchmarkJson && allResults.length > 0) {
|
|
5948
|
-
const benchmarkPath =
|
|
5570
|
+
const benchmarkPath = path15.resolve(options.benchmarkJson);
|
|
5949
5571
|
await writeBenchmarkJson(benchmarkPath, allResults);
|
|
5950
5572
|
console.log(`Benchmark written to: ${benchmarkPath}`);
|
|
5951
5573
|
}
|
|
5952
5574
|
if (usesDefaultArtifactWorkspace && allResults.length > 0) {
|
|
5953
5575
|
const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : "";
|
|
5954
|
-
|
|
5955
|
-
|
|
5956
|
-
|
|
5957
|
-
|
|
5958
|
-
|
|
5959
|
-
|
|
5960
|
-
|
|
5961
|
-
|
|
5962
|
-
|
|
5963
|
-
|
|
5964
|
-
|
|
5965
|
-
|
|
5966
|
-
` Per-test artifacts: ${
|
|
5967
|
-
|
|
5968
|
-
|
|
5969
|
-
|
|
5576
|
+
if (isResumeAppend) {
|
|
5577
|
+
const { writePerTestArtifacts } = await import("./artifact-writer-WH3OE42V.js");
|
|
5578
|
+
await writePerTestArtifacts(allResults, runDir, {
|
|
5579
|
+
experiment: normalizeExperimentName(options.experiment)
|
|
5580
|
+
});
|
|
5581
|
+
const { benchmarkPath: workspaceBenchmarkPath, timingPath } = await aggregateRunDir(
|
|
5582
|
+
runDir,
|
|
5583
|
+
{ evalFile, experiment: normalizeExperimentName(options.experiment) }
|
|
5584
|
+
);
|
|
5585
|
+
const indexPath = path15.join(runDir, "index.jsonl");
|
|
5586
|
+
console.log(`Artifact workspace updated: ${runDir}`);
|
|
5587
|
+
console.log(` Index: ${indexPath}`);
|
|
5588
|
+
console.log(` Per-test artifacts: ${runDir} (${allResults.length} new test directories)`);
|
|
5589
|
+
console.log(` Timing: ${timingPath}`);
|
|
5590
|
+
console.log(` Benchmark: ${workspaceBenchmarkPath}`);
|
|
5591
|
+
} else {
|
|
5592
|
+
const {
|
|
5593
|
+
testArtifactDir,
|
|
5594
|
+
timingPath,
|
|
5595
|
+
benchmarkPath: workspaceBenchmarkPath,
|
|
5596
|
+
indexPath
|
|
5597
|
+
} = await writeArtifactsFromResults(allResults, runDir, {
|
|
5598
|
+
evalFile,
|
|
5599
|
+
experiment: normalizeExperimentName(options.experiment)
|
|
5600
|
+
});
|
|
5601
|
+
console.log(`Artifact workspace written to: ${runDir}`);
|
|
5602
|
+
console.log(` Index: ${indexPath}`);
|
|
5603
|
+
console.log(
|
|
5604
|
+
` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`
|
|
5605
|
+
);
|
|
5606
|
+
console.log(` Timing: ${timingPath}`);
|
|
5607
|
+
console.log(` Benchmark: ${workspaceBenchmarkPath}`);
|
|
5608
|
+
}
|
|
5970
5609
|
}
|
|
5971
5610
|
if (resolvedExportPaths.length > 0 && allResults.length > 0) {
|
|
5972
5611
|
for (const exportPath of resolvedExportPaths) {
|
|
@@ -5977,7 +5616,7 @@ async function runEvalCommand(input) {
|
|
|
5977
5616
|
await writer.close();
|
|
5978
5617
|
}
|
|
5979
5618
|
console.log(
|
|
5980
|
-
`Export file(s) written: ${resolvedExportPaths.map((p) =>
|
|
5619
|
+
`Export file(s) written: ${resolvedExportPaths.map((p) => path15.relative(cwd, p)).join(", ")}`
|
|
5981
5620
|
);
|
|
5982
5621
|
}
|
|
5983
5622
|
const resultsWithWorkspaces = allResults.filter((r) => r.workspacePath);
|
|
@@ -6015,14 +5654,14 @@ Results written to: ${outputPath}`);
|
|
|
6015
5654
|
experiment: normalizeExperimentName(options.experiment)
|
|
6016
5655
|
});
|
|
6017
5656
|
}
|
|
6018
|
-
if (summary.executionErrorCount > 0 && !options.retryErrors) {
|
|
6019
|
-
const evalFileArgs = activeTestFiles.map((f) =>
|
|
5657
|
+
if (summary.executionErrorCount > 0 && !options.retryErrors && !options.resume) {
|
|
5658
|
+
const evalFileArgs = activeTestFiles.map((f) => path15.relative(cwd, f)).join(" ");
|
|
6020
5659
|
const targetFlag = options.target ? ` --target ${options.target}` : "";
|
|
6021
|
-
const
|
|
5660
|
+
const relativeRunDir = path15.relative(cwd, runDir);
|
|
6022
5661
|
console.log(
|
|
6023
5662
|
`
|
|
6024
5663
|
Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:
|
|
6025
|
-
agentv eval run ${evalFileArgs}${targetFlag} --
|
|
5664
|
+
agentv eval run ${evalFileArgs}${targetFlag} --output ${relativeRunDir} --rerun-failed`
|
|
6026
5665
|
);
|
|
6027
5666
|
}
|
|
6028
5667
|
return {
|
|
@@ -6052,7 +5691,7 @@ async function resolveEvaluationRunner() {
|
|
|
6052
5691
|
if (!overridePath) {
|
|
6053
5692
|
return runEvaluation;
|
|
6054
5693
|
}
|
|
6055
|
-
const resolved =
|
|
5694
|
+
const resolved = path15.isAbsolute(overridePath) ? overridePath : path15.resolve(process.cwd(), overridePath);
|
|
6056
5695
|
const moduleUrl = pathToFileURL(resolved).href;
|
|
6057
5696
|
const mod = await import(moduleUrl);
|
|
6058
5697
|
const candidate = mod.runEvaluation;
|
|
@@ -6065,11 +5704,11 @@ async function resolveEvaluationRunner() {
|
|
|
6065
5704
|
}
|
|
6066
5705
|
|
|
6067
5706
|
// src/commands/eval/discover.ts
|
|
6068
|
-
import
|
|
5707
|
+
import path16 from "node:path";
|
|
6069
5708
|
import fg2 from "fast-glob";
|
|
6070
5709
|
async function discoverEvalFiles(cwd) {
|
|
6071
5710
|
const repoRoot = await findRepoRoot(cwd);
|
|
6072
|
-
const config = await loadConfig(
|
|
5711
|
+
const config = await loadConfig(path16.join(cwd, "_"), repoRoot);
|
|
6073
5712
|
const patterns = config?.eval_patterns && config.eval_patterns.length > 0 ? config.eval_patterns : DEFAULT_EVAL_PATTERNS;
|
|
6074
5713
|
const ignore = ["**/node_modules/**", "**/dist/**"];
|
|
6075
5714
|
const matches = await fg2(patterns, {
|
|
@@ -6081,7 +5720,7 @@ async function discoverEvalFiles(cwd) {
|
|
|
6081
5720
|
caseSensitiveMatch: false
|
|
6082
5721
|
});
|
|
6083
5722
|
const evalFiles = matches.map((absPath) => {
|
|
6084
|
-
const relativePath =
|
|
5723
|
+
const relativePath = path16.relative(cwd, absPath);
|
|
6085
5724
|
const category = deriveCategory(relativePath);
|
|
6086
5725
|
return { path: absPath, relativePath, category };
|
|
6087
5726
|
});
|
|
@@ -6103,10 +5742,6 @@ function filterByCategory(files, category) {
|
|
|
6103
5742
|
|
|
6104
5743
|
export {
|
|
6105
5744
|
package_default,
|
|
6106
|
-
toSnakeCaseDeep,
|
|
6107
|
-
RESULT_INDEX_FILENAME,
|
|
6108
|
-
buildDefaultRunDir,
|
|
6109
|
-
resolveRunManifestPath,
|
|
6110
5745
|
parseResultManifest,
|
|
6111
5746
|
resolveResultSourcePath,
|
|
6112
5747
|
loadManifestResults,
|
|
@@ -6132,7 +5767,6 @@ export {
|
|
|
6132
5767
|
listMergedResultFiles,
|
|
6133
5768
|
findRunById,
|
|
6134
5769
|
maybeAutoExportRunArtifacts,
|
|
6135
|
-
writeArtifactsFromResults,
|
|
6136
5770
|
resolveRunCacheFile,
|
|
6137
5771
|
loadRunCache,
|
|
6138
5772
|
detectFileType,
|
|
@@ -6150,4 +5784,4 @@ export {
|
|
|
6150
5784
|
getCategories,
|
|
6151
5785
|
filterByCategory
|
|
6152
5786
|
};
|
|
6153
|
-
//# sourceMappingURL=chunk-
|
|
5787
|
+
//# sourceMappingURL=chunk-VRPCMCLQ.js.map
|