agentv 4.17.1 → 4.19.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/artifact-writer-YATMDPWI.js +41 -0
- package/dist/{chunk-ILIM6IIX.js → chunk-L7WOR7IR.js} +213 -176
- package/dist/chunk-L7WOR7IR.js.map +1 -0
- package/dist/chunk-NL6P5MUH.js +691 -0
- package/dist/chunk-NL6P5MUH.js.map +1 -0
- package/dist/{chunk-IRU2UOWN.js → chunk-PTYQS37Y.js} +29097 -31078
- package/dist/chunk-PTYQS37Y.js.map +1 -0
- package/dist/chunk-R2QDYORI.js +2178 -0
- package/dist/chunk-R2QDYORI.js.map +1 -0
- package/dist/{chunk-ZUNYOUFO.js → chunk-YGNBTBXW.js} +328 -656
- package/dist/chunk-YGNBTBXW.js.map +1 -0
- package/dist/cli.js +7 -5
- package/dist/cli.js.map +1 -1
- package/dist/{dist-U5EXNMON.js → dist-RTIUSC6L.js} +91 -90
- package/dist/dist-RTIUSC6L.js.map +1 -0
- package/dist/index.js +7 -5
- package/dist/{interactive-LFCOVXPQ.js → interactive-UZSFYMCF.js} +8 -6
- package/dist/{interactive-LFCOVXPQ.js.map → interactive-UZSFYMCF.js.map} +1 -1
- package/dist/studio/assets/{index-Bhv1TEO2.js → index-BTsTcivx.js} +1 -1
- package/dist/studio/assets/{index-vZYHIvCH.js → index-KfPHd-QM.js} +1 -1
- package/dist/studio/index.html +1 -1
- package/dist/ts-eval-loader-XFQ6S4DT-S7P2UUBX.js +15 -0
- package/dist/ts-eval-loader-XFQ6S4DT-S7P2UUBX.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-ILIM6IIX.js.map +0 -1
- package/dist/chunk-IRU2UOWN.js.map +0 -1
- package/dist/chunk-ZUNYOUFO.js.map +0 -1
- /package/dist/{dist-U5EXNMON.js.map → artifact-writer-YATMDPWI.js.map} +0 -0
|
@@ -1,4 +1,36 @@
|
|
|
1
1
|
import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
|
|
2
|
+
import {
|
|
3
|
+
RESULT_INDEX_FILENAME,
|
|
4
|
+
RESULT_RUNS_DIRNAME,
|
|
5
|
+
aggregateRunDir,
|
|
6
|
+
buildDefaultRunDir,
|
|
7
|
+
buildTestTargetKey,
|
|
8
|
+
deduplicateByTestIdTarget,
|
|
9
|
+
isDirectoryPath,
|
|
10
|
+
normalizeExperimentName,
|
|
11
|
+
parseJsonlResults,
|
|
12
|
+
resolveExistingRunPrimaryPath,
|
|
13
|
+
resolveRunIndexPath,
|
|
14
|
+
resolveRunManifestPath,
|
|
15
|
+
resolveWorkspaceOrFilePath,
|
|
16
|
+
toSnakeCaseDeep,
|
|
17
|
+
writeArtifactsFromResults
|
|
18
|
+
} from "./chunk-NL6P5MUH.js";
|
|
19
|
+
import {
|
|
20
|
+
ResponseCache,
|
|
21
|
+
commitAndPushResultsBranch,
|
|
22
|
+
createDraftResultsPr,
|
|
23
|
+
deriveCategory,
|
|
24
|
+
directorySizeBytes,
|
|
25
|
+
getResultsRepoStatus,
|
|
26
|
+
loadTsConfig,
|
|
27
|
+
prepareResultsRepoBranch,
|
|
28
|
+
resolveResultsRepoRunsDir,
|
|
29
|
+
shouldEnableCache,
|
|
30
|
+
shouldSkipCacheForTemperature,
|
|
31
|
+
stageResultsArtifacts,
|
|
32
|
+
syncResultsRepo
|
|
33
|
+
} from "./chunk-R2QDYORI.js";
|
|
2
34
|
import {
|
|
3
35
|
CLI_PLACEHOLDERS,
|
|
4
36
|
COMMON_TARGET_SETTINGS,
|
|
@@ -6,48 +38,34 @@ import {
|
|
|
6
38
|
DEFAULT_THRESHOLD,
|
|
7
39
|
KNOWN_PROVIDERS,
|
|
8
40
|
PROVIDER_ALIASES,
|
|
9
|
-
ResponseCache,
|
|
10
41
|
buildDirectoryChain,
|
|
11
42
|
buildSearchRoots,
|
|
12
|
-
commitAndPushResultsBranch,
|
|
13
|
-
createDraftResultsPr,
|
|
14
|
-
deriveCategory,
|
|
15
|
-
directorySizeBytes,
|
|
16
43
|
ensureVSCodeSubagents,
|
|
17
44
|
findDeprecatedCamelCaseTargetWarnings,
|
|
18
45
|
findGitRoot,
|
|
19
|
-
getResultsRepoStatus,
|
|
20
46
|
interpolateEnv,
|
|
21
|
-
|
|
47
|
+
isGraderKind,
|
|
22
48
|
listTargetNames,
|
|
23
49
|
loadCasesFromFile,
|
|
24
50
|
loadConfig,
|
|
25
51
|
loadTestSuite,
|
|
26
|
-
loadTsConfig,
|
|
27
52
|
normalizeLineEndings,
|
|
28
|
-
prepareResultsRepoBranch,
|
|
29
53
|
readTargetDefinitions,
|
|
30
54
|
readTestSuiteMetadata,
|
|
31
55
|
resolveFileReference,
|
|
32
|
-
resolveResultsRepoRunsDir,
|
|
33
56
|
resolveTargetDefinition,
|
|
34
57
|
runEvaluation,
|
|
35
|
-
shouldEnableCache,
|
|
36
|
-
shouldSkipCacheForTemperature,
|
|
37
|
-
stageResultsArtifacts,
|
|
38
58
|
subscribeToCodexLogEntries,
|
|
39
59
|
subscribeToCopilotCliLogEntries,
|
|
40
60
|
subscribeToCopilotSdkLogEntries,
|
|
41
61
|
subscribeToPiLogEntries,
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
toTranscriptJsonLines
|
|
45
|
-
} from "./chunk-IRU2UOWN.js";
|
|
62
|
+
toCamelCaseDeep
|
|
63
|
+
} from "./chunk-PTYQS37Y.js";
|
|
46
64
|
|
|
47
65
|
// package.json
|
|
48
66
|
var package_default = {
|
|
49
67
|
name: "agentv",
|
|
50
|
-
version: "4.
|
|
68
|
+
version: "4.19.0-next.1",
|
|
51
69
|
description: "CLI entry point for AgentV",
|
|
52
70
|
type: "module",
|
|
53
71
|
repository: {
|
|
@@ -132,12 +150,15 @@ async function resolveEvalPaths(evalPaths, cwd) {
|
|
|
132
150
|
const candidatePath = path.isAbsolute(pattern) ? path.normalize(pattern) : path.resolve(cwd, pattern);
|
|
133
151
|
try {
|
|
134
152
|
const stats = await stat(candidatePath);
|
|
135
|
-
if (stats.isFile() && /\.(ya?ml|jsonl|json)$/i.test(candidatePath)) {
|
|
153
|
+
if (stats.isFile() && /\.(ya?ml|jsonl|json|[cm]?ts)$/i.test(candidatePath)) {
|
|
136
154
|
results.add(candidatePath);
|
|
137
155
|
continue;
|
|
138
156
|
}
|
|
139
157
|
if (stats.isDirectory()) {
|
|
140
|
-
const dirGlob = path.posix.join(
|
|
158
|
+
const dirGlob = path.posix.join(
|
|
159
|
+
candidatePath.replace(/\\/g, "/"),
|
|
160
|
+
"**/{*.eval.yaml,*.eval.yml,eval.yaml,eval.yml,*.eval.ts,*.eval.mts}"
|
|
161
|
+
);
|
|
141
162
|
const dirMatches = await fg(dirGlob, {
|
|
142
163
|
absolute: true,
|
|
143
164
|
onlyFiles: true,
|
|
@@ -163,7 +184,9 @@ async function resolveEvalPaths(evalPaths, cwd) {
|
|
|
163
184
|
followSymbolicLinks: true,
|
|
164
185
|
ignore: ignorePatterns
|
|
165
186
|
});
|
|
166
|
-
const yamlMatches = matches.filter(
|
|
187
|
+
const yamlMatches = matches.filter(
|
|
188
|
+
(filePath) => /\.(ya?ml|jsonl|json|[cm]?ts)$/i.test(filePath)
|
|
189
|
+
);
|
|
167
190
|
for (const filePath of yamlMatches) {
|
|
168
191
|
results.add(path.normalize(filePath));
|
|
169
192
|
}
|
|
@@ -185,7 +208,7 @@ async function resolveEvalPaths(evalPaths, cwd) {
|
|
|
185
208
|
throw new Error(
|
|
186
209
|
`No eval files matched any provided paths or globs: ${includePatterns.join(
|
|
187
210
|
", "
|
|
188
|
-
)}. Provide YAML, JSONL, or
|
|
211
|
+
)}. Provide YAML, JSONL, JSON, or TypeScript paths or globs (e.g., "evals/**/eval.yaml", "evals/**/*.eval.ts").`
|
|
189
212
|
);
|
|
190
213
|
}
|
|
191
214
|
const sorted = Array.from(results);
|
|
@@ -261,9 +284,9 @@ async function discoverTargetsFile(options) {
|
|
|
261
284
|
}
|
|
262
285
|
|
|
263
286
|
// src/commands/eval/run-eval.ts
|
|
264
|
-
import { constants as constants4, mkdirSync } from "node:fs";
|
|
265
|
-
import { access as access5 } from "node:fs/promises";
|
|
266
|
-
import
|
|
287
|
+
import { constants as constants4, existsSync as existsSync2, mkdirSync } from "node:fs";
|
|
288
|
+
import { access as access5, readFile as readFile8 } from "node:fs/promises";
|
|
289
|
+
import path15 from "node:path";
|
|
267
290
|
import { pathToFileURL } from "node:url";
|
|
268
291
|
|
|
269
292
|
// src/version-check.ts
|
|
@@ -321,88 +344,15 @@ async function promptContinue() {
|
|
|
321
344
|
}
|
|
322
345
|
|
|
323
346
|
// src/commands/results/remote.ts
|
|
324
|
-
import path6 from "node:path";
|
|
325
|
-
|
|
326
|
-
// src/commands/inspect/utils.ts
|
|
327
|
-
import { readFileSync as readFileSync2, readdirSync, statSync as statSync2 } from "node:fs";
|
|
328
347
|
import path5 from "node:path";
|
|
329
348
|
|
|
330
|
-
// src/commands/
|
|
331
|
-
import {
|
|
332
|
-
import
|
|
333
|
-
var RESULT_INDEX_FILENAME = "index.jsonl";
|
|
334
|
-
var RESULT_RUNS_DIRNAME = "runs";
|
|
335
|
-
var DEFAULT_EXPERIMENT_NAME = "default";
|
|
336
|
-
function normalizeExperimentName(experiment) {
|
|
337
|
-
const trimmed = experiment?.trim();
|
|
338
|
-
if (!trimmed) {
|
|
339
|
-
return DEFAULT_EXPERIMENT_NAME;
|
|
340
|
-
}
|
|
341
|
-
if (!/^[A-Za-z0-9._-]+$/.test(trimmed)) {
|
|
342
|
-
throw new Error(
|
|
343
|
-
`Invalid experiment name "${trimmed}". Use only letters, numbers, ".", "_" and "-".`
|
|
344
|
-
);
|
|
345
|
-
}
|
|
346
|
-
return trimmed;
|
|
347
|
-
}
|
|
348
|
-
function createRunDirName(timestamp = /* @__PURE__ */ new Date()) {
|
|
349
|
-
return timestamp.toISOString().replace(/[:.]/g, "-");
|
|
350
|
-
}
|
|
351
|
-
function buildDefaultRunDir(cwd, experiment, timestamp = /* @__PURE__ */ new Date()) {
|
|
352
|
-
return path3.join(
|
|
353
|
-
cwd,
|
|
354
|
-
".agentv",
|
|
355
|
-
"results",
|
|
356
|
-
RESULT_RUNS_DIRNAME,
|
|
357
|
-
normalizeExperimentName(experiment),
|
|
358
|
-
createRunDirName(timestamp)
|
|
359
|
-
);
|
|
360
|
-
}
|
|
361
|
-
function resolveRunIndexPath(runDir) {
|
|
362
|
-
return path3.join(runDir, RESULT_INDEX_FILENAME);
|
|
363
|
-
}
|
|
364
|
-
function isRunManifestPath(filePath) {
|
|
365
|
-
return path3.basename(filePath) === RESULT_INDEX_FILENAME;
|
|
366
|
-
}
|
|
367
|
-
function resolveExistingRunPrimaryPath(runDir) {
|
|
368
|
-
const indexPath = resolveRunIndexPath(runDir);
|
|
369
|
-
if (existsSync(indexPath)) {
|
|
370
|
-
return indexPath;
|
|
371
|
-
}
|
|
372
|
-
return void 0;
|
|
373
|
-
}
|
|
374
|
-
function isDirectoryPath(filePath) {
|
|
375
|
-
try {
|
|
376
|
-
return statSync(filePath).isDirectory();
|
|
377
|
-
} catch {
|
|
378
|
-
return false;
|
|
379
|
-
}
|
|
380
|
-
}
|
|
381
|
-
function resolveWorkspaceOrFilePath(filePath) {
|
|
382
|
-
if (!isDirectoryPath(filePath)) {
|
|
383
|
-
return filePath;
|
|
384
|
-
}
|
|
385
|
-
const existing = resolveExistingRunPrimaryPath(filePath);
|
|
386
|
-
if (!existing) {
|
|
387
|
-
throw new Error(`Result workspace is missing ${RESULT_INDEX_FILENAME}: ${filePath}`);
|
|
388
|
-
}
|
|
389
|
-
return existing;
|
|
390
|
-
}
|
|
391
|
-
function resolveRunManifestPath(filePath) {
|
|
392
|
-
if (isDirectoryPath(filePath)) {
|
|
393
|
-
return resolveWorkspaceOrFilePath(filePath);
|
|
394
|
-
}
|
|
395
|
-
if (!isRunManifestPath(filePath)) {
|
|
396
|
-
throw new Error(
|
|
397
|
-
`Expected a run workspace directory or ${RESULT_INDEX_FILENAME} manifest: ${filePath}`
|
|
398
|
-
);
|
|
399
|
-
}
|
|
400
|
-
return filePath;
|
|
401
|
-
}
|
|
349
|
+
// src/commands/inspect/utils.ts
|
|
350
|
+
import { readFileSync as readFileSync2, readdirSync, statSync } from "node:fs";
|
|
351
|
+
import path4 from "node:path";
|
|
402
352
|
|
|
403
353
|
// src/commands/results/manifest.ts
|
|
404
|
-
import { existsSync
|
|
405
|
-
import
|
|
354
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
355
|
+
import path3 from "node:path";
|
|
406
356
|
function parseJsonlLines(content) {
|
|
407
357
|
return content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0).map((line) => JSON.parse(line));
|
|
408
358
|
}
|
|
@@ -421,8 +371,8 @@ function readOptionalText(baseDir, relativePath) {
|
|
|
421
371
|
if (!relativePath) {
|
|
422
372
|
return void 0;
|
|
423
373
|
}
|
|
424
|
-
const absolutePath =
|
|
425
|
-
if (!
|
|
374
|
+
const absolutePath = path3.join(baseDir, relativePath);
|
|
375
|
+
if (!existsSync(absolutePath)) {
|
|
426
376
|
return void 0;
|
|
427
377
|
}
|
|
428
378
|
return readFileSync(absolutePath, "utf8");
|
|
@@ -510,8 +460,8 @@ function parseResultManifest(content) {
|
|
|
510
460
|
return parseJsonlLines(content);
|
|
511
461
|
}
|
|
512
462
|
function resolveResultSourcePath(source, cwd) {
|
|
513
|
-
const resolved =
|
|
514
|
-
if (isDirectoryPath(resolved) ||
|
|
463
|
+
const resolved = path3.isAbsolute(source) ? source : path3.resolve(cwd ?? process.cwd(), source);
|
|
464
|
+
if (isDirectoryPath(resolved) || path3.basename(resolved) === RESULT_INDEX_FILENAME) {
|
|
515
465
|
return resolveRunManifestPath(resolved);
|
|
516
466
|
}
|
|
517
467
|
return resolved;
|
|
@@ -520,7 +470,7 @@ function loadManifestResults(sourceFile) {
|
|
|
520
470
|
const resolvedSourceFile = resolveRunManifestPath(sourceFile);
|
|
521
471
|
const content = readFileSync(resolvedSourceFile, "utf8");
|
|
522
472
|
const records = parseResultManifest(content);
|
|
523
|
-
const baseDir =
|
|
473
|
+
const baseDir = path3.dirname(resolvedSourceFile);
|
|
524
474
|
return records.map((record) => hydrateManifestRecord(baseDir, record));
|
|
525
475
|
}
|
|
526
476
|
function loadLightweightResults(sourceFile) {
|
|
@@ -566,10 +516,10 @@ function padLeft(str, len) {
|
|
|
566
516
|
}
|
|
567
517
|
function loadResultFile(filePath) {
|
|
568
518
|
const resolvedFilePath = resolveTraceResultPath(filePath);
|
|
569
|
-
if (
|
|
519
|
+
if (path4.extname(resolvedFilePath) === ".json") {
|
|
570
520
|
return loadOtlpTraceFile(resolvedFilePath);
|
|
571
521
|
}
|
|
572
|
-
if (
|
|
522
|
+
if (path4.basename(resolvedFilePath) === RESULT_INDEX_FILENAME) {
|
|
573
523
|
return loadManifestAsRawResults(resolvedFilePath);
|
|
574
524
|
}
|
|
575
525
|
return loadJsonlRecords(resolvedFilePath);
|
|
@@ -848,7 +798,7 @@ function toTraceSummary(result) {
|
|
|
848
798
|
return toCamelCaseDeep(rawTrace);
|
|
849
799
|
}
|
|
850
800
|
function buildRunId(relativeRunPath) {
|
|
851
|
-
const normalized = relativeRunPath.split(
|
|
801
|
+
const normalized = relativeRunPath.split(path4.sep).join("/");
|
|
852
802
|
const segments = normalized.split("/").filter(Boolean);
|
|
853
803
|
if (segments.length >= 2) {
|
|
854
804
|
const experiment = segments.slice(0, -1).join("/");
|
|
@@ -863,10 +813,10 @@ function buildRunId(relativeRunPath) {
|
|
|
863
813
|
function collectRunManifestPaths(runsDir, currentDir, files) {
|
|
864
814
|
const primaryPath = resolveExistingRunPrimaryPath(currentDir);
|
|
865
815
|
if (primaryPath) {
|
|
866
|
-
const relativeRunPath =
|
|
816
|
+
const relativeRunPath = path4.relative(runsDir, currentDir);
|
|
867
817
|
files.push({
|
|
868
818
|
filePath: primaryPath,
|
|
869
|
-
displayName:
|
|
819
|
+
displayName: path4.basename(currentDir),
|
|
870
820
|
runId: buildRunId(relativeRunPath)
|
|
871
821
|
});
|
|
872
822
|
return;
|
|
@@ -874,7 +824,7 @@ function collectRunManifestPaths(runsDir, currentDir, files) {
|
|
|
874
824
|
const entries = readdirSync(currentDir, { withFileTypes: true });
|
|
875
825
|
for (const entry of entries) {
|
|
876
826
|
if (entry.isDirectory()) {
|
|
877
|
-
collectRunManifestPaths(runsDir,
|
|
827
|
+
collectRunManifestPaths(runsDir, path4.join(currentDir, entry.name), files);
|
|
878
828
|
}
|
|
879
829
|
}
|
|
880
830
|
}
|
|
@@ -884,7 +834,7 @@ function listResultFilesFromRunsDir(runsDir, limit) {
|
|
|
884
834
|
const entries = readdirSync(runsDir, { withFileTypes: true });
|
|
885
835
|
for (const entry of entries) {
|
|
886
836
|
if (entry.isDirectory()) {
|
|
887
|
-
collectRunManifestPaths(runsDir,
|
|
837
|
+
collectRunManifestPaths(runsDir, path4.join(runsDir, entry.name), files);
|
|
888
838
|
}
|
|
889
839
|
}
|
|
890
840
|
} catch {
|
|
@@ -894,7 +844,7 @@ function listResultFilesFromRunsDir(runsDir, limit) {
|
|
|
894
844
|
const metas = [];
|
|
895
845
|
for (const { filePath, displayName, runId } of limited) {
|
|
896
846
|
try {
|
|
897
|
-
const fileStat =
|
|
847
|
+
const fileStat = statSync(filePath);
|
|
898
848
|
const results = loadResultFile(filePath);
|
|
899
849
|
const testCount = results.length;
|
|
900
850
|
const passCount = results.filter((r) => r.score >= DEFAULT_THRESHOLD).length;
|
|
@@ -919,7 +869,7 @@ function listResultFilesFromRunsDir(runsDir, limit) {
|
|
|
919
869
|
}
|
|
920
870
|
function listResultFiles(cwd, limit) {
|
|
921
871
|
return listResultFilesFromRunsDir(
|
|
922
|
-
|
|
872
|
+
path4.join(cwd, ".agentv", "results", RESULT_RUNS_DIRNAME),
|
|
923
873
|
limit
|
|
924
874
|
);
|
|
925
875
|
}
|
|
@@ -971,17 +921,17 @@ function slugify(value) {
|
|
|
971
921
|
return value.trim().replace(/[^A-Za-z0-9._/-]+/g, "-").replace(/\/+/g, "/").replace(/^-+|-+$/g, "").slice(0, 120);
|
|
972
922
|
}
|
|
973
923
|
function getRelativeRunPath(cwd, runDir) {
|
|
974
|
-
const relative =
|
|
975
|
-
if (!relative.startsWith("..") && !
|
|
924
|
+
const relative = path5.relative(path5.join(cwd, ".agentv", "results", "runs"), runDir);
|
|
925
|
+
if (!relative.startsWith("..") && !path5.isAbsolute(relative)) {
|
|
976
926
|
return relative;
|
|
977
927
|
}
|
|
978
|
-
const experiment =
|
|
979
|
-
const runName =
|
|
980
|
-
return experiment && experiment !== runName ?
|
|
928
|
+
const experiment = path5.basename(path5.dirname(runDir));
|
|
929
|
+
const runName = path5.basename(runDir);
|
|
930
|
+
return experiment && experiment !== runName ? path5.join(experiment, runName) : runName;
|
|
981
931
|
}
|
|
982
932
|
function buildBranchName(config, payload) {
|
|
983
|
-
const timestamp =
|
|
984
|
-
const evalStem = payload.test_files.length === 1 ?
|
|
933
|
+
const timestamp = path5.basename(payload.run_dir);
|
|
934
|
+
const evalStem = payload.test_files.length === 1 ? path5.basename(payload.test_files[0]).replace(/\.eval\.ya?ml$/i, "").replace(/\.[^.]+$/i, "") : `${payload.test_files.length}-evals`;
|
|
985
935
|
const experiment = slugify(payload.experiment ?? "default");
|
|
986
936
|
const branchLeaf = slugify(`${experiment}-${evalStem}-${timestamp}`) || timestamp;
|
|
987
937
|
return `${config.branch_prefix}/${branchLeaf}`;
|
|
@@ -1010,7 +960,7 @@ function buildPrBody(payload) {
|
|
|
1010
960
|
"",
|
|
1011
961
|
sections,
|
|
1012
962
|
"",
|
|
1013
|
-
`Run: ${
|
|
963
|
+
`Run: ${path5.basename(payload.run_dir)}`,
|
|
1014
964
|
`Experiment: ${payload.experiment ?? "default"}`,
|
|
1015
965
|
`Eval Files: ${payload.test_files.join(", ")}`
|
|
1016
966
|
].join("\n");
|
|
@@ -1025,7 +975,7 @@ async function maybeWarnLargeArtifact(runDir) {
|
|
|
1025
975
|
}
|
|
1026
976
|
async function loadNormalizedResultsConfig(cwd) {
|
|
1027
977
|
const repoRoot = await findRepoRoot(cwd) ?? cwd;
|
|
1028
|
-
const config = await loadConfig(
|
|
978
|
+
const config = await loadConfig(path5.join(cwd, "_"), repoRoot);
|
|
1029
979
|
if (!config?.results?.export) {
|
|
1030
980
|
return void 0;
|
|
1031
981
|
}
|
|
@@ -1109,7 +1059,7 @@ async function maybeAutoExportRunArtifacts(payload) {
|
|
|
1109
1059
|
const prepared = await prepareResultsRepoBranch(config, branchName);
|
|
1110
1060
|
try {
|
|
1111
1061
|
const relativeRunPath = getRelativeRunPath(payload.cwd, payload.run_dir);
|
|
1112
|
-
const destinationDir =
|
|
1062
|
+
const destinationDir = path5.join(prepared.repoDir, config.path, relativeRunPath);
|
|
1113
1063
|
await stageResultsArtifacts({
|
|
1114
1064
|
repoDir: prepared.repoDir,
|
|
1115
1065
|
sourceDir: payload.run_dir,
|
|
@@ -1143,36 +1093,8 @@ async function maybeAutoExportRunArtifacts(payload) {
|
|
|
1143
1093
|
}
|
|
1144
1094
|
}
|
|
1145
1095
|
|
|
1146
|
-
// src/commands/eval/
|
|
1147
|
-
import {
|
|
1148
|
-
import path7 from "node:path";
|
|
1149
|
-
|
|
1150
|
-
// src/utils/case-conversion.ts
|
|
1151
|
-
function toSnakeCase(str) {
|
|
1152
|
-
if (/^[A-Z]/.test(str)) {
|
|
1153
|
-
return str;
|
|
1154
|
-
}
|
|
1155
|
-
return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
1156
|
-
}
|
|
1157
|
-
function toSnakeCaseDeep(obj) {
|
|
1158
|
-
if (obj === null || obj === void 0) {
|
|
1159
|
-
return obj;
|
|
1160
|
-
}
|
|
1161
|
-
if (Array.isArray(obj)) {
|
|
1162
|
-
return obj.map((item) => toSnakeCaseDeep(item));
|
|
1163
|
-
}
|
|
1164
|
-
if (typeof obj === "object") {
|
|
1165
|
-
const result = {};
|
|
1166
|
-
for (const [key, value] of Object.entries(obj)) {
|
|
1167
|
-
const snakeKey = toSnakeCase(key);
|
|
1168
|
-
result[snakeKey] = toSnakeCaseDeep(value);
|
|
1169
|
-
}
|
|
1170
|
-
return result;
|
|
1171
|
-
}
|
|
1172
|
-
return obj;
|
|
1173
|
-
}
|
|
1174
|
-
|
|
1175
|
-
// src/commands/eval/artifact-writer.ts
|
|
1096
|
+
// src/commands/eval/benchmark-writer.ts
|
|
1097
|
+
import { writeFile } from "node:fs/promises";
|
|
1176
1098
|
function computeStats(values) {
|
|
1177
1099
|
if (values.length === 0) {
|
|
1178
1100
|
return { mean: 0, stddev: 0 };
|
|
@@ -1185,361 +1107,6 @@ function computeStats(values) {
|
|
|
1185
1107
|
};
|
|
1186
1108
|
}
|
|
1187
1109
|
function computePassRate(result) {
|
|
1188
|
-
const scores = result.scores;
|
|
1189
|
-
if (scores && scores.length > 0) {
|
|
1190
|
-
const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
|
|
1191
|
-
return passed / scores.length;
|
|
1192
|
-
}
|
|
1193
|
-
return (result.score ?? 0) >= DEFAULT_THRESHOLD ? 1 : 0;
|
|
1194
|
-
}
|
|
1195
|
-
function countToolCalls(result) {
|
|
1196
|
-
const toolCalls = {};
|
|
1197
|
-
let total = 0;
|
|
1198
|
-
const trace = result.trace;
|
|
1199
|
-
if (trace?.steps) {
|
|
1200
|
-
for (const step of trace.steps) {
|
|
1201
|
-
if (step.toolName || step.type === "tool") {
|
|
1202
|
-
const name = step.toolName ?? "unknown";
|
|
1203
|
-
toolCalls[name] = (toolCalls[name] ?? 0) + 1;
|
|
1204
|
-
total += 1;
|
|
1205
|
-
}
|
|
1206
|
-
}
|
|
1207
|
-
}
|
|
1208
|
-
return { toolCalls, total };
|
|
1209
|
-
}
|
|
1210
|
-
function parseWorkspaceChanges(fileChanges) {
|
|
1211
|
-
if (!fileChanges) {
|
|
1212
|
-
return void 0;
|
|
1213
|
-
}
|
|
1214
|
-
let filesModified = 0;
|
|
1215
|
-
let filesCreated = 0;
|
|
1216
|
-
const lines = fileChanges.split("\n");
|
|
1217
|
-
for (const line of lines) {
|
|
1218
|
-
if (line.startsWith("--- /dev/null")) {
|
|
1219
|
-
filesCreated += 1;
|
|
1220
|
-
} else if (line.startsWith("--- a/")) {
|
|
1221
|
-
filesModified += 1;
|
|
1222
|
-
}
|
|
1223
|
-
}
|
|
1224
|
-
const summaryLines = lines.slice(0, 20);
|
|
1225
|
-
const diffSummary = lines.length > 20 ? `${summaryLines.join("\n")}
|
|
1226
|
-
... (${lines.length - 20} more lines)` : fileChanges;
|
|
1227
|
-
return {
|
|
1228
|
-
files_modified: filesModified,
|
|
1229
|
-
files_created: filesCreated,
|
|
1230
|
-
diff_summary: diffSummary
|
|
1231
|
-
};
|
|
1232
|
-
}
|
|
1233
|
-
function buildAssertions(result) {
|
|
1234
|
-
if (!result.assertions) return [];
|
|
1235
|
-
return result.assertions.map((a) => ({
|
|
1236
|
-
text: a.text,
|
|
1237
|
-
passed: a.passed,
|
|
1238
|
-
evidence: a.evidence ?? ""
|
|
1239
|
-
}));
|
|
1240
|
-
}
|
|
1241
|
-
function buildEvaluators(scores) {
|
|
1242
|
-
if (!scores || scores.length === 0) {
|
|
1243
|
-
return void 0;
|
|
1244
|
-
}
|
|
1245
|
-
return scores.map((s) => ({
|
|
1246
|
-
name: s.name,
|
|
1247
|
-
type: s.type,
|
|
1248
|
-
score: s.score,
|
|
1249
|
-
reasoning: "",
|
|
1250
|
-
weight: s.weight,
|
|
1251
|
-
verdict: s.verdict,
|
|
1252
|
-
assertions: s.assertions,
|
|
1253
|
-
details: s.details
|
|
1254
|
-
}));
|
|
1255
|
-
}
|
|
1256
|
-
function buildGradingArtifact(result) {
|
|
1257
|
-
const assertions = buildAssertions(result);
|
|
1258
|
-
const passed = assertions.filter((e) => e.passed).length;
|
|
1259
|
-
const failed = assertions.filter((e) => !e.passed).length;
|
|
1260
|
-
const total = assertions.length;
|
|
1261
|
-
const { toolCalls, total: totalToolCalls } = countToolCalls(result);
|
|
1262
|
-
const errorsEncountered = result.error ? 1 : 0;
|
|
1263
|
-
return {
|
|
1264
|
-
assertions,
|
|
1265
|
-
summary: {
|
|
1266
|
-
passed,
|
|
1267
|
-
failed,
|
|
1268
|
-
total,
|
|
1269
|
-
pass_rate: total > 0 ? Math.round(passed / total * 1e3) / 1e3 : 0
|
|
1270
|
-
},
|
|
1271
|
-
execution_metrics: {
|
|
1272
|
-
tool_calls: toolCalls,
|
|
1273
|
-
total_tool_calls: totalToolCalls,
|
|
1274
|
-
errors_encountered: errorsEncountered
|
|
1275
|
-
},
|
|
1276
|
-
graders: buildEvaluators(result.scores),
|
|
1277
|
-
workspace_changes: parseWorkspaceChanges(result.fileChanges),
|
|
1278
|
-
conversation: result.conversationId ? {
|
|
1279
|
-
turns: result.trace ? result.trace.steps?.length ?? 0 : 0,
|
|
1280
|
-
conversation_id: result.conversationId
|
|
1281
|
-
} : void 0
|
|
1282
|
-
};
|
|
1283
|
-
}
|
|
1284
|
-
function buildTimingArtifact(results) {
|
|
1285
|
-
let totalInput = 0;
|
|
1286
|
-
let totalOutput = 0;
|
|
1287
|
-
let totalReasoning = 0;
|
|
1288
|
-
let totalDurationMs = 0;
|
|
1289
|
-
for (const result of results) {
|
|
1290
|
-
const usage = result.tokenUsage;
|
|
1291
|
-
if (usage) {
|
|
1292
|
-
totalInput += usage.input ?? 0;
|
|
1293
|
-
totalOutput += usage.output ?? 0;
|
|
1294
|
-
totalReasoning += usage.reasoning ?? 0;
|
|
1295
|
-
}
|
|
1296
|
-
if (result.durationMs != null) {
|
|
1297
|
-
totalDurationMs += result.durationMs;
|
|
1298
|
-
}
|
|
1299
|
-
}
|
|
1300
|
-
return {
|
|
1301
|
-
total_tokens: totalInput + totalOutput,
|
|
1302
|
-
duration_ms: totalDurationMs,
|
|
1303
|
-
total_duration_seconds: Math.round(totalDurationMs / 1e3 * 1e3) / 1e3,
|
|
1304
|
-
token_usage: {
|
|
1305
|
-
input: totalInput,
|
|
1306
|
-
output: totalOutput,
|
|
1307
|
-
reasoning: totalReasoning
|
|
1308
|
-
}
|
|
1309
|
-
};
|
|
1310
|
-
}
|
|
1311
|
-
function buildBenchmarkArtifact(results, evalFile = "", experiment) {
|
|
1312
|
-
const targetSet = /* @__PURE__ */ new Set();
|
|
1313
|
-
const testIdSet = /* @__PURE__ */ new Set();
|
|
1314
|
-
for (const result of results) {
|
|
1315
|
-
targetSet.add(result.target ?? "unknown");
|
|
1316
|
-
testIdSet.add(result.testId ?? "unknown");
|
|
1317
|
-
}
|
|
1318
|
-
const targets = [...targetSet].sort();
|
|
1319
|
-
const testIds = [...testIdSet].sort();
|
|
1320
|
-
const runSummary = {};
|
|
1321
|
-
const notes = [];
|
|
1322
|
-
for (const target of targets) {
|
|
1323
|
-
const targetResults = results.filter((r) => r.target === target);
|
|
1324
|
-
const passRates = targetResults.map(computePassRate);
|
|
1325
|
-
const timings = targetResults.filter((r) => r.durationMs != null).map((r) => r.durationMs / 1e3);
|
|
1326
|
-
const tokens = targetResults.filter((r) => r.tokenUsage != null).map((r) => {
|
|
1327
|
-
const usage = r.tokenUsage;
|
|
1328
|
-
return (usage.input ?? 0) + (usage.output ?? 0);
|
|
1329
|
-
});
|
|
1330
|
-
const entry = {
|
|
1331
|
-
pass_rate: computeStats(passRates),
|
|
1332
|
-
time_seconds: computeStats(timings),
|
|
1333
|
-
tokens: computeStats(tokens)
|
|
1334
|
-
};
|
|
1335
|
-
const toolCallCounts = targetResults.map((r) => countToolCalls(r).total);
|
|
1336
|
-
if (toolCallCounts.some((c2) => c2 > 0)) {
|
|
1337
|
-
entry.tool_calls = computeStats(toolCallCounts);
|
|
1338
|
-
}
|
|
1339
|
-
const costs = targetResults.filter((r) => r.costUsd != null).map((r) => r.costUsd);
|
|
1340
|
-
if (costs.length > 0) {
|
|
1341
|
-
entry.cost_usd = computeStats(costs);
|
|
1342
|
-
}
|
|
1343
|
-
runSummary[target] = entry;
|
|
1344
|
-
}
|
|
1345
|
-
const evaluatorScores = /* @__PURE__ */ new Map();
|
|
1346
|
-
for (const result of results) {
|
|
1347
|
-
if (result.scores) {
|
|
1348
|
-
for (const score of result.scores) {
|
|
1349
|
-
const key = `${score.name}:${score.type}`;
|
|
1350
|
-
if (!evaluatorScores.has(key)) {
|
|
1351
|
-
evaluatorScores.set(key, []);
|
|
1352
|
-
}
|
|
1353
|
-
evaluatorScores.get(key)?.push(score.score);
|
|
1354
|
-
}
|
|
1355
|
-
}
|
|
1356
|
-
}
|
|
1357
|
-
let perEvaluatorSummary;
|
|
1358
|
-
if (evaluatorScores.size > 0) {
|
|
1359
|
-
perEvaluatorSummary = {};
|
|
1360
|
-
for (const [key, scores] of evaluatorScores) {
|
|
1361
|
-
perEvaluatorSummary[key] = computeStats(scores);
|
|
1362
|
-
}
|
|
1363
|
-
}
|
|
1364
|
-
const errorCount = results.filter(
|
|
1365
|
-
(r) => r.executionStatus != null && r.executionStatus === "execution_error"
|
|
1366
|
-
).length;
|
|
1367
|
-
if (errorCount > 0) {
|
|
1368
|
-
notes.push(
|
|
1369
|
-
`${errorCount} test(s) had execution errors and are included in pass_rate as failures`
|
|
1370
|
-
);
|
|
1371
|
-
}
|
|
1372
|
-
if (results.length === 0) {
|
|
1373
|
-
notes.push("No results to summarize");
|
|
1374
|
-
}
|
|
1375
|
-
const firstResult = results[0];
|
|
1376
|
-
const timestamp = firstResult?.timestamp ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
1377
|
-
return {
|
|
1378
|
-
metadata: {
|
|
1379
|
-
eval_file: evalFile,
|
|
1380
|
-
timestamp,
|
|
1381
|
-
targets,
|
|
1382
|
-
tests_run: testIds,
|
|
1383
|
-
experiment
|
|
1384
|
-
},
|
|
1385
|
-
run_summary: runSummary,
|
|
1386
|
-
per_grader_summary: perEvaluatorSummary,
|
|
1387
|
-
notes
|
|
1388
|
-
};
|
|
1389
|
-
}
|
|
1390
|
-
function safeArtifactPathSegment(value, fallback) {
|
|
1391
|
-
const trimmed = value?.trim();
|
|
1392
|
-
if (!trimmed) {
|
|
1393
|
-
return fallback;
|
|
1394
|
-
}
|
|
1395
|
-
return trimmed.replace(/[/\\:*?"<>|]/g, "_");
|
|
1396
|
-
}
|
|
1397
|
-
function safeTestId(testId) {
|
|
1398
|
-
return safeArtifactPathSegment(testId, "unknown");
|
|
1399
|
-
}
|
|
1400
|
-
function getSuite(result) {
|
|
1401
|
-
return result.suite;
|
|
1402
|
-
}
|
|
1403
|
-
function buildArtifactSubdir(result) {
|
|
1404
|
-
const segments = [];
|
|
1405
|
-
const evalSet = getSuite(result);
|
|
1406
|
-
if (evalSet) {
|
|
1407
|
-
segments.push(safeArtifactPathSegment(evalSet, "default"));
|
|
1408
|
-
}
|
|
1409
|
-
segments.push(safeTestId(result.testId));
|
|
1410
|
-
return path7.posix.join(...segments);
|
|
1411
|
-
}
|
|
1412
|
-
function formatOutputMarkdown(output) {
|
|
1413
|
-
return output.map((msg) => `@[${msg.role}]:
|
|
1414
|
-
${String(msg.content ?? "")}`).join("\n\n");
|
|
1415
|
-
}
|
|
1416
|
-
function extractInput(result) {
|
|
1417
|
-
const input = result.input;
|
|
1418
|
-
if (!input) return null;
|
|
1419
|
-
if (typeof input === "string") return input;
|
|
1420
|
-
if (Array.isArray(input) && input.length > 0) {
|
|
1421
|
-
return formatOutputMarkdown(input);
|
|
1422
|
-
}
|
|
1423
|
-
return null;
|
|
1424
|
-
}
|
|
1425
|
-
function buildResultIndexArtifact(result) {
|
|
1426
|
-
const artifactSubdir = buildArtifactSubdir(result);
|
|
1427
|
-
const input = extractInput(result);
|
|
1428
|
-
const hasResponse = Array.isArray(result.output) && result.output.length > 0;
|
|
1429
|
-
return {
|
|
1430
|
-
timestamp: result.timestamp,
|
|
1431
|
-
test_id: result.testId ?? "unknown",
|
|
1432
|
-
suite: getSuite(result),
|
|
1433
|
-
category: result.category,
|
|
1434
|
-
conversation_id: result.conversationId,
|
|
1435
|
-
score: result.score,
|
|
1436
|
-
target: result.target ?? "unknown",
|
|
1437
|
-
scores: result.scores ? toSnakeCaseDeep(result.scores) : void 0,
|
|
1438
|
-
execution_status: result.executionStatus,
|
|
1439
|
-
error: result.error,
|
|
1440
|
-
failure_stage: result.failureStage,
|
|
1441
|
-
failure_reason_code: result.failureReasonCode,
|
|
1442
|
-
workspace_path: result.workspacePath,
|
|
1443
|
-
grading_path: path7.posix.join(artifactSubdir, "grading.json"),
|
|
1444
|
-
timing_path: path7.posix.join(artifactSubdir, "timing.json"),
|
|
1445
|
-
input_path: input ? path7.posix.join(artifactSubdir, "input.md") : void 0,
|
|
1446
|
-
output_path: hasResponse ? path7.posix.join(artifactSubdir, "outputs", "response.md") : void 0,
|
|
1447
|
-
response_path: hasResponse ? path7.posix.join(artifactSubdir, "outputs", "response.md") : void 0
|
|
1448
|
-
};
|
|
1449
|
-
}
|
|
1450
|
-
async function writeJsonlFile(filePath, records) {
|
|
1451
|
-
const content = records.length === 0 ? "" : `${records.map((record) => JSON.stringify(toSnakeCaseDeep(record))).join("\n")}
|
|
1452
|
-
`;
|
|
1453
|
-
await writeFile(filePath, content, "utf8");
|
|
1454
|
-
}
|
|
1455
|
-
function buildTranscriptMessageLines(results) {
|
|
1456
|
-
const lines = [];
|
|
1457
|
-
for (const result of results) {
|
|
1458
|
-
const transcriptLines = toTranscriptJsonLines(
|
|
1459
|
-
{
|
|
1460
|
-
messages: [...result.input ?? [], ...result.output],
|
|
1461
|
-
source: {
|
|
1462
|
-
provider: result.target,
|
|
1463
|
-
sessionId: result.conversationId ?? result.testId,
|
|
1464
|
-
startedAt: result.timestamp
|
|
1465
|
-
},
|
|
1466
|
-
tokenUsage: result.tokenUsage,
|
|
1467
|
-
durationMs: result.durationMs,
|
|
1468
|
-
costUsd: result.costUsd
|
|
1469
|
-
},
|
|
1470
|
-
{
|
|
1471
|
-
testId: result.testId,
|
|
1472
|
-
target: result.target
|
|
1473
|
-
}
|
|
1474
|
-
);
|
|
1475
|
-
lines.push(...transcriptLines.map((line) => JSON.stringify(line)));
|
|
1476
|
-
}
|
|
1477
|
-
return lines.length > 0 ? `${lines.join("\n")}
|
|
1478
|
-
` : "";
|
|
1479
|
-
}
|
|
1480
|
-
async function writeArtifactsFromResults(results, outputDir, options) {
|
|
1481
|
-
const testArtifactDir = outputDir;
|
|
1482
|
-
const timingPath = path7.join(outputDir, "timing.json");
|
|
1483
|
-
const benchmarkPath = path7.join(outputDir, "benchmark.json");
|
|
1484
|
-
const indexPath = path7.join(outputDir, RESULT_INDEX_FILENAME);
|
|
1485
|
-
await mkdir(outputDir, { recursive: true });
|
|
1486
|
-
const indexRecords = [];
|
|
1487
|
-
for (const result of results) {
|
|
1488
|
-
const grading = buildGradingArtifact(result);
|
|
1489
|
-
const timing2 = buildTimingArtifact([result]);
|
|
1490
|
-
const artifactSubdir = buildArtifactSubdir(result);
|
|
1491
|
-
const testDir = path7.join(outputDir, artifactSubdir);
|
|
1492
|
-
const gradingPath = path7.join(testDir, "grading.json");
|
|
1493
|
-
const perTestTimingPath = path7.join(testDir, "timing.json");
|
|
1494
|
-
await mkdir(testDir, { recursive: true });
|
|
1495
|
-
await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}
|
|
1496
|
-
`, "utf8");
|
|
1497
|
-
await writeFile(perTestTimingPath, `${JSON.stringify(timing2, null, 2)}
|
|
1498
|
-
`, "utf8");
|
|
1499
|
-
const input = extractInput(result);
|
|
1500
|
-
if (input) {
|
|
1501
|
-
await writeFile(path7.join(testDir, "input.md"), input, "utf8");
|
|
1502
|
-
}
|
|
1503
|
-
if (result.output && result.output.length > 0) {
|
|
1504
|
-
const outputsDir = path7.join(testDir, "outputs");
|
|
1505
|
-
await mkdir(outputsDir, { recursive: true });
|
|
1506
|
-
await writeFile(
|
|
1507
|
-
path7.join(outputsDir, "response.md"),
|
|
1508
|
-
formatOutputMarkdown(result.output),
|
|
1509
|
-
"utf8"
|
|
1510
|
-
);
|
|
1511
|
-
}
|
|
1512
|
-
indexRecords.push({
|
|
1513
|
-
...buildResultIndexArtifact(result),
|
|
1514
|
-
experiment: options?.experiment
|
|
1515
|
-
});
|
|
1516
|
-
}
|
|
1517
|
-
const timing = buildTimingArtifact(results);
|
|
1518
|
-
await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}
|
|
1519
|
-
`, "utf8");
|
|
1520
|
-
const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
|
|
1521
|
-
await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
|
|
1522
|
-
`, "utf8");
|
|
1523
|
-
await writeJsonlFile(indexPath, indexRecords);
|
|
1524
|
-
const transcriptPath = path7.join(outputDir, "transcript.jsonl");
|
|
1525
|
-
await writeFile(transcriptPath, buildTranscriptMessageLines(results), "utf8");
|
|
1526
|
-
return { testArtifactDir, timingPath, benchmarkPath, indexPath };
|
|
1527
|
-
}
|
|
1528
|
-
|
|
1529
|
-
// src/commands/eval/benchmark-writer.ts
|
|
1530
|
-
import { writeFile as writeFile2 } from "node:fs/promises";
|
|
1531
|
-
function computeStats2(values) {
|
|
1532
|
-
if (values.length === 0) {
|
|
1533
|
-
return { mean: 0, stddev: 0 };
|
|
1534
|
-
}
|
|
1535
|
-
const mean = values.reduce((sum, v) => sum + v, 0) / values.length;
|
|
1536
|
-
const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / values.length;
|
|
1537
|
-
return {
|
|
1538
|
-
mean: Math.round(mean * 1e3) / 1e3,
|
|
1539
|
-
stddev: Math.round(Math.sqrt(variance) * 1e3) / 1e3
|
|
1540
|
-
};
|
|
1541
|
-
}
|
|
1542
|
-
function computePassRate2(result) {
|
|
1543
1110
|
const scores = result.scores;
|
|
1544
1111
|
if (scores && scores.length > 0) {
|
|
1545
1112
|
const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
|
|
@@ -1548,7 +1115,7 @@ function computePassRate2(result) {
|
|
|
1548
1115
|
return result.score >= DEFAULT_THRESHOLD ? 1 : 0;
|
|
1549
1116
|
}
|
|
1550
1117
|
function buildBenchmarkJson(results) {
|
|
1551
|
-
const passRates = results.map(
|
|
1118
|
+
const passRates = results.map(computePassRate);
|
|
1552
1119
|
const timings = results.filter((r) => r.durationMs != null).map((r) => r.durationMs / 1e3);
|
|
1553
1120
|
const tokens = results.filter((r) => r.tokenUsage != null).map((r) => {
|
|
1554
1121
|
const usage = r.tokenUsage;
|
|
@@ -1557,29 +1124,29 @@ function buildBenchmarkJson(results) {
|
|
|
1557
1124
|
return {
|
|
1558
1125
|
run_summary: {
|
|
1559
1126
|
with_skill: {
|
|
1560
|
-
pass_rate:
|
|
1561
|
-
time_seconds:
|
|
1562
|
-
tokens:
|
|
1127
|
+
pass_rate: computeStats(passRates),
|
|
1128
|
+
time_seconds: computeStats(timings),
|
|
1129
|
+
tokens: computeStats(tokens)
|
|
1563
1130
|
}
|
|
1564
1131
|
}
|
|
1565
1132
|
};
|
|
1566
1133
|
}
|
|
1567
1134
|
async function writeBenchmarkJson(outputPath, results) {
|
|
1568
1135
|
const benchmark = buildBenchmarkJson(results);
|
|
1569
|
-
await
|
|
1136
|
+
await writeFile(outputPath, `${JSON.stringify(benchmark, null, 2)}
|
|
1570
1137
|
`, "utf8");
|
|
1571
1138
|
}
|
|
1572
1139
|
|
|
1573
1140
|
// src/commands/eval/env.ts
|
|
1574
1141
|
import { constants as constants3 } from "node:fs";
|
|
1575
1142
|
import { access as access3 } from "node:fs/promises";
|
|
1576
|
-
import
|
|
1143
|
+
import path6 from "node:path";
|
|
1577
1144
|
import { config as loadDotenv } from "dotenv";
|
|
1578
1145
|
function uniqueDirs(directories) {
|
|
1579
1146
|
const seen = /* @__PURE__ */ new Set();
|
|
1580
1147
|
const result = [];
|
|
1581
1148
|
for (const dir of directories) {
|
|
1582
|
-
const absolute =
|
|
1149
|
+
const absolute = path6.resolve(dir);
|
|
1583
1150
|
if (seen.has(absolute)) {
|
|
1584
1151
|
continue;
|
|
1585
1152
|
}
|
|
@@ -1598,14 +1165,14 @@ async function fileExists2(filePath) {
|
|
|
1598
1165
|
}
|
|
1599
1166
|
function collectAncestorDirectories(start, boundary) {
|
|
1600
1167
|
const directories = [];
|
|
1601
|
-
const boundaryDir =
|
|
1602
|
-
let current =
|
|
1168
|
+
const boundaryDir = path6.resolve(boundary);
|
|
1169
|
+
let current = path6.resolve(start);
|
|
1603
1170
|
while (current !== void 0) {
|
|
1604
1171
|
directories.push(current);
|
|
1605
1172
|
if (current === boundaryDir) {
|
|
1606
1173
|
break;
|
|
1607
1174
|
}
|
|
1608
|
-
const parent =
|
|
1175
|
+
const parent = path6.dirname(current);
|
|
1609
1176
|
if (parent === current) {
|
|
1610
1177
|
break;
|
|
1611
1178
|
}
|
|
@@ -1615,12 +1182,12 @@ function collectAncestorDirectories(start, boundary) {
|
|
|
1615
1182
|
}
|
|
1616
1183
|
async function loadEnvFromHierarchy(options) {
|
|
1617
1184
|
const { testFilePath, repoRoot, verbose } = options;
|
|
1618
|
-
const testDir =
|
|
1185
|
+
const testDir = path6.dirname(path6.resolve(testFilePath));
|
|
1619
1186
|
const cwd = process.cwd();
|
|
1620
1187
|
const searchDirs = uniqueDirs([...collectAncestorDirectories(testDir, repoRoot), repoRoot, cwd]);
|
|
1621
1188
|
const envFiles = [];
|
|
1622
1189
|
for (const dir of searchDirs) {
|
|
1623
|
-
const candidate =
|
|
1190
|
+
const candidate = path6.join(dir, ".env");
|
|
1624
1191
|
if (await fileExists2(candidate)) {
|
|
1625
1192
|
envFiles.push(candidate);
|
|
1626
1193
|
}
|
|
@@ -1642,11 +1209,11 @@ async function loadEnvFromHierarchy(options) {
|
|
|
1642
1209
|
}
|
|
1643
1210
|
|
|
1644
1211
|
// src/commands/eval/output-writer.ts
|
|
1645
|
-
import
|
|
1212
|
+
import path12 from "node:path";
|
|
1646
1213
|
|
|
1647
1214
|
// src/commands/eval/html-writer.ts
|
|
1648
|
-
import { mkdir
|
|
1649
|
-
import
|
|
1215
|
+
import { mkdir, writeFile as writeFile2 } from "node:fs/promises";
|
|
1216
|
+
import path7 from "node:path";
|
|
1650
1217
|
|
|
1651
1218
|
// ../../node_modules/.bun/async-mutex@0.5.0/node_modules/async-mutex/index.mjs
|
|
1652
1219
|
var E_TIMEOUT = new Error("timeout while waiting for mutex to become available");
|
|
@@ -1865,7 +1432,7 @@ var HtmlWriter = class _HtmlWriter {
|
|
|
1865
1432
|
this.filePath = filePath;
|
|
1866
1433
|
}
|
|
1867
1434
|
static async open(filePath) {
|
|
1868
|
-
await
|
|
1435
|
+
await mkdir(path7.dirname(filePath), { recursive: true });
|
|
1869
1436
|
const writer = new _HtmlWriter(filePath);
|
|
1870
1437
|
await writer.writeHtml();
|
|
1871
1438
|
return writer;
|
|
@@ -1891,7 +1458,7 @@ var HtmlWriter = class _HtmlWriter {
|
|
|
1891
1458
|
}
|
|
1892
1459
|
async writeHtml() {
|
|
1893
1460
|
const html = generateHtml(this.results, this.isLive);
|
|
1894
|
-
await
|
|
1461
|
+
await writeFile2(this.filePath, html, "utf8");
|
|
1895
1462
|
}
|
|
1896
1463
|
};
|
|
1897
1464
|
function generateHtml(results, isLive) {
|
|
@@ -2326,10 +1893,10 @@ var SCRIPT = `
|
|
|
2326
1893
|
h+='<div class="detail-block"><h4>Output</h4><pre class="detail-pre">'+esc(r.output?JSON.stringify(r.output,null,2):"")+"</pre></div>";
|
|
2327
1894
|
h+="</div>";
|
|
2328
1895
|
|
|
2329
|
-
/*
|
|
1896
|
+
/* grader results */
|
|
2330
1897
|
if(r.scores&&r.scores.length>0){
|
|
2331
|
-
h+="<h4>
|
|
2332
|
-
h+='<table class="eval-table"><thead><tr><th>
|
|
1898
|
+
h+="<h4>Grader Results</h4>";
|
|
1899
|
+
h+='<table class="eval-table"><thead><tr><th>Grader</th><th>Score</th><th>Status</th><th>Assertions</th></tr></thead><tbody>';
|
|
2333
1900
|
for(var i=0;i<r.scores.length;i++){
|
|
2334
1901
|
var ev=r.scores[i],evS=ev.score>=0.5?"pass":"fail";
|
|
2335
1902
|
var evAssertions=ev.assertions||[];
|
|
@@ -2375,8 +1942,8 @@ var SCRIPT = `
|
|
|
2375
1942
|
`;
|
|
2376
1943
|
|
|
2377
1944
|
// src/commands/eval/json-writer.ts
|
|
2378
|
-
import { mkdir as
|
|
2379
|
-
import
|
|
1945
|
+
import { mkdir as mkdir2, writeFile as writeFile3 } from "node:fs/promises";
|
|
1946
|
+
import path8 from "node:path";
|
|
2380
1947
|
var JsonWriter = class _JsonWriter {
|
|
2381
1948
|
filePath;
|
|
2382
1949
|
results = [];
|
|
@@ -2385,7 +1952,7 @@ var JsonWriter = class _JsonWriter {
|
|
|
2385
1952
|
this.filePath = filePath;
|
|
2386
1953
|
}
|
|
2387
1954
|
static async open(filePath) {
|
|
2388
|
-
await
|
|
1955
|
+
await mkdir2(path8.dirname(filePath), { recursive: true });
|
|
2389
1956
|
return new _JsonWriter(filePath);
|
|
2390
1957
|
}
|
|
2391
1958
|
async append(result) {
|
|
@@ -2412,15 +1979,15 @@ var JsonWriter = class _JsonWriter {
|
|
|
2412
1979
|
results: this.results
|
|
2413
1980
|
};
|
|
2414
1981
|
const snakeCaseOutput = toSnakeCaseDeep(output);
|
|
2415
|
-
await
|
|
1982
|
+
await writeFile3(this.filePath, `${JSON.stringify(snakeCaseOutput, null, 2)}
|
|
2416
1983
|
`, "utf8");
|
|
2417
1984
|
}
|
|
2418
1985
|
};
|
|
2419
1986
|
|
|
2420
1987
|
// src/commands/eval/jsonl-writer.ts
|
|
2421
1988
|
import { createWriteStream } from "node:fs";
|
|
2422
|
-
import { mkdir as
|
|
2423
|
-
import
|
|
1989
|
+
import { mkdir as mkdir3 } from "node:fs/promises";
|
|
1990
|
+
import path9 from "node:path";
|
|
2424
1991
|
import { finished } from "node:stream/promises";
|
|
2425
1992
|
var JsonlWriter = class _JsonlWriter {
|
|
2426
1993
|
stream;
|
|
@@ -2429,9 +1996,10 @@ var JsonlWriter = class _JsonlWriter {
|
|
|
2429
1996
|
constructor(stream) {
|
|
2430
1997
|
this.stream = stream;
|
|
2431
1998
|
}
|
|
2432
|
-
static async open(filePath) {
|
|
2433
|
-
await
|
|
2434
|
-
const
|
|
1999
|
+
static async open(filePath, options) {
|
|
2000
|
+
await mkdir3(path9.dirname(filePath), { recursive: true });
|
|
2001
|
+
const flags = options?.append ? "a" : "w";
|
|
2002
|
+
const stream = createWriteStream(filePath, { flags, encoding: "utf8" });
|
|
2435
2003
|
return new _JsonlWriter(stream);
|
|
2436
2004
|
}
|
|
2437
2005
|
async append(record) {
|
|
@@ -2461,8 +2029,8 @@ var JsonlWriter = class _JsonlWriter {
|
|
|
2461
2029
|
};
|
|
2462
2030
|
|
|
2463
2031
|
// src/commands/eval/junit-writer.ts
|
|
2464
|
-
import { mkdir as
|
|
2465
|
-
import
|
|
2032
|
+
import { mkdir as mkdir4, writeFile as writeFile4 } from "node:fs/promises";
|
|
2033
|
+
import path10 from "node:path";
|
|
2466
2034
|
function escapeXml(str) {
|
|
2467
2035
|
return str.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
2468
2036
|
}
|
|
@@ -2476,7 +2044,7 @@ var JunitWriter = class _JunitWriter {
|
|
|
2476
2044
|
this.threshold = options?.threshold ?? 0.5;
|
|
2477
2045
|
}
|
|
2478
2046
|
static async open(filePath, options) {
|
|
2479
|
-
await
|
|
2047
|
+
await mkdir4(path10.dirname(filePath), { recursive: true });
|
|
2480
2048
|
return new _JunitWriter(filePath, options);
|
|
2481
2049
|
}
|
|
2482
2050
|
async append(result) {
|
|
@@ -2545,14 +2113,14 @@ ${testCases.join("\n")}
|
|
|
2545
2113
|
${suiteXmls.join("\n")}
|
|
2546
2114
|
</testsuites>
|
|
2547
2115
|
`;
|
|
2548
|
-
await
|
|
2116
|
+
await writeFile4(this.filePath, xml, "utf8");
|
|
2549
2117
|
}
|
|
2550
2118
|
};
|
|
2551
2119
|
|
|
2552
2120
|
// src/commands/eval/yaml-writer.ts
|
|
2553
2121
|
import { createWriteStream as createWriteStream2 } from "node:fs";
|
|
2554
|
-
import { mkdir as
|
|
2555
|
-
import
|
|
2122
|
+
import { mkdir as mkdir5 } from "node:fs/promises";
|
|
2123
|
+
import path11 from "node:path";
|
|
2556
2124
|
import { finished as finished2 } from "node:stream/promises";
|
|
2557
2125
|
import { stringify as stringifyYaml } from "yaml";
|
|
2558
2126
|
var YamlWriter = class _YamlWriter {
|
|
@@ -2564,7 +2132,7 @@ var YamlWriter = class _YamlWriter {
|
|
|
2564
2132
|
this.stream = stream;
|
|
2565
2133
|
}
|
|
2566
2134
|
static async open(filePath) {
|
|
2567
|
-
await
|
|
2135
|
+
await mkdir5(path11.dirname(filePath), { recursive: true });
|
|
2568
2136
|
const stream = createWriteStream2(filePath, { flags: "w", encoding: "utf8" });
|
|
2569
2137
|
return new _YamlWriter(stream);
|
|
2570
2138
|
}
|
|
@@ -2604,10 +2172,10 @@ var YamlWriter = class _YamlWriter {
|
|
|
2604
2172
|
};
|
|
2605
2173
|
|
|
2606
2174
|
// src/commands/eval/output-writer.ts
|
|
2607
|
-
async function createOutputWriter(filePath, format) {
|
|
2175
|
+
async function createOutputWriter(filePath, format, options) {
|
|
2608
2176
|
switch (format) {
|
|
2609
2177
|
case "jsonl":
|
|
2610
|
-
return JsonlWriter.open(filePath);
|
|
2178
|
+
return JsonlWriter.open(filePath, { append: options?.append });
|
|
2611
2179
|
case "yaml":
|
|
2612
2180
|
return YamlWriter.open(filePath);
|
|
2613
2181
|
case "html":
|
|
@@ -2620,7 +2188,7 @@ async function createOutputWriter(filePath, format) {
|
|
|
2620
2188
|
}
|
|
2621
2189
|
var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml", ".html", ".htm"]);
|
|
2622
2190
|
function createWriterFromPath(filePath, options) {
|
|
2623
|
-
const ext =
|
|
2191
|
+
const ext = path12.extname(filePath).toLowerCase();
|
|
2624
2192
|
switch (ext) {
|
|
2625
2193
|
case ".jsonl":
|
|
2626
2194
|
return JsonlWriter.open(filePath);
|
|
@@ -2719,12 +2287,12 @@ var ProgressDisplay = class {
|
|
|
2719
2287
|
}
|
|
2720
2288
|
addLogPaths(paths) {
|
|
2721
2289
|
const newPaths = [];
|
|
2722
|
-
for (const
|
|
2723
|
-
if (this.logPathSet.has(
|
|
2290
|
+
for (const path17 of paths) {
|
|
2291
|
+
if (this.logPathSet.has(path17)) {
|
|
2724
2292
|
continue;
|
|
2725
2293
|
}
|
|
2726
|
-
this.logPathSet.add(
|
|
2727
|
-
newPaths.push(
|
|
2294
|
+
this.logPathSet.add(path17);
|
|
2295
|
+
newPaths.push(path17);
|
|
2728
2296
|
}
|
|
2729
2297
|
if (newPaths.length === 0) {
|
|
2730
2298
|
return;
|
|
@@ -2777,8 +2345,8 @@ async function loadNonErrorResults(jsonlPath) {
|
|
|
2777
2345
|
}
|
|
2778
2346
|
|
|
2779
2347
|
// src/commands/eval/run-cache.ts
|
|
2780
|
-
import { mkdir as
|
|
2781
|
-
import
|
|
2348
|
+
import { mkdir as mkdir6, readFile, writeFile as writeFile5 } from "node:fs/promises";
|
|
2349
|
+
import path13 from "node:path";
|
|
2782
2350
|
var CACHE_FILENAME = "cache.json";
|
|
2783
2351
|
function resolveRunCacheFile(cache) {
|
|
2784
2352
|
if (cache.lastRunDir) {
|
|
@@ -2787,27 +2355,27 @@ function resolveRunCacheFile(cache) {
|
|
|
2787
2355
|
return "";
|
|
2788
2356
|
}
|
|
2789
2357
|
function cachePath(cwd) {
|
|
2790
|
-
return
|
|
2358
|
+
return path13.join(cwd, ".agentv", CACHE_FILENAME);
|
|
2791
2359
|
}
|
|
2792
2360
|
async function loadRunCache(cwd) {
|
|
2793
2361
|
try {
|
|
2794
|
-
const content = await
|
|
2362
|
+
const content = await readFile(cachePath(cwd), "utf-8");
|
|
2795
2363
|
return JSON.parse(content);
|
|
2796
2364
|
} catch {
|
|
2797
2365
|
return void 0;
|
|
2798
2366
|
}
|
|
2799
2367
|
}
|
|
2800
2368
|
async function saveRunCache(cwd, resultPath) {
|
|
2801
|
-
if (
|
|
2369
|
+
if (path13.basename(resultPath) !== RESULT_INDEX_FILENAME) {
|
|
2802
2370
|
return;
|
|
2803
2371
|
}
|
|
2804
|
-
const dir =
|
|
2805
|
-
await
|
|
2372
|
+
const dir = path13.join(cwd, ".agentv");
|
|
2373
|
+
await mkdir6(dir, { recursive: true });
|
|
2806
2374
|
const cache = {
|
|
2807
|
-
lastRunDir:
|
|
2375
|
+
lastRunDir: path13.dirname(resultPath),
|
|
2808
2376
|
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
2809
2377
|
};
|
|
2810
|
-
await
|
|
2378
|
+
await writeFile5(cachePath(cwd), `${JSON.stringify(cache, null, 2)}
|
|
2811
2379
|
`, "utf-8");
|
|
2812
2380
|
}
|
|
2813
2381
|
|
|
@@ -3072,13 +2640,13 @@ function formatMatrixSummary(results) {
|
|
|
3072
2640
|
}
|
|
3073
2641
|
|
|
3074
2642
|
// ../../packages/core/dist/evaluation/validation/index.js
|
|
3075
|
-
import { readFile as
|
|
3076
|
-
import
|
|
2643
|
+
import { readFile as readFile2 } from "node:fs/promises";
|
|
2644
|
+
import path14 from "node:path";
|
|
3077
2645
|
import { parse } from "yaml";
|
|
3078
2646
|
import { readFile as readFile22, readdir } from "node:fs/promises";
|
|
3079
2647
|
import path22 from "node:path";
|
|
3080
2648
|
import { parse as parse2 } from "yaml";
|
|
3081
|
-
import { readFile as
|
|
2649
|
+
import { readFile as readFile3 } from "node:fs/promises";
|
|
3082
2650
|
import path32 from "node:path";
|
|
3083
2651
|
import { parse as parse3 } from "yaml";
|
|
3084
2652
|
import { readFile as readFile4 } from "node:fs/promises";
|
|
@@ -3097,7 +2665,7 @@ var SCHEMA_TARGETS_V2 = "agentv-targets-v2.2";
|
|
|
3097
2665
|
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
3098
2666
|
async function detectFileType(filePath) {
|
|
3099
2667
|
try {
|
|
3100
|
-
const content = await
|
|
2668
|
+
const content = await readFile2(filePath, "utf8");
|
|
3101
2669
|
const parsed = parse(content);
|
|
3102
2670
|
if (Array.isArray(parsed)) {
|
|
3103
2671
|
return "cases";
|
|
@@ -3125,8 +2693,8 @@ async function detectFileType(filePath) {
|
|
|
3125
2693
|
}
|
|
3126
2694
|
}
|
|
3127
2695
|
function inferFileTypeFromPath(filePath) {
|
|
3128
|
-
const normalized =
|
|
3129
|
-
const basename =
|
|
2696
|
+
const normalized = path14.normalize(filePath).replace(/\\/g, "/");
|
|
2697
|
+
const basename = path14.basename(filePath);
|
|
3130
2698
|
if (normalized.includes("/.agentv/")) {
|
|
3131
2699
|
if (basename === "config.yaml" || basename === "config.yml") {
|
|
3132
2700
|
return "config";
|
|
@@ -3723,7 +3291,7 @@ function validateAssertArray(assertField, parentLocation, filePath, errors, cust
|
|
|
3723
3291
|
continue;
|
|
3724
3292
|
}
|
|
3725
3293
|
const typeValue = rawTypeValue.replace(/_/g, "-");
|
|
3726
|
-
if (!
|
|
3294
|
+
if (!isGraderKind(typeValue) && !customAssertionTypes.has(typeValue)) {
|
|
3727
3295
|
errors.push({
|
|
3728
3296
|
severity: "warning",
|
|
3729
3297
|
filePath,
|
|
@@ -3897,7 +3465,7 @@ async function validateCasesFile(filePath) {
|
|
|
3897
3465
|
const absolutePath = path32.resolve(filePath);
|
|
3898
3466
|
let parsed;
|
|
3899
3467
|
try {
|
|
3900
|
-
const content = await
|
|
3468
|
+
const content = await readFile3(absolutePath, "utf8");
|
|
3901
3469
|
parsed = parse3(content);
|
|
3902
3470
|
} catch (error) {
|
|
3903
3471
|
errors.push({
|
|
@@ -5005,6 +4573,12 @@ Errors in ${targetsFilePath}:`);
|
|
|
5005
4573
|
|
|
5006
4574
|
// src/commands/eval/run-eval.ts
|
|
5007
4575
|
var DEFAULT_WORKERS = 3;
|
|
4576
|
+
function shouldSkipExistingResultForResume(result, rerunFailed) {
|
|
4577
|
+
if (rerunFailed) {
|
|
4578
|
+
return result.executionStatus === "ok";
|
|
4579
|
+
}
|
|
4580
|
+
return result.executionStatus !== "execution_error";
|
|
4581
|
+
}
|
|
5008
4582
|
function normalizeBoolean(value) {
|
|
5009
4583
|
return value === true;
|
|
5010
4584
|
}
|
|
@@ -5175,6 +4749,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
|
5175
4749
|
otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent) || yamlExecution?.otel_capture_content === true,
|
|
5176
4750
|
otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns) || yamlExecution?.otel_group_turns === true,
|
|
5177
4751
|
retryErrors: normalizeString(rawOptions.retryErrors),
|
|
4752
|
+
resume: normalizeBoolean(rawOptions.resume) || normalizeBoolean(rawOptions.rerunFailed),
|
|
4753
|
+
rerunFailed: normalizeBoolean(rawOptions.rerunFailed),
|
|
5178
4754
|
workspaceMode,
|
|
5179
4755
|
workspacePath,
|
|
5180
4756
|
// Precedence: CLI > YAML config > TS config
|
|
@@ -5201,7 +4777,7 @@ async function ensureFileExists(filePath, description) {
|
|
|
5201
4777
|
function buildDefaultOutputPathForExperiment(cwd, experiment) {
|
|
5202
4778
|
const runDir = buildDefaultRunDir(cwd, experiment);
|
|
5203
4779
|
mkdirSync(runDir, { recursive: true });
|
|
5204
|
-
return
|
|
4780
|
+
return path15.join(runDir, "index.jsonl");
|
|
5205
4781
|
}
|
|
5206
4782
|
function createProgressReporter(maxWorkers, options) {
|
|
5207
4783
|
const display = new ProgressDisplay(maxWorkers, options);
|
|
@@ -5215,7 +4791,7 @@ function createProgressReporter(maxWorkers, options) {
|
|
|
5215
4791
|
};
|
|
5216
4792
|
}
|
|
5217
4793
|
function makeTestCaseKey(testFilePath, testId) {
|
|
5218
|
-
return `${
|
|
4794
|
+
return `${path15.resolve(testFilePath)}::${testId}`;
|
|
5219
4795
|
}
|
|
5220
4796
|
function resolveTargetLabel(requestedName, resolvedName) {
|
|
5221
4797
|
if (resolvedName !== requestedName) {
|
|
@@ -5277,7 +4853,7 @@ async function prepareFileMetadata(params) {
|
|
|
5277
4853
|
repoRoot,
|
|
5278
4854
|
verbose: options.verbose
|
|
5279
4855
|
});
|
|
5280
|
-
const relativePath =
|
|
4856
|
+
const relativePath = path15.relative(cwd, testFilePath);
|
|
5281
4857
|
const category = deriveCategory(relativePath);
|
|
5282
4858
|
const suite = await loadTestSuite(testFilePath, repoRoot, {
|
|
5283
4859
|
verbose: options.verbose,
|
|
@@ -5302,7 +4878,53 @@ async function prepareFileMetadata(params) {
|
|
|
5302
4878
|
selections = [
|
|
5303
4879
|
{
|
|
5304
4880
|
selection: transcriptSelection,
|
|
5305
|
-
inlineTargetLabel: `transcript (${
|
|
4881
|
+
inlineTargetLabel: `transcript (${path15.basename(options.transcript)})`
|
|
4882
|
+
}
|
|
4883
|
+
];
|
|
4884
|
+
} else if (suite.inlineTarget && options.cliTargets.length === 0) {
|
|
4885
|
+
const targetDefinition = suite.inlineTarget;
|
|
4886
|
+
const resolvedTarget = options.dryRun ? {
|
|
4887
|
+
kind: "mock",
|
|
4888
|
+
name: `${targetDefinition.name}-dry-run`,
|
|
4889
|
+
graderTarget: void 0,
|
|
4890
|
+
config: {
|
|
4891
|
+
response: '{"answer":"Mock dry-run response"}',
|
|
4892
|
+
delayMs: options.dryRunDelay,
|
|
4893
|
+
delayMinMs: options.dryRunDelayMin,
|
|
4894
|
+
delayMaxMs: options.dryRunDelayMax
|
|
4895
|
+
}
|
|
4896
|
+
} : resolveTargetDefinition(targetDefinition, process.env, testFilePath, {
|
|
4897
|
+
emitDeprecationWarnings: false
|
|
4898
|
+
});
|
|
4899
|
+
selections = [
|
|
4900
|
+
{
|
|
4901
|
+
selection: {
|
|
4902
|
+
definitions: [targetDefinition],
|
|
4903
|
+
resolvedTarget,
|
|
4904
|
+
targetName: targetDefinition.name,
|
|
4905
|
+
targetSource: "test-file",
|
|
4906
|
+
targetsFilePath: testFilePath
|
|
4907
|
+
},
|
|
4908
|
+
inlineTargetLabel: resolveTargetLabel(targetDefinition.name, resolvedTarget.name)
|
|
4909
|
+
}
|
|
4910
|
+
];
|
|
4911
|
+
} else if (suite.providerFactory && options.cliTargets.length === 0) {
|
|
4912
|
+
const taskTarget = {
|
|
4913
|
+
kind: "mock",
|
|
4914
|
+
name: "custom-task",
|
|
4915
|
+
graderTarget: void 0,
|
|
4916
|
+
config: {}
|
|
4917
|
+
};
|
|
4918
|
+
selections = [
|
|
4919
|
+
{
|
|
4920
|
+
selection: {
|
|
4921
|
+
definitions: [],
|
|
4922
|
+
resolvedTarget: taskTarget,
|
|
4923
|
+
targetName: "custom-task",
|
|
4924
|
+
targetSource: "test-file",
|
|
4925
|
+
targetsFilePath: testFilePath
|
|
4926
|
+
},
|
|
4927
|
+
inlineTargetLabel: "custom-task"
|
|
5306
4928
|
}
|
|
5307
4929
|
];
|
|
5308
4930
|
} else {
|
|
@@ -5372,10 +4994,11 @@ async function prepareFileMetadata(params) {
|
|
|
5372
4994
|
yamlWorkers: suite.workers,
|
|
5373
4995
|
yamlCache: suite.cacheConfig?.enabled,
|
|
5374
4996
|
yamlCachePath: suite.cacheConfig?.cachePath,
|
|
5375
|
-
|
|
4997
|
+
budgetUsd: suite.budgetUsd,
|
|
5376
4998
|
failOnError: suite.failOnError,
|
|
5377
4999
|
threshold: suite.threshold,
|
|
5378
|
-
tags: suite.metadata?.tags
|
|
5000
|
+
tags: suite.metadata?.tags,
|
|
5001
|
+
providerFactory: suite.providerFactory
|
|
5379
5002
|
};
|
|
5380
5003
|
}
|
|
5381
5004
|
async function runSingleEvalFile(params) {
|
|
@@ -5398,7 +5021,7 @@ async function runSingleEvalFile(params) {
|
|
|
5398
5021
|
testCases,
|
|
5399
5022
|
trialsConfig,
|
|
5400
5023
|
matrixMode,
|
|
5401
|
-
|
|
5024
|
+
budgetUsd,
|
|
5402
5025
|
failOnError,
|
|
5403
5026
|
providerFactory
|
|
5404
5027
|
} = params;
|
|
@@ -5464,7 +5087,7 @@ async function runSingleEvalFile(params) {
|
|
|
5464
5087
|
workspacePath: options.workspacePath,
|
|
5465
5088
|
keepWorkspaces: options.keepWorkspaces,
|
|
5466
5089
|
trials: trialsConfig,
|
|
5467
|
-
|
|
5090
|
+
budgetUsd,
|
|
5468
5091
|
failOnError,
|
|
5469
5092
|
graderTarget: options.graderTarget,
|
|
5470
5093
|
model: options.model,
|
|
@@ -5537,7 +5160,7 @@ async function runEvalCommand(input) {
|
|
|
5537
5160
|
);
|
|
5538
5161
|
}
|
|
5539
5162
|
const repoRoot = await findRepoRoot(cwd);
|
|
5540
|
-
const yamlConfig = await loadConfig(
|
|
5163
|
+
const yamlConfig = await loadConfig(path15.join(cwd, "_"), repoRoot);
|
|
5541
5164
|
if (yamlConfig?.required_version) {
|
|
5542
5165
|
await enforceRequiredVersion(yamlConfig.required_version, {
|
|
5543
5166
|
strict: normalizeBoolean(input.rawOptions.strict)
|
|
@@ -5552,7 +5175,7 @@ async function runEvalCommand(input) {
|
|
|
5552
5175
|
}
|
|
5553
5176
|
let retryNonErrorResults;
|
|
5554
5177
|
if (options.retryErrors) {
|
|
5555
|
-
const retryPath =
|
|
5178
|
+
const retryPath = path15.resolve(options.retryErrors);
|
|
5556
5179
|
await ensureFileExists(retryPath, "Retry-errors JSONL file");
|
|
5557
5180
|
const completedIds = await loadFullyCompletedTestIds(retryPath);
|
|
5558
5181
|
const errorIds = await loadErrorTestIds(retryPath);
|
|
@@ -5565,8 +5188,37 @@ async function runEvalCommand(input) {
|
|
|
5565
5188
|
console.log(`Skipping ${completedIds.length} already-completed test(s).`);
|
|
5566
5189
|
}
|
|
5567
5190
|
}
|
|
5191
|
+
let resumeSkipKeys;
|
|
5192
|
+
let isResumeAppend = false;
|
|
5193
|
+
if (options.resume && !options.retryErrors) {
|
|
5194
|
+
const explicitResumeDir = options.outputDir ?? options.artifacts;
|
|
5195
|
+
if (explicitResumeDir) {
|
|
5196
|
+
const resumeIndexPath = path15.join(path15.resolve(explicitResumeDir), "index.jsonl");
|
|
5197
|
+
if (existsSync2(resumeIndexPath)) {
|
|
5198
|
+
const content = await readFile8(resumeIndexPath, "utf8");
|
|
5199
|
+
const existingResults = parseJsonlResults(content);
|
|
5200
|
+
resumeSkipKeys = /* @__PURE__ */ new Set();
|
|
5201
|
+
for (const r of existingResults) {
|
|
5202
|
+
if (shouldSkipExistingResultForResume(r, options.rerunFailed)) {
|
|
5203
|
+
resumeSkipKeys.add(buildTestTargetKey(r.testId, r.target));
|
|
5204
|
+
}
|
|
5205
|
+
}
|
|
5206
|
+
isResumeAppend = true;
|
|
5207
|
+
const modeLabel = options.rerunFailed ? "Rerun-failed" : "Resume";
|
|
5208
|
+
console.log(
|
|
5209
|
+
`${modeLabel}: found ${existingResults.length} existing result(s), skipping ${resumeSkipKeys.size} completed.`
|
|
5210
|
+
);
|
|
5211
|
+
} else {
|
|
5212
|
+
console.log("Resume: no existing index.jsonl found, starting fresh run.");
|
|
5213
|
+
}
|
|
5214
|
+
} else {
|
|
5215
|
+
console.warn(
|
|
5216
|
+
"Warning: --resume requires --output <dir> to identify the run directory. Ignoring --resume."
|
|
5217
|
+
);
|
|
5218
|
+
}
|
|
5219
|
+
}
|
|
5568
5220
|
if (options.workspacePath) {
|
|
5569
|
-
const resolvedWorkspace =
|
|
5221
|
+
const resolvedWorkspace = path15.resolve(options.workspacePath);
|
|
5570
5222
|
try {
|
|
5571
5223
|
const { stat: stat2 } = await import("node:fs/promises");
|
|
5572
5224
|
const stats = await stat2(resolvedWorkspace);
|
|
@@ -5607,25 +5259,25 @@ async function runEvalCommand(input) {
|
|
|
5607
5259
|
let outputPath;
|
|
5608
5260
|
let usesDefaultArtifactWorkspace;
|
|
5609
5261
|
if (explicitDir) {
|
|
5610
|
-
runDir =
|
|
5262
|
+
runDir = path15.resolve(explicitDir);
|
|
5611
5263
|
mkdirSync(runDir, { recursive: true });
|
|
5612
|
-
outputPath =
|
|
5264
|
+
outputPath = path15.join(runDir, "index.jsonl");
|
|
5613
5265
|
usesDefaultArtifactWorkspace = true;
|
|
5614
5266
|
} else if (options.outPath) {
|
|
5615
|
-
outputPath =
|
|
5616
|
-
runDir =
|
|
5267
|
+
outputPath = path15.resolve(options.outPath);
|
|
5268
|
+
runDir = path15.dirname(outputPath);
|
|
5617
5269
|
mkdirSync(runDir, { recursive: true });
|
|
5618
5270
|
usesDefaultArtifactWorkspace = false;
|
|
5619
5271
|
} else {
|
|
5620
5272
|
outputPath = buildDefaultOutputPathForExperiment(cwd, options.experiment);
|
|
5621
|
-
runDir =
|
|
5273
|
+
runDir = path15.dirname(outputPath);
|
|
5622
5274
|
usesDefaultArtifactWorkspace = true;
|
|
5623
5275
|
}
|
|
5624
5276
|
let otelExporter = null;
|
|
5625
5277
|
const useFileExport = !!options.otelFile;
|
|
5626
5278
|
if (options.exportOtel || useFileExport) {
|
|
5627
5279
|
try {
|
|
5628
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
5280
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-RTIUSC6L.js");
|
|
5629
5281
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
5630
5282
|
let headers = {};
|
|
5631
5283
|
if (options.otelBackend) {
|
|
@@ -5649,7 +5301,7 @@ async function runEvalCommand(input) {
|
|
|
5649
5301
|
headers,
|
|
5650
5302
|
captureContent,
|
|
5651
5303
|
groupTurns: options.otelGroupTurns,
|
|
5652
|
-
otlpFilePath: options.otelFile ?
|
|
5304
|
+
otlpFilePath: options.otelFile ? path15.resolve(options.otelFile) : void 0
|
|
5653
5305
|
});
|
|
5654
5306
|
const initialized = await otelExporter.init();
|
|
5655
5307
|
if (!initialized) {
|
|
@@ -5666,7 +5318,7 @@ async function runEvalCommand(input) {
|
|
|
5666
5318
|
}
|
|
5667
5319
|
}
|
|
5668
5320
|
const primaryWritePath = outputPath;
|
|
5669
|
-
const resolvedExportPaths = options.exportPaths.map((p) =>
|
|
5321
|
+
const resolvedExportPaths = options.exportPaths.map((p) => path15.resolve(p));
|
|
5670
5322
|
console.log(`Artifact directory: ${runDir}`);
|
|
5671
5323
|
if (resolvedExportPaths.length > 0) {
|
|
5672
5324
|
console.log("Export files:");
|
|
@@ -5674,9 +5326,9 @@ async function runEvalCommand(input) {
|
|
|
5674
5326
|
console.log(` ${p}`);
|
|
5675
5327
|
}
|
|
5676
5328
|
}
|
|
5677
|
-
const resolvedTestFiles = input.testFiles.map((file) =>
|
|
5329
|
+
const resolvedTestFiles = input.testFiles.map((file) => path15.resolve(file));
|
|
5678
5330
|
if (options.otelFile) {
|
|
5679
|
-
console.log(`OTLP JSON file: ${
|
|
5331
|
+
console.log(`OTLP JSON file: ${path15.resolve(options.otelFile)}`);
|
|
5680
5332
|
}
|
|
5681
5333
|
const evaluationRunner = await resolveEvaluationRunner();
|
|
5682
5334
|
const allResults = [];
|
|
@@ -5685,23 +5337,7 @@ async function runEvalCommand(input) {
|
|
|
5685
5337
|
const displayIdTracker = createDisplayIdTracker();
|
|
5686
5338
|
const perFileWorkers = options.workers;
|
|
5687
5339
|
const fileMetadata = /* @__PURE__ */ new Map();
|
|
5688
|
-
const tsFiles = [];
|
|
5689
|
-
const yamlFiles = [];
|
|
5690
5340
|
for (const testFilePath of resolvedTestFiles) {
|
|
5691
|
-
if (/\.(ts|js|mts|mjs)$/.test(testFilePath)) {
|
|
5692
|
-
tsFiles.push(testFilePath);
|
|
5693
|
-
} else {
|
|
5694
|
-
yamlFiles.push(testFilePath);
|
|
5695
|
-
}
|
|
5696
|
-
}
|
|
5697
|
-
for (const tsFile of tsFiles) {
|
|
5698
|
-
await ensureFileExists(tsFile, "TypeScript eval file");
|
|
5699
|
-
await import(pathToFileURL(tsFile).href);
|
|
5700
|
-
}
|
|
5701
|
-
if (yamlFiles.length === 0 && tsFiles.length > 0) {
|
|
5702
|
-
return;
|
|
5703
|
-
}
|
|
5704
|
-
for (const testFilePath of yamlFiles) {
|
|
5705
5341
|
const meta = await prepareFileMetadata({
|
|
5706
5342
|
testFilePath,
|
|
5707
5343
|
repoRoot,
|
|
@@ -5716,7 +5352,7 @@ async function runEvalCommand(input) {
|
|
|
5716
5352
|
for (const [testFilePath, meta] of fileMetadata.entries()) {
|
|
5717
5353
|
if (!matchesTagFilters(meta.tags, options.tags, options.excludeTags)) {
|
|
5718
5354
|
fileMetadata.delete(testFilePath);
|
|
5719
|
-
skippedFiles.push(
|
|
5355
|
+
skippedFiles.push(path15.relative(cwd, testFilePath));
|
|
5720
5356
|
}
|
|
5721
5357
|
}
|
|
5722
5358
|
if (skippedFiles.length > 0 && options.verbose) {
|
|
@@ -5737,7 +5373,7 @@ async function runEvalCommand(input) {
|
|
|
5737
5373
|
cliNoCache: options.noCache,
|
|
5738
5374
|
yamlCache: yamlCacheEnabled
|
|
5739
5375
|
});
|
|
5740
|
-
const cache = cacheEnabled ? new ResponseCache(yamlCachePath ?
|
|
5376
|
+
const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path15.resolve(yamlCachePath) : void 0) : void 0;
|
|
5741
5377
|
if (cacheEnabled) {
|
|
5742
5378
|
console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
|
|
5743
5379
|
}
|
|
@@ -5747,14 +5383,25 @@ async function runEvalCommand(input) {
|
|
|
5747
5383
|
throw new Error("--threshold must be between 0 and 1");
|
|
5748
5384
|
}
|
|
5749
5385
|
const writerOptions = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
|
|
5750
|
-
const outputWriter = await createOutputWriter(primaryWritePath, "jsonl"
|
|
5386
|
+
const outputWriter = await createOutputWriter(primaryWritePath, "jsonl", {
|
|
5387
|
+
append: isResumeAppend
|
|
5388
|
+
});
|
|
5751
5389
|
const isMatrixMode = Array.from(fileMetadata.values()).some((meta) => meta.selections.length > 1);
|
|
5752
5390
|
let totalEvalCount = 0;
|
|
5391
|
+
let resumeSkippedCount = 0;
|
|
5753
5392
|
for (const meta of fileMetadata.values()) {
|
|
5754
5393
|
const suiteTargetNames = meta.selections.map((s) => s.selection.targetName);
|
|
5755
5394
|
for (const test of meta.testCases) {
|
|
5756
5395
|
const testTargetNames = test.targets && test.targets.length > 0 ? test.targets.filter((t) => suiteTargetNames.includes(t)) : suiteTargetNames;
|
|
5757
|
-
|
|
5396
|
+
const effectiveTargets = testTargetNames.length > 0 ? testTargetNames : ["unknown"];
|
|
5397
|
+
for (const tn of effectiveTargets) {
|
|
5398
|
+
const key = `${test.id}::${tn}`;
|
|
5399
|
+
if (resumeSkipKeys?.has(key)) {
|
|
5400
|
+
resumeSkippedCount++;
|
|
5401
|
+
} else {
|
|
5402
|
+
totalEvalCount++;
|
|
5403
|
+
}
|
|
5404
|
+
}
|
|
5758
5405
|
}
|
|
5759
5406
|
}
|
|
5760
5407
|
if (totalEvalCount === 0) {
|
|
@@ -5762,6 +5409,10 @@ async function runEvalCommand(input) {
|
|
|
5762
5409
|
console.log("No execution errors or missing cases in the previous run. Nothing to retry.");
|
|
5763
5410
|
return;
|
|
5764
5411
|
}
|
|
5412
|
+
if (resumeSkipKeys && resumeSkippedCount > 0) {
|
|
5413
|
+
console.log(`Nothing to resume \u2014 all ${resumeSkippedCount} test(s) already completed.`);
|
|
5414
|
+
return;
|
|
5415
|
+
}
|
|
5765
5416
|
throw new Error("No tests matched the provided filters.");
|
|
5766
5417
|
}
|
|
5767
5418
|
const progressReporter = createProgressReporter(options.workers ?? DEFAULT_WORKERS, {
|
|
@@ -5821,7 +5472,7 @@ async function runEvalCommand(input) {
|
|
|
5821
5472
|
const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
|
|
5822
5473
|
let transcriptProviderFactory;
|
|
5823
5474
|
if (options.transcript) {
|
|
5824
|
-
const { TranscriptProvider } = await import("./dist-
|
|
5475
|
+
const { TranscriptProvider } = await import("./dist-RTIUSC6L.js");
|
|
5825
5476
|
const transcriptProvider = await TranscriptProvider.fromFile(options.transcript);
|
|
5826
5477
|
const totalTests = [...fileMetadata.values()].reduce(
|
|
5827
5478
|
(sum, meta) => sum + meta.testCases.length,
|
|
@@ -5852,7 +5503,10 @@ async function runEvalCommand(input) {
|
|
|
5852
5503
|
}
|
|
5853
5504
|
return true;
|
|
5854
5505
|
}) : targetPrep.testCases;
|
|
5855
|
-
|
|
5506
|
+
const filteredTestCases = resumeSkipKeys ? applicableTestCases.filter(
|
|
5507
|
+
(test) => !resumeSkipKeys.has(buildTestTargetKey(test.id, targetName))
|
|
5508
|
+
) : applicableTestCases;
|
|
5509
|
+
if (filteredTestCases.length === 0) {
|
|
5856
5510
|
return [];
|
|
5857
5511
|
}
|
|
5858
5512
|
try {
|
|
@@ -5872,15 +5526,15 @@ async function runEvalCommand(input) {
|
|
|
5872
5526
|
displayIdTracker,
|
|
5873
5527
|
selection,
|
|
5874
5528
|
inlineTargetLabel,
|
|
5875
|
-
testCases:
|
|
5529
|
+
testCases: filteredTestCases,
|
|
5876
5530
|
trialsConfig: options.transcript ? void 0 : targetPrep.trialsConfig,
|
|
5877
5531
|
matrixMode: targetPrep.selections.length > 1,
|
|
5878
|
-
|
|
5532
|
+
budgetUsd: targetPrep.budgetUsd,
|
|
5879
5533
|
failOnError: targetPrep.failOnError,
|
|
5880
5534
|
threshold: resolvedThreshold,
|
|
5881
|
-
providerFactory: transcriptProviderFactory
|
|
5535
|
+
providerFactory: transcriptProviderFactory ?? targetPrep.providerFactory
|
|
5882
5536
|
});
|
|
5883
|
-
const evalFile =
|
|
5537
|
+
const evalFile = path15.relative(cwd, testFilePath);
|
|
5884
5538
|
const existingSummary = remoteEvalSummaries.find(
|
|
5885
5539
|
(summary2) => summary2.evalFile === evalFile
|
|
5886
5540
|
);
|
|
@@ -5897,10 +5551,10 @@ async function runEvalCommand(input) {
|
|
|
5897
5551
|
const message = fileError instanceof Error ? fileError.message : String(fileError);
|
|
5898
5552
|
console.error(
|
|
5899
5553
|
`
|
|
5900
|
-
[ERROR] \u26A0 Eval file failed: ${
|
|
5554
|
+
[ERROR] \u26A0 Eval file failed: ${path15.basename(testFilePath)} \u2014 ${message}
|
|
5901
5555
|
`
|
|
5902
5556
|
);
|
|
5903
|
-
const errorResults =
|
|
5557
|
+
const errorResults = filteredTestCases.map((testCase) => ({
|
|
5904
5558
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
5905
5559
|
testId: testCase.id,
|
|
5906
5560
|
score: 0,
|
|
@@ -5936,37 +5590,60 @@ async function runEvalCommand(input) {
|
|
|
5936
5590
|
`Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`
|
|
5937
5591
|
);
|
|
5938
5592
|
}
|
|
5593
|
+
await outputWriter.close().catch(() => void 0);
|
|
5594
|
+
let summaryResults = allResults;
|
|
5595
|
+
if (isResumeAppend && usesDefaultArtifactWorkspace) {
|
|
5596
|
+
const content = await readFile8(outputPath, "utf8");
|
|
5597
|
+
summaryResults = deduplicateByTestIdTarget(parseJsonlResults(content));
|
|
5598
|
+
}
|
|
5939
5599
|
const thresholdOpts = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
|
|
5940
|
-
const summary = calculateEvaluationSummary(
|
|
5600
|
+
const summary = calculateEvaluationSummary(summaryResults, thresholdOpts);
|
|
5941
5601
|
console.log(formatEvaluationSummary(summary, thresholdOpts));
|
|
5942
5602
|
const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total;
|
|
5943
5603
|
const thresholdFailed = resolvedThreshold !== void 0 && summary.qualityFailureCount > 0;
|
|
5944
|
-
if (isMatrixMode &&
|
|
5945
|
-
console.log(formatMatrixSummary(
|
|
5604
|
+
if (isMatrixMode && summaryResults.length > 0) {
|
|
5605
|
+
console.log(formatMatrixSummary(summaryResults));
|
|
5946
5606
|
}
|
|
5947
5607
|
if (options.benchmarkJson && allResults.length > 0) {
|
|
5948
|
-
const benchmarkPath =
|
|
5608
|
+
const benchmarkPath = path15.resolve(options.benchmarkJson);
|
|
5949
5609
|
await writeBenchmarkJson(benchmarkPath, allResults);
|
|
5950
5610
|
console.log(`Benchmark written to: ${benchmarkPath}`);
|
|
5951
5611
|
}
|
|
5952
5612
|
if (usesDefaultArtifactWorkspace && allResults.length > 0) {
|
|
5953
5613
|
const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : "";
|
|
5954
|
-
|
|
5955
|
-
|
|
5956
|
-
|
|
5957
|
-
|
|
5958
|
-
|
|
5959
|
-
|
|
5960
|
-
|
|
5961
|
-
|
|
5962
|
-
|
|
5963
|
-
|
|
5964
|
-
|
|
5965
|
-
|
|
5966
|
-
` Per-test artifacts: ${
|
|
5967
|
-
|
|
5968
|
-
|
|
5969
|
-
|
|
5614
|
+
if (isResumeAppend) {
|
|
5615
|
+
const { writePerTestArtifacts } = await import("./artifact-writer-YATMDPWI.js");
|
|
5616
|
+
await writePerTestArtifacts(allResults, runDir, {
|
|
5617
|
+
experiment: normalizeExperimentName(options.experiment)
|
|
5618
|
+
});
|
|
5619
|
+
const { benchmarkPath: workspaceBenchmarkPath, timingPath } = await aggregateRunDir(
|
|
5620
|
+
runDir,
|
|
5621
|
+
{ evalFile, experiment: normalizeExperimentName(options.experiment) }
|
|
5622
|
+
);
|
|
5623
|
+
const indexPath = path15.join(runDir, "index.jsonl");
|
|
5624
|
+
console.log(`Artifact workspace updated: ${runDir}`);
|
|
5625
|
+
console.log(` Index: ${indexPath}`);
|
|
5626
|
+
console.log(` Per-test artifacts: ${runDir} (${allResults.length} new test directories)`);
|
|
5627
|
+
console.log(` Timing: ${timingPath}`);
|
|
5628
|
+
console.log(` Benchmark: ${workspaceBenchmarkPath}`);
|
|
5629
|
+
} else {
|
|
5630
|
+
const {
|
|
5631
|
+
testArtifactDir,
|
|
5632
|
+
timingPath,
|
|
5633
|
+
benchmarkPath: workspaceBenchmarkPath,
|
|
5634
|
+
indexPath
|
|
5635
|
+
} = await writeArtifactsFromResults(allResults, runDir, {
|
|
5636
|
+
evalFile,
|
|
5637
|
+
experiment: normalizeExperimentName(options.experiment)
|
|
5638
|
+
});
|
|
5639
|
+
console.log(`Artifact workspace written to: ${runDir}`);
|
|
5640
|
+
console.log(` Index: ${indexPath}`);
|
|
5641
|
+
console.log(
|
|
5642
|
+
` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`
|
|
5643
|
+
);
|
|
5644
|
+
console.log(` Timing: ${timingPath}`);
|
|
5645
|
+
console.log(` Benchmark: ${workspaceBenchmarkPath}`);
|
|
5646
|
+
}
|
|
5970
5647
|
}
|
|
5971
5648
|
if (resolvedExportPaths.length > 0 && allResults.length > 0) {
|
|
5972
5649
|
for (const exportPath of resolvedExportPaths) {
|
|
@@ -5977,7 +5654,7 @@ async function runEvalCommand(input) {
|
|
|
5977
5654
|
await writer.close();
|
|
5978
5655
|
}
|
|
5979
5656
|
console.log(
|
|
5980
|
-
`Export file(s) written: ${resolvedExportPaths.map((p) =>
|
|
5657
|
+
`Export file(s) written: ${resolvedExportPaths.map((p) => path15.relative(cwd, p)).join(", ")}`
|
|
5981
5658
|
);
|
|
5982
5659
|
}
|
|
5983
5660
|
const resultsWithWorkspaces = allResults.filter((r) => r.workspacePath);
|
|
@@ -6015,14 +5692,14 @@ Results written to: ${outputPath}`);
|
|
|
6015
5692
|
experiment: normalizeExperimentName(options.experiment)
|
|
6016
5693
|
});
|
|
6017
5694
|
}
|
|
6018
|
-
if (summary.executionErrorCount > 0 && !options.retryErrors) {
|
|
6019
|
-
const evalFileArgs = activeTestFiles.map((f) =>
|
|
5695
|
+
if (summary.executionErrorCount > 0 && !options.retryErrors && !options.resume) {
|
|
5696
|
+
const evalFileArgs = activeTestFiles.map((f) => path15.relative(cwd, f)).join(" ");
|
|
6020
5697
|
const targetFlag = options.target ? ` --target ${options.target}` : "";
|
|
6021
|
-
const
|
|
5698
|
+
const relativeRunDir = path15.relative(cwd, runDir);
|
|
6022
5699
|
console.log(
|
|
6023
5700
|
`
|
|
6024
5701
|
Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:
|
|
6025
|
-
agentv eval run ${evalFileArgs}${targetFlag} --
|
|
5702
|
+
agentv eval run ${evalFileArgs}${targetFlag} --output ${relativeRunDir} --rerun-failed`
|
|
6026
5703
|
);
|
|
6027
5704
|
}
|
|
6028
5705
|
return {
|
|
@@ -6052,7 +5729,7 @@ async function resolveEvaluationRunner() {
|
|
|
6052
5729
|
if (!overridePath) {
|
|
6053
5730
|
return runEvaluation;
|
|
6054
5731
|
}
|
|
6055
|
-
const resolved =
|
|
5732
|
+
const resolved = path15.isAbsolute(overridePath) ? overridePath : path15.resolve(process.cwd(), overridePath);
|
|
6056
5733
|
const moduleUrl = pathToFileURL(resolved).href;
|
|
6057
5734
|
const mod = await import(moduleUrl);
|
|
6058
5735
|
const candidate = mod.runEvaluation;
|
|
@@ -6065,11 +5742,11 @@ async function resolveEvaluationRunner() {
|
|
|
6065
5742
|
}
|
|
6066
5743
|
|
|
6067
5744
|
// src/commands/eval/discover.ts
|
|
6068
|
-
import
|
|
5745
|
+
import path16 from "node:path";
|
|
6069
5746
|
import fg2 from "fast-glob";
|
|
6070
5747
|
async function discoverEvalFiles(cwd) {
|
|
6071
5748
|
const repoRoot = await findRepoRoot(cwd);
|
|
6072
|
-
const config = await loadConfig(
|
|
5749
|
+
const config = await loadConfig(path16.join(cwd, "_"), repoRoot);
|
|
6073
5750
|
const patterns = config?.eval_patterns && config.eval_patterns.length > 0 ? config.eval_patterns : DEFAULT_EVAL_PATTERNS;
|
|
6074
5751
|
const ignore = ["**/node_modules/**", "**/dist/**"];
|
|
6075
5752
|
const matches = await fg2(patterns, {
|
|
@@ -6081,7 +5758,7 @@ async function discoverEvalFiles(cwd) {
|
|
|
6081
5758
|
caseSensitiveMatch: false
|
|
6082
5759
|
});
|
|
6083
5760
|
const evalFiles = matches.map((absPath) => {
|
|
6084
|
-
const relativePath =
|
|
5761
|
+
const relativePath = path16.relative(cwd, absPath);
|
|
6085
5762
|
const category = deriveCategory(relativePath);
|
|
6086
5763
|
return { path: absPath, relativePath, category };
|
|
6087
5764
|
});
|
|
@@ -6103,10 +5780,6 @@ function filterByCategory(files, category) {
|
|
|
6103
5780
|
|
|
6104
5781
|
export {
|
|
6105
5782
|
package_default,
|
|
6106
|
-
toSnakeCaseDeep,
|
|
6107
|
-
RESULT_INDEX_FILENAME,
|
|
6108
|
-
buildDefaultRunDir,
|
|
6109
|
-
resolveRunManifestPath,
|
|
6110
5783
|
parseResultManifest,
|
|
6111
5784
|
resolveResultSourcePath,
|
|
6112
5785
|
loadManifestResults,
|
|
@@ -6132,7 +5805,6 @@ export {
|
|
|
6132
5805
|
listMergedResultFiles,
|
|
6133
5806
|
findRunById,
|
|
6134
5807
|
maybeAutoExportRunArtifacts,
|
|
6135
|
-
writeArtifactsFromResults,
|
|
6136
5808
|
resolveRunCacheFile,
|
|
6137
5809
|
loadRunCache,
|
|
6138
5810
|
detectFileType,
|
|
@@ -6150,4 +5822,4 @@ export {
|
|
|
6150
5822
|
getCategories,
|
|
6151
5823
|
filterByCategory
|
|
6152
5824
|
};
|
|
6153
|
-
//# sourceMappingURL=chunk-
|
|
5825
|
+
//# sourceMappingURL=chunk-YGNBTBXW.js.map
|