agentv 4.10.0 → 4.11.2-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-XOSNETAV.js → chunk-BAUNAXHT.js} +1 -1
- package/dist/chunk-BPGJ4HBU.js +183 -0
- package/dist/chunk-BPGJ4HBU.js.map +1 -0
- package/dist/{chunk-KF6BABQ5.js → chunk-DHWFLK6T.js} +1090 -303
- package/dist/chunk-DHWFLK6T.js.map +1 -0
- package/dist/{chunk-SE73HJZG.js → chunk-FQGY6QXQ.js} +780 -346
- package/dist/chunk-FQGY6QXQ.js.map +1 -0
- package/dist/chunk-NPVGBFF6.js +151 -0
- package/dist/chunk-NPVGBFF6.js.map +1 -0
- package/dist/{chunk-VA64NETD.js → chunk-YLVQNF23.js} +1120 -731
- package/dist/chunk-YLVQNF23.js.map +1 -0
- package/dist/cli.js +6 -4
- package/dist/cli.js.map +1 -1
- package/dist/{dist-XDNB4WDT.js → dist-HNSXNRVK.js} +36 -3
- package/dist/docker-workspace-RPPXBT27-B4AQHVWA.js +11 -0
- package/dist/{esm-CZAWIY6F.js → esm-UYZ3HJBU.js} +2 -2
- package/dist/esm-UYZ3HJBU.js.map +1 -0
- package/dist/exec-AR6JUUN5-6MBPURPR.js +11 -0
- package/dist/exec-AR6JUUN5-6MBPURPR.js.map +1 -0
- package/dist/index.js +6 -4
- package/dist/{interactive-SNKK6VCV.js → interactive-OPHUF2UP.js} +6 -4
- package/dist/{interactive-SNKK6VCV.js.map → interactive-OPHUF2UP.js.map} +1 -1
- package/dist/{src-ML4D2MC2.js → src-PXDA7QIS.js} +2 -2
- package/dist/studio/assets/index-Bi-KHfNm.js +65 -0
- package/dist/studio/assets/index-D_j-w4UO.css +1 -0
- package/dist/studio/assets/{index-DcwjOyrk.js → index-VyDFrnoK.js} +1 -1
- package/dist/studio/index.html +2 -2
- package/package.json +1 -1
- package/dist/chunk-KF6BABQ5.js.map +0 -1
- package/dist/chunk-SE73HJZG.js.map +0 -1
- package/dist/chunk-VA64NETD.js.map +0 -1
- package/dist/studio/assets/index-DHxVz6M9.css +0 -1
- package/dist/studio/assets/index-Y5InSvcS.js +0 -65
- /package/dist/{chunk-XOSNETAV.js.map → chunk-BAUNAXHT.js.map} +0 -0
- /package/dist/{dist-XDNB4WDT.js.map → dist-HNSXNRVK.js.map} +0 -0
- /package/dist/{esm-CZAWIY6F.js.map → docker-workspace-RPPXBT27-B4AQHVWA.js.map} +0 -0
- /package/dist/{src-ML4D2MC2.js.map → src-PXDA7QIS.js.map} +0 -0
|
@@ -2,32 +2,47 @@ import { createRequire } from 'node:module'; const require = createRequire(impor
|
|
|
2
2
|
import {
|
|
3
3
|
HtmlWriter,
|
|
4
4
|
RESULT_INDEX_FILENAME,
|
|
5
|
-
RESULT_RUNS_DIRNAME,
|
|
6
5
|
TARGET_FILE_CANDIDATES,
|
|
7
6
|
buildDefaultRunDir,
|
|
7
|
+
c,
|
|
8
8
|
detectFileType,
|
|
9
9
|
discoverEvalFiles,
|
|
10
10
|
findRepoRoot,
|
|
11
|
+
findRunById,
|
|
12
|
+
formatCost,
|
|
13
|
+
formatDuration,
|
|
14
|
+
formatNumber,
|
|
15
|
+
formatScore,
|
|
16
|
+
formatSize,
|
|
17
|
+
getRemoteResultsStatus,
|
|
18
|
+
getTraceSpans,
|
|
19
|
+
getTraceSummary,
|
|
20
|
+
listMergedResultFiles,
|
|
21
|
+
listResultFiles,
|
|
11
22
|
loadLightweightResults,
|
|
12
23
|
loadManifestResults,
|
|
24
|
+
loadResultFile,
|
|
13
25
|
loadRunCache,
|
|
26
|
+
maybeAutoExportRunArtifacts,
|
|
14
27
|
package_default,
|
|
28
|
+
padLeft,
|
|
29
|
+
padRight,
|
|
15
30
|
parseResultManifest,
|
|
16
31
|
resolveEvalPaths,
|
|
17
|
-
resolveExistingRunPrimaryPath,
|
|
18
32
|
resolveResultSourcePath,
|
|
19
33
|
resolveRunCacheFile,
|
|
20
34
|
resolveRunManifestPath,
|
|
21
|
-
resolveWorkspaceOrFilePath,
|
|
22
35
|
runEvalCommand,
|
|
23
36
|
selectTarget,
|
|
37
|
+
syncRemoteResults,
|
|
24
38
|
toSnakeCaseDeep,
|
|
39
|
+
toTraceSummary,
|
|
25
40
|
validateConfigFile,
|
|
26
41
|
validateEvalFile,
|
|
27
42
|
validateFileReferences,
|
|
28
43
|
validateTargetsFile,
|
|
29
44
|
writeArtifactsFromResults
|
|
30
|
-
} from "./chunk-
|
|
45
|
+
} from "./chunk-DHWFLK6T.js";
|
|
31
46
|
import {
|
|
32
47
|
DEFAULT_CATEGORY,
|
|
33
48
|
DEFAULT_THRESHOLD,
|
|
@@ -61,7 +76,7 @@ import {
|
|
|
61
76
|
toTranscriptJsonLine,
|
|
62
77
|
transpileEvalYamlFile,
|
|
63
78
|
trimBaselineResult
|
|
64
|
-
} from "./chunk-
|
|
79
|
+
} from "./chunk-FQGY6QXQ.js";
|
|
65
80
|
import {
|
|
66
81
|
__commonJS,
|
|
67
82
|
__require,
|
|
@@ -2770,7 +2785,7 @@ var colors = {
|
|
|
2770
2785
|
gray: "\x1B[90m"
|
|
2771
2786
|
};
|
|
2772
2787
|
var noColor = process.env.NO_COLOR !== void 0 || !process.stdout.isTTY;
|
|
2773
|
-
var
|
|
2788
|
+
var c2 = noColor ? Object.fromEntries(Object.keys(colors).map((k) => [k, ""])) : colors;
|
|
2774
2789
|
function loadCompareResults(filePath) {
|
|
2775
2790
|
return loadLightweightResults(resolveResultSourcePath(filePath)).map((record) => {
|
|
2776
2791
|
if (!record.testId || record.testId === "unknown") {
|
|
@@ -2904,29 +2919,29 @@ function determineMatrixExitCode(matrixOutput, baselineTarget) {
|
|
|
2904
2919
|
function formatDelta(delta) {
|
|
2905
2920
|
const sign = delta >= 0 ? "+" : "";
|
|
2906
2921
|
const formatted = `${sign}${delta.toFixed(2)}`;
|
|
2907
|
-
if (delta > 0) return `${
|
|
2908
|
-
if (delta < 0) return `${
|
|
2909
|
-
return `${
|
|
2922
|
+
if (delta > 0) return `${c2.green}${formatted}${c2.reset}`;
|
|
2923
|
+
if (delta < 0) return `${c2.red}${formatted}${c2.reset}`;
|
|
2924
|
+
return `${c2.gray}${formatted}${c2.reset}`;
|
|
2910
2925
|
}
|
|
2911
2926
|
function formatOutcome(outcome) {
|
|
2912
2927
|
switch (outcome) {
|
|
2913
2928
|
case "win":
|
|
2914
|
-
return `${
|
|
2929
|
+
return `${c2.green}\u2713 win${c2.reset}`;
|
|
2915
2930
|
case "loss":
|
|
2916
|
-
return `${
|
|
2931
|
+
return `${c2.red}\u2717 loss${c2.reset}`;
|
|
2917
2932
|
case "tie":
|
|
2918
|
-
return `${
|
|
2933
|
+
return `${c2.gray}= tie${c2.reset}`;
|
|
2919
2934
|
}
|
|
2920
2935
|
}
|
|
2921
2936
|
var ansiPattern = new RegExp(`${String.fromCharCode(27)}\\[[0-9;]*m`, "g");
|
|
2922
2937
|
function stripAnsi2(str) {
|
|
2923
2938
|
return str.replace(ansiPattern, "");
|
|
2924
2939
|
}
|
|
2925
|
-
function
|
|
2940
|
+
function padRight2(str, len) {
|
|
2926
2941
|
const plainLen = stripAnsi2(str).length;
|
|
2927
2942
|
return str + " ".repeat(Math.max(0, len - plainLen));
|
|
2928
2943
|
}
|
|
2929
|
-
function
|
|
2944
|
+
function padLeft2(str, len) {
|
|
2930
2945
|
const plainLen = stripAnsi2(str).length;
|
|
2931
2946
|
return " ".repeat(Math.max(0, len - plainLen)) + str;
|
|
2932
2947
|
}
|
|
@@ -2934,42 +2949,42 @@ function formatTable(comparison, file1, file2) {
|
|
|
2934
2949
|
const lines = [];
|
|
2935
2950
|
lines.push("");
|
|
2936
2951
|
lines.push(
|
|
2937
|
-
`${
|
|
2952
|
+
`${c2.bold}Comparing:${c2.reset} ${c2.cyan}${file1}${c2.reset} \u2192 ${c2.cyan}${file2}${c2.reset}`
|
|
2938
2953
|
);
|
|
2939
2954
|
lines.push("");
|
|
2940
2955
|
if (comparison.matched.length === 0) {
|
|
2941
|
-
lines.push(`${
|
|
2956
|
+
lines.push(`${c2.yellow}No matching test IDs found between files.${c2.reset}`);
|
|
2942
2957
|
} else {
|
|
2943
2958
|
const maxIdLen = Math.max(
|
|
2944
2959
|
7,
|
|
2945
2960
|
...comparison.matched.map((m) => m.testId.length)
|
|
2946
2961
|
);
|
|
2947
|
-
const header = ` ${
|
|
2948
|
-
lines.push(`${
|
|
2962
|
+
const header = ` ${padRight2("Test ID", maxIdLen)} ${padLeft2("Baseline", 8)} ${padLeft2("Candidate", 9)} ${padLeft2("Delta", 8)} Result`;
|
|
2963
|
+
lines.push(`${c2.dim}${header}${c2.reset}`);
|
|
2949
2964
|
lines.push(
|
|
2950
|
-
`${
|
|
2965
|
+
`${c2.dim} ${"\u2500".repeat(maxIdLen)} ${"\u2500".repeat(8)} ${"\u2500".repeat(9)} ${"\u2500".repeat(8)} ${"\u2500".repeat(8)}${c2.reset}`
|
|
2951
2966
|
);
|
|
2952
2967
|
for (const m of comparison.matched) {
|
|
2953
|
-
const row = ` ${
|
|
2968
|
+
const row = ` ${padRight2(m.testId, maxIdLen)} ${padLeft2(m.score1.toFixed(2), 8)} ${padLeft2(m.score2.toFixed(2), 9)} ${padLeft2(formatDelta(m.delta), 8)} ${formatOutcome(m.outcome)}`;
|
|
2954
2969
|
lines.push(row);
|
|
2955
2970
|
}
|
|
2956
2971
|
}
|
|
2957
2972
|
if (comparison.unmatched.file1 > 0 || comparison.unmatched.file2 > 0) {
|
|
2958
2973
|
lines.push("");
|
|
2959
2974
|
lines.push(
|
|
2960
|
-
`${
|
|
2975
|
+
`${c2.yellow}\u26A0 Unmatched:${c2.reset} ${comparison.unmatched.file1} in baseline, ${comparison.unmatched.file2} in candidate`
|
|
2961
2976
|
);
|
|
2962
2977
|
}
|
|
2963
2978
|
lines.push("");
|
|
2964
2979
|
const { wins, losses, ties, meanDelta } = comparison.summary;
|
|
2965
|
-
const winStr = wins > 0 ? `${
|
|
2966
|
-
const lossStr = losses > 0 ? `${
|
|
2980
|
+
const winStr = wins > 0 ? `${c2.green}${wins} win${wins !== 1 ? "s" : ""}${c2.reset}` : `${wins} wins`;
|
|
2981
|
+
const lossStr = losses > 0 ? `${c2.red}${losses} loss${losses !== 1 ? "es" : ""}${c2.reset}` : `${losses} losses`;
|
|
2967
2982
|
const tieStr = `${ties} tie${ties !== 1 ? "s" : ""}`;
|
|
2968
|
-
const deltaColor = meanDelta > 0 ?
|
|
2983
|
+
const deltaColor = meanDelta > 0 ? c2.green : meanDelta < 0 ? c2.red : c2.gray;
|
|
2969
2984
|
const deltaSign = meanDelta >= 0 ? "+" : "";
|
|
2970
|
-
const status = meanDelta > 0 ? `${
|
|
2985
|
+
const status = meanDelta > 0 ? `${c2.green}improved${c2.reset}` : meanDelta < 0 ? `${c2.red}regressed${c2.reset}` : `${c2.gray}neutral${c2.reset}`;
|
|
2971
2986
|
lines.push(
|
|
2972
|
-
`${
|
|
2987
|
+
`${c2.bold}Summary:${c2.reset} ${winStr}, ${lossStr}, ${tieStr} | Mean \u0394: ${deltaColor}${deltaSign}${meanDelta.toFixed(3)}${c2.reset} | Status: ${status}`
|
|
2973
2988
|
);
|
|
2974
2989
|
lines.push("");
|
|
2975
2990
|
return lines.join("\n");
|
|
@@ -2978,10 +2993,10 @@ function formatMatrix(matrixOutput, baselineTarget) {
|
|
|
2978
2993
|
const { matrix, pairwise, targets } = matrixOutput;
|
|
2979
2994
|
const lines = [];
|
|
2980
2995
|
lines.push("");
|
|
2981
|
-
lines.push(`${
|
|
2996
|
+
lines.push(`${c2.bold}Score Matrix${c2.reset}`);
|
|
2982
2997
|
lines.push("");
|
|
2983
2998
|
if (matrix.length === 0) {
|
|
2984
|
-
lines.push(`${
|
|
2999
|
+
lines.push(`${c2.yellow}No results found.${c2.reset}`);
|
|
2985
3000
|
return lines.join("\n");
|
|
2986
3001
|
}
|
|
2987
3002
|
const testIdWidth = Math.max(
|
|
@@ -2989,49 +3004,49 @@ function formatMatrix(matrixOutput, baselineTarget) {
|
|
|
2989
3004
|
...matrix.map((r) => r.testId.length)
|
|
2990
3005
|
);
|
|
2991
3006
|
const targetWidths = targets.map((t) => Math.max(t.length, 6));
|
|
2992
|
-
let header = ` ${
|
|
3007
|
+
let header = ` ${padRight2("Test ID", testIdWidth)}`;
|
|
2993
3008
|
for (let i = 0; i < targets.length; i++) {
|
|
2994
|
-
header += ` ${
|
|
3009
|
+
header += ` ${padLeft2(targets[i], targetWidths[i])}`;
|
|
2995
3010
|
}
|
|
2996
|
-
lines.push(`${
|
|
3011
|
+
lines.push(`${c2.dim}${header}${c2.reset}`);
|
|
2997
3012
|
let sep = ` ${"\u2500".repeat(testIdWidth)}`;
|
|
2998
3013
|
for (let i = 0; i < targets.length; i++) {
|
|
2999
3014
|
sep += ` ${"\u2500".repeat(targetWidths[i])}`;
|
|
3000
3015
|
}
|
|
3001
|
-
lines.push(`${
|
|
3016
|
+
lines.push(`${c2.dim}${sep}${c2.reset}`);
|
|
3002
3017
|
for (const row of matrix) {
|
|
3003
|
-
let line = ` ${
|
|
3018
|
+
let line = ` ${padRight2(row.testId, testIdWidth)}`;
|
|
3004
3019
|
for (let i = 0; i < targets.length; i++) {
|
|
3005
3020
|
const score = row.scores[targets[i]];
|
|
3006
3021
|
const scoreStr = score !== void 0 ? score.toFixed(2) : " --";
|
|
3007
3022
|
if (baselineTarget && targets[i] !== baselineTarget && score !== void 0) {
|
|
3008
3023
|
const baselineScore = row.scores[baselineTarget];
|
|
3009
3024
|
if (baselineScore !== void 0 && score < baselineScore) {
|
|
3010
|
-
line += ` ${
|
|
3025
|
+
line += ` ${padLeft2(`${c2.red}${scoreStr}${c2.reset}`, targetWidths[i])}`;
|
|
3011
3026
|
} else if (baselineScore !== void 0 && score > baselineScore) {
|
|
3012
|
-
line += ` ${
|
|
3027
|
+
line += ` ${padLeft2(`${c2.green}${scoreStr}${c2.reset}`, targetWidths[i])}`;
|
|
3013
3028
|
} else {
|
|
3014
|
-
line += ` ${
|
|
3029
|
+
line += ` ${padLeft2(scoreStr, targetWidths[i])}`;
|
|
3015
3030
|
}
|
|
3016
3031
|
} else {
|
|
3017
|
-
line += ` ${
|
|
3032
|
+
line += ` ${padLeft2(scoreStr, targetWidths[i])}`;
|
|
3018
3033
|
}
|
|
3019
3034
|
}
|
|
3020
3035
|
lines.push(line);
|
|
3021
3036
|
}
|
|
3022
3037
|
if (pairwise.length > 0) {
|
|
3023
3038
|
lines.push("");
|
|
3024
|
-
lines.push(`${
|
|
3039
|
+
lines.push(`${c2.bold}Pairwise Summary:${c2.reset}`);
|
|
3025
3040
|
const maxLabelLen = Math.max(
|
|
3026
3041
|
...pairwise.map((pw) => ` ${pw.baseline} \u2192 ${pw.candidate}:`.length)
|
|
3027
3042
|
);
|
|
3028
3043
|
for (const p of pairwise) {
|
|
3029
3044
|
const { wins, losses, ties, meanDelta } = p.summary;
|
|
3030
3045
|
const sign = meanDelta >= 0 ? "+" : "";
|
|
3031
|
-
const deltaColor = meanDelta > 0 ?
|
|
3046
|
+
const deltaColor = meanDelta > 0 ? c2.green : meanDelta < 0 ? c2.red : c2.gray;
|
|
3032
3047
|
const label = ` ${p.baseline} \u2192 ${p.candidate}:`;
|
|
3033
3048
|
lines.push(
|
|
3034
|
-
`${
|
|
3049
|
+
`${padRight2(label, maxLabelLen)} ${wins} win${wins !== 1 ? "s" : ""}, ${losses} loss${losses !== 1 ? "es" : ""}, ${ties} tie${ties !== 1 ? "s" : ""} (${c2.bold}\u0394${c2.reset} ${deltaColor}${sign}${meanDelta.toFixed(3)}${c2.reset})`
|
|
3035
3050
|
);
|
|
3036
3051
|
}
|
|
3037
3052
|
}
|
|
@@ -3754,6 +3769,11 @@ var evalRunCommand = command({
|
|
|
3754
3769
|
long: "output-format",
|
|
3755
3770
|
description: "[Deprecated] Output format: 'jsonl', 'yaml', or 'html' (default: jsonl)"
|
|
3756
3771
|
}),
|
|
3772
|
+
experiment: option({
|
|
3773
|
+
type: optional(string),
|
|
3774
|
+
long: "experiment",
|
|
3775
|
+
description: "Experiment label for canonical run output (default: default)"
|
|
3776
|
+
}),
|
|
3757
3777
|
export: multioption({
|
|
3758
3778
|
type: array(string),
|
|
3759
3779
|
long: "export",
|
|
@@ -3892,7 +3912,7 @@ var evalRunCommand = command({
|
|
|
3892
3912
|
},
|
|
3893
3913
|
handler: async (args) => {
|
|
3894
3914
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
3895
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
3915
|
+
const { launchInteractiveWizard } = await import("./interactive-OPHUF2UP.js");
|
|
3896
3916
|
await launchInteractiveWizard();
|
|
3897
3917
|
return;
|
|
3898
3918
|
}
|
|
@@ -3905,6 +3925,7 @@ var evalRunCommand = command({
|
|
|
3905
3925
|
out: args.out,
|
|
3906
3926
|
output: args.output,
|
|
3907
3927
|
outputFormat: args.outputFormat,
|
|
3928
|
+
experiment: args.experiment,
|
|
3908
3929
|
export: args.export,
|
|
3909
3930
|
dryRun: args.dryRun,
|
|
3910
3931
|
dryRunDelay: args.dryRunDelay,
|
|
@@ -4299,42 +4320,149 @@ function formatDurationMs3(ms) {
|
|
|
4299
4320
|
return `${minutes}m ${remainingSeconds}s`;
|
|
4300
4321
|
}
|
|
4301
4322
|
|
|
4323
|
+
// src/commands/import/huggingface.ts
|
|
4324
|
+
import { execFile } from "node:child_process";
|
|
4325
|
+
import { existsSync } from "node:fs";
|
|
4326
|
+
import path7 from "node:path";
|
|
4327
|
+
function findScript() {
|
|
4328
|
+
const candidates = [
|
|
4329
|
+
path7.resolve(__dirname, "..", "..", "..", "..", "..", "scripts", "import-huggingface.py"),
|
|
4330
|
+
path7.resolve(__dirname, "..", "..", "..", "..", "scripts", "import-huggingface.py"),
|
|
4331
|
+
path7.resolve(process.cwd(), "scripts", "import-huggingface.py")
|
|
4332
|
+
];
|
|
4333
|
+
for (const candidate of candidates) {
|
|
4334
|
+
if (existsSync(candidate)) return candidate;
|
|
4335
|
+
}
|
|
4336
|
+
return candidates[candidates.length - 1];
|
|
4337
|
+
}
|
|
4338
|
+
var importHuggingFaceCommand = command({
|
|
4339
|
+
name: "huggingface",
|
|
4340
|
+
description: "Import a HuggingFace dataset into AgentV EVAL.yaml format",
|
|
4341
|
+
args: {
|
|
4342
|
+
repo: option({
|
|
4343
|
+
type: string,
|
|
4344
|
+
long: "repo",
|
|
4345
|
+
description: "HuggingFace dataset repository (e.g. SWE-bench/SWE-bench_Verified)"
|
|
4346
|
+
}),
|
|
4347
|
+
split: option({
|
|
4348
|
+
type: optional(string),
|
|
4349
|
+
long: "split",
|
|
4350
|
+
description: "Dataset split to load (default: test)"
|
|
4351
|
+
}),
|
|
4352
|
+
limit: option({
|
|
4353
|
+
type: optional(number),
|
|
4354
|
+
long: "limit",
|
|
4355
|
+
description: "Maximum number of instances to import"
|
|
4356
|
+
}),
|
|
4357
|
+
output: option({
|
|
4358
|
+
type: optional(string),
|
|
4359
|
+
long: "output",
|
|
4360
|
+
short: "o",
|
|
4361
|
+
description: "Output directory for EVAL.yaml files (default: evals/)"
|
|
4362
|
+
})
|
|
4363
|
+
},
|
|
4364
|
+
handler: async ({ repo, split, limit, output }) => {
|
|
4365
|
+
const scriptPath = findScript();
|
|
4366
|
+
if (!existsSync(scriptPath)) {
|
|
4367
|
+
console.error(`Error: Python script not found at ${scriptPath}`);
|
|
4368
|
+
console.error(
|
|
4369
|
+
"Make sure you are running from the agentv repository root, or install agentv from source."
|
|
4370
|
+
);
|
|
4371
|
+
process.exit(1);
|
|
4372
|
+
}
|
|
4373
|
+
const args = [scriptPath, "--repo", repo];
|
|
4374
|
+
if (split) args.push("--split", split);
|
|
4375
|
+
if (limit !== void 0) args.push("--limit", String(limit));
|
|
4376
|
+
if (output) args.push("--output", output);
|
|
4377
|
+
console.log(`Importing from HuggingFace: ${repo} (split=${split ?? "test"})...`);
|
|
4378
|
+
try {
|
|
4379
|
+
await new Promise((resolve3, reject) => {
|
|
4380
|
+
const child = execFile("uv", ["run", ...args], { maxBuffer: 50 * 1024 * 1024 }, (error) => {
|
|
4381
|
+
if (error) {
|
|
4382
|
+
reject(error);
|
|
4383
|
+
} else {
|
|
4384
|
+
resolve3();
|
|
4385
|
+
}
|
|
4386
|
+
});
|
|
4387
|
+
let stderrBuf = "";
|
|
4388
|
+
child.stderr?.on("data", (data) => {
|
|
4389
|
+
const chunk = data.toString();
|
|
4390
|
+
stderrBuf += chunk;
|
|
4391
|
+
process.stderr.write(data);
|
|
4392
|
+
});
|
|
4393
|
+
let stdout = "";
|
|
4394
|
+
child.stdout?.on("data", (data) => {
|
|
4395
|
+
stdout += data.toString();
|
|
4396
|
+
});
|
|
4397
|
+
child.on("close", (code) => {
|
|
4398
|
+
if (code === 0 && stdout.trim()) {
|
|
4399
|
+
try {
|
|
4400
|
+
const summary = JSON.parse(stdout.trim());
|
|
4401
|
+
console.log(
|
|
4402
|
+
`
|
|
4403
|
+
Imported ${summary.files_created} eval(s) from ${summary.dataset} \u2192 ${summary.output_dir}/`
|
|
4404
|
+
);
|
|
4405
|
+
} catch {
|
|
4406
|
+
if (stdout.trim()) console.log(stdout.trim());
|
|
4407
|
+
}
|
|
4408
|
+
} else if (code !== 0) {
|
|
4409
|
+
const tail = stderrBuf.trim().slice(-2e3);
|
|
4410
|
+
if (tail) {
|
|
4411
|
+
console.error("\n--- import-huggingface.py stderr (last 2 000 chars) ---");
|
|
4412
|
+
console.error(tail);
|
|
4413
|
+
}
|
|
4414
|
+
}
|
|
4415
|
+
});
|
|
4416
|
+
});
|
|
4417
|
+
} catch (err2) {
|
|
4418
|
+
if (err2 instanceof Error && err2.code === "ENOENT") {
|
|
4419
|
+
console.error(
|
|
4420
|
+
"Error: `uv` is not installed or not found on PATH.\nInstall it with: curl -LsSf https://astral.sh/uv/install.sh | sh\nSee https://docs.astral.sh/uv/ for details."
|
|
4421
|
+
);
|
|
4422
|
+
process.exit(1);
|
|
4423
|
+
}
|
|
4424
|
+
throw err2;
|
|
4425
|
+
}
|
|
4426
|
+
}
|
|
4427
|
+
});
|
|
4428
|
+
|
|
4302
4429
|
// src/commands/import/index.ts
|
|
4303
4430
|
var importCommand = subcommands({
|
|
4304
4431
|
name: "import",
|
|
4305
|
-
description: "Import agent session transcripts for offline grading",
|
|
4432
|
+
description: "Import agent session transcripts and datasets for offline grading",
|
|
4306
4433
|
cmds: {
|
|
4307
4434
|
claude: importClaudeCommand,
|
|
4308
4435
|
codex: importCodexCommand,
|
|
4309
|
-
copilot: importCopilotCommand
|
|
4436
|
+
copilot: importCopilotCommand,
|
|
4437
|
+
huggingface: importHuggingFaceCommand
|
|
4310
4438
|
}
|
|
4311
4439
|
});
|
|
4312
4440
|
|
|
4313
4441
|
// src/commands/init/index.ts
|
|
4314
|
-
import { existsSync, mkdirSync, writeFileSync as writeFileSync2 } from "node:fs";
|
|
4315
|
-
import
|
|
4442
|
+
import { existsSync as existsSync2, mkdirSync, writeFileSync as writeFileSync2 } from "node:fs";
|
|
4443
|
+
import path9 from "node:path";
|
|
4316
4444
|
import * as readline from "node:readline/promises";
|
|
4317
4445
|
|
|
4318
4446
|
// src/templates/index.ts
|
|
4319
4447
|
import { readFileSync as readFileSync3, readdirSync, statSync } from "node:fs";
|
|
4320
|
-
import
|
|
4448
|
+
import path8 from "node:path";
|
|
4321
4449
|
import { fileURLToPath } from "node:url";
|
|
4322
4450
|
function getAgentvTemplates() {
|
|
4323
4451
|
return getTemplatesFromDir(".agentv");
|
|
4324
4452
|
}
|
|
4325
4453
|
function getEnvExampleTemplate() {
|
|
4326
|
-
const currentDir =
|
|
4327
|
-
const templatesBase = currentDir.includes(`${
|
|
4328
|
-
const content = readFileSync3(
|
|
4454
|
+
const currentDir = path8.dirname(fileURLToPath(import.meta.url));
|
|
4455
|
+
const templatesBase = currentDir.includes(`${path8.sep}dist`) ? path8.join(currentDir, "templates") : currentDir;
|
|
4456
|
+
const content = readFileSync3(path8.join(templatesBase, ".env.example"), "utf-8");
|
|
4329
4457
|
return { path: ".env.example", content };
|
|
4330
4458
|
}
|
|
4331
4459
|
function getTemplatesFromDir(subdir) {
|
|
4332
|
-
const currentDir =
|
|
4460
|
+
const currentDir = path8.dirname(fileURLToPath(import.meta.url));
|
|
4333
4461
|
let templatesDir;
|
|
4334
|
-
if (currentDir.includes(`${
|
|
4335
|
-
templatesDir =
|
|
4462
|
+
if (currentDir.includes(`${path8.sep}dist`)) {
|
|
4463
|
+
templatesDir = path8.join(currentDir, "templates", subdir);
|
|
4336
4464
|
} else {
|
|
4337
|
-
templatesDir =
|
|
4465
|
+
templatesDir = path8.join(currentDir, subdir);
|
|
4338
4466
|
}
|
|
4339
4467
|
return readTemplatesRecursively(templatesDir, "");
|
|
4340
4468
|
}
|
|
@@ -4342,15 +4470,15 @@ function readTemplatesRecursively(dir, relativePath) {
|
|
|
4342
4470
|
const templates = [];
|
|
4343
4471
|
const entries2 = readdirSync(dir);
|
|
4344
4472
|
for (const entry of entries2) {
|
|
4345
|
-
const fullPath =
|
|
4473
|
+
const fullPath = path8.join(dir, entry);
|
|
4346
4474
|
const stat3 = statSync(fullPath);
|
|
4347
|
-
const entryRelativePath = relativePath ?
|
|
4475
|
+
const entryRelativePath = relativePath ? path8.join(relativePath, entry) : entry;
|
|
4348
4476
|
if (stat3.isDirectory()) {
|
|
4349
4477
|
templates.push(...readTemplatesRecursively(fullPath, entryRelativePath));
|
|
4350
4478
|
} else {
|
|
4351
4479
|
const content = readFileSync3(fullPath, "utf-8");
|
|
4352
4480
|
templates.push({
|
|
4353
|
-
path: entryRelativePath.split(
|
|
4481
|
+
path: entryRelativePath.split(path8.sep).join("/"),
|
|
4354
4482
|
// Normalize to forward slashes
|
|
4355
4483
|
content
|
|
4356
4484
|
});
|
|
@@ -4379,22 +4507,22 @@ async function promptYesNo(message) {
|
|
|
4379
4507
|
}
|
|
4380
4508
|
}
|
|
4381
4509
|
async function initCommand(options = {}) {
|
|
4382
|
-
const targetPath =
|
|
4383
|
-
const agentvDir =
|
|
4510
|
+
const targetPath = path9.resolve(options.targetPath ?? ".");
|
|
4511
|
+
const agentvDir = path9.join(targetPath, ".agentv");
|
|
4384
4512
|
const otherAgentvTemplates = getAgentvTemplates();
|
|
4385
4513
|
const envTemplate = getEnvExampleTemplate();
|
|
4386
4514
|
const existingFiles = [];
|
|
4387
4515
|
if (envTemplate) {
|
|
4388
|
-
const envFilePath =
|
|
4389
|
-
if (
|
|
4516
|
+
const envFilePath = path9.join(targetPath, ".env.example");
|
|
4517
|
+
if (existsSync2(envFilePath)) {
|
|
4390
4518
|
existingFiles.push(".env.example");
|
|
4391
4519
|
}
|
|
4392
4520
|
}
|
|
4393
|
-
if (
|
|
4521
|
+
if (existsSync2(agentvDir)) {
|
|
4394
4522
|
for (const template of otherAgentvTemplates) {
|
|
4395
|
-
const targetFilePath =
|
|
4396
|
-
if (
|
|
4397
|
-
existingFiles.push(
|
|
4523
|
+
const targetFilePath = path9.join(agentvDir, template.path);
|
|
4524
|
+
if (existsSync2(targetFilePath)) {
|
|
4525
|
+
existingFiles.push(path9.relative(targetPath, targetFilePath));
|
|
4398
4526
|
}
|
|
4399
4527
|
}
|
|
4400
4528
|
}
|
|
@@ -4412,22 +4540,22 @@ async function initCommand(options = {}) {
|
|
|
4412
4540
|
}
|
|
4413
4541
|
console.log();
|
|
4414
4542
|
}
|
|
4415
|
-
if (!
|
|
4543
|
+
if (!existsSync2(agentvDir)) {
|
|
4416
4544
|
mkdirSync(agentvDir, { recursive: true });
|
|
4417
4545
|
}
|
|
4418
4546
|
if (envTemplate) {
|
|
4419
|
-
const envFilePath =
|
|
4547
|
+
const envFilePath = path9.join(targetPath, ".env.example");
|
|
4420
4548
|
writeFileSync2(envFilePath, envTemplate.content, "utf-8");
|
|
4421
4549
|
console.log("Created .env.example");
|
|
4422
4550
|
}
|
|
4423
4551
|
for (const template of otherAgentvTemplates) {
|
|
4424
|
-
const targetFilePath =
|
|
4425
|
-
const targetDirPath =
|
|
4426
|
-
if (!
|
|
4552
|
+
const targetFilePath = path9.join(agentvDir, template.path);
|
|
4553
|
+
const targetDirPath = path9.dirname(targetFilePath);
|
|
4554
|
+
if (!existsSync2(targetDirPath)) {
|
|
4427
4555
|
mkdirSync(targetDirPath, { recursive: true });
|
|
4428
4556
|
}
|
|
4429
4557
|
writeFileSync2(targetFilePath, template.content, "utf-8");
|
|
4430
|
-
console.log(`Created ${
|
|
4558
|
+
console.log(`Created ${path9.relative(targetPath, targetFilePath)}`);
|
|
4431
4559
|
}
|
|
4432
4560
|
console.log("\nAgentV initialized successfully!");
|
|
4433
4561
|
console.log("\nFiles installed to root:");
|
|
@@ -4435,7 +4563,7 @@ async function initCommand(options = {}) {
|
|
|
4435
4563
|
console.log(" - .env.example");
|
|
4436
4564
|
}
|
|
4437
4565
|
console.log(`
|
|
4438
|
-
Files installed to ${
|
|
4566
|
+
Files installed to ${path9.relative(targetPath, agentvDir)}:`);
|
|
4439
4567
|
for (const t of otherAgentvTemplates) {
|
|
4440
4568
|
console.log(` - ${t.path}`);
|
|
4441
4569
|
}
|
|
@@ -4465,415 +4593,291 @@ var initCmdTsCommand = command({
|
|
|
4465
4593
|
}
|
|
4466
4594
|
});
|
|
4467
4595
|
|
|
4468
|
-
// src/commands/inspect/
|
|
4469
|
-
import { readFileSync as readFileSync4, readdirSync as readdirSync2, statSync as statSync2 } from "node:fs";
|
|
4470
|
-
import
|
|
4471
|
-
|
|
4472
|
-
|
|
4473
|
-
|
|
4474
|
-
|
|
4475
|
-
|
|
4476
|
-
|
|
4477
|
-
|
|
4478
|
-
|
|
4479
|
-
|
|
4480
|
-
|
|
4481
|
-
|
|
4482
|
-
|
|
4483
|
-
|
|
4484
|
-
function stripAnsi3(str) {
|
|
4485
|
-
return str.replace(ansiPattern2, "");
|
|
4486
|
-
}
|
|
4487
|
-
function padRight2(str, len) {
|
|
4488
|
-
const plainLen = stripAnsi3(str).length;
|
|
4489
|
-
return str + " ".repeat(Math.max(0, len - plainLen));
|
|
4490
|
-
}
|
|
4491
|
-
function padLeft2(str, len) {
|
|
4492
|
-
const plainLen = stripAnsi3(str).length;
|
|
4493
|
-
return " ".repeat(Math.max(0, len - plainLen)) + str;
|
|
4494
|
-
}
|
|
4495
|
-
function loadResultFile(filePath) {
|
|
4496
|
-
const resolvedFilePath = resolveTraceResultPath(filePath);
|
|
4497
|
-
if (path9.extname(resolvedFilePath) === ".json") {
|
|
4498
|
-
return loadOtlpTraceFile(resolvedFilePath);
|
|
4499
|
-
}
|
|
4500
|
-
if (path9.basename(resolvedFilePath) === RESULT_INDEX_FILENAME) {
|
|
4501
|
-
return loadManifestAsRawResults(resolvedFilePath);
|
|
4596
|
+
// src/commands/inspect/filter.ts
|
|
4597
|
+
import { existsSync as existsSync3, readFileSync as readFileSync4, readdirSync as readdirSync2, statSync as statSync2 } from "node:fs";
|
|
4598
|
+
import path10 from "node:path";
|
|
4599
|
+
function collectIndexFiles(dir) {
|
|
4600
|
+
const files = [];
|
|
4601
|
+
try {
|
|
4602
|
+
const entries2 = readdirSync2(dir, { withFileTypes: true });
|
|
4603
|
+
for (const entry of entries2) {
|
|
4604
|
+
const fullPath = path10.join(dir, entry.name);
|
|
4605
|
+
if (entry.isDirectory()) {
|
|
4606
|
+
files.push(...collectIndexFiles(fullPath));
|
|
4607
|
+
} else if (entry.name === "index.jsonl") {
|
|
4608
|
+
files.push(fullPath);
|
|
4609
|
+
}
|
|
4610
|
+
}
|
|
4611
|
+
} catch {
|
|
4502
4612
|
}
|
|
4503
|
-
return
|
|
4504
|
-
}
|
|
4505
|
-
function resolveTraceResultPath(filePath) {
|
|
4506
|
-
return resolveWorkspaceOrFilePath(filePath);
|
|
4613
|
+
return files;
|
|
4507
4614
|
}
|
|
4508
|
-
function
|
|
4509
|
-
const
|
|
4510
|
-
const
|
|
4511
|
-
|
|
4512
|
-
const
|
|
4513
|
-
|
|
4514
|
-
throw new Error(`Missing or invalid score in result at line ${i + 1}: ${line.slice(0, 100)}`);
|
|
4615
|
+
function extractToolNames(record) {
|
|
4616
|
+
const tools = /* @__PURE__ */ new Set();
|
|
4617
|
+
const trace = record.trace;
|
|
4618
|
+
if (trace?.tool_calls && typeof trace.tool_calls === "object") {
|
|
4619
|
+
for (const name of Object.keys(trace.tool_calls)) {
|
|
4620
|
+
tools.add(name);
|
|
4515
4621
|
}
|
|
4516
|
-
return record;
|
|
4517
|
-
});
|
|
4518
|
-
}
|
|
4519
|
-
function loadManifestAsRawResults(filePath) {
|
|
4520
|
-
return loadManifestResults(filePath).map(toRawResult);
|
|
4521
|
-
}
|
|
4522
|
-
function toRawResult(result) {
|
|
4523
|
-
return {
|
|
4524
|
-
timestamp: result.timestamp,
|
|
4525
|
-
test_id: result.testId,
|
|
4526
|
-
suite: result.suite,
|
|
4527
|
-
conversation_id: result.conversationId,
|
|
4528
|
-
score: result.score,
|
|
4529
|
-
assertions: result.assertions?.map((assertion) => ({
|
|
4530
|
-
text: assertion.text,
|
|
4531
|
-
passed: assertion.passed,
|
|
4532
|
-
evidence: assertion.evidence
|
|
4533
|
-
})),
|
|
4534
|
-
target: result.target,
|
|
4535
|
-
error: result.error,
|
|
4536
|
-
scores: result.scores?.map((score) => ({
|
|
4537
|
-
name: score.name,
|
|
4538
|
-
type: score.type,
|
|
4539
|
-
score: score.score,
|
|
4540
|
-
assertions: score.assertions?.map((assertion) => ({
|
|
4541
|
-
text: assertion.text,
|
|
4542
|
-
passed: assertion.passed,
|
|
4543
|
-
evidence: assertion.evidence
|
|
4544
|
-
})),
|
|
4545
|
-
weight: score.weight
|
|
4546
|
-
})),
|
|
4547
|
-
token_usage: result.tokenUsage ? {
|
|
4548
|
-
input: result.tokenUsage.input,
|
|
4549
|
-
output: result.tokenUsage.output,
|
|
4550
|
-
cached: result.tokenUsage.cached
|
|
4551
|
-
} : void 0,
|
|
4552
|
-
cost_usd: result.costUsd,
|
|
4553
|
-
duration_ms: result.durationMs,
|
|
4554
|
-
start_time: result.startTime,
|
|
4555
|
-
end_time: result.endTime,
|
|
4556
|
-
input: result.input,
|
|
4557
|
-
output: result.output,
|
|
4558
|
-
file_changes: result.fileChanges
|
|
4559
|
-
};
|
|
4560
|
-
}
|
|
4561
|
-
function loadOtlpTraceFile(filePath) {
|
|
4562
|
-
const parsed = JSON.parse(readFileSync4(filePath, "utf8"));
|
|
4563
|
-
const spans = parsed.resourceSpans?.flatMap((resource) => resource.scopeSpans ?? []).flatMap((scope) => scope.spans ?? []);
|
|
4564
|
-
if (!spans || spans.length === 0) {
|
|
4565
|
-
return [];
|
|
4566
4622
|
}
|
|
4567
|
-
const
|
|
4568
|
-
|
|
4569
|
-
|
|
4570
|
-
|
|
4571
|
-
|
|
4572
|
-
|
|
4573
|
-
|
|
4574
|
-
|
|
4575
|
-
childMap.set(span.parentSpanId, siblings);
|
|
4576
|
-
}
|
|
4577
|
-
}
|
|
4578
|
-
const roots = spans.filter((span) => !span.parentSpanId || !spanMap.has(span.parentSpanId));
|
|
4579
|
-
const supportedRoots = roots.filter(isAgentvEvalRoot);
|
|
4580
|
-
const candidateRoots = supportedRoots.length > 0 ? supportedRoots : roots;
|
|
4581
|
-
return candidateRoots.map((root, index) => {
|
|
4582
|
-
const descendants = collectChildSpans(root.spanId, childMap);
|
|
4583
|
-
const rootAttrs = parseOtlpAttributes(root.attributes);
|
|
4584
|
-
const parsedDescendants = descendants.map((span) => ({
|
|
4585
|
-
...span,
|
|
4586
|
-
parsedAttributes: parseOtlpAttributes(span.attributes)
|
|
4587
|
-
}));
|
|
4588
|
-
const toolSpans = parsedDescendants.filter(
|
|
4589
|
-
(span) => typeof span.parsedAttributes.gen_ai_tool_name === "string"
|
|
4590
|
-
);
|
|
4591
|
-
const llmSpans = parsedDescendants.filter(
|
|
4592
|
-
(span) => span.parsedAttributes.gen_ai_operation_name === "chat" || typeof span.name === "string" && span.name.startsWith("chat ")
|
|
4593
|
-
);
|
|
4594
|
-
const tokenUsage = descendants.reduce(
|
|
4595
|
-
(acc, span) => {
|
|
4596
|
-
const attrs = parseOtlpAttributes(span.attributes);
|
|
4597
|
-
acc.input += numberAttr(attrs.gen_ai_usage_input_tokens) ?? 0;
|
|
4598
|
-
acc.output += numberAttr(attrs.gen_ai_usage_output_tokens) ?? 0;
|
|
4599
|
-
const cached = numberAttr(attrs.gen_ai_usage_cache_read_input_tokens);
|
|
4600
|
-
if (cached !== void 0 && cached > 0) {
|
|
4601
|
-
acc.cached = (acc.cached ?? 0) + cached;
|
|
4623
|
+
const output = record.output;
|
|
4624
|
+
if (Array.isArray(output)) {
|
|
4625
|
+
for (const msg of output) {
|
|
4626
|
+
if (typeof msg === "object" && msg !== null && Array.isArray(msg.tool_calls)) {
|
|
4627
|
+
for (const tc of msg.tool_calls) {
|
|
4628
|
+
if (typeof tc.tool === "string") {
|
|
4629
|
+
tools.add(tc.tool);
|
|
4630
|
+
}
|
|
4602
4631
|
}
|
|
4603
|
-
|
|
4604
|
-
},
|
|
4605
|
-
{ input: 0, output: 0, cached: void 0 }
|
|
4606
|
-
);
|
|
4607
|
-
const traceSummary = buildDerivedTraceSummary({
|
|
4608
|
-
trace: {
|
|
4609
|
-
event_count: numberAttr(rootAttrs.agentv_trace_event_count) ?? (toolSpans.length > 0 ? toolSpans.length : void 0),
|
|
4610
|
-
tool_calls: countRawSpanNames(
|
|
4611
|
-
toolSpans.map((span) => ({
|
|
4612
|
-
type: "tool",
|
|
4613
|
-
name: String(span.parsedAttributes.gen_ai_tool_name)
|
|
4614
|
-
}))
|
|
4615
|
-
),
|
|
4616
|
-
error_count: descendants.filter((span) => span.status?.code === 2).length || void 0,
|
|
4617
|
-
llm_call_count: numberAttr(rootAttrs.agentv_trace_llm_call_count) ?? (llmSpans.length > 0 ? llmSpans.length : void 0)
|
|
4618
|
-
},
|
|
4619
|
-
spans: [
|
|
4620
|
-
...llmSpans.map((span) => ({
|
|
4621
|
-
type: "llm",
|
|
4622
|
-
name: span.name ?? "chat",
|
|
4623
|
-
duration_ms: durationFromSpan(span)
|
|
4624
|
-
})),
|
|
4625
|
-
...toolSpans.map((span) => ({
|
|
4626
|
-
type: "tool",
|
|
4627
|
-
name: String(span.parsedAttributes.gen_ai_tool_name),
|
|
4628
|
-
duration_ms: durationFromSpan(span)
|
|
4629
|
-
}))
|
|
4630
|
-
],
|
|
4631
|
-
duration_ms: numberAttr(rootAttrs.agentv_trace_duration_ms) ?? durationFromSpan(root),
|
|
4632
|
-
cost_usd: numberAttr(rootAttrs.agentv_trace_cost_usd),
|
|
4633
|
-
token_usage: tokenUsage.input || tokenUsage.output || tokenUsage.cached || numberAttr(rootAttrs.agentv_trace_token_input) || numberAttr(rootAttrs.agentv_trace_token_output) || numberAttr(rootAttrs.agentv_trace_token_cached) ? {
|
|
4634
|
-
input: tokenUsage.input || numberAttr(rootAttrs.agentv_trace_token_input) || 0,
|
|
4635
|
-
output: tokenUsage.output || numberAttr(rootAttrs.agentv_trace_token_output) || 0,
|
|
4636
|
-
...tokenUsage.cached || numberAttr(rootAttrs.agentv_trace_token_cached) ? {
|
|
4637
|
-
cached: tokenUsage.cached || numberAttr(rootAttrs.agentv_trace_token_cached) || 0
|
|
4638
|
-
} : {}
|
|
4639
|
-
} : void 0
|
|
4640
|
-
});
|
|
4641
|
-
const score = numberAttr(rootAttrs.agentv_score);
|
|
4642
|
-
if (score === void 0) {
|
|
4643
|
-
throw new Error(
|
|
4644
|
-
`Unsupported OTLP trace root span at index ${index + 1}: missing agentv.score attribute`
|
|
4645
|
-
);
|
|
4632
|
+
}
|
|
4646
4633
|
}
|
|
4647
|
-
return {
|
|
4648
|
-
test_id: stringAttr(rootAttrs.agentv_test_id) ?? stringAttr(rootAttrs.agentv_eval_id) ?? `trace-${index + 1}`,
|
|
4649
|
-
suite: stringAttr(rootAttrs.agentv_suite),
|
|
4650
|
-
target: stringAttr(rootAttrs.agentv_target),
|
|
4651
|
-
score,
|
|
4652
|
-
error: root.status?.code === 2 ? root.status.message : void 0,
|
|
4653
|
-
cost_usd: traceSummary?.cost_usd,
|
|
4654
|
-
duration_ms: traceSummary?.duration_ms,
|
|
4655
|
-
token_usage: traceSummary?.token_usage,
|
|
4656
|
-
trace: traceSummary ? {
|
|
4657
|
-
event_count: traceSummary.event_count,
|
|
4658
|
-
tool_calls: traceSummary.tool_calls,
|
|
4659
|
-
error_count: traceSummary.error_count,
|
|
4660
|
-
tool_durations: traceSummary.tool_durations,
|
|
4661
|
-
llm_call_count: traceSummary.llm_call_count,
|
|
4662
|
-
token_usage: traceSummary.token_usage,
|
|
4663
|
-
cost_usd: traceSummary.cost_usd,
|
|
4664
|
-
duration_ms: traceSummary.duration_ms
|
|
4665
|
-
} : void 0,
|
|
4666
|
-
spans: traceSummary?.spans,
|
|
4667
|
-
output: stringAttr(rootAttrs.agentv_output_text),
|
|
4668
|
-
scores: root.events?.filter(
|
|
4669
|
-
(event) => event.name?.startsWith("agentv.grader.") || event.name?.startsWith("agentv.evaluator.")
|
|
4670
|
-
).map((event) => {
|
|
4671
|
-
const attrs = parseOtlpAttributes(event.attributes);
|
|
4672
|
-
const name = event.name?.replace(/^agentv\.grader\./, "").replace(/^agentv\.evaluator\./, "") ?? "unknown";
|
|
4673
|
-
return {
|
|
4674
|
-
name,
|
|
4675
|
-
type: stringAttr(attrs.agentv_grader_type) ?? stringAttr(attrs.agentv_evaluator_type) ?? "unknown",
|
|
4676
|
-
score: numberAttr(attrs.agentv_grader_score) ?? numberAttr(attrs.agentv_evaluator_score) ?? 0
|
|
4677
|
-
};
|
|
4678
|
-
})
|
|
4679
|
-
};
|
|
4680
|
-
});
|
|
4681
|
-
}
|
|
4682
|
-
function isAgentvEvalRoot(span) {
|
|
4683
|
-
const attrs = parseOtlpAttributes(span.attributes);
|
|
4684
|
-
return span.name === "agentv.eval" || numberAttr(attrs.agentv_score) !== void 0 || typeof stringAttr(attrs.agentv_test_id) === "string";
|
|
4685
|
-
}
|
|
4686
|
-
function collectChildSpans(spanId, childMap) {
|
|
4687
|
-
if (!spanId) return [];
|
|
4688
|
-
const direct = childMap.get(spanId) ?? [];
|
|
4689
|
-
const all = [...direct];
|
|
4690
|
-
for (const child of direct) {
|
|
4691
|
-
all.push(...collectChildSpans(child.spanId, childMap));
|
|
4692
|
-
}
|
|
4693
|
-
return all;
|
|
4694
|
-
}
|
|
4695
|
-
function parseOtlpAttributes(attributes) {
|
|
4696
|
-
const parsed = {};
|
|
4697
|
-
for (const attribute of attributes ?? []) {
|
|
4698
|
-
parsed[attribute.key.replace(/\./g, "_")] = parseOtlpValue(attribute.value);
|
|
4699
|
-
}
|
|
4700
|
-
return parsed;
|
|
4701
|
-
}
|
|
4702
|
-
function parseOtlpValue(value) {
|
|
4703
|
-
if (!value) return void 0;
|
|
4704
|
-
if ("stringValue" in value && value.stringValue !== void 0) return value.stringValue;
|
|
4705
|
-
if ("intValue" in value && value.intValue !== void 0) return Number(value.intValue);
|
|
4706
|
-
if ("doubleValue" in value && value.doubleValue !== void 0) return value.doubleValue;
|
|
4707
|
-
if ("boolValue" in value && value.boolValue !== void 0) return value.boolValue;
|
|
4708
|
-
if ("arrayValue" in value)
|
|
4709
|
-
return (value.arrayValue?.values ?? []).map((entry) => parseOtlpValue(entry));
|
|
4710
|
-
return void 0;
|
|
4711
|
-
}
|
|
4712
|
-
function durationFromSpan(span) {
|
|
4713
|
-
const start = Number(span.startTimeUnixNano);
|
|
4714
|
-
const end = Number(span.endTimeUnixNano);
|
|
4715
|
-
if (!Number.isFinite(start) || !Number.isFinite(end)) return void 0;
|
|
4716
|
-
return Math.round((end - start) / 1e6);
|
|
4717
|
-
}
|
|
4718
|
-
function stringAttr(value) {
|
|
4719
|
-
return typeof value === "string" ? value : void 0;
|
|
4720
|
-
}
|
|
4721
|
-
function numberAttr(value) {
|
|
4722
|
-
return typeof value === "number" && Number.isFinite(value) ? value : void 0;
|
|
4723
|
-
}
|
|
4724
|
-
function buildDerivedTraceSummary(result) {
|
|
4725
|
-
const toolSpans = (result.spans ?? []).filter((span) => span.type === "tool");
|
|
4726
|
-
const llmSpans = (result.spans ?? []).filter((span) => span.type === "llm");
|
|
4727
|
-
const toolCalls = result.trace?.tool_calls ?? countRawSpanNames(toolSpans);
|
|
4728
|
-
const toolDurations = result.trace?.tool_durations ?? groupRawSpanDurations(toolSpans);
|
|
4729
|
-
const hasSpanData = (result.spans?.length ?? 0) > 0;
|
|
4730
|
-
const eventCount = result.trace?.event_count ?? (hasSpanData ? toolSpans.length : void 0);
|
|
4731
|
-
const llmCallCount = result.trace?.llm_call_count ?? (hasSpanData ? llmSpans.length : void 0);
|
|
4732
|
-
if (!result.trace && !result.spans?.length && result.token_usage === void 0 && result.cost_usd === void 0 && result.duration_ms === void 0) {
|
|
4733
|
-
return void 0;
|
|
4734
|
-
}
|
|
4735
|
-
return {
|
|
4736
|
-
event_count: eventCount,
|
|
4737
|
-
tool_calls: toolCalls,
|
|
4738
|
-
error_count: result.trace?.error_count,
|
|
4739
|
-
tool_durations: toolDurations,
|
|
4740
|
-
llm_call_count: llmCallCount,
|
|
4741
|
-
token_usage: result.trace?.token_usage ?? result.token_usage,
|
|
4742
|
-
cost_usd: result.trace?.cost_usd ?? result.cost_usd,
|
|
4743
|
-
duration_ms: result.trace?.duration_ms ?? result.duration_ms,
|
|
4744
|
-
spans: result.spans
|
|
4745
|
-
};
|
|
4746
|
-
}
|
|
4747
|
-
function countRawSpanNames(spans) {
|
|
4748
|
-
const counts = {};
|
|
4749
|
-
for (const span of spans) {
|
|
4750
|
-
counts[span.name] = (counts[span.name] ?? 0) + 1;
|
|
4751
4634
|
}
|
|
4752
|
-
return
|
|
4753
|
-
}
|
|
4754
|
-
function groupRawSpanDurations(spans) {
|
|
4755
|
-
const grouped = {};
|
|
4756
|
-
for (const span of spans) {
|
|
4757
|
-
if (span.duration_ms === void 0) continue;
|
|
4758
|
-
const existing = grouped[span.name] ?? [];
|
|
4759
|
-
existing.push(span.duration_ms);
|
|
4760
|
-
grouped[span.name] = existing;
|
|
4761
|
-
}
|
|
4762
|
-
return Object.keys(grouped).length > 0 ? grouped : void 0;
|
|
4763
|
-
}
|
|
4764
|
-
function getTraceSummary(result) {
|
|
4765
|
-
const derived = buildDerivedTraceSummary(result);
|
|
4766
|
-
if (!derived) return void 0;
|
|
4767
|
-
const { spans: _spans, ...trace } = derived;
|
|
4768
|
-
return trace;
|
|
4769
|
-
}
|
|
4770
|
-
function getTraceSpans(result) {
|
|
4771
|
-
return buildDerivedTraceSummary(result)?.spans ?? [];
|
|
4635
|
+
return [...tools];
|
|
4772
4636
|
}
|
|
4773
|
-
function
|
|
4774
|
-
|
|
4775
|
-
if (!rawTrace) return void 0;
|
|
4776
|
-
return toCamelCaseDeep(rawTrace);
|
|
4777
|
-
}
|
|
4778
|
-
function listResultFiles(cwd, limit) {
|
|
4779
|
-
const runsDir = path9.join(cwd, ".agentv", "results", RESULT_RUNS_DIRNAME);
|
|
4780
|
-
const files = [];
|
|
4637
|
+
function parseFilterableRecords(filePath) {
|
|
4638
|
+
let content;
|
|
4781
4639
|
try {
|
|
4782
|
-
|
|
4783
|
-
|
|
4784
|
-
|
|
4785
|
-
|
|
4786
|
-
|
|
4787
|
-
|
|
4788
|
-
|
|
4789
|
-
|
|
4640
|
+
content = readFileSync4(filePath, "utf8");
|
|
4641
|
+
} catch {
|
|
4642
|
+
return [];
|
|
4643
|
+
}
|
|
4644
|
+
const lines = content.split("\n").filter((line) => line.trim());
|
|
4645
|
+
const records = [];
|
|
4646
|
+
for (const line of lines) {
|
|
4647
|
+
let raw;
|
|
4648
|
+
try {
|
|
4649
|
+
raw = JSON.parse(line);
|
|
4650
|
+
} catch {
|
|
4651
|
+
continue;
|
|
4652
|
+
}
|
|
4653
|
+
let experiment = typeof raw.experiment === "string" ? raw.experiment : void 0;
|
|
4654
|
+
if (!experiment) {
|
|
4655
|
+
const parts = filePath.split(path10.sep);
|
|
4656
|
+
const runsIdx = parts.indexOf("runs");
|
|
4657
|
+
if (runsIdx !== -1 && parts.length - runsIdx >= 3) {
|
|
4658
|
+
const candidate = parts[runsIdx + 1];
|
|
4659
|
+
if (candidate && !/^\d{4}-\d{2}-\d{2}T/.test(candidate)) {
|
|
4660
|
+
experiment = candidate;
|
|
4661
|
+
}
|
|
4790
4662
|
}
|
|
4791
4663
|
}
|
|
4792
|
-
|
|
4664
|
+
records.push({
|
|
4665
|
+
file: filePath,
|
|
4666
|
+
test_id: typeof raw.test_id === "string" ? raw.test_id : "unknown",
|
|
4667
|
+
suite: typeof raw.suite === "string" ? raw.suite : void 0,
|
|
4668
|
+
target: typeof raw.target === "string" ? raw.target : void 0,
|
|
4669
|
+
experiment,
|
|
4670
|
+
score: typeof raw.score === "number" ? raw.score : 0,
|
|
4671
|
+
execution_status: typeof raw.execution_status === "string" ? raw.execution_status : void 0,
|
|
4672
|
+
error: typeof raw.error === "string" ? raw.error : void 0,
|
|
4673
|
+
timestamp: typeof raw.timestamp === "string" ? raw.timestamp : void 0,
|
|
4674
|
+
tool_names: extractToolNames(raw)
|
|
4675
|
+
});
|
|
4793
4676
|
}
|
|
4794
|
-
|
|
4795
|
-
|
|
4796
|
-
|
|
4797
|
-
|
|
4677
|
+
return records;
|
|
4678
|
+
}
|
|
4679
|
+
function buildFilterPredicate(opts) {
|
|
4680
|
+
return (record) => {
|
|
4681
|
+
if (opts.target && record.target !== opts.target) return false;
|
|
4682
|
+
if (opts.experiment && record.experiment !== opts.experiment) return false;
|
|
4683
|
+
if (opts.scoreBelow !== void 0 && record.score >= opts.scoreBelow) return false;
|
|
4684
|
+
if (opts.scoreAbove !== void 0 && record.score <= opts.scoreAbove) return false;
|
|
4685
|
+
if (opts.status) {
|
|
4686
|
+
const statusMap = {
|
|
4687
|
+
pass: ["ok"],
|
|
4688
|
+
fail: ["quality_failure"],
|
|
4689
|
+
error: ["error", "timeout", "provider_error"]
|
|
4690
|
+
};
|
|
4691
|
+
const allowedStatuses = statusMap[opts.status] ?? [opts.status];
|
|
4692
|
+
if (record.execution_status && !allowedStatuses.includes(record.execution_status))
|
|
4693
|
+
return false;
|
|
4694
|
+
if (!record.execution_status) {
|
|
4695
|
+
if (opts.status === "pass" && record.score < 1) return false;
|
|
4696
|
+
if (opts.status === "fail" && record.score >= 1) return false;
|
|
4697
|
+
if (opts.status === "error" && !record.error) return false;
|
|
4698
|
+
}
|
|
4699
|
+
}
|
|
4700
|
+
if (opts.hasTool) {
|
|
4701
|
+
const toolPattern = opts.hasTool.toLowerCase();
|
|
4702
|
+
const hasMatch = record.tool_names.some((t) => t.toLowerCase().includes(toolPattern));
|
|
4703
|
+
if (!hasMatch) return false;
|
|
4704
|
+
}
|
|
4705
|
+
return true;
|
|
4706
|
+
};
|
|
4707
|
+
}
|
|
4708
|
+
function discoverFilterSources(searchPath, cwd) {
|
|
4709
|
+
if (searchPath) {
|
|
4710
|
+
const resolved = path10.isAbsolute(searchPath) ? searchPath : path10.resolve(cwd, searchPath);
|
|
4711
|
+
if (!existsSync3(resolved)) {
|
|
4712
|
+
console.error(`${c.red}Error:${c.reset} Path does not exist: ${resolved}`);
|
|
4713
|
+
process.exit(1);
|
|
4714
|
+
}
|
|
4798
4715
|
try {
|
|
4799
|
-
|
|
4800
|
-
|
|
4801
|
-
|
|
4802
|
-
const passCount = results.filter((r) => r.score >= DEFAULT_THRESHOLD).length;
|
|
4803
|
-
const passRate = testCount > 0 ? passCount / testCount : 0;
|
|
4804
|
-
const avgScore = testCount > 0 ? results.reduce((sum, r) => sum + r.score, 0) / testCount : 0;
|
|
4805
|
-
const filenameTimestamp = extractTimestampFromFilename(displayName);
|
|
4806
|
-
const timestamp = filenameTimestamp ?? results[0]?.timestamp ?? "unknown";
|
|
4807
|
-
metas.push({
|
|
4808
|
-
path: filePath,
|
|
4809
|
-
filename: displayName,
|
|
4810
|
-
timestamp,
|
|
4811
|
-
testCount,
|
|
4812
|
-
passRate,
|
|
4813
|
-
avgScore,
|
|
4814
|
-
sizeBytes: fileStat.size
|
|
4815
|
-
});
|
|
4716
|
+
if (statSync2(resolved).isDirectory()) {
|
|
4717
|
+
return collectIndexFiles(resolved);
|
|
4718
|
+
}
|
|
4816
4719
|
} catch {
|
|
4817
4720
|
}
|
|
4721
|
+
return [resolved];
|
|
4818
4722
|
}
|
|
4819
|
-
return
|
|
4723
|
+
return collectIndexFiles(path10.join(cwd, ".agentv", "results", "runs"));
|
|
4820
4724
|
}
|
|
4821
|
-
function
|
|
4822
|
-
const
|
|
4823
|
-
|
|
4725
|
+
function formatFilterTable(records) {
|
|
4726
|
+
const lines = [];
|
|
4727
|
+
if (records.length === 0) {
|
|
4728
|
+
lines.push(`${c.yellow}No matching results found.${c.reset}`);
|
|
4729
|
+
return lines.join("\n");
|
|
4730
|
+
}
|
|
4731
|
+
lines.push("");
|
|
4732
|
+
lines.push(
|
|
4733
|
+
`${c.bold}Filtered Results${c.reset} ${c.dim}(${records.length} match${records.length !== 1 ? "es" : ""})${c.reset}`
|
|
4824
4734
|
);
|
|
4825
|
-
|
|
4826
|
-
|
|
4827
|
-
|
|
4828
|
-
|
|
4829
|
-
|
|
4830
|
-
}
|
|
4831
|
-
|
|
4832
|
-
|
|
4833
|
-
|
|
4834
|
-
const
|
|
4835
|
-
|
|
4836
|
-
|
|
4837
|
-
|
|
4838
|
-
|
|
4839
|
-
|
|
4840
|
-
|
|
4841
|
-
|
|
4842
|
-
|
|
4843
|
-
|
|
4844
|
-
|
|
4845
|
-
|
|
4846
|
-
|
|
4847
|
-
|
|
4848
|
-
return
|
|
4735
|
+
lines.push("");
|
|
4736
|
+
const maxIdLen = Math.min(32, Math.max(7, ...records.map((r) => r.test_id.length)));
|
|
4737
|
+
const maxTargetLen = Math.min(16, Math.max(6, ...records.map((r) => (r.target ?? "").length)));
|
|
4738
|
+
const maxExpLen = Math.min(20, Math.max(10, ...records.map((r) => (r.experiment ?? "").length)));
|
|
4739
|
+
const header = ` ${padRight("Test ID", maxIdLen)} ${padRight("Target", maxTargetLen)} ${padRight("Experiment", maxExpLen)} ${padLeft("Score", 6)} Status`;
|
|
4740
|
+
lines.push(`${c.dim}${header}${c.reset}`);
|
|
4741
|
+
lines.push(
|
|
4742
|
+
`${c.dim} ${"\u2500".repeat(maxIdLen)} ${"\u2500".repeat(maxTargetLen)} ${"\u2500".repeat(maxExpLen)} ${"\u2500".repeat(6)} ${"\u2500".repeat(16)}${c.reset}`
|
|
4743
|
+
);
|
|
4744
|
+
for (const record of records) {
|
|
4745
|
+
const scoreColor = record.score >= 1 ? c.green : record.score >= 0.5 ? c.yellow : c.red;
|
|
4746
|
+
const status = record.execution_status ?? (record.error ? "error" : record.score >= 1 ? "ok" : "quality_failure");
|
|
4747
|
+
const statusColor = status === "ok" ? c.green : status === "error" ? c.red : c.yellow;
|
|
4748
|
+
const row = ` ${padRight(record.test_id.slice(0, maxIdLen), maxIdLen)} ${padRight((record.target ?? "-").slice(0, maxTargetLen), maxTargetLen)} ${padRight((record.experiment ?? "-").slice(0, maxExpLen), maxExpLen)} ${padLeft(`${scoreColor}${formatScore(record.score)}${c.reset}`, 6)} ${statusColor}${status}${c.reset}`;
|
|
4749
|
+
lines.push(row);
|
|
4750
|
+
}
|
|
4751
|
+
lines.push("");
|
|
4752
|
+
const passCount = records.filter((r) => r.score >= 1).length;
|
|
4753
|
+
const avgScore = records.length > 0 ? records.reduce((sum, r) => sum + r.score, 0) / records.length : 0;
|
|
4754
|
+
lines.push(
|
|
4755
|
+
`${c.dim}${records.length} result${records.length !== 1 ? "s" : ""} | ${passCount} passed | avg score: ${formatScore(avgScore)}${c.reset}`
|
|
4756
|
+
);
|
|
4757
|
+
lines.push("");
|
|
4758
|
+
return lines.join("\n");
|
|
4849
4759
|
}
|
|
4760
|
+
var inspectFilterCommand = command({
|
|
4761
|
+
name: "filter",
|
|
4762
|
+
description: "Filter evaluation results by target, experiment, score, status, or tool usage",
|
|
4763
|
+
args: {
|
|
4764
|
+
path: positional({
|
|
4765
|
+
type: optional(string),
|
|
4766
|
+
displayName: "path",
|
|
4767
|
+
description: "Directory or file to filter (default: .agentv/results/runs/)"
|
|
4768
|
+
}),
|
|
4769
|
+
target: option({
|
|
4770
|
+
type: optional(string),
|
|
4771
|
+
long: "target",
|
|
4772
|
+
description: "Filter by target name"
|
|
4773
|
+
}),
|
|
4774
|
+
experiment: option({
|
|
4775
|
+
type: optional(string),
|
|
4776
|
+
long: "experiment",
|
|
4777
|
+
description: "Filter by experiment name"
|
|
4778
|
+
}),
|
|
4779
|
+
scoreBelow: option({
|
|
4780
|
+
type: optional(number),
|
|
4781
|
+
long: "score-below",
|
|
4782
|
+
description: "Filter to results with score below this value"
|
|
4783
|
+
}),
|
|
4784
|
+
scoreAbove: option({
|
|
4785
|
+
type: optional(number),
|
|
4786
|
+
long: "score-above",
|
|
4787
|
+
description: "Filter to results with score above this value"
|
|
4788
|
+
}),
|
|
4789
|
+
status: option({
|
|
4790
|
+
type: optional(string),
|
|
4791
|
+
long: "status",
|
|
4792
|
+
description: "Filter by execution status: pass, fail, error (or raw value like ok, quality_failure)"
|
|
4793
|
+
}),
|
|
4794
|
+
hasTool: option({
|
|
4795
|
+
type: optional(string),
|
|
4796
|
+
long: "has-tool",
|
|
4797
|
+
description: "Filter to results that used a specific tool (substring match)"
|
|
4798
|
+
}),
|
|
4799
|
+
dir: option({
|
|
4800
|
+
type: optional(string),
|
|
4801
|
+
long: "dir",
|
|
4802
|
+
short: "d",
|
|
4803
|
+
description: "Working directory (default: current directory)"
|
|
4804
|
+
}),
|
|
4805
|
+
format: option({
|
|
4806
|
+
type: optional(oneOf(["table", "json"])),
|
|
4807
|
+
long: "format",
|
|
4808
|
+
short: "f",
|
|
4809
|
+
description: "Output format: table (default) or json"
|
|
4810
|
+
})
|
|
4811
|
+
},
|
|
4812
|
+
handler: async ({
|
|
4813
|
+
path: searchPath,
|
|
4814
|
+
target,
|
|
4815
|
+
experiment,
|
|
4816
|
+
scoreBelow,
|
|
4817
|
+
scoreAbove,
|
|
4818
|
+
status,
|
|
4819
|
+
hasTool,
|
|
4820
|
+
dir,
|
|
4821
|
+
format
|
|
4822
|
+
}) => {
|
|
4823
|
+
const cwd = dir ?? process.cwd();
|
|
4824
|
+
const sources = discoverFilterSources(searchPath, cwd);
|
|
4825
|
+
if (sources.length === 0) {
|
|
4826
|
+
console.error(`${c.yellow}No result files found.${c.reset}`);
|
|
4827
|
+
console.error(`${c.dim}Run an evaluation first, or specify a path.${c.reset}`);
|
|
4828
|
+
process.exit(0);
|
|
4829
|
+
}
|
|
4830
|
+
const allRecords = [];
|
|
4831
|
+
for (const source of sources) {
|
|
4832
|
+
allRecords.push(...parseFilterableRecords(source));
|
|
4833
|
+
}
|
|
4834
|
+
if (allRecords.length === 0) {
|
|
4835
|
+
console.error(`${c.yellow}No results found in the specified path.${c.reset}`);
|
|
4836
|
+
process.exit(0);
|
|
4837
|
+
}
|
|
4838
|
+
const predicate = buildFilterPredicate({
|
|
4839
|
+
target,
|
|
4840
|
+
experiment,
|
|
4841
|
+
scoreBelow,
|
|
4842
|
+
scoreAbove,
|
|
4843
|
+
status,
|
|
4844
|
+
hasTool
|
|
4845
|
+
});
|
|
4846
|
+
const filtered = allRecords.filter(predicate);
|
|
4847
|
+
if (format === "json") {
|
|
4848
|
+
console.log(JSON.stringify(filtered, null, 2));
|
|
4849
|
+
} else {
|
|
4850
|
+
console.log(formatFilterTable(filtered));
|
|
4851
|
+
}
|
|
4852
|
+
}
|
|
4853
|
+
});
|
|
4850
4854
|
|
|
4851
4855
|
// src/commands/inspect/list.ts
|
|
4852
4856
|
function formatListTable(metas) {
|
|
4853
4857
|
const lines = [];
|
|
4854
4858
|
if (metas.length === 0) {
|
|
4855
|
-
lines.push(`${
|
|
4856
|
-
lines.push(`${
|
|
4859
|
+
lines.push(`${c.yellow}No run workspaces found in .agentv/results/runs/${c.reset}`);
|
|
4860
|
+
lines.push(`${c.dim}Run an evaluation first: agentv run <eval-file>${c.reset}`);
|
|
4857
4861
|
return lines.join("\n");
|
|
4858
4862
|
}
|
|
4859
4863
|
lines.push("");
|
|
4860
|
-
lines.push(`${
|
|
4864
|
+
lines.push(`${c.bold}Evaluation Runs${c.reset} ${c.dim}(.agentv/results/runs/)${c.reset}`);
|
|
4861
4865
|
lines.push("");
|
|
4862
4866
|
const maxFileLen = Math.max(4, ...metas.map((m) => m.filename.length));
|
|
4863
|
-
const header = ` ${
|
|
4864
|
-
lines.push(`${
|
|
4867
|
+
const header = ` ${padRight("File", maxFileLen)} ${padLeft("Tests", 5)} ${padLeft("Pass", 5)} ${padLeft("Score", 6)} ${padLeft("Size", 7)} Timestamp`;
|
|
4868
|
+
lines.push(`${c.dim}${header}${c.reset}`);
|
|
4865
4869
|
lines.push(
|
|
4866
|
-
`${
|
|
4870
|
+
`${c.dim} ${"\u2500".repeat(maxFileLen)} ${"\u2500".repeat(5)} ${"\u2500".repeat(5)} ${"\u2500".repeat(6)} ${"\u2500".repeat(7)} ${"\u2500".repeat(24)}${c.reset}`
|
|
4867
4871
|
);
|
|
4868
4872
|
for (const meta of metas) {
|
|
4869
|
-
const passColor = meta.passRate >= 1 ?
|
|
4870
|
-
const scoreColor = meta.avgScore >= 0.9 ?
|
|
4871
|
-
const row = ` ${
|
|
4873
|
+
const passColor = meta.passRate >= 1 ? c.green : meta.passRate >= 0.5 ? c.yellow : c.red;
|
|
4874
|
+
const scoreColor = meta.avgScore >= 0.9 ? c.green : meta.avgScore >= 0.5 ? c.yellow : c.red;
|
|
4875
|
+
const row = ` ${padRight(meta.filename, maxFileLen)} ${padLeft(String(meta.testCount), 5)} ${padLeft(`${passColor}${formatScore(meta.passRate)}${c.reset}`, 5)} ${padLeft(`${scoreColor}${formatScore(meta.avgScore)}${c.reset}`, 6)} ${padLeft(formatSize(meta.sizeBytes), 7)} ${c.dim}${meta.timestamp}${c.reset}`;
|
|
4872
4876
|
lines.push(row);
|
|
4873
4877
|
}
|
|
4874
4878
|
lines.push("");
|
|
4875
4879
|
lines.push(
|
|
4876
|
-
`${
|
|
4880
|
+
`${c.dim}${metas.length} run workspace${metas.length !== 1 ? "s" : ""} found${c.reset}`
|
|
4877
4881
|
);
|
|
4878
4882
|
lines.push("");
|
|
4879
4883
|
return lines.join("\n");
|
|
@@ -5073,19 +5077,19 @@ function renderTable(scored, assertSpec) {
|
|
|
5073
5077
|
{ header: "Verdict", width: 8 },
|
|
5074
5078
|
{ header: "Detail", width: 50 }
|
|
5075
5079
|
];
|
|
5076
|
-
const headerLine = cols.map((col) =>
|
|
5080
|
+
const headerLine = cols.map((col) => padRight(`${c.bold}${col.header}${c.reset}`, col.width)).join(" ");
|
|
5077
5081
|
lines.push(headerLine);
|
|
5078
5082
|
lines.push(cols.map((col) => "\u2500".repeat(col.width)).join("\u2500\u2500"));
|
|
5079
5083
|
for (const r of scored) {
|
|
5080
|
-
const verdictColor = r.verdict === "pass" ?
|
|
5084
|
+
const verdictColor = r.verdict === "pass" ? c.green : c.red;
|
|
5081
5085
|
const failed = r.assertions.filter((a) => !a.passed);
|
|
5082
5086
|
const passed = r.assertions.filter((a) => a.passed);
|
|
5083
5087
|
const detail = failed.length > 0 ? failed[0].text.slice(0, 48) : passed.length > 0 ? passed[0].text.slice(0, 48) : "";
|
|
5084
5088
|
const row = [
|
|
5085
|
-
|
|
5086
|
-
|
|
5087
|
-
|
|
5088
|
-
|
|
5089
|
+
padRight(r.testId.slice(0, 24), cols[0].width),
|
|
5090
|
+
padLeft(formatScore(r.originalScore), cols[1].width),
|
|
5091
|
+
padLeft(`${verdictColor}${formatScore(r.newScore)}${c.reset}`, cols[2].width),
|
|
5092
|
+
padRight(`${verdictColor}${r.verdict.toUpperCase()}${c.reset}`, cols[3].width),
|
|
5089
5093
|
detail.slice(0, cols[4].width)
|
|
5090
5094
|
].join(" ");
|
|
5091
5095
|
lines.push(row);
|
|
@@ -5095,7 +5099,7 @@ function renderTable(scored, assertSpec) {
|
|
|
5095
5099
|
const meanScore = total > 0 ? scored.reduce((sum, r) => sum + r.newScore, 0) / total : 0;
|
|
5096
5100
|
lines.push("");
|
|
5097
5101
|
lines.push(
|
|
5098
|
-
`${
|
|
5102
|
+
`${c.bold}Assert:${c.reset} ${assertSpec} ${c.bold}Results:${c.reset} ${passCount}/${total} passed (${formatScore(passCount / (total || 1))}) ${c.bold}Mean:${c.reset} ${formatScore(meanScore)}`
|
|
5099
5103
|
);
|
|
5100
5104
|
return lines.join("\n");
|
|
5101
5105
|
}
|
|
@@ -5132,7 +5136,7 @@ var traceScoreCommand = command({
|
|
|
5132
5136
|
evaluatorConfig = parseAssertSpec(assertSpec);
|
|
5133
5137
|
} catch (err2) {
|
|
5134
5138
|
const msg = err2 instanceof Error ? err2.message : String(err2);
|
|
5135
|
-
console.error(`${
|
|
5139
|
+
console.error(`${c.red}Error:${c.reset} ${msg}`);
|
|
5136
5140
|
process.exit(1);
|
|
5137
5141
|
}
|
|
5138
5142
|
let results;
|
|
@@ -5140,11 +5144,11 @@ var traceScoreCommand = command({
|
|
|
5140
5144
|
results = loadResultFile(file);
|
|
5141
5145
|
} catch (err2) {
|
|
5142
5146
|
const msg = err2 instanceof Error ? err2.message : String(err2);
|
|
5143
|
-
console.error(`${
|
|
5147
|
+
console.error(`${c.red}Error:${c.reset} Could not load result file: ${msg}`);
|
|
5144
5148
|
process.exit(1);
|
|
5145
5149
|
}
|
|
5146
5150
|
if (results.length === 0) {
|
|
5147
|
-
console.error(`${
|
|
5151
|
+
console.error(`${c.yellow}Warning:${c.reset} No results found in ${file}`);
|
|
5148
5152
|
process.exit(0);
|
|
5149
5153
|
}
|
|
5150
5154
|
const traceRequired = ["latency", "cost", "token-usage", "execution-metrics"].includes(
|
|
@@ -5156,7 +5160,7 @@ var traceScoreCommand = command({
|
|
|
5156
5160
|
);
|
|
5157
5161
|
if (!hasTrace) {
|
|
5158
5162
|
console.error(
|
|
5159
|
-
`${
|
|
5163
|
+
`${c.red}Error:${c.reset} Source lacks trace metrics. Use an OTLP trace export via ${c.bold}--otel-file${c.reset} or a run manifest with summary metrics in ${c.bold}index.jsonl${c.reset}.`
|
|
5160
5164
|
);
|
|
5161
5165
|
process.exit(1);
|
|
5162
5166
|
}
|
|
@@ -5166,12 +5170,12 @@ var traceScoreCommand = command({
|
|
|
5166
5170
|
scored = await runScore(results, evaluatorConfig, testId);
|
|
5167
5171
|
} catch (err2) {
|
|
5168
5172
|
const msg = err2 instanceof Error ? err2.message : String(err2);
|
|
5169
|
-
console.error(`${
|
|
5173
|
+
console.error(`${c.red}Error:${c.reset} Scoring failed: ${msg}`);
|
|
5170
5174
|
process.exit(1);
|
|
5171
5175
|
}
|
|
5172
5176
|
if (scored.length === 0) {
|
|
5173
5177
|
console.error(
|
|
5174
|
-
`${
|
|
5178
|
+
`${c.yellow}Warning:${c.reset} No results matched${testId ? ` test ID "${testId}"` : ""}`
|
|
5175
5179
|
);
|
|
5176
5180
|
process.exit(0);
|
|
5177
5181
|
}
|
|
@@ -5187,6 +5191,192 @@ var traceScoreCommand = command({
|
|
|
5187
5191
|
}
|
|
5188
5192
|
});
|
|
5189
5193
|
|
|
5194
|
+
// src/commands/inspect/search.ts
|
|
5195
|
+
import { existsSync as existsSync4, readFileSync as readFileSync5, readdirSync as readdirSync3, statSync as statSync3 } from "node:fs";
|
|
5196
|
+
import path11 from "node:path";
|
|
5197
|
+
function collectJsonlFiles(dir) {
|
|
5198
|
+
const files = [];
|
|
5199
|
+
try {
|
|
5200
|
+
const entries2 = readdirSync3(dir, { withFileTypes: true });
|
|
5201
|
+
for (const entry of entries2) {
|
|
5202
|
+
const fullPath = path11.join(dir, entry.name);
|
|
5203
|
+
if (entry.isDirectory()) {
|
|
5204
|
+
files.push(...collectJsonlFiles(fullPath));
|
|
5205
|
+
} else if (entry.name.endsWith(".jsonl")) {
|
|
5206
|
+
files.push(fullPath);
|
|
5207
|
+
}
|
|
5208
|
+
}
|
|
5209
|
+
} catch {
|
|
5210
|
+
}
|
|
5211
|
+
return files;
|
|
5212
|
+
}
|
|
5213
|
+
function extractSnippet(text, matchIndex, matchLength, contextChars = 60) {
|
|
5214
|
+
const start = Math.max(0, matchIndex - contextChars);
|
|
5215
|
+
const end = Math.min(text.length, matchIndex + matchLength + contextChars);
|
|
5216
|
+
let snippet = text.slice(start, end);
|
|
5217
|
+
if (start > 0) snippet = `...${snippet}`;
|
|
5218
|
+
if (end < text.length) snippet = `${snippet}...`;
|
|
5219
|
+
return snippet.replace(/\n/g, "\\n").replace(/\r/g, "");
|
|
5220
|
+
}
|
|
5221
|
+
function searchJsonlFile(filePath, regex2, targetFilter, experimentFilter) {
|
|
5222
|
+
const matches = [];
|
|
5223
|
+
let content;
|
|
5224
|
+
try {
|
|
5225
|
+
content = readFileSync5(filePath, "utf8");
|
|
5226
|
+
} catch {
|
|
5227
|
+
return matches;
|
|
5228
|
+
}
|
|
5229
|
+
const lines = content.split("\n").filter((line) => line.trim());
|
|
5230
|
+
for (let i = 0; i < lines.length; i++) {
|
|
5231
|
+
const line = lines[i];
|
|
5232
|
+
let record;
|
|
5233
|
+
try {
|
|
5234
|
+
record = JSON.parse(line);
|
|
5235
|
+
} catch {
|
|
5236
|
+
continue;
|
|
5237
|
+
}
|
|
5238
|
+
const target = typeof record.target === "string" ? record.target : void 0;
|
|
5239
|
+
const experiment = typeof record.experiment === "string" ? record.experiment : void 0;
|
|
5240
|
+
const score = typeof record.score === "number" ? record.score : void 0;
|
|
5241
|
+
const testId = typeof record.test_id === "string" ? record.test_id : typeof record.source === "object" && record.source !== null ? record.source.session_id : void 0;
|
|
5242
|
+
if (targetFilter && target !== targetFilter) continue;
|
|
5243
|
+
if (experimentFilter && experiment !== experimentFilter) continue;
|
|
5244
|
+
const match = regex2.exec(line);
|
|
5245
|
+
if (match) {
|
|
5246
|
+
matches.push({
|
|
5247
|
+
file: filePath,
|
|
5248
|
+
id: testId ?? `line-${i + 1}`,
|
|
5249
|
+
lineNumber: i + 1,
|
|
5250
|
+
snippet: extractSnippet(line, match.index, match[0].length),
|
|
5251
|
+
target,
|
|
5252
|
+
experiment,
|
|
5253
|
+
score
|
|
5254
|
+
});
|
|
5255
|
+
}
|
|
5256
|
+
}
|
|
5257
|
+
return matches;
|
|
5258
|
+
}
|
|
5259
|
+
function discoverSources(basePath, cwd) {
|
|
5260
|
+
if (basePath) {
|
|
5261
|
+
const resolved = path11.isAbsolute(basePath) ? basePath : path11.resolve(cwd, basePath);
|
|
5262
|
+
if (!existsSync4(resolved)) {
|
|
5263
|
+
console.error(`${c.red}Error:${c.reset} Path does not exist: ${resolved}`);
|
|
5264
|
+
process.exit(1);
|
|
5265
|
+
}
|
|
5266
|
+
try {
|
|
5267
|
+
if (statSync3(resolved).isDirectory()) {
|
|
5268
|
+
return collectJsonlFiles(resolved);
|
|
5269
|
+
}
|
|
5270
|
+
} catch {
|
|
5271
|
+
}
|
|
5272
|
+
return [resolved];
|
|
5273
|
+
}
|
|
5274
|
+
const sources = [];
|
|
5275
|
+
sources.push(...collectJsonlFiles(path11.join(cwd, ".agentv", "results", "runs")));
|
|
5276
|
+
sources.push(...collectJsonlFiles(path11.join(cwd, ".agentv", "transcripts")));
|
|
5277
|
+
return sources;
|
|
5278
|
+
}
|
|
5279
|
+
function formatSearchResults(matches, pattern) {
|
|
5280
|
+
const lines = [];
|
|
5281
|
+
if (matches.length === 0) {
|
|
5282
|
+
lines.push(`${c.yellow}No matches found for pattern: ${pattern}${c.reset}`);
|
|
5283
|
+
return lines.join("\n");
|
|
5284
|
+
}
|
|
5285
|
+
lines.push("");
|
|
5286
|
+
lines.push(`${c.bold}Search Results${c.reset} ${c.dim}pattern: /${pattern}/${c.reset}`);
|
|
5287
|
+
lines.push(
|
|
5288
|
+
`${c.dim}${matches.length} record${matches.length !== 1 ? "s" : ""} matched${c.reset}`
|
|
5289
|
+
);
|
|
5290
|
+
lines.push("");
|
|
5291
|
+
const byFile = /* @__PURE__ */ new Map();
|
|
5292
|
+
for (const match of matches) {
|
|
5293
|
+
const existing = byFile.get(match.file) ?? [];
|
|
5294
|
+
existing.push(match);
|
|
5295
|
+
byFile.set(match.file, existing);
|
|
5296
|
+
}
|
|
5297
|
+
for (const [file, fileMatches] of byFile) {
|
|
5298
|
+
lines.push(`${c.cyan}${file}${c.reset}`);
|
|
5299
|
+
for (const match of fileMatches) {
|
|
5300
|
+
const meta = [];
|
|
5301
|
+
if (match.target) meta.push(`target:${match.target}`);
|
|
5302
|
+
if (match.experiment) meta.push(`exp:${match.experiment}`);
|
|
5303
|
+
if (match.score !== void 0) meta.push(`score:${match.score}`);
|
|
5304
|
+
const metaStr = meta.length > 0 ? ` ${c.dim}[${meta.join(", ")}]${c.reset}` : "";
|
|
5305
|
+
lines.push(
|
|
5306
|
+
` ${c.bold}${match.id}${c.reset} ${c.dim}(line ${match.lineNumber})${c.reset}${metaStr}`
|
|
5307
|
+
);
|
|
5308
|
+
lines.push(` ${match.snippet}`);
|
|
5309
|
+
}
|
|
5310
|
+
lines.push("");
|
|
5311
|
+
}
|
|
5312
|
+
return lines.join("\n");
|
|
5313
|
+
}
|
|
5314
|
+
var inspectSearchCommand = command({
|
|
5315
|
+
name: "search",
|
|
5316
|
+
description: "Search across evaluation results and transcripts for a regex pattern",
|
|
5317
|
+
args: {
|
|
5318
|
+
pattern: option({
|
|
5319
|
+
type: string,
|
|
5320
|
+
long: "pattern",
|
|
5321
|
+
short: "p",
|
|
5322
|
+
description: "Regex pattern to search for in result/transcript content"
|
|
5323
|
+
}),
|
|
5324
|
+
path: positional({
|
|
5325
|
+
type: optional(string),
|
|
5326
|
+
displayName: "path",
|
|
5327
|
+
description: "Directory or file to search (default: .agentv/results/runs/ and .agentv/transcripts/)"
|
|
5328
|
+
}),
|
|
5329
|
+
target: option({
|
|
5330
|
+
type: optional(string),
|
|
5331
|
+
long: "target",
|
|
5332
|
+
description: "Filter results to a specific target name"
|
|
5333
|
+
}),
|
|
5334
|
+
experiment: option({
|
|
5335
|
+
type: optional(string),
|
|
5336
|
+
long: "experiment",
|
|
5337
|
+
description: "Filter results to a specific experiment name"
|
|
5338
|
+
}),
|
|
5339
|
+
dir: option({
|
|
5340
|
+
type: optional(string),
|
|
5341
|
+
long: "dir",
|
|
5342
|
+
short: "d",
|
|
5343
|
+
description: "Working directory (default: current directory)"
|
|
5344
|
+
}),
|
|
5345
|
+
format: option({
|
|
5346
|
+
type: optional(oneOf(["table", "json"])),
|
|
5347
|
+
long: "format",
|
|
5348
|
+
short: "f",
|
|
5349
|
+
description: "Output format: table (default) or json"
|
|
5350
|
+
})
|
|
5351
|
+
},
|
|
5352
|
+
handler: async ({ pattern, path: searchPath, target, experiment, dir, format }) => {
|
|
5353
|
+
const cwd = dir ?? process.cwd();
|
|
5354
|
+
let regex2;
|
|
5355
|
+
try {
|
|
5356
|
+
regex2 = new RegExp(pattern, "i");
|
|
5357
|
+
} catch (err2) {
|
|
5358
|
+
console.error(`${c.red}Error:${c.reset} Invalid regex pattern: ${err2.message}`);
|
|
5359
|
+
process.exit(1);
|
|
5360
|
+
}
|
|
5361
|
+
const sources = discoverSources(searchPath, cwd);
|
|
5362
|
+
if (sources.length === 0) {
|
|
5363
|
+
console.error(`${c.yellow}No JSONL files found to search.${c.reset}`);
|
|
5364
|
+
console.error(`${c.dim}Run an evaluation first, or specify a path to search.${c.reset}`);
|
|
5365
|
+
process.exit(0);
|
|
5366
|
+
}
|
|
5367
|
+
const allMatches = [];
|
|
5368
|
+
for (const source of sources) {
|
|
5369
|
+
const fileMatches = searchJsonlFile(source, regex2, target, experiment);
|
|
5370
|
+
allMatches.push(...fileMatches);
|
|
5371
|
+
}
|
|
5372
|
+
if (format === "json") {
|
|
5373
|
+
console.log(JSON.stringify(toSnakeCaseDeep(allMatches), null, 2));
|
|
5374
|
+
} else {
|
|
5375
|
+
console.log(formatSearchResults(allMatches, pattern));
|
|
5376
|
+
}
|
|
5377
|
+
}
|
|
5378
|
+
});
|
|
5379
|
+
|
|
5190
5380
|
// src/commands/inspect/show.ts
|
|
5191
5381
|
function renderFlatTrace(result) {
|
|
5192
5382
|
const trace = getTraceSummary(result);
|
|
@@ -5214,8 +5404,8 @@ function renderFlatTrace(result) {
|
|
|
5214
5404
|
}
|
|
5215
5405
|
function renderScores(scores) {
|
|
5216
5406
|
return scores.map((s) => {
|
|
5217
|
-
const scoreColor = s.score >= 0.9 ?
|
|
5218
|
-
return `${s.name} ${scoreColor}${formatScore(s.score)}${
|
|
5407
|
+
const scoreColor = s.score >= 0.9 ? c.green : s.score >= 0.5 ? c.yellow : c.red;
|
|
5408
|
+
return `${s.name} ${scoreColor}${formatScore(s.score)}${c.reset}`;
|
|
5219
5409
|
}).join(" | ");
|
|
5220
5410
|
}
|
|
5221
5411
|
function renderTree(result) {
|
|
@@ -5228,7 +5418,7 @@ function renderTree(result) {
|
|
|
5228
5418
|
if (getTraceSummary(result) || result.duration_ms !== void 0 || result.cost_usd !== void 0) {
|
|
5229
5419
|
return renderFlatTrace(result);
|
|
5230
5420
|
}
|
|
5231
|
-
return `${
|
|
5421
|
+
return `${c.dim}No trace data available${c.reset}`;
|
|
5232
5422
|
}
|
|
5233
5423
|
const lines = [];
|
|
5234
5424
|
const testId = result.test_id ?? result.eval_id ?? "unknown";
|
|
@@ -5238,7 +5428,7 @@ function renderTree(result) {
|
|
|
5238
5428
|
if (totalDuration !== void 0) rootParts.push(formatDuration(totalDuration));
|
|
5239
5429
|
if (totalTokens !== void 0) rootParts.push(`${formatNumber(totalTokens)} tok`);
|
|
5240
5430
|
if (result.cost_usd !== void 0) rootParts.push(formatCost(result.cost_usd));
|
|
5241
|
-
lines.push(`${
|
|
5431
|
+
lines.push(`${c.bold}${rootParts.join(", ")}${c.reset}`);
|
|
5242
5432
|
const steps = [];
|
|
5243
5433
|
for (let i = 0; i < messages.length; i++) {
|
|
5244
5434
|
const msg = messages[i];
|
|
@@ -5256,7 +5446,7 @@ function renderTree(result) {
|
|
|
5256
5446
|
const connector = isLast ? "\u2514\u2500" : "\u251C\u2500";
|
|
5257
5447
|
const childPrefix = isLast ? " " : "\u2502 ";
|
|
5258
5448
|
if (step.type === "llm") {
|
|
5259
|
-
const parts = [`${
|
|
5449
|
+
const parts = [`${c.cyan}model${c.reset}`];
|
|
5260
5450
|
if (step.msg.duration_ms !== void 0) parts.push(formatDuration(step.msg.duration_ms));
|
|
5261
5451
|
if (step.msg.token_usage) {
|
|
5262
5452
|
const tok = step.msg.token_usage.input + step.msg.token_usage.output;
|
|
@@ -5267,18 +5457,18 @@ function renderTree(result) {
|
|
|
5267
5457
|
const toolCalls = step.msg.tool_calls ?? [];
|
|
5268
5458
|
if (toolCalls.length === 1) {
|
|
5269
5459
|
const tc = toolCalls[0];
|
|
5270
|
-
const parts = [`${
|
|
5460
|
+
const parts = [`${c.yellow}${tc.tool}${c.reset}`];
|
|
5271
5461
|
if (tc.duration_ms !== void 0) parts.push(formatDuration(tc.duration_ms));
|
|
5272
5462
|
lines.push(`${connector} ${parts.join(", ")}`);
|
|
5273
5463
|
} else {
|
|
5274
|
-
const parts = [`${
|
|
5464
|
+
const parts = [`${c.dim}tools${c.reset}`];
|
|
5275
5465
|
if (step.msg.duration_ms !== void 0) parts.push(formatDuration(step.msg.duration_ms));
|
|
5276
5466
|
lines.push(`${connector} ${parts.join(", ")}`);
|
|
5277
5467
|
for (let ti = 0; ti < toolCalls.length; ti++) {
|
|
5278
5468
|
const tc = toolCalls[ti];
|
|
5279
5469
|
const isLastTool = ti === toolCalls.length - 1;
|
|
5280
5470
|
const toolConnector = isLastTool ? "\u2514\u2500" : "\u251C\u2500";
|
|
5281
|
-
const tcParts = [`${
|
|
5471
|
+
const tcParts = [`${c.yellow}${tc.tool}${c.reset}`];
|
|
5282
5472
|
if (tc.duration_ms !== void 0) tcParts.push(formatDuration(tc.duration_ms));
|
|
5283
5473
|
lines.push(`${childPrefix}${toolConnector} ${tcParts.join(", ")}`);
|
|
5284
5474
|
}
|
|
@@ -5287,7 +5477,7 @@ function renderTree(result) {
|
|
|
5287
5477
|
}
|
|
5288
5478
|
if (result.scores && result.scores.length > 0) {
|
|
5289
5479
|
lines.push("");
|
|
5290
|
-
lines.push(`${
|
|
5480
|
+
lines.push(`${c.dim}Scores:${c.reset} ${renderScores(result.scores)}`);
|
|
5291
5481
|
}
|
|
5292
5482
|
return lines.join("\n");
|
|
5293
5483
|
}
|
|
@@ -5299,11 +5489,11 @@ function renderSpanTree(result, spans) {
|
|
|
5299
5489
|
if (result.duration_ms !== void 0) rootParts.push(formatDuration(result.duration_ms));
|
|
5300
5490
|
if (totalTokens !== void 0) rootParts.push(`${formatNumber(totalTokens)} tok`);
|
|
5301
5491
|
if (result.cost_usd !== void 0) rootParts.push(formatCost(result.cost_usd));
|
|
5302
|
-
lines.push(`${
|
|
5492
|
+
lines.push(`${c.bold}${rootParts.join(", ")}${c.reset}`);
|
|
5303
5493
|
spans.forEach((span, index) => {
|
|
5304
5494
|
const connector = index === spans.length - 1 ? "\u2514\u2500" : "\u251C\u2500";
|
|
5305
|
-
const color = span.type === "llm" ?
|
|
5306
|
-
const parts = [`${color}${span.name}${
|
|
5495
|
+
const color = span.type === "llm" ? c.cyan : c.yellow;
|
|
5496
|
+
const parts = [`${color}${span.name}${c.reset}`];
|
|
5307
5497
|
if (span.duration_ms !== void 0) {
|
|
5308
5498
|
parts.push(formatDuration(span.duration_ms));
|
|
5309
5499
|
}
|
|
@@ -5311,7 +5501,7 @@ function renderSpanTree(result, spans) {
|
|
|
5311
5501
|
});
|
|
5312
5502
|
if (result.scores && result.scores.length > 0) {
|
|
5313
5503
|
lines.push("");
|
|
5314
|
-
lines.push(`${
|
|
5504
|
+
lines.push(`${c.dim}Scores:${c.reset} ${renderScores(result.scores)}`);
|
|
5315
5505
|
}
|
|
5316
5506
|
return lines.join("\n");
|
|
5317
5507
|
}
|
|
@@ -5322,30 +5512,30 @@ function formatResultDetail(result, index, tree) {
|
|
|
5322
5512
|
lines.push(renderTree(result));
|
|
5323
5513
|
return lines.join("\n");
|
|
5324
5514
|
}
|
|
5325
|
-
const scoreColor = result.score >= 0.9 ?
|
|
5515
|
+
const scoreColor = result.score >= 0.9 ? c.green : result.score >= 0.5 ? c.yellow : c.red;
|
|
5326
5516
|
lines.push(
|
|
5327
|
-
`${
|
|
5517
|
+
`${c.bold}${testId}${c.reset} ${scoreColor}${formatScore(result.score)}${c.reset}${result.target ? ` ${c.dim}target: ${result.target}${c.reset}` : ""}${result.suite ? ` ${c.dim}suite: ${result.suite}${c.reset}` : ""}`
|
|
5328
5518
|
);
|
|
5329
5519
|
if (result.error) {
|
|
5330
|
-
lines.push(` ${
|
|
5520
|
+
lines.push(` ${c.red}Error: ${result.error}${c.reset}`);
|
|
5331
5521
|
}
|
|
5332
5522
|
if (result.assertions && result.assertions.length > 0) {
|
|
5333
5523
|
const passed = result.assertions.filter((a) => a.passed);
|
|
5334
5524
|
const failed = result.assertions.filter((a) => !a.passed);
|
|
5335
5525
|
if (passed.length > 0)
|
|
5336
5526
|
lines.push(
|
|
5337
|
-
` ${
|
|
5527
|
+
` ${c.green}\u2713 Passed:${c.reset} ${passed.map((a) => a.text).join(", ")}`
|
|
5338
5528
|
);
|
|
5339
5529
|
if (failed.length > 0)
|
|
5340
5530
|
lines.push(
|
|
5341
|
-
` ${
|
|
5531
|
+
` ${c.red}\u2717 Failed:${c.reset} ${failed.map((a) => a.text).join(", ")}`
|
|
5342
5532
|
);
|
|
5343
5533
|
}
|
|
5344
5534
|
if (result.scores && result.scores.length > 0) {
|
|
5345
|
-
lines.push(` ${
|
|
5535
|
+
lines.push(` ${c.dim}Scores:${c.reset} ${renderScores(result.scores)}`);
|
|
5346
5536
|
}
|
|
5347
5537
|
if (result.trace || result.duration_ms !== void 0 || result.cost_usd !== void 0) {
|
|
5348
|
-
lines.push(` ${
|
|
5538
|
+
lines.push(` ${c.dim}Trace:${c.reset} ${renderFlatTrace(result)}`);
|
|
5349
5539
|
}
|
|
5350
5540
|
if (result.assertions && result.assertions.length > 0) {
|
|
5351
5541
|
const withEvidence = result.assertions.filter((a) => a.evidence);
|
|
@@ -5353,7 +5543,7 @@ function formatResultDetail(result, index, tree) {
|
|
|
5353
5543
|
const maxLen = 200;
|
|
5354
5544
|
const evidence = withEvidence[0].evidence;
|
|
5355
5545
|
const truncated = evidence.length > maxLen ? `${evidence.slice(0, maxLen)}...` : evidence;
|
|
5356
|
-
lines.push(` ${
|
|
5546
|
+
lines.push(` ${c.dim}Evidence: ${truncated}${c.reset}`);
|
|
5357
5547
|
}
|
|
5358
5548
|
}
|
|
5359
5549
|
return lines.join("\n");
|
|
@@ -5364,9 +5554,9 @@ function formatShowTable(results, filePath, testIdFilter, tree) {
|
|
|
5364
5554
|
if (testIdFilter) {
|
|
5365
5555
|
filtered = results.filter((r) => (r.test_id ?? r.eval_id) === testIdFilter);
|
|
5366
5556
|
if (filtered.length === 0) {
|
|
5367
|
-
lines.push(`${
|
|
5557
|
+
lines.push(`${c.yellow}No results found with test ID "${testIdFilter}"${c.reset}`);
|
|
5368
5558
|
lines.push("");
|
|
5369
|
-
lines.push(`${
|
|
5559
|
+
lines.push(`${c.dim}Available test IDs:${c.reset}`);
|
|
5370
5560
|
for (const r of results) {
|
|
5371
5561
|
lines.push(` ${r.test_id ?? r.eval_id ?? "(unnamed)"}`);
|
|
5372
5562
|
}
|
|
@@ -5374,19 +5564,19 @@ function formatShowTable(results, filePath, testIdFilter, tree) {
|
|
|
5374
5564
|
}
|
|
5375
5565
|
}
|
|
5376
5566
|
lines.push("");
|
|
5377
|
-
lines.push(`${
|
|
5567
|
+
lines.push(`${c.bold}Results:${c.reset} ${c.cyan}${filePath}${c.reset}`);
|
|
5378
5568
|
const totalTests = filtered.length;
|
|
5379
5569
|
const passCount = filtered.filter((r) => r.score >= 1).length;
|
|
5380
5570
|
const failCount = totalTests - passCount;
|
|
5381
5571
|
const avgScore = totalTests > 0 ? filtered.reduce((sum, r) => sum + r.score, 0) / totalTests : 0;
|
|
5382
5572
|
lines.push(
|
|
5383
|
-
`${
|
|
5573
|
+
`${c.dim}${totalTests} test${totalTests !== 1 ? "s" : ""} | ${c.green}${passCount} passed${c.reset}${c.dim}${failCount > 0 ? ` | ${c.red}${failCount} failed${c.reset}${c.dim}` : ""} | avg score: ${formatScore(avgScore)}${c.reset}`
|
|
5384
5574
|
);
|
|
5385
5575
|
lines.push("");
|
|
5386
5576
|
for (let i = 0; i < filtered.length; i++) {
|
|
5387
5577
|
lines.push(formatResultDetail(filtered[i], i, tree ?? false));
|
|
5388
5578
|
if (i < filtered.length - 1) {
|
|
5389
|
-
lines.push(`${
|
|
5579
|
+
lines.push(`${c.dim}${"\u2500".repeat(60)}${c.reset}`);
|
|
5390
5580
|
}
|
|
5391
5581
|
}
|
|
5392
5582
|
lines.push("");
|
|
@@ -5515,32 +5705,32 @@ function groupResults(results, groupBy2) {
|
|
|
5515
5705
|
function formatStatsTable(groups, filePath) {
|
|
5516
5706
|
const lines = [];
|
|
5517
5707
|
lines.push("");
|
|
5518
|
-
lines.push(`${
|
|
5708
|
+
lines.push(`${c.bold}Statistics:${c.reset} ${c.cyan}${filePath}${c.reset}`);
|
|
5519
5709
|
for (const group of groups) {
|
|
5520
5710
|
if (groups.length > 1 || group.label !== "all") {
|
|
5521
5711
|
lines.push("");
|
|
5522
5712
|
lines.push(
|
|
5523
|
-
`${
|
|
5713
|
+
`${c.bold}Group: ${group.label}${c.reset} ${c.dim}(${group.results.length} tests)${c.reset}`
|
|
5524
5714
|
);
|
|
5525
5715
|
} else {
|
|
5526
|
-
lines.push(`${
|
|
5716
|
+
lines.push(`${c.dim}${group.results.length} tests${c.reset}`);
|
|
5527
5717
|
}
|
|
5528
5718
|
lines.push("");
|
|
5529
5719
|
const metrics = collectMetrics(group.results);
|
|
5530
5720
|
if (metrics.length === 0) {
|
|
5531
|
-
lines.push(`${
|
|
5721
|
+
lines.push(`${c.yellow}No trace metrics available${c.reset}`);
|
|
5532
5722
|
continue;
|
|
5533
5723
|
}
|
|
5534
5724
|
const nameWidth = Math.max(12, ...metrics.map((m) => m.name.length));
|
|
5535
5725
|
const colWidth = 10;
|
|
5536
|
-
const header = ` ${
|
|
5537
|
-
lines.push(`${
|
|
5726
|
+
const header = ` ${padRight("Metric", nameWidth)} ${padLeft("Mean", colWidth)} ${padLeft("P50", colWidth)} ${padLeft("P90", colWidth)} ${padLeft("P95", colWidth)} ${padLeft("P99", colWidth)}`;
|
|
5727
|
+
lines.push(`${c.dim}${header}${c.reset}`);
|
|
5538
5728
|
lines.push(
|
|
5539
|
-
`${
|
|
5729
|
+
`${c.dim} ${"\u2500".repeat(nameWidth)} ${"\u2500".repeat(colWidth)} ${"\u2500".repeat(colWidth)} ${"\u2500".repeat(colWidth)} ${"\u2500".repeat(colWidth)} ${"\u2500".repeat(colWidth)}${c.reset}`
|
|
5540
5730
|
);
|
|
5541
5731
|
for (const metric of metrics) {
|
|
5542
5732
|
const sorted = [...metric.values].sort((a, b) => a - b);
|
|
5543
|
-
const row = ` ${
|
|
5733
|
+
const row = ` ${padRight(metric.name, nameWidth)} ${padLeft(metric.formatter(mean(sorted)), colWidth)} ${padLeft(metric.formatter(percentile(sorted, 50)), colWidth)} ${padLeft(metric.formatter(percentile(sorted, 90)), colWidth)} ${padLeft(metric.formatter(percentile(sorted, 95)), colWidth)} ${padLeft(metric.formatter(percentile(sorted, 99)), colWidth)}`;
|
|
5544
5734
|
lines.push(row);
|
|
5545
5735
|
}
|
|
5546
5736
|
}
|
|
@@ -5620,15 +5810,17 @@ var inspectCommand = subcommands({
|
|
|
5620
5810
|
name: "inspect",
|
|
5621
5811
|
description: "Inspect and analyze evaluation results",
|
|
5622
5812
|
cmds: {
|
|
5813
|
+
filter: inspectFilterCommand,
|
|
5623
5814
|
list: traceListCommand,
|
|
5624
5815
|
score: traceScoreCommand,
|
|
5816
|
+
search: inspectSearchCommand,
|
|
5625
5817
|
show: traceShowCommand,
|
|
5626
5818
|
stats: traceStatsCommand
|
|
5627
5819
|
}
|
|
5628
5820
|
});
|
|
5629
5821
|
|
|
5630
5822
|
// src/commands/pipeline/bench.ts
|
|
5631
|
-
import { existsSync as
|
|
5823
|
+
import { existsSync as existsSync5 } from "node:fs";
|
|
5632
5824
|
import { readFile as readFile2, readdir, writeFile as writeFile5 } from "node:fs/promises";
|
|
5633
5825
|
import { join } from "node:path";
|
|
5634
5826
|
var evalBenchCommand = command({
|
|
@@ -5739,7 +5931,7 @@ var evalBenchCommand = command({
|
|
|
5739
5931
|
}));
|
|
5740
5932
|
let executionStatus = "ok";
|
|
5741
5933
|
const timingPath = join(testDir, "timing.json");
|
|
5742
|
-
if (
|
|
5934
|
+
if (existsSync5(timingPath)) {
|
|
5743
5935
|
try {
|
|
5744
5936
|
const timing = JSON.parse(await readFile2(timingPath, "utf8"));
|
|
5745
5937
|
if (typeof timing.execution_status === "string") {
|
|
@@ -5748,7 +5940,7 @@ var evalBenchCommand = command({
|
|
|
5748
5940
|
} catch {
|
|
5749
5941
|
}
|
|
5750
5942
|
}
|
|
5751
|
-
const hasResponse =
|
|
5943
|
+
const hasResponse = existsSync5(join(testDir, "response.md"));
|
|
5752
5944
|
indexLines.push(
|
|
5753
5945
|
JSON.stringify({
|
|
5754
5946
|
timestamp: manifest.timestamp,
|
|
@@ -5796,6 +5988,33 @@ var evalBenchCommand = command({
|
|
|
5796
5988
|
"utf8"
|
|
5797
5989
|
);
|
|
5798
5990
|
console.log(`Benchmark: ${testIds.length} test(s), pass_rate=${passRateStats.mean}`);
|
|
5991
|
+
const results = indexLines.map((line) => JSON.parse(line));
|
|
5992
|
+
await maybeAutoExportRunArtifacts({
|
|
5993
|
+
cwd: process.cwd(),
|
|
5994
|
+
run_dir: exportDir,
|
|
5995
|
+
experiment,
|
|
5996
|
+
test_files: manifest.eval_file ? [manifest.eval_file] : [],
|
|
5997
|
+
results: results.map((result) => ({
|
|
5998
|
+
testId: result.test_id,
|
|
5999
|
+
score: result.score,
|
|
6000
|
+
executionStatus: result.execution_status,
|
|
6001
|
+
target: result.target,
|
|
6002
|
+
timestamp: result.timestamp
|
|
6003
|
+
})),
|
|
6004
|
+
eval_summaries: [
|
|
6005
|
+
{
|
|
6006
|
+
eval_file: manifest.eval_file ?? "pipeline",
|
|
6007
|
+
total: results.length,
|
|
6008
|
+
passed: results.filter((result) => result.score >= DEFAULT_THRESHOLD).length,
|
|
6009
|
+
avg_score: results.length > 0 ? results.reduce((sum, result) => sum + result.score, 0) / results.length : 0,
|
|
6010
|
+
results: results.map((result) => ({
|
|
6011
|
+
test_id: result.test_id,
|
|
6012
|
+
score: result.score,
|
|
6013
|
+
status: result.execution_status === "execution_error" ? "ERROR" : result.score >= DEFAULT_THRESHOLD ? "PASS" : "FAIL"
|
|
6014
|
+
}))
|
|
6015
|
+
}
|
|
6016
|
+
]
|
|
6017
|
+
});
|
|
5799
6018
|
}
|
|
5800
6019
|
});
|
|
5801
6020
|
function computeStats(values) {
|
|
@@ -5987,7 +6206,7 @@ var evalInputCommand = command({
|
|
|
5987
6206
|
out: option({
|
|
5988
6207
|
type: optional(string),
|
|
5989
6208
|
long: "out",
|
|
5990
|
-
description: "Output directory for extracted inputs (default: .agentv/results/runs/<timestamp>)"
|
|
6209
|
+
description: "Output directory for extracted inputs (default: .agentv/results/runs/<experiment>/<timestamp>)"
|
|
5991
6210
|
}),
|
|
5992
6211
|
experiment: option({
|
|
5993
6212
|
type: optional(string),
|
|
@@ -5997,7 +6216,7 @@ var evalInputCommand = command({
|
|
|
5997
6216
|
},
|
|
5998
6217
|
handler: async ({ evalPath, out, experiment }) => {
|
|
5999
6218
|
const resolvedEvalPath = resolve(evalPath);
|
|
6000
|
-
const outDir = resolve(out ?? buildDefaultRunDir(process.cwd()));
|
|
6219
|
+
const outDir = resolve(out ?? buildDefaultRunDir(process.cwd(), experiment));
|
|
6001
6220
|
const repoRoot = await findRepoRoot(dirname(resolvedEvalPath));
|
|
6002
6221
|
const evalDir = dirname(resolvedEvalPath);
|
|
6003
6222
|
const category = deriveCategory(relative(process.cwd(), resolvedEvalPath));
|
|
@@ -6147,7 +6366,7 @@ async function writeJson(filePath, data) {
|
|
|
6147
6366
|
|
|
6148
6367
|
// src/commands/pipeline/run.ts
|
|
6149
6368
|
import { exec } from "node:child_process";
|
|
6150
|
-
import { existsSync as
|
|
6369
|
+
import { existsSync as existsSync6, readFileSync as readFileSync6, unlinkSync } from "node:fs";
|
|
6151
6370
|
import { mkdir as mkdir7, readFile as readFile5, readdir as readdir3, writeFile as writeFile8 } from "node:fs/promises";
|
|
6152
6371
|
import { tmpdir } from "node:os";
|
|
6153
6372
|
import { dirname as dirname2, join as join4, relative as relative2, resolve as resolve2 } from "node:path";
|
|
@@ -6161,9 +6380,9 @@ function loadEnvFile(dir) {
|
|
|
6161
6380
|
let current = resolve2(dir);
|
|
6162
6381
|
while (true) {
|
|
6163
6382
|
const candidate = join4(current, ".env");
|
|
6164
|
-
if (
|
|
6383
|
+
if (existsSync6(candidate)) {
|
|
6165
6384
|
const env2 = {};
|
|
6166
|
-
for (const line of
|
|
6385
|
+
for (const line of readFileSync6(candidate, "utf8").split("\n")) {
|
|
6167
6386
|
const trimmed = line.trim();
|
|
6168
6387
|
if (!trimmed || trimmed.startsWith("#")) continue;
|
|
6169
6388
|
const eqIdx = trimmed.indexOf("=");
|
|
@@ -6190,7 +6409,7 @@ var evalRunCommand2 = command({
|
|
|
6190
6409
|
out: option({
|
|
6191
6410
|
type: optional(string),
|
|
6192
6411
|
long: "out",
|
|
6193
|
-
description: "Output directory for results (default: .agentv/results/runs/<timestamp>)"
|
|
6412
|
+
description: "Output directory for results (default: .agentv/results/runs/<experiment>/<timestamp>)"
|
|
6194
6413
|
}),
|
|
6195
6414
|
workers: option({
|
|
6196
6415
|
type: optional(number),
|
|
@@ -6210,7 +6429,7 @@ var evalRunCommand2 = command({
|
|
|
6210
6429
|
},
|
|
6211
6430
|
handler: async ({ evalPath, out, workers, experiment, graderType }) => {
|
|
6212
6431
|
const resolvedEvalPath = resolve2(evalPath);
|
|
6213
|
-
const outDir = resolve2(out ?? buildDefaultRunDir(process.cwd()));
|
|
6432
|
+
const outDir = resolve2(out ?? buildDefaultRunDir(process.cwd(), experiment));
|
|
6214
6433
|
const repoRoot = await findRepoRoot(dirname2(resolvedEvalPath));
|
|
6215
6434
|
const evalDir = dirname2(resolvedEvalPath);
|
|
6216
6435
|
const category = deriveCategory(relative2(process.cwd(), resolvedEvalPath));
|
|
@@ -6348,8 +6567,8 @@ var evalRunCommand2 = command({
|
|
|
6348
6567
|
});
|
|
6349
6568
|
const durationMs = Math.round(performance.now() - start);
|
|
6350
6569
|
let response;
|
|
6351
|
-
if (
|
|
6352
|
-
response =
|
|
6570
|
+
if (existsSync6(outputFile)) {
|
|
6571
|
+
response = readFileSync6(outputFile, "utf8");
|
|
6353
6572
|
} else {
|
|
6354
6573
|
response = "ERROR: No output file generated";
|
|
6355
6574
|
}
|
|
@@ -6381,8 +6600,8 @@ var evalRunCommand2 = command({
|
|
|
6381
6600
|
invCompleted++;
|
|
6382
6601
|
writeInvProgress();
|
|
6383
6602
|
try {
|
|
6384
|
-
if (
|
|
6385
|
-
if (
|
|
6603
|
+
if (existsSync6(promptFile)) unlinkSync(promptFile);
|
|
6604
|
+
if (existsSync6(outputFile)) unlinkSync(outputFile);
|
|
6386
6605
|
} catch {
|
|
6387
6606
|
}
|
|
6388
6607
|
}
|
|
@@ -6469,7 +6688,7 @@ async function writeGraderConfigs2(testDir, assertions, evalDir) {
|
|
|
6469
6688
|
let promptContent = "";
|
|
6470
6689
|
if (config.resolvedPromptPath) {
|
|
6471
6690
|
try {
|
|
6472
|
-
promptContent =
|
|
6691
|
+
promptContent = readFileSync6(config.resolvedPromptPath, "utf8");
|
|
6473
6692
|
} catch {
|
|
6474
6693
|
promptContent = typeof config.prompt === "string" ? config.prompt : "";
|
|
6475
6694
|
}
|
|
@@ -6500,10 +6719,10 @@ var pipelineCommand = subcommands({
|
|
|
6500
6719
|
});
|
|
6501
6720
|
|
|
6502
6721
|
// src/commands/results/export.ts
|
|
6503
|
-
import
|
|
6722
|
+
import path12 from "node:path";
|
|
6504
6723
|
|
|
6505
6724
|
// src/commands/results/shared.ts
|
|
6506
|
-
import { existsSync as
|
|
6725
|
+
import { existsSync as existsSync7 } from "node:fs";
|
|
6507
6726
|
var sourceArg = positional({
|
|
6508
6727
|
type: optional(string),
|
|
6509
6728
|
displayName: "source",
|
|
@@ -6513,7 +6732,7 @@ async function resolveSourceFile(source, cwd) {
|
|
|
6513
6732
|
let sourceFile;
|
|
6514
6733
|
if (source) {
|
|
6515
6734
|
sourceFile = resolveResultSourcePath(source, cwd);
|
|
6516
|
-
if (!
|
|
6735
|
+
if (!existsSync7(sourceFile)) {
|
|
6517
6736
|
console.error(`Error: File not found: ${sourceFile}`);
|
|
6518
6737
|
process.exit(1);
|
|
6519
6738
|
}
|
|
@@ -6521,7 +6740,7 @@ async function resolveSourceFile(source, cwd) {
|
|
|
6521
6740
|
} else {
|
|
6522
6741
|
const cache = await loadRunCache(cwd);
|
|
6523
6742
|
const cachedFile = cache ? resolveRunCacheFile(cache) : "";
|
|
6524
|
-
if (cachedFile &&
|
|
6743
|
+
if (cachedFile && existsSync7(cachedFile)) {
|
|
6525
6744
|
sourceFile = cachedFile;
|
|
6526
6745
|
} else {
|
|
6527
6746
|
const metas = listResultFiles(cwd, 1);
|
|
@@ -6547,14 +6766,20 @@ async function loadResults(source, cwd) {
|
|
|
6547
6766
|
|
|
6548
6767
|
// src/commands/results/export.ts
|
|
6549
6768
|
function deriveOutputDir(cwd, sourceFile) {
|
|
6550
|
-
if (
|
|
6769
|
+
if (path12.basename(sourceFile) !== RESULT_INDEX_FILENAME) {
|
|
6551
6770
|
throw new Error(`Expected a run manifest named ${RESULT_INDEX_FILENAME}: ${sourceFile}`);
|
|
6552
6771
|
}
|
|
6553
|
-
const
|
|
6772
|
+
const runDir = path12.dirname(sourceFile);
|
|
6773
|
+
const segments = path12.normalize(runDir).split(path12.sep).filter(Boolean);
|
|
6774
|
+
const runsIndex = segments.lastIndexOf("runs");
|
|
6775
|
+
if (runsIndex >= 0 && runsIndex < segments.length - 1) {
|
|
6776
|
+
return path12.join(cwd, ".agentv", "results", "export", ...segments.slice(runsIndex + 1));
|
|
6777
|
+
}
|
|
6778
|
+
const parentDir = path12.basename(runDir);
|
|
6554
6779
|
if (parentDir.startsWith("eval_")) {
|
|
6555
|
-
return
|
|
6780
|
+
return path12.join(cwd, ".agentv", "results", "export", parentDir.slice(5));
|
|
6556
6781
|
}
|
|
6557
|
-
return
|
|
6782
|
+
return path12.join(cwd, ".agentv", "results", "export", parentDir);
|
|
6558
6783
|
}
|
|
6559
6784
|
async function loadExportSource(source, cwd) {
|
|
6560
6785
|
const { sourceFile } = await resolveSourceFile(source, cwd);
|
|
@@ -6587,7 +6812,7 @@ var resultsExportCommand = command({
|
|
|
6587
6812
|
const cwd = dir ?? process.cwd();
|
|
6588
6813
|
try {
|
|
6589
6814
|
const { sourceFile, results } = await loadExportSource(source, cwd);
|
|
6590
|
-
const outputDir = out ?
|
|
6815
|
+
const outputDir = out ? path12.isAbsolute(out) ? out : path12.resolve(cwd, out) : deriveOutputDir(cwd, sourceFile);
|
|
6591
6816
|
await writeArtifactsFromResults(results, outputDir, {
|
|
6592
6817
|
evalFile: sourceFile
|
|
6593
6818
|
});
|
|
@@ -6728,7 +6953,7 @@ var resultsShowCommand = command({
|
|
|
6728
6953
|
});
|
|
6729
6954
|
|
|
6730
6955
|
// src/commands/results/summary.ts
|
|
6731
|
-
import { existsSync as
|
|
6956
|
+
import { existsSync as existsSync8, readFileSync as readFileSync7 } from "node:fs";
|
|
6732
6957
|
function formatSummary(results, grading) {
|
|
6733
6958
|
const total = results.length;
|
|
6734
6959
|
let passed;
|
|
@@ -6779,9 +7004,9 @@ var resultsSummaryCommand = command({
|
|
|
6779
7004
|
const { results, sourceFile } = await loadResults(source, cwd);
|
|
6780
7005
|
let grading;
|
|
6781
7006
|
const gradingPath = sourceFile.replace(/\.jsonl$/, ".grading.json");
|
|
6782
|
-
if (
|
|
7007
|
+
if (existsSync8(gradingPath)) {
|
|
6783
7008
|
try {
|
|
6784
|
-
grading = JSON.parse(
|
|
7009
|
+
grading = JSON.parse(readFileSync7(gradingPath, "utf8"));
|
|
6785
7010
|
} catch {
|
|
6786
7011
|
}
|
|
6787
7012
|
}
|
|
@@ -6794,16 +7019,17 @@ var resultsSummaryCommand = command({
|
|
|
6794
7019
|
});
|
|
6795
7020
|
|
|
6796
7021
|
// src/commands/results/validate.ts
|
|
6797
|
-
import { existsSync as
|
|
6798
|
-
import
|
|
7022
|
+
import { existsSync as existsSync9, readFileSync as readFileSync8, statSync as statSync4 } from "node:fs";
|
|
7023
|
+
import path13 from "node:path";
|
|
6799
7024
|
function checkDirectoryNaming(runDir) {
|
|
6800
|
-
const dirName =
|
|
6801
|
-
const
|
|
7025
|
+
const dirName = path13.basename(runDir);
|
|
7026
|
+
const pathSegments = path13.normalize(runDir).split(path13.sep).filter(Boolean);
|
|
7027
|
+
const runsIndex = pathSegments.lastIndexOf("runs");
|
|
6802
7028
|
const diagnostics = [];
|
|
6803
|
-
if (
|
|
7029
|
+
if (runsIndex < 0 || runsIndex >= pathSegments.length - 1) {
|
|
6804
7030
|
diagnostics.push({
|
|
6805
7031
|
severity: "warning",
|
|
6806
|
-
message:
|
|
7032
|
+
message: "Directory is not under a 'runs/' tree. Expected: .agentv/results/runs/<experiment>/<run-dir>"
|
|
6807
7033
|
});
|
|
6808
7034
|
}
|
|
6809
7035
|
const isNewFormat = /^\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z$/.test(dirName);
|
|
@@ -6816,15 +7042,25 @@ function checkDirectoryNaming(runDir) {
|
|
|
6816
7042
|
}
|
|
6817
7043
|
return diagnostics;
|
|
6818
7044
|
}
|
|
7045
|
+
function validateRunDirectory(runDir) {
|
|
7046
|
+
const diagnostics = [];
|
|
7047
|
+
diagnostics.push(...checkDirectoryNaming(runDir));
|
|
7048
|
+
const { diagnostics: indexDiags, entries: entries2 } = checkIndexJsonl(runDir);
|
|
7049
|
+
diagnostics.push(...indexDiags);
|
|
7050
|
+
if (entries2.length > 0) {
|
|
7051
|
+
diagnostics.push(...checkArtifactFiles(runDir, entries2));
|
|
7052
|
+
}
|
|
7053
|
+
return { diagnostics, entries: entries2 };
|
|
7054
|
+
}
|
|
6819
7055
|
function checkIndexJsonl(runDir) {
|
|
6820
|
-
const indexPath =
|
|
7056
|
+
const indexPath = path13.join(runDir, "index.jsonl");
|
|
6821
7057
|
const diagnostics = [];
|
|
6822
7058
|
const entries2 = [];
|
|
6823
|
-
if (!
|
|
7059
|
+
if (!existsSync9(indexPath)) {
|
|
6824
7060
|
diagnostics.push({ severity: "error", message: "index.jsonl is missing" });
|
|
6825
7061
|
return { diagnostics, entries: entries2 };
|
|
6826
7062
|
}
|
|
6827
|
-
const content =
|
|
7063
|
+
const content = readFileSync8(indexPath, "utf8");
|
|
6828
7064
|
const lines = content.split("\n").filter((l) => l.trim().length > 0);
|
|
6829
7065
|
if (lines.length === 0) {
|
|
6830
7066
|
diagnostics.push({ severity: "error", message: "index.jsonl is empty" });
|
|
@@ -6916,15 +7152,15 @@ function checkArtifactFiles(runDir, entries2) {
|
|
|
6916
7152
|
for (const entry of entries2) {
|
|
6917
7153
|
const testId = entry.test_id ?? "?";
|
|
6918
7154
|
if (entry.grading_path) {
|
|
6919
|
-
const gradingPath =
|
|
6920
|
-
if (!
|
|
7155
|
+
const gradingPath = path13.join(runDir, entry.grading_path);
|
|
7156
|
+
if (!existsSync9(gradingPath)) {
|
|
6921
7157
|
diagnostics.push({
|
|
6922
7158
|
severity: "error",
|
|
6923
7159
|
message: `${testId}: grading.json not found at '${entry.grading_path}'`
|
|
6924
7160
|
});
|
|
6925
7161
|
} else {
|
|
6926
7162
|
try {
|
|
6927
|
-
const grading = JSON.parse(
|
|
7163
|
+
const grading = JSON.parse(readFileSync8(gradingPath, "utf8"));
|
|
6928
7164
|
if (!grading.assertions || !Array.isArray(grading.assertions)) {
|
|
6929
7165
|
diagnostics.push({
|
|
6930
7166
|
severity: "error",
|
|
@@ -6946,8 +7182,8 @@ function checkArtifactFiles(runDir, entries2) {
|
|
|
6946
7182
|
}
|
|
6947
7183
|
}
|
|
6948
7184
|
if (entry.timing_path) {
|
|
6949
|
-
const timingPath =
|
|
6950
|
-
if (!
|
|
7185
|
+
const timingPath = path13.join(runDir, entry.timing_path);
|
|
7186
|
+
if (!existsSync9(timingPath)) {
|
|
6951
7187
|
diagnostics.push({
|
|
6952
7188
|
severity: "warning",
|
|
6953
7189
|
message: `${testId}: timing.json not found at '${entry.timing_path}'`
|
|
@@ -6955,8 +7191,8 @@ function checkArtifactFiles(runDir, entries2) {
|
|
|
6955
7191
|
}
|
|
6956
7192
|
}
|
|
6957
7193
|
}
|
|
6958
|
-
const benchmarkPath =
|
|
6959
|
-
if (!
|
|
7194
|
+
const benchmarkPath = path13.join(runDir, "benchmark.json");
|
|
7195
|
+
if (!existsSync9(benchmarkPath)) {
|
|
6960
7196
|
diagnostics.push({ severity: "warning", message: "benchmark.json is missing" });
|
|
6961
7197
|
}
|
|
6962
7198
|
return diagnostics;
|
|
@@ -6972,18 +7208,12 @@ var resultsValidateCommand = command({
|
|
|
6972
7208
|
})
|
|
6973
7209
|
},
|
|
6974
7210
|
handler: async ({ runDir }) => {
|
|
6975
|
-
const resolvedDir =
|
|
6976
|
-
if (!
|
|
7211
|
+
const resolvedDir = path13.resolve(runDir);
|
|
7212
|
+
if (!existsSync9(resolvedDir) || !statSync4(resolvedDir).isDirectory()) {
|
|
6977
7213
|
console.error(`Error: '${runDir}' is not a directory`);
|
|
6978
7214
|
process.exit(1);
|
|
6979
7215
|
}
|
|
6980
|
-
const allDiagnostics =
|
|
6981
|
-
allDiagnostics.push(...checkDirectoryNaming(resolvedDir));
|
|
6982
|
-
const { diagnostics: indexDiags, entries: entries2 } = checkIndexJsonl(resolvedDir);
|
|
6983
|
-
allDiagnostics.push(...indexDiags);
|
|
6984
|
-
if (entries2.length > 0) {
|
|
6985
|
-
allDiagnostics.push(...checkArtifactFiles(resolvedDir, entries2));
|
|
6986
|
-
}
|
|
7216
|
+
const { diagnostics: allDiagnostics, entries: entries2 } = validateRunDirectory(resolvedDir);
|
|
6987
7217
|
const errors = allDiagnostics.filter((d) => d.severity === "error");
|
|
6988
7218
|
const warnings = allDiagnostics.filter((d) => d.severity === "warning");
|
|
6989
7219
|
if (allDiagnostics.length === 0) {
|
|
@@ -7020,15 +7250,15 @@ var resultsCommand = subcommands({
|
|
|
7020
7250
|
});
|
|
7021
7251
|
|
|
7022
7252
|
// src/commands/results/serve.ts
|
|
7023
|
-
import { existsSync as
|
|
7024
|
-
import
|
|
7253
|
+
import { existsSync as existsSync12, readFileSync as readFileSync10, readdirSync as readdirSync4, statSync as statSync5, writeFileSync as writeFileSync4 } from "node:fs";
|
|
7254
|
+
import path16 from "node:path";
|
|
7025
7255
|
import { fileURLToPath as fileURLToPath2 } from "node:url";
|
|
7026
7256
|
import { Hono } from "hono";
|
|
7027
7257
|
|
|
7028
7258
|
// src/commands/results/eval-runner.ts
|
|
7029
7259
|
import { spawn } from "node:child_process";
|
|
7030
|
-
import { existsSync as
|
|
7031
|
-
import
|
|
7260
|
+
import { existsSync as existsSync10 } from "node:fs";
|
|
7261
|
+
import path14 from "node:path";
|
|
7032
7262
|
var activeRuns = /* @__PURE__ */ new Map();
|
|
7033
7263
|
function generateRunId() {
|
|
7034
7264
|
const now = /* @__PURE__ */ new Date();
|
|
@@ -7049,16 +7279,16 @@ async function discoverTargetsInProject(cwd) {
|
|
|
7049
7279
|
const repoRoot = await findRepoRoot(cwd) ?? cwd;
|
|
7050
7280
|
let targetsFilePath;
|
|
7051
7281
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
7052
|
-
const fullPath =
|
|
7053
|
-
if (
|
|
7282
|
+
const fullPath = path14.join(cwd, candidate);
|
|
7283
|
+
if (existsSync10(fullPath)) {
|
|
7054
7284
|
targetsFilePath = fullPath;
|
|
7055
7285
|
break;
|
|
7056
7286
|
}
|
|
7057
7287
|
}
|
|
7058
7288
|
if (!targetsFilePath) {
|
|
7059
7289
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
7060
|
-
const fullPath =
|
|
7061
|
-
if (
|
|
7290
|
+
const fullPath = path14.join(repoRoot, candidate);
|
|
7291
|
+
if (existsSync10(fullPath)) {
|
|
7062
7292
|
targetsFilePath = fullPath;
|
|
7063
7293
|
break;
|
|
7064
7294
|
}
|
|
@@ -7107,22 +7337,23 @@ function buildCliPreview(args) {
|
|
|
7107
7337
|
}
|
|
7108
7338
|
function resolveCliPath(cwd) {
|
|
7109
7339
|
const candidates = [
|
|
7110
|
-
|
|
7111
|
-
|
|
7340
|
+
path14.join(cwd, "apps/cli/src/cli.ts"),
|
|
7341
|
+
path14.join(cwd, "apps/cli/dist/cli.js")
|
|
7112
7342
|
];
|
|
7113
7343
|
for (const c4 of candidates) {
|
|
7114
|
-
if (
|
|
7344
|
+
if (existsSync10(c4)) {
|
|
7115
7345
|
return { bunPath: "bun", cliPath: c4 };
|
|
7116
7346
|
}
|
|
7117
7347
|
}
|
|
7118
|
-
const currentDir = typeof __dirname !== "undefined" ? __dirname :
|
|
7119
|
-
const fromSrc =
|
|
7120
|
-
const fromDist =
|
|
7121
|
-
if (
|
|
7122
|
-
if (
|
|
7348
|
+
const currentDir = typeof __dirname !== "undefined" ? __dirname : path14.dirname(new URL(import.meta.url).pathname);
|
|
7349
|
+
const fromSrc = path14.resolve(currentDir, "../../../cli.ts");
|
|
7350
|
+
const fromDist = path14.resolve(currentDir, "../../cli.js");
|
|
7351
|
+
if (existsSync10(fromSrc)) return { bunPath: "bun", cliPath: fromSrc };
|
|
7352
|
+
if (existsSync10(fromDist)) return { bunPath: "bun", cliPath: fromDist };
|
|
7123
7353
|
return void 0;
|
|
7124
7354
|
}
|
|
7125
|
-
function registerEvalRoutes(app2, getCwd) {
|
|
7355
|
+
function registerEvalRoutes(app2, getCwd, options) {
|
|
7356
|
+
const readOnly = options?.readOnly === true;
|
|
7126
7357
|
app2.get("/api/eval/discover", async (c4) => {
|
|
7127
7358
|
const cwd = getCwd(c4);
|
|
7128
7359
|
try {
|
|
@@ -7148,6 +7379,9 @@ function registerEvalRoutes(app2, getCwd) {
|
|
|
7148
7379
|
}
|
|
7149
7380
|
});
|
|
7150
7381
|
app2.post("/api/eval/run", async (c4) => {
|
|
7382
|
+
if (readOnly) {
|
|
7383
|
+
return c4.json({ error: "Studio is running in read-only mode" }, 403);
|
|
7384
|
+
}
|
|
7151
7385
|
const cwd = getCwd(c4);
|
|
7152
7386
|
let body;
|
|
7153
7387
|
try {
|
|
@@ -7389,18 +7623,18 @@ Process error: ${err2.message}`;
|
|
|
7389
7623
|
}
|
|
7390
7624
|
|
|
7391
7625
|
// src/commands/results/studio-config.ts
|
|
7392
|
-
import { existsSync as
|
|
7393
|
-
import
|
|
7626
|
+
import { existsSync as existsSync11, mkdirSync as mkdirSync2, readFileSync as readFileSync9, writeFileSync as writeFileSync3 } from "node:fs";
|
|
7627
|
+
import path15 from "node:path";
|
|
7394
7628
|
import { parse as parseYaml, stringify as stringifyYaml2 } from "yaml";
|
|
7395
7629
|
var DEFAULTS = {
|
|
7396
7630
|
threshold: DEFAULT_THRESHOLD
|
|
7397
7631
|
};
|
|
7398
7632
|
function loadStudioConfig(agentvDir) {
|
|
7399
|
-
const configPath =
|
|
7400
|
-
if (!
|
|
7633
|
+
const configPath = path15.join(agentvDir, "config.yaml");
|
|
7634
|
+
if (!existsSync11(configPath)) {
|
|
7401
7635
|
return { ...DEFAULTS };
|
|
7402
7636
|
}
|
|
7403
|
-
const raw =
|
|
7637
|
+
const raw = readFileSync9(configPath, "utf-8");
|
|
7404
7638
|
const parsed = parseYaml(raw);
|
|
7405
7639
|
if (!parsed || typeof parsed !== "object") {
|
|
7406
7640
|
return { ...DEFAULTS };
|
|
@@ -7422,13 +7656,13 @@ function loadStudioConfig(agentvDir) {
|
|
|
7422
7656
|
};
|
|
7423
7657
|
}
|
|
7424
7658
|
function saveStudioConfig(agentvDir, config) {
|
|
7425
|
-
if (!
|
|
7659
|
+
if (!existsSync11(agentvDir)) {
|
|
7426
7660
|
mkdirSync2(agentvDir, { recursive: true });
|
|
7427
7661
|
}
|
|
7428
|
-
const configPath =
|
|
7662
|
+
const configPath = path15.join(agentvDir, "config.yaml");
|
|
7429
7663
|
let existing = {};
|
|
7430
|
-
if (
|
|
7431
|
-
const raw =
|
|
7664
|
+
if (existsSync11(configPath)) {
|
|
7665
|
+
const raw = readFileSync9(configPath, "utf-8");
|
|
7432
7666
|
const parsed = parseYaml(raw);
|
|
7433
7667
|
if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
|
|
7434
7668
|
existing = parsed;
|
|
@@ -7451,7 +7685,7 @@ function saveStudioConfig(agentvDir, config) {
|
|
|
7451
7685
|
async function resolveSourceFile2(source, cwd) {
|
|
7452
7686
|
if (source) {
|
|
7453
7687
|
let resolved = resolveResultSourcePath(source, cwd);
|
|
7454
|
-
if (!
|
|
7688
|
+
if (!existsSync12(resolved)) {
|
|
7455
7689
|
throw new Error(`Source file not found: ${resolved}`);
|
|
7456
7690
|
}
|
|
7457
7691
|
resolved = resolveRunManifestPath(resolved);
|
|
@@ -7459,7 +7693,7 @@ async function resolveSourceFile2(source, cwd) {
|
|
|
7459
7693
|
}
|
|
7460
7694
|
const cache = await loadRunCache(cwd);
|
|
7461
7695
|
const cachedFile = cache ? resolveRunCacheFile(cache) : "";
|
|
7462
|
-
if (cachedFile &&
|
|
7696
|
+
if (cachedFile && existsSync12(cachedFile)) {
|
|
7463
7697
|
return cachedFile;
|
|
7464
7698
|
}
|
|
7465
7699
|
const metas = listResultFiles(cwd, 10);
|
|
@@ -7479,16 +7713,25 @@ Serving most recent: ${metas[0].path}
|
|
|
7479
7713
|
}
|
|
7480
7714
|
return metas[0].path;
|
|
7481
7715
|
}
|
|
7716
|
+
function resolveDashboardMode(projectCount, options) {
|
|
7717
|
+
if (options.single === true) {
|
|
7718
|
+
return { isMultiProject: false, showMultiWarning: options.multi === true };
|
|
7719
|
+
}
|
|
7720
|
+
if (options.multi === true) {
|
|
7721
|
+
return { isMultiProject: true, showMultiWarning: true };
|
|
7722
|
+
}
|
|
7723
|
+
return { isMultiProject: projectCount > 1, showMultiWarning: false };
|
|
7724
|
+
}
|
|
7482
7725
|
function feedbackPath(resultDir) {
|
|
7483
|
-
return
|
|
7726
|
+
return path16.join(resultDir, "feedback.json");
|
|
7484
7727
|
}
|
|
7485
7728
|
function readFeedback(cwd) {
|
|
7486
7729
|
const fp = feedbackPath(cwd);
|
|
7487
|
-
if (!
|
|
7730
|
+
if (!existsSync12(fp)) {
|
|
7488
7731
|
return { reviews: [] };
|
|
7489
7732
|
}
|
|
7490
7733
|
try {
|
|
7491
|
-
return JSON.parse(
|
|
7734
|
+
return JSON.parse(readFileSync10(fp, "utf8"));
|
|
7492
7735
|
} catch (err2) {
|
|
7493
7736
|
console.error(`Warning: could not parse ${fp}, starting fresh: ${err2.message}`);
|
|
7494
7737
|
return { reviews: [] };
|
|
@@ -7499,16 +7742,16 @@ function writeFeedback(cwd, data) {
|
|
|
7499
7742
|
`, "utf8");
|
|
7500
7743
|
}
|
|
7501
7744
|
function buildFileTree(dirPath, relativeTo) {
|
|
7502
|
-
if (!
|
|
7745
|
+
if (!existsSync12(dirPath) || !statSync5(dirPath).isDirectory()) {
|
|
7503
7746
|
return [];
|
|
7504
7747
|
}
|
|
7505
|
-
const entries2 =
|
|
7748
|
+
const entries2 = readdirSync4(dirPath, { withFileTypes: true });
|
|
7506
7749
|
return entries2.sort((a, b) => {
|
|
7507
7750
|
if (a.isDirectory() !== b.isDirectory()) return a.isDirectory() ? -1 : 1;
|
|
7508
7751
|
return a.name.localeCompare(b.name);
|
|
7509
7752
|
}).map((entry) => {
|
|
7510
|
-
const fullPath =
|
|
7511
|
-
const relPath =
|
|
7753
|
+
const fullPath = path16.join(dirPath, entry.name);
|
|
7754
|
+
const relPath = path16.relative(relativeTo, fullPath);
|
|
7512
7755
|
if (entry.isDirectory()) {
|
|
7513
7756
|
return {
|
|
7514
7757
|
name: entry.name,
|
|
@@ -7521,7 +7764,7 @@ function buildFileTree(dirPath, relativeTo) {
|
|
|
7521
7764
|
});
|
|
7522
7765
|
}
|
|
7523
7766
|
function inferLanguage(filePath) {
|
|
7524
|
-
const ext =
|
|
7767
|
+
const ext = path16.extname(filePath).toLowerCase();
|
|
7525
7768
|
const langMap = {
|
|
7526
7769
|
".json": "json",
|
|
7527
7770
|
".jsonl": "json",
|
|
@@ -7559,8 +7802,8 @@ function stripHeavyFields(results) {
|
|
|
7559
7802
|
};
|
|
7560
7803
|
});
|
|
7561
7804
|
}
|
|
7562
|
-
function handleRuns(c4, { searchDir }) {
|
|
7563
|
-
const metas =
|
|
7805
|
+
async function handleRuns(c4, { searchDir }) {
|
|
7806
|
+
const { runs: metas } = await listMergedResultFiles(searchDir);
|
|
7564
7807
|
return c4.json({
|
|
7565
7808
|
runs: metas.map((m) => {
|
|
7566
7809
|
let target;
|
|
@@ -7575,32 +7818,38 @@ function handleRuns(c4, { searchDir }) {
|
|
|
7575
7818
|
}
|
|
7576
7819
|
return {
|
|
7577
7820
|
filename: m.filename,
|
|
7821
|
+
display_name: m.displayName,
|
|
7578
7822
|
path: m.path,
|
|
7579
7823
|
timestamp: m.timestamp,
|
|
7580
7824
|
test_count: m.testCount,
|
|
7581
7825
|
pass_rate: m.passRate,
|
|
7582
7826
|
avg_score: m.avgScore,
|
|
7583
7827
|
size_bytes: m.sizeBytes,
|
|
7828
|
+
source: m.source,
|
|
7584
7829
|
...target && { target },
|
|
7585
7830
|
...experiment && { experiment }
|
|
7586
7831
|
};
|
|
7587
7832
|
})
|
|
7588
7833
|
});
|
|
7589
7834
|
}
|
|
7590
|
-
function handleRunDetail(c4, { searchDir }) {
|
|
7591
|
-
const filename = c4.req.param("filename");
|
|
7592
|
-
const meta =
|
|
7835
|
+
async function handleRunDetail(c4, { searchDir }) {
|
|
7836
|
+
const filename = c4.req.param("filename") ?? "";
|
|
7837
|
+
const meta = await findRunById(searchDir, filename);
|
|
7593
7838
|
if (!meta) return c4.json({ error: "Run not found" }, 404);
|
|
7594
7839
|
try {
|
|
7595
7840
|
const loaded = loadManifestResults(meta.path);
|
|
7596
|
-
return c4.json({
|
|
7841
|
+
return c4.json({
|
|
7842
|
+
results: stripHeavyFields(loaded),
|
|
7843
|
+
source: meta.source,
|
|
7844
|
+
source_label: meta.displayName
|
|
7845
|
+
});
|
|
7597
7846
|
} catch {
|
|
7598
7847
|
return c4.json({ error: "Failed to load run" }, 500);
|
|
7599
7848
|
}
|
|
7600
7849
|
}
|
|
7601
|
-
function handleRunSuites(c4, { searchDir, agentvDir }) {
|
|
7602
|
-
const filename = c4.req.param("filename");
|
|
7603
|
-
const meta =
|
|
7850
|
+
async function handleRunSuites(c4, { searchDir, agentvDir }) {
|
|
7851
|
+
const filename = c4.req.param("filename") ?? "";
|
|
7852
|
+
const meta = await findRunById(searchDir, filename);
|
|
7604
7853
|
if (!meta) return c4.json({ error: "Run not found" }, 404);
|
|
7605
7854
|
try {
|
|
7606
7855
|
const loaded = loadManifestResults(meta.path);
|
|
@@ -7626,9 +7875,9 @@ function handleRunSuites(c4, { searchDir, agentvDir }) {
|
|
|
7626
7875
|
return c4.json({ error: "Failed to load suites" }, 500);
|
|
7627
7876
|
}
|
|
7628
7877
|
}
|
|
7629
|
-
function handleRunCategories(c4, { searchDir, agentvDir }) {
|
|
7630
|
-
const filename = c4.req.param("filename");
|
|
7631
|
-
const meta =
|
|
7878
|
+
async function handleRunCategories(c4, { searchDir, agentvDir }) {
|
|
7879
|
+
const filename = c4.req.param("filename") ?? "";
|
|
7880
|
+
const meta = await findRunById(searchDir, filename);
|
|
7632
7881
|
if (!meta) return c4.json({ error: "Run not found" }, 404);
|
|
7633
7882
|
try {
|
|
7634
7883
|
const loaded = loadManifestResults(meta.path);
|
|
@@ -7661,10 +7910,10 @@ function handleRunCategories(c4, { searchDir, agentvDir }) {
|
|
|
7661
7910
|
return c4.json({ error: "Failed to load categories" }, 500);
|
|
7662
7911
|
}
|
|
7663
7912
|
}
|
|
7664
|
-
function handleCategorySuites(c4, { searchDir, agentvDir }) {
|
|
7665
|
-
const filename = c4.req.param("filename");
|
|
7913
|
+
async function handleCategorySuites(c4, { searchDir, agentvDir }) {
|
|
7914
|
+
const filename = c4.req.param("filename") ?? "";
|
|
7666
7915
|
const category = decodeURIComponent(c4.req.param("category") ?? "");
|
|
7667
|
-
const meta =
|
|
7916
|
+
const meta = await findRunById(searchDir, filename);
|
|
7668
7917
|
if (!meta) return c4.json({ error: "Run not found" }, 404);
|
|
7669
7918
|
try {
|
|
7670
7919
|
const loaded = loadManifestResults(meta.path);
|
|
@@ -7691,10 +7940,10 @@ function handleCategorySuites(c4, { searchDir, agentvDir }) {
|
|
|
7691
7940
|
return c4.json({ error: "Failed to load suites" }, 500);
|
|
7692
7941
|
}
|
|
7693
7942
|
}
|
|
7694
|
-
function handleEvalDetail(c4, { searchDir }) {
|
|
7695
|
-
const filename = c4.req.param("filename");
|
|
7943
|
+
async function handleEvalDetail(c4, { searchDir }) {
|
|
7944
|
+
const filename = c4.req.param("filename") ?? "";
|
|
7696
7945
|
const evalId = c4.req.param("evalId");
|
|
7697
|
-
const meta =
|
|
7946
|
+
const meta = await findRunById(searchDir, filename);
|
|
7698
7947
|
if (!meta) return c4.json({ error: "Run not found" }, 404);
|
|
7699
7948
|
try {
|
|
7700
7949
|
const loaded = loadManifestResults(meta.path);
|
|
@@ -7705,17 +7954,17 @@ function handleEvalDetail(c4, { searchDir }) {
|
|
|
7705
7954
|
return c4.json({ error: "Failed to load eval" }, 500);
|
|
7706
7955
|
}
|
|
7707
7956
|
}
|
|
7708
|
-
function handleEvalFiles(c4, { searchDir }) {
|
|
7709
|
-
const filename = c4.req.param("filename");
|
|
7957
|
+
async function handleEvalFiles(c4, { searchDir }) {
|
|
7958
|
+
const filename = c4.req.param("filename") ?? "";
|
|
7710
7959
|
const evalId = c4.req.param("evalId");
|
|
7711
|
-
const meta =
|
|
7960
|
+
const meta = await findRunById(searchDir, filename);
|
|
7712
7961
|
if (!meta) return c4.json({ error: "Run not found" }, 404);
|
|
7713
7962
|
try {
|
|
7714
|
-
const content =
|
|
7963
|
+
const content = readFileSync10(meta.path, "utf8");
|
|
7715
7964
|
const records = parseResultManifest(content);
|
|
7716
7965
|
const record = records.find((r) => r.test_id === evalId);
|
|
7717
7966
|
if (!record) return c4.json({ error: "Eval not found" }, 404);
|
|
7718
|
-
const baseDir =
|
|
7967
|
+
const baseDir = path16.dirname(meta.path);
|
|
7719
7968
|
const knownPaths = [
|
|
7720
7969
|
record.grading_path,
|
|
7721
7970
|
record.timing_path,
|
|
@@ -7724,47 +7973,46 @@ function handleEvalFiles(c4, { searchDir }) {
|
|
|
7724
7973
|
record.response_path
|
|
7725
7974
|
].filter((p) => !!p);
|
|
7726
7975
|
if (knownPaths.length === 0) return c4.json({ files: [] });
|
|
7727
|
-
const artifactDirs = knownPaths.map((p) =>
|
|
7976
|
+
const artifactDirs = knownPaths.map((p) => path16.dirname(p));
|
|
7728
7977
|
let commonDir = artifactDirs[0];
|
|
7729
7978
|
for (const dir of artifactDirs) {
|
|
7730
7979
|
while (!dir.startsWith(commonDir)) {
|
|
7731
|
-
commonDir =
|
|
7980
|
+
commonDir = path16.dirname(commonDir);
|
|
7732
7981
|
}
|
|
7733
7982
|
}
|
|
7734
|
-
const artifactAbsDir =
|
|
7983
|
+
const artifactAbsDir = path16.join(baseDir, commonDir);
|
|
7735
7984
|
const files = buildFileTree(artifactAbsDir, baseDir);
|
|
7736
7985
|
return c4.json({ files });
|
|
7737
7986
|
} catch {
|
|
7738
7987
|
return c4.json({ error: "Failed to load file tree" }, 500);
|
|
7739
7988
|
}
|
|
7740
7989
|
}
|
|
7741
|
-
function handleEvalFileContent(c4, { searchDir }) {
|
|
7742
|
-
const filename = c4.req.param("filename");
|
|
7743
|
-
const
|
|
7744
|
-
const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
|
|
7990
|
+
async function handleEvalFileContent(c4, { searchDir }) {
|
|
7991
|
+
const filename = c4.req.param("filename") ?? "";
|
|
7992
|
+
const meta = await findRunById(searchDir, filename);
|
|
7745
7993
|
if (!meta) return c4.json({ error: "Run not found" }, 404);
|
|
7746
|
-
const marker =
|
|
7994
|
+
const marker = "/files/";
|
|
7747
7995
|
const markerIdx = c4.req.path.indexOf(marker);
|
|
7748
7996
|
const filePath = markerIdx >= 0 ? c4.req.path.slice(markerIdx + marker.length) : "";
|
|
7749
7997
|
if (!filePath) return c4.json({ error: "No file path specified" }, 400);
|
|
7750
|
-
const baseDir =
|
|
7751
|
-
const absolutePath =
|
|
7752
|
-
if (!absolutePath.startsWith(
|
|
7998
|
+
const baseDir = path16.dirname(meta.path);
|
|
7999
|
+
const absolutePath = path16.resolve(baseDir, filePath);
|
|
8000
|
+
if (!absolutePath.startsWith(path16.resolve(baseDir) + path16.sep) && absolutePath !== path16.resolve(baseDir)) {
|
|
7753
8001
|
return c4.json({ error: "Path traversal not allowed" }, 403);
|
|
7754
8002
|
}
|
|
7755
|
-
if (!
|
|
8003
|
+
if (!existsSync12(absolutePath) || !statSync5(absolutePath).isFile()) {
|
|
7756
8004
|
return c4.json({ error: "File not found" }, 404);
|
|
7757
8005
|
}
|
|
7758
8006
|
try {
|
|
7759
|
-
const fileContent =
|
|
8007
|
+
const fileContent = readFileSync10(absolutePath, "utf8");
|
|
7760
8008
|
const language = inferLanguage(absolutePath);
|
|
7761
8009
|
return c4.json({ content: fileContent, language });
|
|
7762
8010
|
} catch {
|
|
7763
8011
|
return c4.json({ error: "Failed to read file" }, 500);
|
|
7764
8012
|
}
|
|
7765
8013
|
}
|
|
7766
|
-
function handleExperiments(c4, { searchDir, agentvDir }) {
|
|
7767
|
-
const metas =
|
|
8014
|
+
async function handleExperiments(c4, { searchDir, agentvDir }) {
|
|
8015
|
+
const { runs: metas } = await listMergedResultFiles(searchDir);
|
|
7768
8016
|
const { threshold: pass_threshold } = loadStudioConfig(agentvDir);
|
|
7769
8017
|
const experimentMap = /* @__PURE__ */ new Map();
|
|
7770
8018
|
for (const m of metas) {
|
|
@@ -7802,8 +8050,70 @@ function handleExperiments(c4, { searchDir, agentvDir }) {
|
|
|
7802
8050
|
}));
|
|
7803
8051
|
return c4.json({ experiments });
|
|
7804
8052
|
}
|
|
7805
|
-
function
|
|
7806
|
-
const metas =
|
|
8053
|
+
async function handleCompare(c4, { searchDir, agentvDir }) {
|
|
8054
|
+
const { runs: metas } = await listMergedResultFiles(searchDir);
|
|
8055
|
+
const { threshold: pass_threshold } = loadStudioConfig(agentvDir);
|
|
8056
|
+
const cellMap = /* @__PURE__ */ new Map();
|
|
8057
|
+
const experimentsSet = /* @__PURE__ */ new Set();
|
|
8058
|
+
const targetsSet = /* @__PURE__ */ new Set();
|
|
8059
|
+
for (const m of metas) {
|
|
8060
|
+
try {
|
|
8061
|
+
const records = loadLightweightResults(m.path);
|
|
8062
|
+
for (const r of records) {
|
|
8063
|
+
const experiment = r.experiment ?? "default";
|
|
8064
|
+
const target = r.target ?? "default";
|
|
8065
|
+
experimentsSet.add(experiment);
|
|
8066
|
+
targetsSet.add(target);
|
|
8067
|
+
const key = JSON.stringify([experiment, target]);
|
|
8068
|
+
const entry = cellMap.get(key) ?? {
|
|
8069
|
+
experiment,
|
|
8070
|
+
target,
|
|
8071
|
+
evalCount: 0,
|
|
8072
|
+
passedCount: 0,
|
|
8073
|
+
scoreSum: 0,
|
|
8074
|
+
tests: []
|
|
8075
|
+
};
|
|
8076
|
+
const passed = r.score >= pass_threshold;
|
|
8077
|
+
entry.evalCount++;
|
|
8078
|
+
if (passed) entry.passedCount++;
|
|
8079
|
+
entry.scoreSum += r.score;
|
|
8080
|
+
entry.tests.push({
|
|
8081
|
+
test_id: r.testId,
|
|
8082
|
+
score: r.score,
|
|
8083
|
+
passed,
|
|
8084
|
+
execution_status: r.executionStatus
|
|
8085
|
+
});
|
|
8086
|
+
cellMap.set(key, entry);
|
|
8087
|
+
}
|
|
8088
|
+
} catch {
|
|
8089
|
+
}
|
|
8090
|
+
}
|
|
8091
|
+
const MAX_TESTS_PER_CELL = 100;
|
|
8092
|
+
const cells = [...cellMap.values()].map((entry) => {
|
|
8093
|
+
const dedupMap = /* @__PURE__ */ new Map();
|
|
8094
|
+
for (const t of entry.tests) {
|
|
8095
|
+
dedupMap.set(t.test_id, t);
|
|
8096
|
+
}
|
|
8097
|
+
const dedupedTests = [...dedupMap.values()];
|
|
8098
|
+
const cappedTests = dedupedTests.slice(-MAX_TESTS_PER_CELL);
|
|
8099
|
+
return {
|
|
8100
|
+
experiment: entry.experiment,
|
|
8101
|
+
target: entry.target,
|
|
8102
|
+
eval_count: entry.evalCount,
|
|
8103
|
+
passed_count: entry.passedCount,
|
|
8104
|
+
pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0,
|
|
8105
|
+
avg_score: entry.evalCount > 0 ? entry.scoreSum / entry.evalCount : 0,
|
|
8106
|
+
tests: cappedTests
|
|
8107
|
+
};
|
|
8108
|
+
});
|
|
8109
|
+
return c4.json({
|
|
8110
|
+
experiments: [...experimentsSet].sort(),
|
|
8111
|
+
targets: [...targetsSet].sort(),
|
|
8112
|
+
cells
|
|
8113
|
+
});
|
|
8114
|
+
}
|
|
8115
|
+
async function handleTargets(c4, { searchDir, agentvDir }) {
|
|
8116
|
+
const { runs: metas } = await listMergedResultFiles(searchDir);
|
|
7807
8117
|
const { threshold: pass_threshold } = loadStudioConfig(agentvDir);
|
|
7808
8118
|
const targetMap = /* @__PURE__ */ new Map();
|
|
7809
8119
|
for (const m of metas) {
|
|
@@ -7836,29 +8146,38 @@ function handleTargets(c4, { searchDir, agentvDir }) {
|
|
|
7836
8146
|
}));
|
|
7837
8147
|
return c4.json({ targets });
|
|
7838
8148
|
}
|
|
7839
|
-
function handleConfig(c4, { agentvDir }) {
|
|
7840
|
-
return c4.json(
|
|
8149
|
+
function handleConfig(c4, { agentvDir, searchDir }, options) {
|
|
8150
|
+
return c4.json({
|
|
8151
|
+
...loadStudioConfig(agentvDir),
|
|
8152
|
+
read_only: options?.readOnly === true,
|
|
8153
|
+
project_name: path16.basename(searchDir),
|
|
8154
|
+
multi_project_dashboard: options?.multiProjectDashboard === true
|
|
8155
|
+
});
|
|
7841
8156
|
}
|
|
7842
8157
|
function handleFeedbackRead(c4, { searchDir }) {
|
|
7843
|
-
const resultsDir =
|
|
7844
|
-
return c4.json(readFeedback(
|
|
8158
|
+
const resultsDir = path16.join(searchDir, ".agentv", "results");
|
|
8159
|
+
return c4.json(readFeedback(existsSync12(resultsDir) ? resultsDir : searchDir));
|
|
7845
8160
|
}
|
|
7846
8161
|
function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
7847
8162
|
const searchDir = cwd ?? resultDir;
|
|
7848
|
-
const agentvDir =
|
|
8163
|
+
const agentvDir = path16.join(searchDir, ".agentv");
|
|
7849
8164
|
const defaultCtx = { searchDir, agentvDir };
|
|
8165
|
+
const readOnly = options?.readOnly === true;
|
|
7850
8166
|
const app2 = new Hono();
|
|
7851
8167
|
function withProject(c4, handler) {
|
|
7852
8168
|
const project = getProject(c4.req.param("projectId") ?? "");
|
|
7853
|
-
if (!project || !
|
|
8169
|
+
if (!project || !existsSync12(project.path)) {
|
|
7854
8170
|
return c4.json({ error: "Project not found" }, 404);
|
|
7855
8171
|
}
|
|
7856
8172
|
return handler(c4, {
|
|
7857
8173
|
searchDir: project.path,
|
|
7858
|
-
agentvDir:
|
|
8174
|
+
agentvDir: path16.join(project.path, ".agentv")
|
|
7859
8175
|
});
|
|
7860
8176
|
}
|
|
7861
8177
|
app2.post("/api/config", async (c4) => {
|
|
8178
|
+
if (readOnly) {
|
|
8179
|
+
return c4.json({ error: "Studio is running in read-only mode" }, 403);
|
|
8180
|
+
}
|
|
7862
8181
|
try {
|
|
7863
8182
|
const body = await c4.req.json();
|
|
7864
8183
|
const current = loadStudioConfig(agentvDir);
|
|
@@ -7881,32 +8200,37 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
|
7881
8200
|
last_opened_at: entry.lastOpenedAt
|
|
7882
8201
|
};
|
|
7883
8202
|
}
|
|
7884
|
-
app2.get("/api/projects", (c4) => {
|
|
8203
|
+
app2.get("/api/projects", async (c4) => {
|
|
7885
8204
|
const registry = loadProjectRegistry();
|
|
7886
|
-
const projects =
|
|
7887
|
-
|
|
7888
|
-
|
|
7889
|
-
|
|
7890
|
-
|
|
7891
|
-
|
|
7892
|
-
|
|
7893
|
-
|
|
7894
|
-
|
|
7895
|
-
|
|
7896
|
-
|
|
8205
|
+
const projects = await Promise.all(
|
|
8206
|
+
registry.projects.map(async (p) => {
|
|
8207
|
+
let runCount = 0;
|
|
8208
|
+
let passRate = 0;
|
|
8209
|
+
let lastRun = null;
|
|
8210
|
+
try {
|
|
8211
|
+
const { runs: metas } = await listMergedResultFiles(p.path);
|
|
8212
|
+
runCount = metas.length;
|
|
8213
|
+
if (metas.length > 0) {
|
|
8214
|
+
const totalPassRate = metas.reduce((sum, m) => sum + m.passRate, 0);
|
|
8215
|
+
passRate = totalPassRate / metas.length;
|
|
8216
|
+
lastRun = metas[0].timestamp;
|
|
8217
|
+
}
|
|
8218
|
+
} catch {
|
|
7897
8219
|
}
|
|
7898
|
-
|
|
7899
|
-
|
|
7900
|
-
|
|
7901
|
-
|
|
7902
|
-
|
|
7903
|
-
|
|
7904
|
-
|
|
7905
|
-
|
|
7906
|
-
});
|
|
8220
|
+
return {
|
|
8221
|
+
...projectEntryToWire(p),
|
|
8222
|
+
run_count: runCount,
|
|
8223
|
+
pass_rate: passRate,
|
|
8224
|
+
last_run: lastRun
|
|
8225
|
+
};
|
|
8226
|
+
})
|
|
8227
|
+
);
|
|
7907
8228
|
return c4.json({ projects });
|
|
7908
8229
|
});
|
|
7909
8230
|
app2.post("/api/projects", async (c4) => {
|
|
8231
|
+
if (readOnly) {
|
|
8232
|
+
return c4.json({ error: "Studio is running in read-only mode" }, 403);
|
|
8233
|
+
}
|
|
7910
8234
|
try {
|
|
7911
8235
|
const body = await c4.req.json();
|
|
7912
8236
|
if (!body.path) return c4.json({ error: "Missing path" }, 400);
|
|
@@ -7917,15 +8241,18 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
|
7917
8241
|
}
|
|
7918
8242
|
});
|
|
7919
8243
|
app2.delete("/api/projects/:projectId", (c4) => {
|
|
8244
|
+
if (readOnly) {
|
|
8245
|
+
return c4.json({ error: "Studio is running in read-only mode" }, 403);
|
|
8246
|
+
}
|
|
7920
8247
|
const removed = removeProject(c4.req.param("projectId") ?? "");
|
|
7921
8248
|
if (!removed) return c4.json({ error: "Project not found" }, 404);
|
|
7922
8249
|
return c4.json({ ok: true });
|
|
7923
8250
|
});
|
|
7924
|
-
app2.get("/api/projects/:projectId/summary", (c4) => {
|
|
8251
|
+
app2.get("/api/projects/:projectId/summary", async (c4) => {
|
|
7925
8252
|
const project = getProject(c4.req.param("projectId") ?? "");
|
|
7926
8253
|
if (!project) return c4.json({ error: "Project not found" }, 404);
|
|
7927
8254
|
try {
|
|
7928
|
-
const metas =
|
|
8255
|
+
const { runs: metas } = await listMergedResultFiles(project.path);
|
|
7929
8256
|
const runCount = metas.length;
|
|
7930
8257
|
const passRate = runCount > 0 ? metas.reduce((s, m) => s + m.passRate, 0) / runCount : 0;
|
|
7931
8258
|
const lastRun = metas.length > 0 ? metas[0].timestamp : null;
|
|
@@ -7942,6 +8269,9 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
|
7942
8269
|
}
|
|
7943
8270
|
});
|
|
7944
8271
|
app2.post("/api/projects/discover", async (c4) => {
|
|
8272
|
+
if (readOnly) {
|
|
8273
|
+
return c4.json({ error: "Studio is running in read-only mode" }, 403);
|
|
8274
|
+
}
|
|
7945
8275
|
try {
|
|
7946
8276
|
const body = await c4.req.json();
|
|
7947
8277
|
if (!body.path) return c4.json({ error: "Missing path" }, 400);
|
|
@@ -7952,12 +8282,12 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
|
7952
8282
|
return c4.json({ error: err2.message }, 400);
|
|
7953
8283
|
}
|
|
7954
8284
|
});
|
|
7955
|
-
app2.get("/api/projects/all-runs", (c4) => {
|
|
8285
|
+
app2.get("/api/projects/all-runs", async (c4) => {
|
|
7956
8286
|
const registry = loadProjectRegistry();
|
|
7957
8287
|
const allRuns = [];
|
|
7958
8288
|
for (const p of registry.projects) {
|
|
7959
8289
|
try {
|
|
7960
|
-
const metas =
|
|
8290
|
+
const { runs: metas } = await listMergedResultFiles(p.path);
|
|
7961
8291
|
for (const m of metas) {
|
|
7962
8292
|
let target;
|
|
7963
8293
|
let experiment;
|
|
@@ -7971,12 +8301,14 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
|
7971
8301
|
}
|
|
7972
8302
|
allRuns.push({
|
|
7973
8303
|
filename: m.filename,
|
|
8304
|
+
display_name: m.displayName,
|
|
7974
8305
|
path: m.path,
|
|
7975
8306
|
timestamp: m.timestamp,
|
|
7976
8307
|
test_count: m.testCount,
|
|
7977
8308
|
pass_rate: m.passRate,
|
|
7978
8309
|
avg_score: m.avgScore,
|
|
7979
8310
|
size_bytes: m.sizeBytes,
|
|
8311
|
+
source: m.source,
|
|
7980
8312
|
...target && { target },
|
|
7981
8313
|
...experiment && { experiment },
|
|
7982
8314
|
project_id: p.id,
|
|
@@ -7989,7 +8321,15 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
|
7989
8321
|
allRuns.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
|
|
7990
8322
|
return c4.json({ runs: allRuns });
|
|
7991
8323
|
});
|
|
7992
|
-
app2.get(
|
|
8324
|
+
app2.get(
|
|
8325
|
+
"/api/config",
|
|
8326
|
+
(c4) => handleConfig(c4, defaultCtx, {
|
|
8327
|
+
readOnly,
|
|
8328
|
+
multiProjectDashboard: options?.multiProjectDashboard
|
|
8329
|
+
})
|
|
8330
|
+
);
|
|
8331
|
+
app2.get("/api/remote/status", async (c4) => c4.json(await getRemoteResultsStatus(searchDir)));
|
|
8332
|
+
app2.post("/api/remote/sync", async (c4) => c4.json(await syncRemoteResults(searchDir)));
|
|
7993
8333
|
app2.get("/api/runs", (c4) => handleRuns(c4, defaultCtx));
|
|
7994
8334
|
app2.get("/api/runs/:filename", (c4) => handleRunDetail(c4, defaultCtx));
|
|
7995
8335
|
app2.get("/api/runs/:filename/suites", (c4) => handleRunSuites(c4, defaultCtx));
|
|
@@ -8002,12 +8342,16 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
|
8002
8342
|
app2.get("/api/runs/:filename/evals/:evalId/files", (c4) => handleEvalFiles(c4, defaultCtx));
|
|
8003
8343
|
app2.get("/api/runs/:filename/evals/:evalId/files/*", (c4) => handleEvalFileContent(c4, defaultCtx));
|
|
8004
8344
|
app2.get("/api/experiments", (c4) => handleExperiments(c4, defaultCtx));
|
|
8345
|
+
app2.get("/api/compare", (c4) => handleCompare(c4, defaultCtx));
|
|
8005
8346
|
app2.get("/api/targets", (c4) => handleTargets(c4, defaultCtx));
|
|
8006
8347
|
app2.get("/api/feedback", (c4) => {
|
|
8007
8348
|
const data = readFeedback(resultDir);
|
|
8008
8349
|
return c4.json(data);
|
|
8009
8350
|
});
|
|
8010
8351
|
app2.post("/api/feedback", async (c4) => {
|
|
8352
|
+
if (readOnly) {
|
|
8353
|
+
return c4.json({ error: "Studio is running in read-only mode" }, 403);
|
|
8354
|
+
}
|
|
8011
8355
|
let body;
|
|
8012
8356
|
try {
|
|
8013
8357
|
body = await c4.req.json();
|
|
@@ -8045,8 +8389,8 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
|
8045
8389
|
writeFeedback(resultDir, existing);
|
|
8046
8390
|
return c4.json(existing);
|
|
8047
8391
|
});
|
|
8048
|
-
app2.get("/api/index", (c4) => {
|
|
8049
|
-
const metas =
|
|
8392
|
+
app2.get("/api/index", async (c4) => {
|
|
8393
|
+
const { runs: metas } = await listMergedResultFiles(searchDir);
|
|
8050
8394
|
const entries2 = metas.map((m) => {
|
|
8051
8395
|
let totalCostUsd = 0;
|
|
8052
8396
|
try {
|
|
@@ -8056,6 +8400,7 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
|
8056
8400
|
}
|
|
8057
8401
|
return {
|
|
8058
8402
|
run_filename: m.filename,
|
|
8403
|
+
display_name: m.displayName,
|
|
8059
8404
|
test_count: m.testCount,
|
|
8060
8405
|
pass_rate: m.passRate,
|
|
8061
8406
|
avg_score: m.avgScore,
|
|
@@ -8065,7 +8410,27 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
|
8065
8410
|
});
|
|
8066
8411
|
return c4.json({ entries: entries2 });
|
|
8067
8412
|
});
|
|
8068
|
-
app2.get(
|
|
8413
|
+
app2.get(
|
|
8414
|
+
"/api/projects/:projectId/config",
|
|
8415
|
+
(c4) => withProject(
|
|
8416
|
+
c4,
|
|
8417
|
+
(ctx, dataCtx) => handleConfig(ctx, dataCtx, {
|
|
8418
|
+
readOnly,
|
|
8419
|
+
multiProjectDashboard: options?.multiProjectDashboard
|
|
8420
|
+
})
|
|
8421
|
+
)
|
|
8422
|
+
);
|
|
8423
|
+
app2.get(
|
|
8424
|
+
"/api/projects/:projectId/remote/status",
|
|
8425
|
+
(c4) => withProject(
|
|
8426
|
+
c4,
|
|
8427
|
+
async (ctx, dataCtx) => ctx.json(await getRemoteResultsStatus(dataCtx.searchDir))
|
|
8428
|
+
)
|
|
8429
|
+
);
|
|
8430
|
+
app2.post(
|
|
8431
|
+
"/api/projects/:projectId/remote/sync",
|
|
8432
|
+
(c4) => withProject(c4, async (ctx, dataCtx) => ctx.json(await syncRemoteResults(dataCtx.searchDir)))
|
|
8433
|
+
);
|
|
8069
8434
|
app2.get("/api/projects/:projectId/runs", (c4) => withProject(c4, handleRuns));
|
|
8070
8435
|
app2.get("/api/projects/:projectId/runs/:filename", (c4) => withProject(c4, handleRunDetail));
|
|
8071
8436
|
app2.get("/api/projects/:projectId/runs/:filename/suites", (c4) => withProject(c4, handleRunSuites));
|
|
@@ -8090,31 +8455,36 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
|
8090
8455
|
(c4) => withProject(c4, handleEvalFileContent)
|
|
8091
8456
|
);
|
|
8092
8457
|
app2.get("/api/projects/:projectId/experiments", (c4) => withProject(c4, handleExperiments));
|
|
8458
|
+
app2.get("/api/projects/:projectId/compare", (c4) => withProject(c4, handleCompare));
|
|
8093
8459
|
app2.get("/api/projects/:projectId/targets", (c4) => withProject(c4, handleTargets));
|
|
8094
8460
|
app2.get("/api/projects/:projectId/feedback", (c4) => withProject(c4, handleFeedbackRead));
|
|
8095
|
-
registerEvalRoutes(
|
|
8096
|
-
|
|
8097
|
-
|
|
8098
|
-
const
|
|
8099
|
-
if (
|
|
8100
|
-
|
|
8101
|
-
|
|
8102
|
-
|
|
8461
|
+
registerEvalRoutes(
|
|
8462
|
+
app2,
|
|
8463
|
+
(c4) => {
|
|
8464
|
+
const projectId = c4.req.param("projectId");
|
|
8465
|
+
if (projectId) {
|
|
8466
|
+
const project = getProject(projectId);
|
|
8467
|
+
if (project) return project.path;
|
|
8468
|
+
}
|
|
8469
|
+
return searchDir;
|
|
8470
|
+
},
|
|
8471
|
+
{ readOnly }
|
|
8472
|
+
);
|
|
8103
8473
|
const studioDistPath = options?.studioDir ?? resolveStudioDistDir();
|
|
8104
|
-
if (!studioDistPath || !
|
|
8474
|
+
if (!studioDistPath || !existsSync12(path16.join(studioDistPath, "index.html"))) {
|
|
8105
8475
|
throw new Error('Studio dist not found. Run "bun run build" in apps/studio/ to build the SPA.');
|
|
8106
8476
|
}
|
|
8107
8477
|
app2.get("/", (c4) => {
|
|
8108
|
-
const indexPath =
|
|
8109
|
-
if (
|
|
8478
|
+
const indexPath = path16.join(studioDistPath, "index.html");
|
|
8479
|
+
if (existsSync12(indexPath)) return c4.html(readFileSync10(indexPath, "utf8"));
|
|
8110
8480
|
return c4.notFound();
|
|
8111
8481
|
});
|
|
8112
8482
|
app2.get("/assets/*", (c4) => {
|
|
8113
8483
|
const assetPath = c4.req.path;
|
|
8114
|
-
const filePath =
|
|
8115
|
-
if (!
|
|
8116
|
-
const content =
|
|
8117
|
-
const ext =
|
|
8484
|
+
const filePath = path16.join(studioDistPath, assetPath);
|
|
8485
|
+
if (!existsSync12(filePath)) return c4.notFound();
|
|
8486
|
+
const content = readFileSync10(filePath);
|
|
8487
|
+
const ext = path16.extname(filePath);
|
|
8118
8488
|
const mimeTypes = {
|
|
8119
8489
|
".js": "application/javascript",
|
|
8120
8490
|
".css": "text/css",
|
|
@@ -8135,26 +8505,26 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
|
8135
8505
|
});
|
|
8136
8506
|
app2.get("*", (c4) => {
|
|
8137
8507
|
if (c4.req.path.startsWith("/api/")) return c4.json({ error: "Not found" }, 404);
|
|
8138
|
-
const indexPath =
|
|
8139
|
-
if (
|
|
8508
|
+
const indexPath = path16.join(studioDistPath, "index.html");
|
|
8509
|
+
if (existsSync12(indexPath)) return c4.html(readFileSync10(indexPath, "utf8"));
|
|
8140
8510
|
return c4.notFound();
|
|
8141
8511
|
});
|
|
8142
8512
|
return app2;
|
|
8143
8513
|
}
|
|
8144
8514
|
function resolveStudioDistDir() {
|
|
8145
|
-
const currentDir = typeof __dirname !== "undefined" ? __dirname :
|
|
8515
|
+
const currentDir = typeof __dirname !== "undefined" ? __dirname : path16.dirname(fileURLToPath2(import.meta.url));
|
|
8146
8516
|
const candidates = [
|
|
8147
8517
|
// From src/commands/results/ → sibling apps/studio/dist
|
|
8148
|
-
|
|
8518
|
+
path16.resolve(currentDir, "../../../../studio/dist"),
|
|
8149
8519
|
// From dist/ → sibling apps/studio/dist (monorepo dev)
|
|
8150
|
-
|
|
8520
|
+
path16.resolve(currentDir, "../../studio/dist"),
|
|
8151
8521
|
// Bundled inside CLI dist (published package: dist/studio/)
|
|
8152
|
-
|
|
8522
|
+
path16.resolve(currentDir, "studio"),
|
|
8153
8523
|
// From dist/ in monorepo root context
|
|
8154
|
-
|
|
8524
|
+
path16.resolve(currentDir, "../../../apps/studio/dist")
|
|
8155
8525
|
];
|
|
8156
8526
|
for (const candidate of candidates) {
|
|
8157
|
-
if (
|
|
8527
|
+
if (existsSync12(candidate) && existsSync12(path16.join(candidate, "index.html"))) {
|
|
8158
8528
|
return candidate;
|
|
8159
8529
|
}
|
|
8160
8530
|
}
|
|
@@ -8183,7 +8553,11 @@ var resultsServeCommand = command({
|
|
|
8183
8553
|
}),
|
|
8184
8554
|
multi: flag({
|
|
8185
8555
|
long: "multi",
|
|
8186
|
-
description: "Launch in multi-project dashboard mode"
|
|
8556
|
+
description: "Launch in multi-project dashboard mode (deprecated; use auto-detect or --single)"
|
|
8557
|
+
}),
|
|
8558
|
+
single: flag({
|
|
8559
|
+
long: "single",
|
|
8560
|
+
description: "Force single-project dashboard mode"
|
|
8187
8561
|
}),
|
|
8188
8562
|
add: option({
|
|
8189
8563
|
type: optional(string),
|
|
@@ -8199,9 +8573,13 @@ var resultsServeCommand = command({
|
|
|
8199
8573
|
type: optional(string),
|
|
8200
8574
|
long: "discover",
|
|
8201
8575
|
description: "Scan a directory tree for repos with .agentv/"
|
|
8576
|
+
}),
|
|
8577
|
+
readOnly: flag({
|
|
8578
|
+
long: "read-only",
|
|
8579
|
+
description: "Disable write operations and launch Studio in read-only leaderboard mode"
|
|
8202
8580
|
})
|
|
8203
8581
|
},
|
|
8204
|
-
handler: async ({ source, port, dir, multi, add, remove, discover }) => {
|
|
8582
|
+
handler: async ({ source, port, dir, multi, single, add, remove, discover, readOnly }) => {
|
|
8205
8583
|
const cwd = dir ?? process.cwd();
|
|
8206
8584
|
const listenPort = port ?? (process.env.PORT ? Number(process.env.PORT) : 3117);
|
|
8207
8585
|
if (add) {
|
|
@@ -8239,7 +8617,10 @@ Discovered ${discovered.length} project(s).`);
|
|
|
8239
8617
|
return;
|
|
8240
8618
|
}
|
|
8241
8619
|
const registry = loadProjectRegistry();
|
|
8242
|
-
const isMultiProject
|
|
8620
|
+
const { isMultiProject, showMultiWarning } = resolveDashboardMode(registry.projects.length, {
|
|
8621
|
+
multi,
|
|
8622
|
+
single
|
|
8623
|
+
});
|
|
8243
8624
|
try {
|
|
8244
8625
|
let results = [];
|
|
8245
8626
|
let sourceFile;
|
|
@@ -8249,7 +8630,7 @@ Discovered ${discovered.length} project(s).`);
|
|
|
8249
8630
|
} else {
|
|
8250
8631
|
const cache = await loadRunCache(cwd);
|
|
8251
8632
|
const cachedFile = cache ? resolveRunCacheFile(cache) : "";
|
|
8252
|
-
if (cachedFile &&
|
|
8633
|
+
if (cachedFile && existsSync12(cachedFile)) {
|
|
8253
8634
|
sourceFile = cachedFile;
|
|
8254
8635
|
results = loadManifestResults(cachedFile);
|
|
8255
8636
|
} else {
|
|
@@ -8260,8 +8641,16 @@ Discovered ${discovered.length} project(s).`);
|
|
|
8260
8641
|
}
|
|
8261
8642
|
}
|
|
8262
8643
|
}
|
|
8263
|
-
const resultDir = sourceFile ?
|
|
8264
|
-
const app2 = createApp(results, resultDir, cwd, sourceFile
|
|
8644
|
+
const resultDir = sourceFile ? path16.dirname(path16.resolve(sourceFile)) : cwd;
|
|
8645
|
+
const app2 = createApp(results, resultDir, cwd, sourceFile, {
|
|
8646
|
+
readOnly,
|
|
8647
|
+
multiProjectDashboard: isMultiProject
|
|
8648
|
+
});
|
|
8649
|
+
if (showMultiWarning) {
|
|
8650
|
+
console.warn(
|
|
8651
|
+
"Warning: --multi is deprecated. Studio now auto-detects multi-project mode when multiple projects are registered. Use --single to force the single-project view."
|
|
8652
|
+
);
|
|
8653
|
+
}
|
|
8265
8654
|
if (isMultiProject) {
|
|
8266
8655
|
console.log(`Multi-project mode: ${registry.projects.length} project(s) registered`);
|
|
8267
8656
|
} else if (results.length > 0 && sourceFile) {
|
|
@@ -8377,7 +8766,7 @@ var selfCommand = subcommands({
|
|
|
8377
8766
|
|
|
8378
8767
|
// src/commands/transpile/index.ts
|
|
8379
8768
|
import { writeFileSync as writeFileSync5 } from "node:fs";
|
|
8380
|
-
import
|
|
8769
|
+
import path17 from "node:path";
|
|
8381
8770
|
var transpileCommand = command({
|
|
8382
8771
|
name: "transpile",
|
|
8383
8772
|
description: "Convert an EVAL.yaml file to Agent Skills evals.json format",
|
|
@@ -8401,7 +8790,7 @@ var transpileCommand = command({
|
|
|
8401
8790
|
handler: async ({ input, outDir, stdout }) => {
|
|
8402
8791
|
let result;
|
|
8403
8792
|
try {
|
|
8404
|
-
result = transpileEvalYamlFile(
|
|
8793
|
+
result = transpileEvalYamlFile(path17.resolve(input));
|
|
8405
8794
|
} catch (error) {
|
|
8406
8795
|
console.error(`Error: ${error.message}`);
|
|
8407
8796
|
process.exit(1);
|
|
@@ -8425,11 +8814,11 @@ var transpileCommand = command({
|
|
|
8425
8814
|
process.stdout.write("\n");
|
|
8426
8815
|
return;
|
|
8427
8816
|
}
|
|
8428
|
-
const outputDir = outDir ?
|
|
8817
|
+
const outputDir = outDir ? path17.resolve(outDir) : path17.dirname(path17.resolve(input));
|
|
8429
8818
|
const fileNames = getOutputFilenames(result);
|
|
8430
8819
|
for (const [skill, evalsJson] of result.files) {
|
|
8431
8820
|
const fileName = fileNames.get(skill) ?? "evals.json";
|
|
8432
|
-
const outputPath =
|
|
8821
|
+
const outputPath = path17.join(outputDir, fileName);
|
|
8433
8822
|
writeFileSync5(outputPath, `${JSON.stringify(evalsJson, null, 2)}
|
|
8434
8823
|
`);
|
|
8435
8824
|
console.log(`Transpiled to ${outputPath}`);
|
|
@@ -8438,8 +8827,8 @@ var transpileCommand = command({
|
|
|
8438
8827
|
});
|
|
8439
8828
|
|
|
8440
8829
|
// src/commands/trend/index.ts
|
|
8441
|
-
import
|
|
8442
|
-
var
|
|
8830
|
+
import path18 from "node:path";
|
|
8831
|
+
var colors2 = {
|
|
8443
8832
|
reset: "\x1B[0m",
|
|
8444
8833
|
bold: "\x1B[1m",
|
|
8445
8834
|
dim: "\x1B[2m",
|
|
@@ -8449,18 +8838,18 @@ var colors3 = {
|
|
|
8449
8838
|
cyan: "\x1B[36m",
|
|
8450
8839
|
gray: "\x1B[90m"
|
|
8451
8840
|
};
|
|
8452
|
-
var
|
|
8453
|
-
var c3 =
|
|
8454
|
-
var
|
|
8455
|
-
function
|
|
8456
|
-
return str.replace(
|
|
8457
|
-
}
|
|
8458
|
-
function
|
|
8459
|
-
const plainLen =
|
|
8841
|
+
var noColor2 = process.env.NO_COLOR !== void 0 || !process.stdout.isTTY;
|
|
8842
|
+
var c3 = noColor2 ? Object.fromEntries(Object.keys(colors2).map((k) => [k, ""])) : colors2;
|
|
8843
|
+
var ansiPattern2 = new RegExp(`${String.fromCharCode(27)}\\[[0-9;]*m`, "g");
|
|
8844
|
+
function stripAnsi3(str) {
|
|
8845
|
+
return str.replace(ansiPattern2, "");
|
|
8846
|
+
}
|
|
8847
|
+
function padRight4(str, len) {
|
|
8848
|
+
const plainLen = stripAnsi3(str).length;
|
|
8460
8849
|
return str + " ".repeat(Math.max(0, len - plainLen));
|
|
8461
8850
|
}
|
|
8462
8851
|
function padLeft3(str, len) {
|
|
8463
|
-
const plainLen =
|
|
8852
|
+
const plainLen = stripAnsi3(str).length;
|
|
8464
8853
|
return " ".repeat(Math.max(0, len - plainLen)) + str;
|
|
8465
8854
|
}
|
|
8466
8855
|
function formatSignedNumber(value, digits = 3) {
|
|
@@ -8488,7 +8877,7 @@ function colorizeSlope(value) {
|
|
|
8488
8877
|
}
|
|
8489
8878
|
function ensureTrendIndexPath(source, cwd) {
|
|
8490
8879
|
const resolved = resolveResultSourcePath(source, cwd);
|
|
8491
|
-
if (
|
|
8880
|
+
if (path18.basename(resolved) !== RESULT_INDEX_FILENAME) {
|
|
8492
8881
|
throw new Error(
|
|
8493
8882
|
`Unsupported result source for trend: ${source}. Use a run workspace directory or ${RESULT_INDEX_FILENAME} manifest.`
|
|
8494
8883
|
);
|
|
@@ -8508,7 +8897,7 @@ function resolveTrendSources(cwd, sources, last) {
|
|
|
8508
8897
|
if (last < 2) {
|
|
8509
8898
|
throw new Error("--last must be at least 2");
|
|
8510
8899
|
}
|
|
8511
|
-
const metas = listResultFiles(cwd).filter((meta) =>
|
|
8900
|
+
const metas = listResultFiles(cwd).filter((meta) => path18.basename(meta.path) === RESULT_INDEX_FILENAME).slice(0, last);
|
|
8512
8901
|
if (metas.length < 2) {
|
|
8513
8902
|
throw new Error(
|
|
8514
8903
|
"Trend analysis requires at least 2 canonical run workspaces in .agentv/results/runs/"
|
|
@@ -8523,10 +8912,10 @@ function getRunLabel(sourcePath, timestamp) {
|
|
|
8523
8912
|
if (timestamp) {
|
|
8524
8913
|
return timestamp;
|
|
8525
8914
|
}
|
|
8526
|
-
return
|
|
8915
|
+
return path18.basename(path18.dirname(sourcePath));
|
|
8527
8916
|
}
|
|
8528
8917
|
function getRunSortKey(sourcePath, timestamp) {
|
|
8529
|
-
return timestamp ??
|
|
8918
|
+
return timestamp ?? path18.basename(path18.dirname(sourcePath));
|
|
8530
8919
|
}
|
|
8531
8920
|
function mean2(values) {
|
|
8532
8921
|
return values.reduce((sum, value) => sum + value, 0) / values.length;
|
|
@@ -8673,14 +9062,14 @@ function formatTrendTable(output) {
|
|
|
8673
9062
|
`${c3.bold}Matched Tests:${c3.reset} ${output.summary.matchedTestCount} | ${c3.bold}Verdict:${c3.reset} ${colorizeDirection(output.summary.direction)}`
|
|
8674
9063
|
);
|
|
8675
9064
|
lines.push("");
|
|
8676
|
-
const header = ` ${
|
|
9065
|
+
const header = ` ${padRight4("Run", runLabelWidth)} ${padLeft3("Tests", matchWidth)} ${padLeft3("Mean Score", scoreWidth)}`;
|
|
8677
9066
|
lines.push(`${c3.dim}${header}${c3.reset}`);
|
|
8678
9067
|
lines.push(
|
|
8679
9068
|
`${c3.dim} ${"\u2500".repeat(runLabelWidth)} ${"\u2500".repeat(matchWidth)} ${"\u2500".repeat(scoreWidth)}${c3.reset}`
|
|
8680
9069
|
);
|
|
8681
9070
|
for (const run2 of output.runs) {
|
|
8682
9071
|
lines.push(
|
|
8683
|
-
` ${
|
|
9072
|
+
` ${padRight4(run2.label, runLabelWidth)} ${padLeft3(String(run2.matchedTestCount), matchWidth)} ${padLeft3(run2.meanScore.toFixed(3), scoreWidth)}`
|
|
8684
9073
|
);
|
|
8685
9074
|
}
|
|
8686
9075
|
lines.push("");
|
|
@@ -8781,7 +9170,7 @@ var trendCommand = command({
|
|
|
8781
9170
|
});
|
|
8782
9171
|
|
|
8783
9172
|
// src/commands/trim/index.ts
|
|
8784
|
-
import { readFileSync as
|
|
9173
|
+
import { readFileSync as readFileSync11, writeFileSync as writeFileSync6 } from "node:fs";
|
|
8785
9174
|
var trimCommand = command({
|
|
8786
9175
|
name: "trim",
|
|
8787
9176
|
description: "Trim evaluation results for baseline storage (strips debug/audit fields)",
|
|
@@ -8800,7 +9189,7 @@ var trimCommand = command({
|
|
|
8800
9189
|
},
|
|
8801
9190
|
handler: async ({ input, out }) => {
|
|
8802
9191
|
try {
|
|
8803
|
-
const content =
|
|
9192
|
+
const content = readFileSync11(input, "utf8");
|
|
8804
9193
|
const lines = content.trim().split("\n").filter((line) => line.trim());
|
|
8805
9194
|
const trimmedLines = lines.map((line) => {
|
|
8806
9195
|
const record = JSON.parse(line);
|
|
@@ -8907,7 +9296,7 @@ function isTTY() {
|
|
|
8907
9296
|
// src/commands/validate/validate-files.ts
|
|
8908
9297
|
import { constants } from "node:fs";
|
|
8909
9298
|
import { access, readdir as readdir4, stat } from "node:fs/promises";
|
|
8910
|
-
import
|
|
9299
|
+
import path19 from "node:path";
|
|
8911
9300
|
import fg2 from "fast-glob";
|
|
8912
9301
|
async function validateFiles(paths) {
|
|
8913
9302
|
const filePaths = await expandPaths(paths);
|
|
@@ -8926,7 +9315,7 @@ async function validateFiles(paths) {
|
|
|
8926
9315
|
};
|
|
8927
9316
|
}
|
|
8928
9317
|
async function validateSingleFile(filePath) {
|
|
8929
|
-
const absolutePath =
|
|
9318
|
+
const absolutePath = path19.resolve(filePath);
|
|
8930
9319
|
const fileType = await detectFileType(absolutePath);
|
|
8931
9320
|
let result;
|
|
8932
9321
|
if (fileType === "eval") {
|
|
@@ -8951,7 +9340,7 @@ async function validateSingleFile(filePath) {
|
|
|
8951
9340
|
async function expandPaths(paths) {
|
|
8952
9341
|
const expanded = /* @__PURE__ */ new Set();
|
|
8953
9342
|
for (const inputPath of paths) {
|
|
8954
|
-
const absolutePath =
|
|
9343
|
+
const absolutePath = path19.resolve(inputPath);
|
|
8955
9344
|
try {
|
|
8956
9345
|
await access(absolutePath, constants.F_OK);
|
|
8957
9346
|
const stats = await stat(absolutePath);
|
|
@@ -8979,7 +9368,7 @@ async function expandPaths(paths) {
|
|
|
8979
9368
|
if (yamlMatches.length === 0) {
|
|
8980
9369
|
console.warn(`Warning: No YAML files matched pattern: ${inputPath}`);
|
|
8981
9370
|
}
|
|
8982
|
-
for (const f of yamlMatches) expanded.add(
|
|
9371
|
+
for (const f of yamlMatches) expanded.add(path19.normalize(f));
|
|
8983
9372
|
}
|
|
8984
9373
|
const sorted = Array.from(expanded);
|
|
8985
9374
|
sorted.sort();
|
|
@@ -8990,7 +9379,7 @@ async function findYamlFiles(dirPath) {
|
|
|
8990
9379
|
try {
|
|
8991
9380
|
const entries2 = await readdir4(dirPath, { withFileTypes: true });
|
|
8992
9381
|
for (const entry of entries2) {
|
|
8993
|
-
const fullPath =
|
|
9382
|
+
const fullPath = path19.join(dirPath, entry.name);
|
|
8994
9383
|
if (entry.isDirectory()) {
|
|
8995
9384
|
if (entry.name === "node_modules" || entry.name.startsWith(".")) {
|
|
8996
9385
|
continue;
|
|
@@ -9007,7 +9396,7 @@ async function findYamlFiles(dirPath) {
|
|
|
9007
9396
|
return results;
|
|
9008
9397
|
}
|
|
9009
9398
|
function isYamlFile(filePath) {
|
|
9010
|
-
const ext =
|
|
9399
|
+
const ext = path19.extname(filePath).toLowerCase();
|
|
9011
9400
|
return ext === ".yaml" || ext === ".yml";
|
|
9012
9401
|
}
|
|
9013
9402
|
|
|
@@ -9062,9 +9451,9 @@ var validateCommand = command({
|
|
|
9062
9451
|
});
|
|
9063
9452
|
|
|
9064
9453
|
// src/commands/workspace/clean.ts
|
|
9065
|
-
import { existsSync as
|
|
9454
|
+
import { existsSync as existsSync13 } from "node:fs";
|
|
9066
9455
|
import { readFile as readFile6, readdir as readdir5, rm } from "node:fs/promises";
|
|
9067
|
-
import
|
|
9456
|
+
import path20 from "node:path";
|
|
9068
9457
|
async function confirm(message) {
|
|
9069
9458
|
const readline2 = await import("node:readline");
|
|
9070
9459
|
const rl = readline2.createInterface({ input: process.stdin, output: process.stdout });
|
|
@@ -9091,7 +9480,7 @@ var cleanCommand = command({
|
|
|
9091
9480
|
},
|
|
9092
9481
|
handler: async ({ repo, force }) => {
|
|
9093
9482
|
const poolRoot = getWorkspacePoolRoot();
|
|
9094
|
-
if (!
|
|
9483
|
+
if (!existsSync13(poolRoot)) {
|
|
9095
9484
|
console.log("No workspace pool entries found.");
|
|
9096
9485
|
return;
|
|
9097
9486
|
}
|
|
@@ -9100,8 +9489,8 @@ var cleanCommand = command({
|
|
|
9100
9489
|
const poolDirs = entries2.filter((e) => e.isDirectory());
|
|
9101
9490
|
const matchingDirs = [];
|
|
9102
9491
|
for (const dir of poolDirs) {
|
|
9103
|
-
const poolDir =
|
|
9104
|
-
const metadataPath =
|
|
9492
|
+
const poolDir = path20.join(poolRoot, dir.name);
|
|
9493
|
+
const metadataPath = path20.join(poolDir, "metadata.json");
|
|
9105
9494
|
try {
|
|
9106
9495
|
const raw = await readFile6(metadataPath, "utf-8");
|
|
9107
9496
|
const metadata = JSON.parse(raw);
|
|
@@ -9132,7 +9521,7 @@ var cleanCommand = command({
|
|
|
9132
9521
|
}
|
|
9133
9522
|
for (const dir of matchingDirs) {
|
|
9134
9523
|
await rm(dir, { recursive: true, force: true });
|
|
9135
|
-
console.log(`Removed: ${
|
|
9524
|
+
console.log(`Removed: ${path20.basename(dir).slice(0, 12)}...`);
|
|
9136
9525
|
}
|
|
9137
9526
|
console.log("Done.");
|
|
9138
9527
|
} else {
|
|
@@ -9150,7 +9539,7 @@ var cleanCommand = command({
|
|
|
9150
9539
|
});
|
|
9151
9540
|
|
|
9152
9541
|
// src/commands/workspace/deps.ts
|
|
9153
|
-
import
|
|
9542
|
+
import path21 from "node:path";
|
|
9154
9543
|
var depsCommand = command({
|
|
9155
9544
|
name: "deps",
|
|
9156
9545
|
description: "Scan eval files and list git repo dependencies needed by workspaces",
|
|
@@ -9174,7 +9563,7 @@ var depsCommand = command({
|
|
|
9174
9563
|
const resolvedPaths = await resolveEvalPaths(evalPaths, cwd);
|
|
9175
9564
|
const result = await scanRepoDeps(resolvedPaths);
|
|
9176
9565
|
for (const err2 of result.errors) {
|
|
9177
|
-
console.error(`warning: ${
|
|
9566
|
+
console.error(`warning: ${path21.relative(cwd, err2.file)}: ${err2.message}`);
|
|
9178
9567
|
}
|
|
9179
9568
|
const output = {
|
|
9180
9569
|
repos: result.repos.map((r) => ({
|
|
@@ -9182,7 +9571,7 @@ var depsCommand = command({
|
|
|
9182
9571
|
...r.ref !== void 0 && { ref: r.ref },
|
|
9183
9572
|
...r.clone !== void 0 && { clone: r.clone },
|
|
9184
9573
|
...r.checkout !== void 0 && { checkout: r.checkout },
|
|
9185
|
-
...usedBy && { used_by: r.usedBy.map((p) =>
|
|
9574
|
+
...usedBy && { used_by: r.usedBy.map((p) => path21.relative(cwd, p)) }
|
|
9186
9575
|
}))
|
|
9187
9576
|
};
|
|
9188
9577
|
console.log(JSON.stringify(output, null, 2));
|
|
@@ -9190,15 +9579,15 @@ var depsCommand = command({
|
|
|
9190
9579
|
});
|
|
9191
9580
|
|
|
9192
9581
|
// src/commands/workspace/list.ts
|
|
9193
|
-
import { existsSync as
|
|
9582
|
+
import { existsSync as existsSync14 } from "node:fs";
|
|
9194
9583
|
import { readFile as readFile7, readdir as readdir6, stat as stat2 } from "node:fs/promises";
|
|
9195
|
-
import
|
|
9584
|
+
import path22 from "node:path";
|
|
9196
9585
|
async function getDirectorySize(dirPath) {
|
|
9197
9586
|
let totalSize = 0;
|
|
9198
9587
|
try {
|
|
9199
9588
|
const entries2 = await readdir6(dirPath, { withFileTypes: true });
|
|
9200
9589
|
for (const entry of entries2) {
|
|
9201
|
-
const fullPath =
|
|
9590
|
+
const fullPath = path22.join(dirPath, entry.name);
|
|
9202
9591
|
if (entry.isDirectory()) {
|
|
9203
9592
|
totalSize += await getDirectorySize(fullPath);
|
|
9204
9593
|
} else {
|
|
@@ -9222,7 +9611,7 @@ var listCommand = command({
|
|
|
9222
9611
|
args: {},
|
|
9223
9612
|
handler: async () => {
|
|
9224
9613
|
const poolRoot = getWorkspacePoolRoot();
|
|
9225
|
-
if (!
|
|
9614
|
+
if (!existsSync14(poolRoot)) {
|
|
9226
9615
|
console.log("No workspace pool entries found.");
|
|
9227
9616
|
return;
|
|
9228
9617
|
}
|
|
@@ -9233,11 +9622,11 @@ var listCommand = command({
|
|
|
9233
9622
|
return;
|
|
9234
9623
|
}
|
|
9235
9624
|
for (const dir of poolDirs) {
|
|
9236
|
-
const poolDir =
|
|
9625
|
+
const poolDir = path22.join(poolRoot, dir.name);
|
|
9237
9626
|
const fingerprint = dir.name;
|
|
9238
9627
|
const poolEntries = await readdir6(poolDir, { withFileTypes: true });
|
|
9239
9628
|
const slots = poolEntries.filter((e) => e.isDirectory() && e.name.startsWith("slot-"));
|
|
9240
|
-
const metadataPath =
|
|
9629
|
+
const metadataPath = path22.join(poolDir, "metadata.json");
|
|
9241
9630
|
let metadata = null;
|
|
9242
9631
|
try {
|
|
9243
9632
|
const raw = await readFile7(metadataPath, "utf-8");
|
|
@@ -9284,8 +9673,8 @@ var CHECK_INTERVAL_MS = 24 * 60 * 60 * 1e3;
|
|
|
9284
9673
|
var AGENTV_DIR = getAgentvHome();
|
|
9285
9674
|
var CACHE_FILE = "version-check.json";
|
|
9286
9675
|
var NPM_REGISTRY_URL = "https://registry.npmjs.org/agentv/latest";
|
|
9287
|
-
async function getCachedUpdateInfo(
|
|
9288
|
-
const filePath =
|
|
9676
|
+
async function getCachedUpdateInfo(path23) {
|
|
9677
|
+
const filePath = path23 ?? join5(AGENTV_DIR, CACHE_FILE);
|
|
9289
9678
|
try {
|
|
9290
9679
|
const raw = await readFile8(filePath, "utf-8");
|
|
9291
9680
|
const data = JSON.parse(raw);
|
|
@@ -9446,4 +9835,4 @@ export {
|
|
|
9446
9835
|
preprocessArgv,
|
|
9447
9836
|
runCli
|
|
9448
9837
|
};
|
|
9449
|
-
//# sourceMappingURL=chunk-
|
|
9838
|
+
//# sourceMappingURL=chunk-YLVQNF23.js.map
|