agentv 4.16.0 → 4.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-TA2KF32T.js → chunk-MQCJCM3I.js} +1921 -148
- package/dist/chunk-MQCJCM3I.js.map +1 -0
- package/dist/{chunk-NENUAMR5.js → chunk-YA2D37ZO.js} +2 -2
- package/dist/{chunk-NENUAMR5.js.map → chunk-YA2D37ZO.js.map} +1 -1
- package/dist/cli.js +2 -2
- package/dist/index.js +2 -2
- package/dist/{interactive-IWGFP7T2.js → interactive-3GW5UCBK.js} +2 -2
- package/dist/studio/assets/index-Bhv1TEO2.js +116 -0
- package/dist/studio/assets/index-DcpbFwI5.css +1 -0
- package/dist/studio/assets/{index-CEDEXdPp.js → index-vZYHIvCH.js} +1 -1
- package/dist/studio/index.html +2 -2
- package/package.json +1 -1
- package/dist/chunk-TA2KF32T.js.map +0 -1
- package/dist/studio/assets/index-C0HOd-aF.js +0 -65
- package/dist/studio/assets/index-_W-FWSZU.css +0 -1
- /package/dist/{interactive-IWGFP7T2.js.map → interactive-3GW5UCBK.js.map} +0 -0
|
@@ -44,7 +44,7 @@ import {
|
|
|
44
44
|
validateTargetsFile,
|
|
45
45
|
validateWorkspacePaths,
|
|
46
46
|
writeArtifactsFromResults
|
|
47
|
-
} from "./chunk-
|
|
47
|
+
} from "./chunk-YA2D37ZO.js";
|
|
48
48
|
import {
|
|
49
49
|
DEFAULT_CATEGORY,
|
|
50
50
|
DEFAULT_THRESHOLD,
|
|
@@ -2831,6 +2831,10 @@ function loadCombinedResults(filePath) {
|
|
|
2831
2831
|
}
|
|
2832
2832
|
return groups;
|
|
2833
2833
|
}
|
|
2834
|
+
function computeNormalizedGain(baselineScore, candidateScore) {
|
|
2835
|
+
if (baselineScore >= 1) return null;
|
|
2836
|
+
return (candidateScore - baselineScore) / (1 - baselineScore);
|
|
2837
|
+
}
|
|
2834
2838
|
function classifyOutcome(delta, threshold) {
|
|
2835
2839
|
if (delta >= threshold) return "win";
|
|
2836
2840
|
if (delta <= -threshold) return "loss";
|
|
@@ -2850,6 +2854,7 @@ function compareResults(results1, results2, threshold) {
|
|
|
2850
2854
|
score1,
|
|
2851
2855
|
score2,
|
|
2852
2856
|
delta,
|
|
2857
|
+
normalizedGain: computeNormalizedGain(score1, score2),
|
|
2853
2858
|
outcome: classifyOutcome(delta, threshold)
|
|
2854
2859
|
});
|
|
2855
2860
|
matchedIds.add(testId);
|
|
@@ -2861,6 +2866,8 @@ function compareResults(results1, results2, threshold) {
|
|
|
2861
2866
|
const losses = matched.filter((m) => m.outcome === "loss").length;
|
|
2862
2867
|
const ties = matched.filter((m) => m.outcome === "tie").length;
|
|
2863
2868
|
const meanDelta = matched.length > 0 ? matched.reduce((sum, m) => sum + m.delta, 0) / matched.length : 0;
|
|
2869
|
+
const gainValues = matched.map((m) => m.normalizedGain).filter((g) => g !== null);
|
|
2870
|
+
const meanNormalizedGain = gainValues.length > 0 ? Math.round(gainValues.reduce((sum, g) => sum + g, 0) / gainValues.length * 1e3) / 1e3 : null;
|
|
2864
2871
|
return {
|
|
2865
2872
|
matched,
|
|
2866
2873
|
unmatched: { file1: unmatchedFile1, file2: unmatchedFile2 },
|
|
@@ -2870,7 +2877,8 @@ function compareResults(results1, results2, threshold) {
|
|
|
2870
2877
|
wins,
|
|
2871
2878
|
losses,
|
|
2872
2879
|
ties,
|
|
2873
|
-
meanDelta: Math.round(meanDelta * 1e3) / 1e3
|
|
2880
|
+
meanDelta: Math.round(meanDelta * 1e3) / 1e3,
|
|
2881
|
+
meanNormalizedGain
|
|
2874
2882
|
}
|
|
2875
2883
|
};
|
|
2876
2884
|
}
|
|
@@ -2989,16 +2997,21 @@ function formatTable(comparison, file1, file2) {
|
|
|
2989
2997
|
);
|
|
2990
2998
|
}
|
|
2991
2999
|
lines.push("");
|
|
2992
|
-
const { wins, losses, ties, meanDelta } = comparison.summary;
|
|
3000
|
+
const { wins, losses, ties, meanDelta, meanNormalizedGain } = comparison.summary;
|
|
2993
3001
|
const winStr = wins > 0 ? `${c2.green}${wins} win${wins !== 1 ? "s" : ""}${c2.reset}` : `${wins} wins`;
|
|
2994
3002
|
const lossStr = losses > 0 ? `${c2.red}${losses} loss${losses !== 1 ? "es" : ""}${c2.reset}` : `${losses} losses`;
|
|
2995
3003
|
const tieStr = `${ties} tie${ties !== 1 ? "s" : ""}`;
|
|
2996
3004
|
const deltaColor = meanDelta > 0 ? c2.green : meanDelta < 0 ? c2.red : c2.gray;
|
|
2997
3005
|
const deltaSign = meanDelta >= 0 ? "+" : "";
|
|
2998
3006
|
const status = meanDelta > 0 ? `${c2.green}improved${c2.reset}` : meanDelta < 0 ? `${c2.red}regressed${c2.reset}` : `${c2.gray}neutral${c2.reset}`;
|
|
2999
|
-
|
|
3000
|
-
|
|
3001
|
-
|
|
3007
|
+
let summaryLine = `${c2.bold}Summary:${c2.reset} ${winStr}, ${lossStr}, ${tieStr} | Mean \u0394: ${deltaColor}${deltaSign}${meanDelta.toFixed(3)}${c2.reset}`;
|
|
3008
|
+
if (meanNormalizedGain != null) {
|
|
3009
|
+
const gColor = meanNormalizedGain > 0 ? c2.green : meanNormalizedGain < 0 ? c2.red : c2.gray;
|
|
3010
|
+
const gSign = meanNormalizedGain >= 0 ? "+" : "";
|
|
3011
|
+
summaryLine += ` | g: ${gColor}${gSign}${meanNormalizedGain.toFixed(3)}${c2.reset}`;
|
|
3012
|
+
}
|
|
3013
|
+
summaryLine += ` | Status: ${status}`;
|
|
3014
|
+
lines.push(summaryLine);
|
|
3002
3015
|
lines.push("");
|
|
3003
3016
|
return lines.join("\n");
|
|
3004
3017
|
}
|
|
@@ -3054,13 +3067,18 @@ function formatMatrix(matrixOutput, baselineTarget) {
|
|
|
3054
3067
|
...pairwise.map((pw) => ` ${pw.baseline} \u2192 ${pw.candidate}:`.length)
|
|
3055
3068
|
);
|
|
3056
3069
|
for (const p of pairwise) {
|
|
3057
|
-
const { wins, losses, ties, meanDelta } = p.summary;
|
|
3070
|
+
const { wins, losses, ties, meanDelta, meanNormalizedGain } = p.summary;
|
|
3058
3071
|
const sign = meanDelta >= 0 ? "+" : "";
|
|
3059
3072
|
const deltaColor = meanDelta > 0 ? c2.green : meanDelta < 0 ? c2.red : c2.gray;
|
|
3060
3073
|
const label = ` ${p.baseline} \u2192 ${p.candidate}:`;
|
|
3061
|
-
|
|
3062
|
-
|
|
3063
|
-
|
|
3074
|
+
let pairLine = `${padRight2(label, maxLabelLen)} ${wins} win${wins !== 1 ? "s" : ""}, ${losses} loss${losses !== 1 ? "es" : ""}, ${ties} tie${ties !== 1 ? "s" : ""} (${c2.bold}\u0394${c2.reset} ${deltaColor}${sign}${meanDelta.toFixed(3)}${c2.reset}`;
|
|
3075
|
+
if (meanNormalizedGain != null) {
|
|
3076
|
+
const gColor = meanNormalizedGain > 0 ? c2.green : meanNormalizedGain < 0 ? c2.red : c2.gray;
|
|
3077
|
+
const gSign = meanNormalizedGain >= 0 ? "+" : "";
|
|
3078
|
+
pairLine += `, ${c2.bold}g${c2.reset} ${gColor}${gSign}${meanNormalizedGain.toFixed(3)}${c2.reset}`;
|
|
3079
|
+
}
|
|
3080
|
+
pairLine += ")";
|
|
3081
|
+
lines.push(pairLine);
|
|
3064
3082
|
}
|
|
3065
3083
|
}
|
|
3066
3084
|
lines.push("");
|
|
@@ -3929,7 +3947,7 @@ var evalRunCommand = command({
|
|
|
3929
3947
|
},
|
|
3930
3948
|
handler: async (args) => {
|
|
3931
3949
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
3932
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
3950
|
+
const { launchInteractiveWizard } = await import("./interactive-3GW5UCBK.js");
|
|
3933
3951
|
await launchInteractiveWizard();
|
|
3934
3952
|
return;
|
|
3935
3953
|
}
|
|
@@ -6972,6 +6990,1739 @@ var resultsFailuresCommand = command({
|
|
|
6972
6990
|
}
|
|
6973
6991
|
});
|
|
6974
6992
|
|
|
6993
|
+
// src/commands/results/report.ts
|
|
6994
|
+
import { existsSync as existsSync8, mkdirSync as mkdirSync2, readFileSync as readFileSync7, writeFileSync as writeFileSync3 } from "node:fs";
|
|
6995
|
+
import path13 from "node:path";
|
|
6996
|
+
|
|
6997
|
+
// src/commands/results/report-template.ts
|
|
6998
|
+
var RESULTS_REPORT_TEMPLATE = `<!DOCTYPE html>
|
|
6999
|
+
<html lang="en">
|
|
7000
|
+
<head>
|
|
7001
|
+
<meta charset="utf-8">
|
|
7002
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
7003
|
+
<title>AgentV Evaluation Report</title>
|
|
7004
|
+
<style>
|
|
7005
|
+
* { box-sizing: border-box; }
|
|
7006
|
+
html, body { margin: 0; padding: 0; }
|
|
7007
|
+
:root {
|
|
7008
|
+
color-scheme: dark;
|
|
7009
|
+
--canvas: #030712;
|
|
7010
|
+
--surface: #111827;
|
|
7011
|
+
--surface-muted: rgba(17, 24, 39, 0.5);
|
|
7012
|
+
--surface-hover: rgba(17, 24, 39, 0.3);
|
|
7013
|
+
--border: #1f2937;
|
|
7014
|
+
--text: #d1d5db;
|
|
7015
|
+
--text-muted: #9ca3af;
|
|
7016
|
+
--text-subtle: #6b7280;
|
|
7017
|
+
--heading: #ffffff;
|
|
7018
|
+
--accent: #22d3ee;
|
|
7019
|
+
--accent-weak: rgba(8, 145, 178, 0.2);
|
|
7020
|
+
--pass: #34d399;
|
|
7021
|
+
--warn: #facc15;
|
|
7022
|
+
--fail: #f87171;
|
|
7023
|
+
--track: #1f2937;
|
|
7024
|
+
--pill-gradient: linear-gradient(90deg, #60a5fa 0%, #2563eb 100%);
|
|
7025
|
+
--font: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
|
|
7026
|
+
--radius-md: 6px;
|
|
7027
|
+
--radius-lg: 8px;
|
|
7028
|
+
}
|
|
7029
|
+
|
|
7030
|
+
:root.light {
|
|
7031
|
+
color-scheme: light;
|
|
7032
|
+
--canvas: #f8fafc;
|
|
7033
|
+
--surface: #ffffff;
|
|
7034
|
+
--surface-muted: rgba(241, 245, 249, 0.8);
|
|
7035
|
+
--surface-hover: rgba(241, 245, 249, 0.5);
|
|
7036
|
+
--border: #e2e8f0;
|
|
7037
|
+
--text: #1e293b;
|
|
7038
|
+
--text-muted: #64748b;
|
|
7039
|
+
--text-subtle: #94a3b8;
|
|
7040
|
+
--heading: #0f172a;
|
|
7041
|
+
--accent: #0891b2;
|
|
7042
|
+
--accent-weak: rgba(8, 145, 178, 0.1);
|
|
7043
|
+
--pass: #16a34a;
|
|
7044
|
+
--warn: #ca8a04;
|
|
7045
|
+
--fail: #dc2626;
|
|
7046
|
+
--track: #e2e8f0;
|
|
7047
|
+
--pill-gradient: linear-gradient(90deg, #3b82f6 0%, #2563eb 100%);
|
|
7048
|
+
}
|
|
7049
|
+
|
|
7050
|
+
:root.light .chip {
|
|
7051
|
+
border-color: rgba(8, 145, 178, 0.3);
|
|
7052
|
+
background: rgba(8, 145, 178, 0.08);
|
|
7053
|
+
color: #0891b2;
|
|
7054
|
+
}
|
|
7055
|
+
|
|
7056
|
+
:root.light .assertion-badge {
|
|
7057
|
+
border-color: rgba(8, 145, 178, 0.3);
|
|
7058
|
+
background: rgba(8, 145, 178, 0.08);
|
|
7059
|
+
color: #0891b2;
|
|
7060
|
+
}
|
|
7061
|
+
|
|
7062
|
+
:root.light .pass-rate-label {
|
|
7063
|
+
color: #ffffff;
|
|
7064
|
+
}
|
|
7065
|
+
|
|
7066
|
+
:root.light .group-header {
|
|
7067
|
+
background: rgba(241, 245, 249, 0.8);
|
|
7068
|
+
}
|
|
7069
|
+
|
|
7070
|
+
:root.light .status-pill {
|
|
7071
|
+
background: rgba(241, 245, 249, 0.8);
|
|
7072
|
+
}
|
|
7073
|
+
|
|
7074
|
+
:root.light .assertion-item {
|
|
7075
|
+
background: rgba(241, 245, 249, 0.5);
|
|
7076
|
+
border-color: var(--border);
|
|
7077
|
+
}
|
|
7078
|
+
|
|
7079
|
+
:root.light .detail-row td {
|
|
7080
|
+
background: rgba(248, 250, 252, 0.8);
|
|
7081
|
+
}
|
|
7082
|
+
|
|
7083
|
+
body {
|
|
7084
|
+
background: var(--canvas);
|
|
7085
|
+
color: var(--text);
|
|
7086
|
+
font-family: var(--font);
|
|
7087
|
+
font-size: 14px;
|
|
7088
|
+
line-height: 1.5;
|
|
7089
|
+
}
|
|
7090
|
+
|
|
7091
|
+
button, input, select {
|
|
7092
|
+
font: inherit;
|
|
7093
|
+
}
|
|
7094
|
+
|
|
7095
|
+
.tabular,
|
|
7096
|
+
.num,
|
|
7097
|
+
.pass-rate-label,
|
|
7098
|
+
.metric-value,
|
|
7099
|
+
.count-label,
|
|
7100
|
+
.table-num {
|
|
7101
|
+
font-variant-numeric: tabular-nums;
|
|
7102
|
+
}
|
|
7103
|
+
|
|
7104
|
+
.shell {
|
|
7105
|
+
min-height: 100vh;
|
|
7106
|
+
}
|
|
7107
|
+
|
|
7108
|
+
.header {
|
|
7109
|
+
border-bottom: 1px solid var(--border);
|
|
7110
|
+
background: var(--surface);
|
|
7111
|
+
padding: 16px 24px;
|
|
7112
|
+
display: flex;
|
|
7113
|
+
align-items: center;
|
|
7114
|
+
justify-content: space-between;
|
|
7115
|
+
gap: 16px;
|
|
7116
|
+
}
|
|
7117
|
+
|
|
7118
|
+
.header-title-wrap {
|
|
7119
|
+
display: flex;
|
|
7120
|
+
flex-direction: column;
|
|
7121
|
+
gap: 4px;
|
|
7122
|
+
}
|
|
7123
|
+
|
|
7124
|
+
.eyebrow {
|
|
7125
|
+
color: var(--text-muted);
|
|
7126
|
+
font-size: 12px;
|
|
7127
|
+
letter-spacing: 0.08em;
|
|
7128
|
+
text-transform: uppercase;
|
|
7129
|
+
}
|
|
7130
|
+
|
|
7131
|
+
.header-title {
|
|
7132
|
+
color: var(--heading);
|
|
7133
|
+
font-size: 24px;
|
|
7134
|
+
font-weight: 600;
|
|
7135
|
+
margin: 0;
|
|
7136
|
+
}
|
|
7137
|
+
|
|
7138
|
+
.header-subtitle {
|
|
7139
|
+
color: var(--text-muted);
|
|
7140
|
+
margin: 0;
|
|
7141
|
+
}
|
|
7142
|
+
|
|
7143
|
+
.header-meta {
|
|
7144
|
+
display: flex;
|
|
7145
|
+
flex-wrap: wrap;
|
|
7146
|
+
justify-content: flex-end;
|
|
7147
|
+
align-items: center;
|
|
7148
|
+
gap: 8px;
|
|
7149
|
+
}
|
|
7150
|
+
|
|
7151
|
+
.chip {
|
|
7152
|
+
border: 1px solid rgba(8, 145, 178, 0.45);
|
|
7153
|
+
background: rgba(8, 145, 178, 0.16);
|
|
7154
|
+
color: #67e8f9;
|
|
7155
|
+
border-radius: var(--radius-md);
|
|
7156
|
+
padding: 4px 8px;
|
|
7157
|
+
font-size: 12px;
|
|
7158
|
+
font-weight: 500;
|
|
7159
|
+
}
|
|
7160
|
+
|
|
7161
|
+
.tabs {
|
|
7162
|
+
display: flex;
|
|
7163
|
+
gap: 4px;
|
|
7164
|
+
padding: 0 24px;
|
|
7165
|
+
border-bottom: 1px solid var(--border);
|
|
7166
|
+
background: var(--canvas);
|
|
7167
|
+
}
|
|
7168
|
+
|
|
7169
|
+
.tab {
|
|
7170
|
+
background: transparent;
|
|
7171
|
+
border: none;
|
|
7172
|
+
border-bottom: 2px solid transparent;
|
|
7173
|
+
color: var(--text-muted);
|
|
7174
|
+
cursor: pointer;
|
|
7175
|
+
padding: 12px 8px 10px;
|
|
7176
|
+
transition: color 120ms ease, border-color 120ms ease;
|
|
7177
|
+
}
|
|
7178
|
+
|
|
7179
|
+
.tab:hover {
|
|
7180
|
+
color: var(--text);
|
|
7181
|
+
}
|
|
7182
|
+
|
|
7183
|
+
.tab.active {
|
|
7184
|
+
color: var(--accent);
|
|
7185
|
+
border-bottom-color: var(--accent);
|
|
7186
|
+
}
|
|
7187
|
+
|
|
7188
|
+
.app {
|
|
7189
|
+
padding: 24px;
|
|
7190
|
+
display: flex;
|
|
7191
|
+
flex-direction: column;
|
|
7192
|
+
gap: 24px;
|
|
7193
|
+
}
|
|
7194
|
+
|
|
7195
|
+
.empty-state {
|
|
7196
|
+
border: 1px solid var(--border);
|
|
7197
|
+
border-radius: var(--radius-lg);
|
|
7198
|
+
background: var(--surface);
|
|
7199
|
+
padding: 32px;
|
|
7200
|
+
text-align: center;
|
|
7201
|
+
}
|
|
7202
|
+
|
|
7203
|
+
.empty-state h2 {
|
|
7204
|
+
margin: 0 0 8px;
|
|
7205
|
+
color: var(--heading);
|
|
7206
|
+
font-size: 18px;
|
|
7207
|
+
font-weight: 500;
|
|
7208
|
+
}
|
|
7209
|
+
|
|
7210
|
+
.empty-state p {
|
|
7211
|
+
margin: 0;
|
|
7212
|
+
color: var(--text-muted);
|
|
7213
|
+
}
|
|
7214
|
+
|
|
7215
|
+
.stats-grid {
|
|
7216
|
+
display: grid;
|
|
7217
|
+
grid-template-columns: repeat(auto-fit, minmax(160px, 1fr));
|
|
7218
|
+
gap: 12px;
|
|
7219
|
+
}
|
|
7220
|
+
|
|
7221
|
+
.stat-card {
|
|
7222
|
+
border: 1px solid var(--border);
|
|
7223
|
+
border-radius: var(--radius-lg);
|
|
7224
|
+
background: var(--surface);
|
|
7225
|
+
padding: 16px;
|
|
7226
|
+
display: flex;
|
|
7227
|
+
flex-direction: column;
|
|
7228
|
+
gap: 6px;
|
|
7229
|
+
min-height: 96px;
|
|
7230
|
+
}
|
|
7231
|
+
|
|
7232
|
+
.stat-label {
|
|
7233
|
+
color: var(--text-muted);
|
|
7234
|
+
font-size: 12px;
|
|
7235
|
+
letter-spacing: 0.04em;
|
|
7236
|
+
text-transform: uppercase;
|
|
7237
|
+
}
|
|
7238
|
+
|
|
7239
|
+
.stat-value {
|
|
7240
|
+
color: var(--heading);
|
|
7241
|
+
font-size: 28px;
|
|
7242
|
+
font-weight: 600;
|
|
7243
|
+
line-height: 1.1;
|
|
7244
|
+
}
|
|
7245
|
+
|
|
7246
|
+
.tone-pass .stat-value,
|
|
7247
|
+
.text-pass { color: var(--pass); }
|
|
7248
|
+
.tone-warn .stat-value,
|
|
7249
|
+
.text-warn { color: var(--warn); }
|
|
7250
|
+
.tone-fail .stat-value,
|
|
7251
|
+
.text-fail { color: var(--fail); }
|
|
7252
|
+
|
|
7253
|
+
.section {
|
|
7254
|
+
display: flex;
|
|
7255
|
+
flex-direction: column;
|
|
7256
|
+
gap: 12px;
|
|
7257
|
+
}
|
|
7258
|
+
|
|
7259
|
+
.section-heading {
|
|
7260
|
+
display: flex;
|
|
7261
|
+
align-items: baseline;
|
|
7262
|
+
justify-content: space-between;
|
|
7263
|
+
gap: 12px;
|
|
7264
|
+
flex-wrap: wrap;
|
|
7265
|
+
}
|
|
7266
|
+
|
|
7267
|
+
.section-heading h2 {
|
|
7268
|
+
margin: 0;
|
|
7269
|
+
color: var(--heading);
|
|
7270
|
+
font-size: 20px;
|
|
7271
|
+
font-weight: 600;
|
|
7272
|
+
}
|
|
7273
|
+
|
|
7274
|
+
.section-heading p {
|
|
7275
|
+
margin: 0;
|
|
7276
|
+
color: var(--text-muted);
|
|
7277
|
+
}
|
|
7278
|
+
|
|
7279
|
+
.table-wrap {
|
|
7280
|
+
overflow-x: auto;
|
|
7281
|
+
border: 1px solid var(--border);
|
|
7282
|
+
border-radius: var(--radius-lg);
|
|
7283
|
+
background: var(--surface);
|
|
7284
|
+
}
|
|
7285
|
+
|
|
7286
|
+
table {
|
|
7287
|
+
width: 100%;
|
|
7288
|
+
border-collapse: collapse;
|
|
7289
|
+
font-size: 14px;
|
|
7290
|
+
}
|
|
7291
|
+
|
|
7292
|
+
thead {
|
|
7293
|
+
border-bottom: 1px solid var(--border);
|
|
7294
|
+
background: var(--surface-muted);
|
|
7295
|
+
}
|
|
7296
|
+
|
|
7297
|
+
th {
|
|
7298
|
+
padding: 12px 16px;
|
|
7299
|
+
text-align: left;
|
|
7300
|
+
font-weight: 500;
|
|
7301
|
+
color: var(--text-muted);
|
|
7302
|
+
white-space: nowrap;
|
|
7303
|
+
}
|
|
7304
|
+
|
|
7305
|
+
td {
|
|
7306
|
+
padding: 12px 16px;
|
|
7307
|
+
border-top: 1px solid rgba(31, 41, 55, 0.5);
|
|
7308
|
+
vertical-align: top;
|
|
7309
|
+
}
|
|
7310
|
+
|
|
7311
|
+
tbody tr {
|
|
7312
|
+
transition: background-color 120ms ease;
|
|
7313
|
+
}
|
|
7314
|
+
|
|
7315
|
+
tbody tr:hover {
|
|
7316
|
+
background: var(--surface-hover);
|
|
7317
|
+
}
|
|
7318
|
+
|
|
7319
|
+
.sortable {
|
|
7320
|
+
cursor: pointer;
|
|
7321
|
+
user-select: none;
|
|
7322
|
+
}
|
|
7323
|
+
|
|
7324
|
+
.sortable:hover {
|
|
7325
|
+
color: var(--text);
|
|
7326
|
+
}
|
|
7327
|
+
|
|
7328
|
+
.table-num {
|
|
7329
|
+
text-align: right;
|
|
7330
|
+
color: var(--text);
|
|
7331
|
+
white-space: nowrap;
|
|
7332
|
+
}
|
|
7333
|
+
|
|
7334
|
+
.table-muted {
|
|
7335
|
+
color: var(--text-muted);
|
|
7336
|
+
}
|
|
7337
|
+
|
|
7338
|
+
.id-cell {
|
|
7339
|
+
font-weight: 500;
|
|
7340
|
+
color: var(--heading);
|
|
7341
|
+
}
|
|
7342
|
+
|
|
7343
|
+
.status-pill {
|
|
7344
|
+
display: inline-flex;
|
|
7345
|
+
align-items: center;
|
|
7346
|
+
gap: 6px;
|
|
7347
|
+
padding: 2px 8px;
|
|
7348
|
+
border-radius: 999px;
|
|
7349
|
+
border: 1px solid var(--border);
|
|
7350
|
+
background: rgba(17, 24, 39, 0.6);
|
|
7351
|
+
color: var(--text);
|
|
7352
|
+
font-size: 12px;
|
|
7353
|
+
white-space: nowrap;
|
|
7354
|
+
}
|
|
7355
|
+
|
|
7356
|
+
.status-dot {
|
|
7357
|
+
width: 8px;
|
|
7358
|
+
height: 8px;
|
|
7359
|
+
border-radius: 999px;
|
|
7360
|
+
background: var(--text-subtle);
|
|
7361
|
+
}
|
|
7362
|
+
|
|
7363
|
+
.status-pass .status-dot { background: var(--pass); }
|
|
7364
|
+
.status-fail .status-dot { background: var(--fail); }
|
|
7365
|
+
.status-error .status-dot { background: var(--warn); }
|
|
7366
|
+
|
|
7367
|
+
.pass-rate-track {
|
|
7368
|
+
width: 80px;
|
|
7369
|
+
height: 20px;
|
|
7370
|
+
overflow: hidden;
|
|
7371
|
+
border-radius: 999px;
|
|
7372
|
+
background: var(--track);
|
|
7373
|
+
position: relative;
|
|
7374
|
+
border: 1px solid rgba(31, 41, 55, 0.8);
|
|
7375
|
+
}
|
|
7376
|
+
|
|
7377
|
+
.pass-rate-fill {
|
|
7378
|
+
position: absolute;
|
|
7379
|
+
inset: 0 auto 0 0;
|
|
7380
|
+
background: var(--pill-gradient);
|
|
7381
|
+
border-radius: 999px;
|
|
7382
|
+
}
|
|
7383
|
+
|
|
7384
|
+
.pass-rate-label {
|
|
7385
|
+
position: absolute;
|
|
7386
|
+
inset: 0;
|
|
7387
|
+
display: flex;
|
|
7388
|
+
align-items: center;
|
|
7389
|
+
justify-content: center;
|
|
7390
|
+
font-size: 12px;
|
|
7391
|
+
font-weight: 600;
|
|
7392
|
+
color: #ffffff;
|
|
7393
|
+
white-space: nowrap;
|
|
7394
|
+
z-index: 1;
|
|
7395
|
+
}
|
|
7396
|
+
|
|
7397
|
+
.histogram {
|
|
7398
|
+
border: 1px solid var(--border);
|
|
7399
|
+
border-radius: var(--radius-lg);
|
|
7400
|
+
background: var(--surface);
|
|
7401
|
+
padding: 16px;
|
|
7402
|
+
display: flex;
|
|
7403
|
+
flex-direction: column;
|
|
7404
|
+
gap: 10px;
|
|
7405
|
+
}
|
|
7406
|
+
|
|
7407
|
+
.hist-row {
|
|
7408
|
+
display: grid;
|
|
7409
|
+
grid-template-columns: 72px 1fr 40px;
|
|
7410
|
+
gap: 12px;
|
|
7411
|
+
align-items: center;
|
|
7412
|
+
}
|
|
7413
|
+
|
|
7414
|
+
.hist-label,
|
|
7415
|
+
.hist-count {
|
|
7416
|
+
color: var(--text-muted);
|
|
7417
|
+
font-size: 12px;
|
|
7418
|
+
}
|
|
7419
|
+
|
|
7420
|
+
.hist-bar-track {
|
|
7421
|
+
height: 16px;
|
|
7422
|
+
border-radius: 999px;
|
|
7423
|
+
background: var(--track);
|
|
7424
|
+
overflow: hidden;
|
|
7425
|
+
}
|
|
7426
|
+
|
|
7427
|
+
.hist-bar-fill {
|
|
7428
|
+
height: 100%;
|
|
7429
|
+
border-radius: 999px;
|
|
7430
|
+
background: var(--pill-gradient);
|
|
7431
|
+
}
|
|
7432
|
+
|
|
7433
|
+
.filter-bar {
|
|
7434
|
+
display: flex;
|
|
7435
|
+
flex-wrap: wrap;
|
|
7436
|
+
gap: 8px;
|
|
7437
|
+
align-items: center;
|
|
7438
|
+
}
|
|
7439
|
+
|
|
7440
|
+
.filter-input,
|
|
7441
|
+
.filter-select {
|
|
7442
|
+
border: 1px solid #374151;
|
|
7443
|
+
border-radius: var(--radius-md);
|
|
7444
|
+
background: var(--canvas);
|
|
7445
|
+
color: var(--text);
|
|
7446
|
+
padding: 8px 10px;
|
|
7447
|
+
}
|
|
7448
|
+
|
|
7449
|
+
.filter-input::placeholder {
|
|
7450
|
+
color: var(--text-subtle);
|
|
7451
|
+
}
|
|
7452
|
+
|
|
7453
|
+
.filter-input:focus,
|
|
7454
|
+
.filter-select:focus {
|
|
7455
|
+
outline: none;
|
|
7456
|
+
border-color: var(--accent);
|
|
7457
|
+
box-shadow: 0 0 0 1px var(--accent);
|
|
7458
|
+
}
|
|
7459
|
+
|
|
7460
|
+
.filter-input {
|
|
7461
|
+
min-width: 260px;
|
|
7462
|
+
flex: 1 1 260px;
|
|
7463
|
+
}
|
|
7464
|
+
|
|
7465
|
+
.filter-count {
|
|
7466
|
+
margin-left: auto;
|
|
7467
|
+
color: var(--text-muted);
|
|
7468
|
+
font-size: 12px;
|
|
7469
|
+
}
|
|
7470
|
+
|
|
7471
|
+
.group-list {
|
|
7472
|
+
display: flex;
|
|
7473
|
+
flex-direction: column;
|
|
7474
|
+
gap: 16px;
|
|
7475
|
+
}
|
|
7476
|
+
|
|
7477
|
+
.group-card {
|
|
7478
|
+
border: 1px solid var(--border);
|
|
7479
|
+
border-radius: var(--radius-lg);
|
|
7480
|
+
overflow: hidden;
|
|
7481
|
+
background: var(--surface);
|
|
7482
|
+
}
|
|
7483
|
+
|
|
7484
|
+
.group-header {
|
|
7485
|
+
padding: 16px;
|
|
7486
|
+
border-bottom: 1px solid var(--border);
|
|
7487
|
+
display: flex;
|
|
7488
|
+
justify-content: space-between;
|
|
7489
|
+
align-items: center;
|
|
7490
|
+
gap: 12px;
|
|
7491
|
+
flex-wrap: wrap;
|
|
7492
|
+
background: rgba(17, 24, 39, 0.65);
|
|
7493
|
+
}
|
|
7494
|
+
|
|
7495
|
+
.group-title-wrap {
|
|
7496
|
+
display: flex;
|
|
7497
|
+
flex-direction: column;
|
|
7498
|
+
gap: 4px;
|
|
7499
|
+
}
|
|
7500
|
+
|
|
7501
|
+
.group-title {
|
|
7502
|
+
margin: 0;
|
|
7503
|
+
color: var(--heading);
|
|
7504
|
+
font-size: 18px;
|
|
7505
|
+
font-weight: 600;
|
|
7506
|
+
}
|
|
7507
|
+
|
|
7508
|
+
.group-subtitle {
|
|
7509
|
+
margin: 0;
|
|
7510
|
+
color: var(--text-muted);
|
|
7511
|
+
font-size: 13px;
|
|
7512
|
+
}
|
|
7513
|
+
|
|
7514
|
+
.group-metrics {
|
|
7515
|
+
display: flex;
|
|
7516
|
+
flex-wrap: wrap;
|
|
7517
|
+
align-items: center;
|
|
7518
|
+
gap: 10px 14px;
|
|
7519
|
+
}
|
|
7520
|
+
|
|
7521
|
+
.metric {
|
|
7522
|
+
display: flex;
|
|
7523
|
+
flex-direction: column;
|
|
7524
|
+
gap: 2px;
|
|
7525
|
+
}
|
|
7526
|
+
|
|
7527
|
+
.metric-label {
|
|
7528
|
+
color: var(--text-muted);
|
|
7529
|
+
font-size: 12px;
|
|
7530
|
+
}
|
|
7531
|
+
|
|
7532
|
+
.metric-value {
|
|
7533
|
+
color: var(--heading);
|
|
7534
|
+
font-weight: 500;
|
|
7535
|
+
}
|
|
7536
|
+
|
|
7537
|
+
.test-row {
|
|
7538
|
+
cursor: pointer;
|
|
7539
|
+
}
|
|
7540
|
+
|
|
7541
|
+
.expand-cell {
|
|
7542
|
+
width: 40px;
|
|
7543
|
+
color: var(--text-muted);
|
|
7544
|
+
text-align: center;
|
|
7545
|
+
}
|
|
7546
|
+
|
|
7547
|
+
.detail-row td {
|
|
7548
|
+
padding: 0;
|
|
7549
|
+
background: rgba(3, 7, 18, 0.5);
|
|
7550
|
+
}
|
|
7551
|
+
|
|
7552
|
+
.detail-panel {
|
|
7553
|
+
padding: 16px;
|
|
7554
|
+
display: flex;
|
|
7555
|
+
flex-direction: column;
|
|
7556
|
+
gap: 16px;
|
|
7557
|
+
}
|
|
7558
|
+
|
|
7559
|
+
.detail-grid {
|
|
7560
|
+
display: grid;
|
|
7561
|
+
grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
|
|
7562
|
+
gap: 16px;
|
|
7563
|
+
}
|
|
7564
|
+
|
|
7565
|
+
.detail-card,
|
|
7566
|
+
.assertion-card,
|
|
7567
|
+
.error-card {
|
|
7568
|
+
border: 1px solid var(--border);
|
|
7569
|
+
border-radius: var(--radius-lg);
|
|
7570
|
+
background: var(--surface);
|
|
7571
|
+
padding: 16px;
|
|
7572
|
+
}
|
|
7573
|
+
|
|
7574
|
+
.detail-card h4,
|
|
7575
|
+
.assertion-card h4,
|
|
7576
|
+
.error-card h4 {
|
|
7577
|
+
margin: 0 0 10px;
|
|
7578
|
+
color: var(--heading);
|
|
7579
|
+
font-size: 14px;
|
|
7580
|
+
font-weight: 500;
|
|
7581
|
+
}
|
|
7582
|
+
|
|
7583
|
+
pre {
|
|
7584
|
+
margin: 0;
|
|
7585
|
+
white-space: pre-wrap;
|
|
7586
|
+
word-break: break-word;
|
|
7587
|
+
font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
|
|
7588
|
+
font-size: 12px;
|
|
7589
|
+
color: var(--text);
|
|
7590
|
+
max-height: 320px;
|
|
7591
|
+
overflow: auto;
|
|
7592
|
+
}
|
|
7593
|
+
|
|
7594
|
+
.evaluator-table-wrap {
|
|
7595
|
+
border: 1px solid var(--border);
|
|
7596
|
+
border-radius: var(--radius-lg);
|
|
7597
|
+
overflow-x: auto;
|
|
7598
|
+
background: var(--surface);
|
|
7599
|
+
}
|
|
7600
|
+
|
|
7601
|
+
.evaluator-table-wrap th,
|
|
7602
|
+
.evaluator-table-wrap td {
|
|
7603
|
+
padding: 14px 20px;
|
|
7604
|
+
}
|
|
7605
|
+
|
|
7606
|
+
.assertion-grid {
|
|
7607
|
+
display: grid;
|
|
7608
|
+
grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
|
|
7609
|
+
gap: 16px;
|
|
7610
|
+
}
|
|
7611
|
+
|
|
7612
|
+
.assertion-list {
|
|
7613
|
+
list-style: none;
|
|
7614
|
+
margin: 0;
|
|
7615
|
+
padding: 0;
|
|
7616
|
+
display: flex;
|
|
7617
|
+
flex-direction: column;
|
|
7618
|
+
gap: 10px;
|
|
7619
|
+
}
|
|
7620
|
+
|
|
7621
|
+
.assertion-item {
|
|
7622
|
+
border: 1px solid rgba(31, 41, 55, 0.8);
|
|
7623
|
+
border-radius: var(--radius-md);
|
|
7624
|
+
background: rgba(3, 7, 18, 0.45);
|
|
7625
|
+
padding: 10px 12px;
|
|
7626
|
+
display: flex;
|
|
7627
|
+
flex-direction: column;
|
|
7628
|
+
gap: 8px;
|
|
7629
|
+
}
|
|
7630
|
+
|
|
7631
|
+
.assertion-topline {
|
|
7632
|
+
display: flex;
|
|
7633
|
+
align-items: center;
|
|
7634
|
+
flex-wrap: wrap;
|
|
7635
|
+
gap: 8px;
|
|
7636
|
+
}
|
|
7637
|
+
|
|
7638
|
+
.assertion-badge {
|
|
7639
|
+
border: 1px solid rgba(8, 145, 178, 0.45);
|
|
7640
|
+
background: rgba(8, 145, 178, 0.16);
|
|
7641
|
+
color: #67e8f9;
|
|
7642
|
+
border-radius: var(--radius-md);
|
|
7643
|
+
padding: 2px 8px;
|
|
7644
|
+
font-size: 12px;
|
|
7645
|
+
font-weight: 500;
|
|
7646
|
+
align-self: flex-start;
|
|
7647
|
+
}
|
|
7648
|
+
|
|
7649
|
+
.assertion-status {
|
|
7650
|
+
font-size: 12px;
|
|
7651
|
+
font-weight: 500;
|
|
7652
|
+
}
|
|
7653
|
+
|
|
7654
|
+
.assertion-text {
|
|
7655
|
+
color: var(--text);
|
|
7656
|
+
}
|
|
7657
|
+
|
|
7658
|
+
.assertion-evidence {
|
|
7659
|
+
color: var(--text-muted);
|
|
7660
|
+
font-size: 12px;
|
|
7661
|
+
white-space: pre-wrap;
|
|
7662
|
+
word-break: break-word;
|
|
7663
|
+
}
|
|
7664
|
+
|
|
7665
|
+
.meta-row {
|
|
7666
|
+
display: flex;
|
|
7667
|
+
flex-wrap: wrap;
|
|
7668
|
+
gap: 8px 16px;
|
|
7669
|
+
color: var(--text-muted);
|
|
7670
|
+
font-size: 12px;
|
|
7671
|
+
border-top: 1px solid var(--border);
|
|
7672
|
+
padding-top: 12px;
|
|
7673
|
+
}
|
|
7674
|
+
|
|
7675
|
+
.error-card {
|
|
7676
|
+
border-color: rgba(248, 113, 113, 0.45);
|
|
7677
|
+
background: rgba(127, 29, 29, 0.18);
|
|
7678
|
+
}
|
|
7679
|
+
|
|
7680
|
+
.error-card h4,
|
|
7681
|
+
.error-card pre {
|
|
7682
|
+
color: #fecaca;
|
|
7683
|
+
}
|
|
7684
|
+
|
|
7685
|
+
.muted {
|
|
7686
|
+
color: var(--text-muted);
|
|
7687
|
+
}
|
|
7688
|
+
|
|
7689
|
+
.hidden {
|
|
7690
|
+
display: none;
|
|
7691
|
+
}
|
|
7692
|
+
|
|
7693
|
+
.theme-toggle {
|
|
7694
|
+
background: var(--surface);
|
|
7695
|
+
border: 1px solid var(--border);
|
|
7696
|
+
border-radius: var(--radius-md);
|
|
7697
|
+
color: var(--text-muted);
|
|
7698
|
+
cursor: pointer;
|
|
7699
|
+
padding: 6px 10px;
|
|
7700
|
+
font-size: 13px;
|
|
7701
|
+
transition: color 120ms ease, border-color 120ms ease;
|
|
7702
|
+
white-space: nowrap;
|
|
7703
|
+
}
|
|
7704
|
+
|
|
7705
|
+
.theme-toggle:hover {
|
|
7706
|
+
color: var(--text);
|
|
7707
|
+
border-color: var(--text-muted);
|
|
7708
|
+
}
|
|
7709
|
+
|
|
7710
|
+
.criteria-cell {
|
|
7711
|
+
color: var(--text-muted);
|
|
7712
|
+
font-size: 13px;
|
|
7713
|
+
max-width: 320px;
|
|
7714
|
+
}
|
|
7715
|
+
|
|
7716
|
+
.io-toggle {
|
|
7717
|
+
cursor: pointer;
|
|
7718
|
+
user-select: none;
|
|
7719
|
+
color: var(--text-muted);
|
|
7720
|
+
font-size: 13px;
|
|
7721
|
+
border: 1px solid var(--border);
|
|
7722
|
+
border-radius: var(--radius-md);
|
|
7723
|
+
background: var(--surface);
|
|
7724
|
+
padding: 8px 12px;
|
|
7725
|
+
display: inline-flex;
|
|
7726
|
+
align-items: center;
|
|
7727
|
+
gap: 6px;
|
|
7728
|
+
}
|
|
7729
|
+
|
|
7730
|
+
.io-toggle:hover {
|
|
7731
|
+
color: var(--text);
|
|
7732
|
+
}
|
|
7733
|
+
|
|
7734
|
+
.io-toggle[open] > summary {
|
|
7735
|
+
margin-bottom: 12px;
|
|
7736
|
+
}
|
|
7737
|
+
|
|
7738
|
+
.io-toggle > summary {
|
|
7739
|
+
list-style: none;
|
|
7740
|
+
}
|
|
7741
|
+
|
|
7742
|
+
.io-toggle > summary::-webkit-details-marker {
|
|
7743
|
+
display: none;
|
|
7744
|
+
}
|
|
7745
|
+
|
|
7746
|
+
@media (max-width: 900px) {
|
|
7747
|
+
.header,
|
|
7748
|
+
.group-header,
|
|
7749
|
+
.section-heading {
|
|
7750
|
+
align-items: flex-start;
|
|
7751
|
+
}
|
|
7752
|
+
|
|
7753
|
+
.filter-count {
|
|
7754
|
+
width: 100%;
|
|
7755
|
+
margin-left: 0;
|
|
7756
|
+
}
|
|
7757
|
+
}
|
|
7758
|
+
|
|
7759
|
+
@media (prefers-reduced-motion: reduce) {
|
|
7760
|
+
*, *::before, *::after {
|
|
7761
|
+
transition: none !important;
|
|
7762
|
+
scroll-behavior: auto !important;
|
|
7763
|
+
}
|
|
7764
|
+
}
|
|
7765
|
+
</style>
|
|
7766
|
+
</head>
|
|
7767
|
+
<body>
|
|
7768
|
+
<div class="shell">
|
|
7769
|
+
<header class="header">
|
|
7770
|
+
<div class="header-title-wrap">
|
|
7771
|
+
<div class="eyebrow">AgentV static export</div>
|
|
7772
|
+
<h1 class="header-title">Evaluation Report</h1>
|
|
7773
|
+
<p class="header-subtitle">Studio-themed HTML generated from an existing AgentV results workspace.</p>
|
|
7774
|
+
</div>
|
|
7775
|
+
<div class="header-meta" id="header-meta"></div>
|
|
7776
|
+
</header>
|
|
7777
|
+
|
|
7778
|
+
<nav class="tabs" aria-label="Report sections" id="tab-nav">
|
|
7779
|
+
<button class="tab active" type="button" data-tab="overview">Overview</button>
|
|
7780
|
+
<button class="tab" type="button" data-tab="tests">Test Cases</button>
|
|
7781
|
+
</nav>
|
|
7782
|
+
|
|
7783
|
+
<main class="app" id="app"></main>
|
|
7784
|
+
</div>
|
|
7785
|
+
|
|
7786
|
+
<script>
|
|
7787
|
+
const RAW_DATA = __DATA_PLACEHOLDER__;
|
|
7788
|
+
|
|
7789
|
+
(function () {
|
|
7790
|
+
function pick(obj, keys, fallback) {
|
|
7791
|
+
for (let i = 0; i < keys.length; i += 1) {
|
|
7792
|
+
const key = keys[i];
|
|
7793
|
+
if (obj && Object.prototype.hasOwnProperty.call(obj, key) && obj[key] !== undefined && obj[key] !== null) {
|
|
7794
|
+
return obj[key];
|
|
7795
|
+
}
|
|
7796
|
+
}
|
|
7797
|
+
return fallback;
|
|
7798
|
+
}
|
|
7799
|
+
|
|
7800
|
+
function asString(value, fallback) {
|
|
7801
|
+
return typeof value === 'string' ? value : fallback;
|
|
7802
|
+
}
|
|
7803
|
+
|
|
7804
|
+
function asNumber(value, fallback) {
|
|
7805
|
+
return typeof value === 'number' && Number.isFinite(value) ? value : fallback;
|
|
7806
|
+
}
|
|
7807
|
+
|
|
7808
|
+
function asArray(value) {
|
|
7809
|
+
return Array.isArray(value) ? value : [];
|
|
7810
|
+
}
|
|
7811
|
+
|
|
7812
|
+
function normalizeAssertion(value, assertionType) {
|
|
7813
|
+
if (!value || typeof value !== 'object' || Array.isArray(value)) return null;
|
|
7814
|
+
return {
|
|
7815
|
+
text: asString(value.text, 'Untitled assertion'),
|
|
7816
|
+
passed: Boolean(value.passed),
|
|
7817
|
+
evidence: asString(value.evidence, ''),
|
|
7818
|
+
assertion_type: asString(value.assertion_type, '') || assertionType,
|
|
7819
|
+
};
|
|
7820
|
+
}
|
|
7821
|
+
|
|
7822
|
+
function normalizeScores(value) {
|
|
7823
|
+
return asArray(value)
|
|
7824
|
+
.map((entry) => {
|
|
7825
|
+
if (!entry || typeof entry !== 'object' || Array.isArray(entry)) return null;
|
|
7826
|
+
const scoreName = asString(pick(entry, ['name', 'type'], 'assertion'), 'assertion');
|
|
7827
|
+
const scoreType = asString(pick(entry, ['type', 'name'], 'assertion'), 'assertion');
|
|
7828
|
+
const assertions = asArray(entry.assertions)
|
|
7829
|
+
.map((assertion) => normalizeAssertion(assertion, scoreName))
|
|
7830
|
+
.filter(Boolean);
|
|
7831
|
+
return {
|
|
7832
|
+
name: scoreName,
|
|
7833
|
+
type: scoreType,
|
|
7834
|
+
score: asNumber(entry.score, 0),
|
|
7835
|
+
assertions,
|
|
7836
|
+
};
|
|
7837
|
+
})
|
|
7838
|
+
.filter(Boolean);
|
|
7839
|
+
}
|
|
7840
|
+
|
|
7841
|
+
function normalizeTokenUsage(value) {
|
|
7842
|
+
if (!value || typeof value !== 'object' || Array.isArray(value)) {
|
|
7843
|
+
return { input: 0, output: 0, reasoning: 0 };
|
|
7844
|
+
}
|
|
7845
|
+
return {
|
|
7846
|
+
input: asNumber(pick(value, ['input', 'input_tokens', 'inputTokens'], 0), 0),
|
|
7847
|
+
output: asNumber(pick(value, ['output', 'output_tokens', 'outputTokens'], 0), 0),
|
|
7848
|
+
reasoning: asNumber(pick(value, ['reasoning', 'reasoning_tokens', 'reasoningTokens'], 0), 0),
|
|
7849
|
+
};
|
|
7850
|
+
}
|
|
7851
|
+
|
|
7852
|
+
function normalizeResult(raw, index) {
|
|
7853
|
+
const scores = normalizeScores(pick(raw, ['scores'], []));
|
|
7854
|
+
const fallbackAssertions = asArray(pick(raw, ['assertions'], []))
|
|
7855
|
+
.map((assertion) => normalizeAssertion(assertion, 'assertion'))
|
|
7856
|
+
.filter(Boolean);
|
|
7857
|
+
const flattenedAssertions = scores.flatMap((score) => score.assertions);
|
|
7858
|
+
const displayAssertions = flattenedAssertions.length > 0 ? flattenedAssertions : fallbackAssertions;
|
|
7859
|
+
const testId = asString(pick(raw, ['test_id', 'testId'], 'unknown'), 'unknown');
|
|
7860
|
+
const target = asString(pick(raw, ['target'], 'unknown'), 'unknown');
|
|
7861
|
+
const evalFile = asString(pick(raw, ['eval_file', 'evalFile'], 'ungrouped'), 'ungrouped');
|
|
7862
|
+
const executionStatus = asString(pick(raw, ['execution_status', 'executionStatus'], ''), '');
|
|
7863
|
+
return {
|
|
7864
|
+
_key: \`\${evalFile}:\${target}:\${testId}:\${index}\`,
|
|
7865
|
+
raw,
|
|
7866
|
+
eval_file: evalFile,
|
|
7867
|
+
test_id: testId,
|
|
7868
|
+
target,
|
|
7869
|
+
score: asNumber(pick(raw, ['score'], 0), 0),
|
|
7870
|
+
duration_ms: asNumber(pick(raw, ['duration_ms', 'durationMs'], 0), 0),
|
|
7871
|
+
cost_usd: asNumber(pick(raw, ['cost_usd', 'costUsd'], 0), 0),
|
|
7872
|
+
timestamp: asString(pick(raw, ['timestamp'], ''), ''),
|
|
7873
|
+
execution_status: executionStatus,
|
|
7874
|
+
error: asString(pick(raw, ['error'], ''), ''),
|
|
7875
|
+
input: pick(raw, ['input'], null),
|
|
7876
|
+
output: pick(raw, ['output'], null),
|
|
7877
|
+
token_usage: normalizeTokenUsage(pick(raw, ['token_usage', 'tokenUsage'], {})),
|
|
7878
|
+
total_tokens: 0,
|
|
7879
|
+
scores,
|
|
7880
|
+
assertions: displayAssertions,
|
|
7881
|
+
};
|
|
7882
|
+
}
|
|
7883
|
+
|
|
7884
|
+
function escapeHtml(value) {
|
|
7885
|
+
return String(value)
|
|
7886
|
+
.replace(/&/g, '&')
|
|
7887
|
+
.replace(/</g, '<')
|
|
7888
|
+
.replace(/>/g, '>')
|
|
7889
|
+
.replace(/"/g, '"');
|
|
7890
|
+
}
|
|
7891
|
+
|
|
7892
|
+
function statusOf(result) {
|
|
7893
|
+
if (result.execution_status === 'execution_error') return 'error';
|
|
7894
|
+
if (result.execution_status === 'quality_failure') return 'fail';
|
|
7895
|
+
if (result.execution_status === 'ok') return 'pass';
|
|
7896
|
+
if (result.error) return 'error';
|
|
7897
|
+
return result.score >= 0.5 ? 'pass' : 'fail';
|
|
7898
|
+
}
|
|
7899
|
+
|
|
7900
|
+
function statusLabel(status) {
|
|
7901
|
+
if (status === 'pass') return 'Passed';
|
|
7902
|
+
if (status === 'fail') return 'Failed';
|
|
7903
|
+
return 'Error';
|
|
7904
|
+
}
|
|
7905
|
+
|
|
7906
|
+
function toneForScore(value) {
|
|
7907
|
+
if (value >= 0.8) return 'text-pass';
|
|
7908
|
+
if (value >= 0.5) return 'text-warn';
|
|
7909
|
+
return 'text-fail';
|
|
7910
|
+
}
|
|
7911
|
+
|
|
7912
|
+
function toneClassForRate(value) {
|
|
7913
|
+
if (value >= 0.8) return 'tone-pass';
|
|
7914
|
+
if (value >= 0.5) return 'tone-warn';
|
|
7915
|
+
return 'tone-fail';
|
|
7916
|
+
}
|
|
7917
|
+
|
|
7918
|
+
function formatPercent(value) {
|
|
7919
|
+
return \`\${(value * 100).toFixed(1)}%\`;
|
|
7920
|
+
}
|
|
7921
|
+
|
|
7922
|
+
function formatDuration(ms) {
|
|
7923
|
+
if (!ms) return '-';
|
|
7924
|
+
if (ms < 1000) return \`\${Math.round(ms)}ms\`;
|
|
7925
|
+
if (ms < 60000) return \`\${(ms / 1000).toFixed(1)}s\`;
|
|
7926
|
+
const minutes = Math.floor(ms / 60000);
|
|
7927
|
+
const seconds = Math.round((ms % 60000) / 1000);
|
|
7928
|
+
return \`\${minutes}m \${seconds}s\`;
|
|
7929
|
+
}
|
|
7930
|
+
|
|
7931
|
+
function formatTokens(value) {
|
|
7932
|
+
if (!value) return '-';
|
|
7933
|
+
if (value >= 1000000) return \`\${(value / 1000000).toFixed(1)}M\`;
|
|
7934
|
+
if (value >= 1000) return \`\${(value / 1000).toFixed(1)}K\`;
|
|
7935
|
+
return String(value);
|
|
7936
|
+
}
|
|
7937
|
+
|
|
7938
|
+
function formatCost(value) {
|
|
7939
|
+
if (!value) return '-';
|
|
7940
|
+
if (value < 0.01) return '<$0.01';
|
|
7941
|
+
return \`$\${value.toFixed(2)}\`;
|
|
7942
|
+
}
|
|
7943
|
+
|
|
7944
|
+
function totalTokens(result) {
|
|
7945
|
+
return (result.token_usage.input || 0) + (result.token_usage.output || 0) + (result.token_usage.reasoning || 0);
|
|
7946
|
+
}
|
|
7947
|
+
|
|
7948
|
+
function computeStats(rows) {
|
|
7949
|
+
const stats = {
|
|
7950
|
+
total: rows.length,
|
|
7951
|
+
passed: 0,
|
|
7952
|
+
failed: 0,
|
|
7953
|
+
errors: 0,
|
|
7954
|
+
total_duration_ms: 0,
|
|
7955
|
+
total_tokens: 0,
|
|
7956
|
+
total_cost_usd: 0,
|
|
7957
|
+
scores: [],
|
|
7958
|
+
};
|
|
7959
|
+
rows.forEach((row) => {
|
|
7960
|
+
const status = statusOf(row);
|
|
7961
|
+
if (status === 'pass') stats.passed += 1;
|
|
7962
|
+
else if (status === 'fail') stats.failed += 1;
|
|
7963
|
+
else stats.errors += 1;
|
|
7964
|
+
stats.total_duration_ms += row.duration_ms || 0;
|
|
7965
|
+
stats.total_tokens += totalTokens(row);
|
|
7966
|
+
stats.total_cost_usd += row.cost_usd || 0;
|
|
7967
|
+
if (status !== 'error') stats.scores.push(row.score);
|
|
7968
|
+
});
|
|
7969
|
+
const graded = stats.passed + stats.failed;
|
|
7970
|
+
stats.pass_rate = graded > 0 ? stats.passed / graded : 0;
|
|
7971
|
+
return stats;
|
|
7972
|
+
}
|
|
7973
|
+
|
|
7974
|
+
function groupBy(rows, key) {
|
|
7975
|
+
const map = new Map();
|
|
7976
|
+
rows.forEach((row) => {
|
|
7977
|
+
const value = row[key];
|
|
7978
|
+
if (!map.has(value)) map.set(value, []);
|
|
7979
|
+
map.get(value).push(row);
|
|
7980
|
+
});
|
|
7981
|
+
return Array.from(map.entries()).map(([name, entries]) => ({
|
|
7982
|
+
name,
|
|
7983
|
+
rows: entries,
|
|
7984
|
+
stats: computeStats(entries),
|
|
7985
|
+
}));
|
|
7986
|
+
}
|
|
7987
|
+
|
|
7988
|
+
function uniqueScoreNames(rows) {
|
|
7989
|
+
const seen = new Set();
|
|
7990
|
+
rows.forEach((row) => {
|
|
7991
|
+
row.scores.forEach((score) => seen.add(score.name));
|
|
7992
|
+
});
|
|
7993
|
+
return Array.from(seen.values()).sort();
|
|
7994
|
+
}
|
|
7995
|
+
|
|
7996
|
+
function scoreForName(row, scoreName) {
|
|
7997
|
+
const match = row.scores.find((score) => score.name === scoreName);
|
|
7998
|
+
return match ? match.score : null;
|
|
7999
|
+
}
|
|
8000
|
+
|
|
8001
|
+
function sortRows(rows, sort) {
|
|
8002
|
+
const direction = sort.dir === 'asc' ? 1 : -1;
|
|
8003
|
+
const statusRank = { pass: 0, fail: 1, error: 2 };
|
|
8004
|
+
return rows.slice().sort((left, right) => {
|
|
8005
|
+
let a;
|
|
8006
|
+
let b;
|
|
8007
|
+
if (sort.col === 'status') {
|
|
8008
|
+
a = statusRank[statusOf(left)];
|
|
8009
|
+
b = statusRank[statusOf(right)];
|
|
8010
|
+
} else if (sort.col.startsWith('score:')) {
|
|
8011
|
+
const scoreName = sort.col.slice('score:'.length);
|
|
8012
|
+
a = scoreForName(left, scoreName);
|
|
8013
|
+
b = scoreForName(right, scoreName);
|
|
8014
|
+
} else {
|
|
8015
|
+
a = left[sort.col];
|
|
8016
|
+
b = right[sort.col];
|
|
8017
|
+
}
|
|
8018
|
+
if (a == null && b == null) return 0;
|
|
8019
|
+
if (a == null) return 1;
|
|
8020
|
+
if (b == null) return -1;
|
|
8021
|
+
if (typeof a === 'string' || typeof b === 'string') {
|
|
8022
|
+
return String(a).localeCompare(String(b)) * direction;
|
|
8023
|
+
}
|
|
8024
|
+
return (a - b) * direction;
|
|
8025
|
+
});
|
|
8026
|
+
}
|
|
8027
|
+
|
|
8028
|
+
function matchesSearch(row, search) {
|
|
8029
|
+
if (!search) return true;
|
|
8030
|
+
const haystack = [
|
|
8031
|
+
row.test_id,
|
|
8032
|
+
row.target,
|
|
8033
|
+
row.eval_file,
|
|
8034
|
+
row.error,
|
|
8035
|
+
extractCriteria(row),
|
|
8036
|
+
row.assertions.map((assertion) => \`\${assertion.assertion_type} \${assertion.text} \${assertion.evidence}\`).join(' '),
|
|
8037
|
+
].join(' ').toLowerCase();
|
|
8038
|
+
return haystack.includes(search.toLowerCase());
|
|
8039
|
+
}
|
|
8040
|
+
|
|
8041
|
+
function filterRows(rows, filter) {
|
|
8042
|
+
return rows.filter((row) => {
|
|
8043
|
+
const status = statusOf(row);
|
|
8044
|
+
if (filter.status !== 'all' && status !== filter.status) return false;
|
|
8045
|
+
if (filter.target !== 'all' && row.target !== filter.target) return false;
|
|
8046
|
+
if (!matchesSearch(row, filter.search)) return false;
|
|
8047
|
+
return true;
|
|
8048
|
+
});
|
|
8049
|
+
}
|
|
8050
|
+
|
|
8051
|
+
function renderPassRate(rate) {
|
|
8052
|
+
return \`
|
|
8053
|
+
<span class="pass-rate-track">
|
|
8054
|
+
<span class="pass-rate-fill" style="width:\${Math.max(0, Math.min(100, rate * 100))}%"></span>
|
|
8055
|
+
<span class="pass-rate-label">\${formatPercent(rate)}</span>
|
|
8056
|
+
</span>
|
|
8057
|
+
\`;
|
|
8058
|
+
}
|
|
8059
|
+
|
|
8060
|
+
function formatJson(value) {
|
|
8061
|
+
if (value == null) return '';
|
|
8062
|
+
if (typeof value === 'string') return value;
|
|
8063
|
+
try {
|
|
8064
|
+
return JSON.stringify(value, null, 2);
|
|
8065
|
+
} catch (error) {
|
|
8066
|
+
return String(value);
|
|
8067
|
+
}
|
|
8068
|
+
}
|
|
8069
|
+
|
|
8070
|
+
function renderStatusPill(status) {
|
|
8071
|
+
return \`
|
|
8072
|
+
<span class="status-pill status-\${status}">
|
|
8073
|
+
<span class="status-dot"></span>
|
|
8074
|
+
<span>\${statusLabel(status)}</span>
|
|
8075
|
+
</span>
|
|
8076
|
+
\`;
|
|
8077
|
+
}
|
|
8078
|
+
|
|
8079
|
+
function renderAssertions(assertions) {
|
|
8080
|
+
if (assertions.length === 0) {
|
|
8081
|
+
return '<p class="muted">No assertions.</p>';
|
|
8082
|
+
}
|
|
8083
|
+
return \`
|
|
8084
|
+
<ul class="assertion-list">
|
|
8085
|
+
\${assertions.map((assertion) => \`
|
|
8086
|
+
<li class="assertion-item">
|
|
8087
|
+
<div class="assertion-topline">
|
|
8088
|
+
<span class="assertion-status \${assertion.passed ? 'text-pass' : 'text-fail'}">\${assertion.passed ? '\\u2713' : '\\u2717'}</span>
|
|
8089
|
+
\${assertion.assertion_type ? \`<span class="assertion-badge">\${escapeHtml(assertion.assertion_type)}</span>\` : ''}
|
|
8090
|
+
<span class="assertion-text">\${escapeHtml(assertion.text)}</span>
|
|
8091
|
+
</div>
|
|
8092
|
+
\${assertion.evidence ? \`<div class="assertion-evidence">\${escapeHtml(assertion.evidence)}</div>\` : ''}
|
|
8093
|
+
</li>
|
|
8094
|
+
\`).join('')}
|
|
8095
|
+
</ul>
|
|
8096
|
+
\`;
|
|
8097
|
+
}
|
|
8098
|
+
|
|
8099
|
+
function extractCriteria(row) {
|
|
8100
|
+
if (row.input && typeof row.input === 'object' && row.input.prompt) {
|
|
8101
|
+
return String(row.input.prompt);
|
|
8102
|
+
}
|
|
8103
|
+
if (typeof row.input === 'string' && row.input.length > 0 && row.input.length <= 200) {
|
|
8104
|
+
return row.input;
|
|
8105
|
+
}
|
|
8106
|
+
if (row.assertions && row.assertions.length > 0) {
|
|
8107
|
+
const types = Array.from(new Set(row.assertions.map(a => a.assertion_type).filter(Boolean)));
|
|
8108
|
+
const firstText = row.assertions[0].text || '';
|
|
8109
|
+
const prefix = types.length > 0 ? types.join(', ') + ': ' : '';
|
|
8110
|
+
const full = prefix + firstText;
|
|
8111
|
+
return full.length > 120 ? full.slice(0, 117) + '...' : full;
|
|
8112
|
+
}
|
|
8113
|
+
return '';
|
|
8114
|
+
}
|
|
8115
|
+
|
|
8116
|
+
function renderDetail(row, evaluatorNames, hasAnyCost) {
|
|
8117
|
+
const meta = [];
|
|
8118
|
+
if (row.timestamp) meta.push(escapeHtml(row.timestamp));
|
|
8119
|
+
if (row.target) meta.push(escapeHtml(row.target));
|
|
8120
|
+
if (row.duration_ms) meta.push(escapeHtml(formatDuration(row.duration_ms)));
|
|
8121
|
+
if (totalTokens(row)) meta.push(escapeHtml(\`\${formatTokens(totalTokens(row))} tokens\`));
|
|
8122
|
+
if (hasAnyCost && row.cost_usd) meta.push(escapeHtml(formatCost(row.cost_usd)));
|
|
8123
|
+
|
|
8124
|
+
const hasIo = row.input != null || row.output != null;
|
|
8125
|
+
|
|
8126
|
+
return \`
|
|
8127
|
+
<div class="detail-panel">
|
|
8128
|
+
<div class="assertion-card">
|
|
8129
|
+
<h4>Assertions (\${row.assertions.length})</h4>
|
|
8130
|
+
\${renderAssertions(row.assertions)}
|
|
8131
|
+
</div>
|
|
8132
|
+
|
|
8133
|
+
\${row.error ? \`
|
|
8134
|
+
<div class="error-card">
|
|
8135
|
+
<h4>Error</h4>
|
|
8136
|
+
<pre>\${escapeHtml(row.error)}</pre>
|
|
8137
|
+
</div>
|
|
8138
|
+
\` : ''}
|
|
8139
|
+
|
|
8140
|
+
\${hasIo ? \`
|
|
8141
|
+
<details class="io-toggle">
|
|
8142
|
+
<summary>\\u25B8 Input / Output</summary>
|
|
8143
|
+
<div class="detail-grid">
|
|
8144
|
+
<div class="detail-card">
|
|
8145
|
+
<h4>Input</h4>
|
|
8146
|
+
<pre>\${escapeHtml(formatJson(row.input))}</pre>
|
|
8147
|
+
</div>
|
|
8148
|
+
<div class="detail-card">
|
|
8149
|
+
<h4>Output</h4>
|
|
8150
|
+
<pre>\${escapeHtml(formatJson(row.output))}</pre>
|
|
8151
|
+
</div>
|
|
8152
|
+
</div>
|
|
8153
|
+
</details>
|
|
8154
|
+
\` : ''}
|
|
8155
|
+
|
|
8156
|
+
<div class="meta-row">\${meta.join('<span>·</span>')}</div>
|
|
8157
|
+
</div>
|
|
8158
|
+
\`;
|
|
8159
|
+
}
|
|
8160
|
+
|
|
8161
|
+
const DATA = asArray(RAW_DATA).map(normalizeResult);
|
|
8162
|
+
DATA.forEach((row) => {
|
|
8163
|
+
row.total_tokens = totalTokens(row);
|
|
8164
|
+
});
|
|
8165
|
+
const STATE = {
|
|
8166
|
+
tab: 'overview',
|
|
8167
|
+
filter: { status: 'all', target: 'all', search: '' },
|
|
8168
|
+
sort: { col: 'test_id', dir: 'asc' },
|
|
8169
|
+
expanded: {},
|
|
8170
|
+
};
|
|
8171
|
+
|
|
8172
|
+
const APP = document.getElementById('app');
|
|
8173
|
+
const HEADER_META = document.getElementById('header-meta');
|
|
8174
|
+
const TAB_BUTTONS = Array.from(document.querySelectorAll('.tab'));
|
|
8175
|
+
const TARGETS = Array.from(new Set(DATA.map((row) => row.target))).sort();
|
|
8176
|
+
const EVALUATOR_NAMES = uniqueScoreNames(DATA);
|
|
8177
|
+
const HAS_ANY_COST = DATA.some((row) => row.cost_usd > 0);
|
|
8178
|
+
const OVERALL_STATS = computeStats(DATA);
|
|
8179
|
+
const EVAL_GROUPS = groupBy(DATA, 'eval_file').sort((a, b) => a.name.localeCompare(b.name));
|
|
8180
|
+
const TARGET_GROUPS = groupBy(DATA, 'target').sort((a, b) => a.name.localeCompare(b.name));
|
|
8181
|
+
|
|
8182
|
+
function syncHeaderMeta() {
|
|
8183
|
+
const themeLabel = document.documentElement.classList.contains('light') ? '\\u263E Dark' : '\\u2600 Light';
|
|
8184
|
+
HEADER_META.innerHTML = \`
|
|
8185
|
+
<span class="chip">\${escapeHtml(String(DATA.length))} tests</span>
|
|
8186
|
+
<span class="chip">\${escapeHtml(String(EVAL_GROUPS.length))} eval files</span>
|
|
8187
|
+
<span class="chip">\${escapeHtml(String(TARGETS.length))} targets</span>
|
|
8188
|
+
<button class="theme-toggle" id="theme-btn" type="button">\${themeLabel}</button>
|
|
8189
|
+
\`;
|
|
8190
|
+
document.getElementById('theme-btn').addEventListener('click', toggleTheme);
|
|
8191
|
+
}
|
|
8192
|
+
|
|
8193
|
+
function toggleTheme() {
|
|
8194
|
+
document.documentElement.classList.toggle('light');
|
|
8195
|
+
syncHeaderMeta();
|
|
8196
|
+
}
|
|
8197
|
+
|
|
8198
|
+
function setTab(tab) {
|
|
8199
|
+
STATE.tab = tab;
|
|
8200
|
+
TAB_BUTTONS.forEach((button) => {
|
|
8201
|
+
button.classList.toggle('active', button.getAttribute('data-tab') === tab);
|
|
8202
|
+
});
|
|
8203
|
+
render();
|
|
8204
|
+
}
|
|
8205
|
+
|
|
8206
|
+
TAB_BUTTONS.forEach((button) => {
|
|
8207
|
+
button.addEventListener('click', () => setTab(button.getAttribute('data-tab')));
|
|
8208
|
+
});
|
|
8209
|
+
|
|
8210
|
+
function renderStatCard(label, value, tone) {
|
|
8211
|
+
return \`
|
|
8212
|
+
<div class="stat-card \${tone || ''}">
|
|
8213
|
+
<div class="stat-label">\${escapeHtml(label)}</div>
|
|
8214
|
+
<div class="stat-value">\${escapeHtml(value)}</div>
|
|
8215
|
+
</div>
|
|
8216
|
+
\`;
|
|
8217
|
+
}
|
|
8218
|
+
|
|
8219
|
+
function renderOverview() {
|
|
8220
|
+
if (DATA.length === 0) {
|
|
8221
|
+
APP.innerHTML = \`
|
|
8222
|
+
<div class="empty-state">
|
|
8223
|
+
<h2>No results loaded</h2>
|
|
8224
|
+
<p>Add one or more parsed JSONL result rows to the template placeholder.</p>
|
|
8225
|
+
</div>
|
|
8226
|
+
\`;
|
|
8227
|
+
return;
|
|
8228
|
+
}
|
|
8229
|
+
|
|
8230
|
+
const overviewSections = [];
|
|
8231
|
+
overviewSections.push(\`
|
|
8232
|
+
<section class="section">
|
|
8233
|
+
<div class="stats-grid">
|
|
8234
|
+
\${renderStatCard('Total tests', String(OVERALL_STATS.total))}
|
|
8235
|
+
\${renderStatCard('Eval files', String(EVAL_GROUPS.length))}
|
|
8236
|
+
\${renderStatCard('Targets', String(TARGETS.length))}
|
|
8237
|
+
\${renderStatCard('Passed', String(OVERALL_STATS.passed), 'tone-pass')}
|
|
8238
|
+
\${renderStatCard('Failed', String(OVERALL_STATS.failed), 'tone-fail')}
|
|
8239
|
+
\${renderStatCard('Errors', String(OVERALL_STATS.errors), OVERALL_STATS.errors > 0 ? 'tone-warn' : '')}
|
|
8240
|
+
\${renderStatCard('Pass rate', formatPercent(OVERALL_STATS.pass_rate), toneClassForRate(OVERALL_STATS.pass_rate))}
|
|
8241
|
+
\${renderStatCard('Duration', formatDuration(OVERALL_STATS.total_duration_ms))}
|
|
8242
|
+
\${renderStatCard('Tokens', formatTokens(OVERALL_STATS.total_tokens))}
|
|
8243
|
+
\${HAS_ANY_COST ? renderStatCard('Cost', formatCost(OVERALL_STATS.total_cost_usd)) : ''}
|
|
8244
|
+
</div>
|
|
8245
|
+
</section>
|
|
8246
|
+
\`);
|
|
8247
|
+
|
|
8248
|
+
overviewSections.push(\`
|
|
8249
|
+
<section class="section">
|
|
8250
|
+
<div class="section-heading">
|
|
8251
|
+
<div>
|
|
8252
|
+
<h2>Eval Files</h2>
|
|
8253
|
+
<p>Grouped exactly as the exported template will render them in the Test Cases tab.</p>
|
|
8254
|
+
</div>
|
|
8255
|
+
</div>
|
|
8256
|
+
<div class="table-wrap">
|
|
8257
|
+
<table>
|
|
8258
|
+
<thead>
|
|
8259
|
+
<tr>
|
|
8260
|
+
<th>Eval file</th>
|
|
8261
|
+
<th>Pass rate</th>
|
|
8262
|
+
<th class="table-num">Passed</th>
|
|
8263
|
+
<th class="table-num">Failed</th>
|
|
8264
|
+
<th class="table-num">Errors</th>
|
|
8265
|
+
<th class="table-num">Tests</th>
|
|
8266
|
+
<th class="table-num">Duration</th>
|
|
8267
|
+
<th class="table-num">Tokens</th>
|
|
8268
|
+
\${HAS_ANY_COST ? '<th class="table-num">Cost</th>' : ''}
|
|
8269
|
+
</tr>
|
|
8270
|
+
</thead>
|
|
8271
|
+
<tbody>
|
|
8272
|
+
\${EVAL_GROUPS.map((group) => \`
|
|
8273
|
+
<tr>
|
|
8274
|
+
<td class="id-cell">\${escapeHtml(group.name)}</td>
|
|
8275
|
+
<td>\${renderPassRate(group.stats.pass_rate)}</td>
|
|
8276
|
+
<td class="table-num text-pass">\${escapeHtml(String(group.stats.passed))}</td>
|
|
8277
|
+
<td class="table-num text-fail">\${escapeHtml(String(group.stats.failed))}</td>
|
|
8278
|
+
<td class="table-num text-warn">\${escapeHtml(String(group.stats.errors))}</td>
|
|
8279
|
+
<td class="table-num">\${escapeHtml(String(group.stats.total))}</td>
|
|
8280
|
+
<td class="table-num">\${escapeHtml(formatDuration(group.stats.total_duration_ms))}</td>
|
|
8281
|
+
<td class="table-num">\${escapeHtml(formatTokens(group.stats.total_tokens))}</td>
|
|
8282
|
+
\${HAS_ANY_COST ? \`<td class="table-num">\${escapeHtml(formatCost(group.stats.total_cost_usd))}</td>\` : ''}
|
|
8283
|
+
</tr>
|
|
8284
|
+
\`).join('')}
|
|
8285
|
+
</tbody>
|
|
8286
|
+
</table>
|
|
8287
|
+
</div>
|
|
8288
|
+
</section>
|
|
8289
|
+
\`);
|
|
8290
|
+
|
|
8291
|
+
if (TARGET_GROUPS.length > 1) {
|
|
8292
|
+
overviewSections.push(\`
|
|
8293
|
+
<section class="section">
|
|
8294
|
+
<div class="section-heading">
|
|
8295
|
+
<div>
|
|
8296
|
+
<h2>Targets</h2>
|
|
8297
|
+
<p>Cross-target summary for the loaded result set.</p>
|
|
8298
|
+
</div>
|
|
8299
|
+
</div>
|
|
8300
|
+
<div class="table-wrap">
|
|
8301
|
+
<table>
|
|
8302
|
+
<thead>
|
|
8303
|
+
<tr>
|
|
8304
|
+
<th>Target</th>
|
|
8305
|
+
<th>Pass rate</th>
|
|
8306
|
+
<th class="table-num">Passed</th>
|
|
8307
|
+
<th class="table-num">Failed</th>
|
|
8308
|
+
<th class="table-num">Errors</th>
|
|
8309
|
+
<th class="table-num">Avg score</th>
|
|
8310
|
+
<th class="table-num">Duration</th>
|
|
8311
|
+
<th class="table-num">Tokens</th>
|
|
8312
|
+
\${HAS_ANY_COST ? '<th class="table-num">Cost</th>' : ''}
|
|
8313
|
+
</tr>
|
|
8314
|
+
</thead>
|
|
8315
|
+
<tbody>
|
|
8316
|
+
\${TARGET_GROUPS.map((group) => {
|
|
8317
|
+
const gradedRows = group.rows.filter((row) => statusOf(row) !== 'error');
|
|
8318
|
+
const avgScore = gradedRows.length > 0
|
|
8319
|
+
? gradedRows.reduce((sum, row) => sum + row.score, 0) / gradedRows.length
|
|
8320
|
+
: 0;
|
|
8321
|
+
return \`
|
|
8322
|
+
<tr>
|
|
8323
|
+
<td class="id-cell">\${escapeHtml(group.name)}</td>
|
|
8324
|
+
<td>\${renderPassRate(group.stats.pass_rate)}</td>
|
|
8325
|
+
<td class="table-num text-pass">\${escapeHtml(String(group.stats.passed))}</td>
|
|
8326
|
+
<td class="table-num text-fail">\${escapeHtml(String(group.stats.failed))}</td>
|
|
8327
|
+
<td class="table-num text-warn">\${escapeHtml(String(group.stats.errors))}</td>
|
|
8328
|
+
<td class="table-num \${toneForScore(avgScore)}">\${escapeHtml(formatPercent(avgScore))}</td>
|
|
8329
|
+
<td class="table-num">\${escapeHtml(formatDuration(group.stats.total_duration_ms))}</td>
|
|
8330
|
+
<td class="table-num">\${escapeHtml(formatTokens(group.stats.total_tokens))}</td>
|
|
8331
|
+
\${HAS_ANY_COST ? \`<td class="table-num">\${escapeHtml(formatCost(group.stats.total_cost_usd))}</td>\` : ''}
|
|
8332
|
+
</tr>
|
|
8333
|
+
\`;
|
|
8334
|
+
}).join('')}
|
|
8335
|
+
</tbody>
|
|
8336
|
+
</table>
|
|
8337
|
+
</div>
|
|
8338
|
+
</section>
|
|
8339
|
+
\`);
|
|
8340
|
+
}
|
|
8341
|
+
|
|
8342
|
+
if (OVERALL_STATS.scores.length > 0) {
|
|
8343
|
+
const buckets = [0, 0, 0, 0, 0];
|
|
8344
|
+
OVERALL_STATS.scores.forEach((score) => {
|
|
8345
|
+
const index = Math.min(Math.floor(score * 5), 4);
|
|
8346
|
+
buckets[index] += 1;
|
|
8347
|
+
});
|
|
8348
|
+
const maxBucket = Math.max.apply(null, buckets);
|
|
8349
|
+
const labels = ['0-20%', '20-40%', '40-60%', '60-80%', '80-100%'];
|
|
8350
|
+
overviewSections.push(\`
|
|
8351
|
+
<section class="section">
|
|
8352
|
+
<div class="section-heading">
|
|
8353
|
+
<div>
|
|
8354
|
+
<h2>Score Distribution</h2>
|
|
8355
|
+
<p>Histogram across all non-error result rows.</p>
|
|
8356
|
+
</div>
|
|
8357
|
+
</div>
|
|
8358
|
+
<div class="histogram">
|
|
8359
|
+
\${buckets.map((count, index) => \`
|
|
8360
|
+
<div class="hist-row">
|
|
8361
|
+
<div class="hist-label">\${labels[index]}</div>
|
|
8362
|
+
<div class="hist-bar-track">
|
|
8363
|
+
<div class="hist-bar-fill" style="width:\${maxBucket > 0 ? (count / maxBucket) * 100 : 0}%"></div>
|
|
8364
|
+
</div>
|
|
8365
|
+
<div class="hist-count tabular">\${count}</div>
|
|
8366
|
+
</div>
|
|
8367
|
+
\`).join('')}
|
|
8368
|
+
</div>
|
|
8369
|
+
</section>
|
|
8370
|
+
\`);
|
|
8371
|
+
}
|
|
8372
|
+
|
|
8373
|
+
APP.innerHTML = overviewSections.join('');
|
|
8374
|
+
}
|
|
8375
|
+
|
|
8376
|
+
function renderSortableHeader(label, col) {
|
|
8377
|
+
const arrow = STATE.sort.col === col ? (STATE.sort.dir === 'asc' ? ' \u2191' : ' \u2193') : '';
|
|
8378
|
+
return \`<th class="sortable" data-sort="\${escapeHtml(col)}">\${escapeHtml(label)}\${arrow}</th>\`;
|
|
8379
|
+
}
|
|
8380
|
+
|
|
8381
|
+
function renderTestGroups() {
|
|
8382
|
+
const filteredRows = filterRows(DATA, STATE.filter);
|
|
8383
|
+
const filteredGroups = groupBy(filteredRows, 'eval_file').sort((a, b) => a.name.localeCompare(b.name));
|
|
8384
|
+
const hasMultipleTargets = TARGETS.length > 1;
|
|
8385
|
+
const hasCriteria = DATA.some((row) => extractCriteria(row).length > 0);
|
|
8386
|
+
if (filteredRows.length === 0) {
|
|
8387
|
+
return '<div class="empty-state"><h2>No matching tests</h2><p>Adjust the filters or search text.</p></div>';
|
|
8388
|
+
}
|
|
8389
|
+
|
|
8390
|
+
const colCount = 4 + (hasCriteria ? 1 : 0) + (hasMultipleTargets ? 1 : 0) + (HAS_ANY_COST ? 1 : 0);
|
|
8391
|
+
|
|
8392
|
+
return \`
|
|
8393
|
+
<div class="group-list">
|
|
8394
|
+
\${filteredGroups.map((group) => {
|
|
8395
|
+
const sortedRows = sortRows(group.rows, STATE.sort);
|
|
8396
|
+
const groupSummary = [];
|
|
8397
|
+
groupSummary.push(\`\${group.stats.passed}/\${group.stats.total} passed\`);
|
|
8398
|
+
if (group.stats.failed > 0) groupSummary.push(\`\${group.stats.failed} failed\`);
|
|
8399
|
+
if (group.stats.errors > 0) groupSummary.push(\`\${group.stats.errors} errors\`);
|
|
8400
|
+
return \`
|
|
8401
|
+
<section class="group-card">
|
|
8402
|
+
<div class="group-header">
|
|
8403
|
+
<div class="group-title-wrap">
|
|
8404
|
+
<h2 class="group-title">\${escapeHtml(group.name)}</h2>
|
|
8405
|
+
<p class="group-subtitle">\${escapeHtml(groupSummary.join(' | '))}</p>
|
|
8406
|
+
</div>
|
|
8407
|
+
<div class="group-metrics">
|
|
8408
|
+
<div class="metric">
|
|
8409
|
+
<span class="metric-label">Pass rate</span>
|
|
8410
|
+
<span class="metric-value">\${escapeHtml(formatPercent(group.stats.pass_rate))}</span>
|
|
8411
|
+
</div>
|
|
8412
|
+
<div class="metric">
|
|
8413
|
+
<span class="metric-label">Tests</span>
|
|
8414
|
+
<span class="metric-value">\${escapeHtml(String(group.stats.total))}</span>
|
|
8415
|
+
</div>
|
|
8416
|
+
<div class="metric">
|
|
8417
|
+
<span class="metric-label">Duration</span>
|
|
8418
|
+
<span class="metric-value">\${escapeHtml(formatDuration(group.stats.total_duration_ms))}</span>
|
|
8419
|
+
</div>
|
|
8420
|
+
</div>
|
|
8421
|
+
</div>
|
|
8422
|
+
<div class="table-wrap">
|
|
8423
|
+
<table>
|
|
8424
|
+
<thead>
|
|
8425
|
+
<tr>
|
|
8426
|
+
<th class="expand-cell"></th>
|
|
8427
|
+
\${renderSortableHeader('Test ID', 'test_id')}
|
|
8428
|
+
\${hasCriteria ? '<th>Criteria</th>' : ''}
|
|
8429
|
+
\${hasMultipleTargets ? renderSortableHeader('Target', 'target') : ''}
|
|
8430
|
+
\${renderSortableHeader('Score', 'score')}
|
|
8431
|
+
\${renderSortableHeader('Result', 'status')}
|
|
8432
|
+
\${HAS_ANY_COST ? renderSortableHeader('Cost', 'cost_usd') : ''}
|
|
8433
|
+
</tr>
|
|
8434
|
+
</thead>
|
|
8435
|
+
<tbody>
|
|
8436
|
+
\${sortedRows.map((row) => {
|
|
8437
|
+
const expanded = Boolean(STATE.expanded[row._key]);
|
|
8438
|
+
const criteria = extractCriteria(row);
|
|
8439
|
+
return \`
|
|
8440
|
+
<tr class="test-row" data-expand="\${escapeHtml(row._key)}">
|
|
8441
|
+
<td class="expand-cell">\${expanded ? '\\u25BE' : '\\u25B8'}</td>
|
|
8442
|
+
<td class="id-cell">\${escapeHtml(row.test_id)}</td>
|
|
8443
|
+
\${hasCriteria ? \`<td class="criteria-cell">\${escapeHtml(criteria)}</td>\` : ''}
|
|
8444
|
+
\${hasMultipleTargets ? \`<td class="table-muted">\${escapeHtml(row.target)}</td>\` : ''}
|
|
8445
|
+
<td class="table-num \${toneForScore(row.score)}">\${escapeHtml(formatPercent(row.score))}</td>
|
|
8446
|
+
<td>\${renderStatusPill(statusOf(row))}</td>
|
|
8447
|
+
\${HAS_ANY_COST ? \`<td class="table-num">\${escapeHtml(formatCost(row.cost_usd))}</td>\` : ''}
|
|
8448
|
+
</tr>
|
|
8449
|
+
\${expanded ? \`<tr class="detail-row"><td colspan="\${colCount}">\${renderDetail(row, EVALUATOR_NAMES, HAS_ANY_COST)}</td></tr>\` : ''}
|
|
8450
|
+
\`;
|
|
8451
|
+
}).join('')}
|
|
8452
|
+
</tbody>
|
|
8453
|
+
</table>
|
|
8454
|
+
</div>
|
|
8455
|
+
</section>
|
|
8456
|
+
\`;
|
|
8457
|
+
}).join('')}
|
|
8458
|
+
</div>
|
|
8459
|
+
\`;
|
|
8460
|
+
}
|
|
8461
|
+
|
|
8462
|
+
function bindTestEvents() {
|
|
8463
|
+
const statusInput = document.getElementById('filter-status');
|
|
8464
|
+
const targetInput = document.getElementById('filter-target');
|
|
8465
|
+
const searchInput = document.getElementById('filter-search');
|
|
8466
|
+
const sortableHeaders = Array.from(document.querySelectorAll('[data-sort]'));
|
|
8467
|
+
const expandableRows = Array.from(document.querySelectorAll('[data-expand]'));
|
|
8468
|
+
|
|
8469
|
+
if (statusInput) {
|
|
8470
|
+
statusInput.value = STATE.filter.status;
|
|
8471
|
+
statusInput.addEventListener('change', (event) => {
|
|
8472
|
+
STATE.filter.status = event.target.value;
|
|
8473
|
+
render();
|
|
8474
|
+
});
|
|
8475
|
+
}
|
|
8476
|
+
|
|
8477
|
+
if (targetInput) {
|
|
8478
|
+
targetInput.value = STATE.filter.target;
|
|
8479
|
+
targetInput.addEventListener('change', (event) => {
|
|
8480
|
+
STATE.filter.target = event.target.value;
|
|
8481
|
+
render();
|
|
8482
|
+
});
|
|
8483
|
+
}
|
|
8484
|
+
|
|
8485
|
+
if (searchInput) {
|
|
8486
|
+
searchInput.value = STATE.filter.search;
|
|
8487
|
+
searchInput.addEventListener('input', (event) => {
|
|
8488
|
+
STATE.filter.search = event.target.value;
|
|
8489
|
+
render();
|
|
8490
|
+
});
|
|
8491
|
+
}
|
|
8492
|
+
|
|
8493
|
+
sortableHeaders.forEach((header) => {
|
|
8494
|
+
header.addEventListener('click', () => {
|
|
8495
|
+
const col = header.getAttribute('data-sort');
|
|
8496
|
+
if (STATE.sort.col === col) {
|
|
8497
|
+
STATE.sort.dir = STATE.sort.dir === 'asc' ? 'desc' : 'asc';
|
|
8498
|
+
} else {
|
|
8499
|
+
STATE.sort.col = col;
|
|
8500
|
+
STATE.sort.dir = 'asc';
|
|
8501
|
+
}
|
|
8502
|
+
render();
|
|
8503
|
+
});
|
|
8504
|
+
});
|
|
8505
|
+
|
|
8506
|
+
expandableRows.forEach((row) => {
|
|
8507
|
+
row.addEventListener('click', () => {
|
|
8508
|
+
const key = row.getAttribute('data-expand');
|
|
8509
|
+
STATE.expanded[key] = !STATE.expanded[key];
|
|
8510
|
+
render();
|
|
8511
|
+
});
|
|
8512
|
+
});
|
|
8513
|
+
}
|
|
8514
|
+
|
|
8515
|
+
function renderTests() {
|
|
8516
|
+
const filteredRows = filterRows(DATA, STATE.filter);
|
|
8517
|
+
APP.innerHTML = \`
|
|
8518
|
+
<section class="section">
|
|
8519
|
+
<div class="section-heading">
|
|
8520
|
+
<div>
|
|
8521
|
+
<h2>Test Cases</h2>
|
|
8522
|
+
<p>Grouped by source eval file with assertion type badges carried down from parent score entries.</p>
|
|
8523
|
+
</div>
|
|
8524
|
+
</div>
|
|
8525
|
+
<div class="filter-bar">
|
|
8526
|
+
<select class="filter-select" id="filter-status" aria-label="Filter by status">
|
|
8527
|
+
<option value="all">All status</option>
|
|
8528
|
+
<option value="pass">Passed</option>
|
|
8529
|
+
<option value="fail">Failed</option>
|
|
8530
|
+
<option value="error">Errors</option>
|
|
8531
|
+
</select>
|
|
8532
|
+
\${TARGETS.length > 1 ? \`
|
|
8533
|
+
<select class="filter-select" id="filter-target" aria-label="Filter by target">
|
|
8534
|
+
<option value="all">All targets</option>
|
|
8535
|
+
\${TARGETS.map((target) => \`<option value="\${escapeHtml(target)}">\${escapeHtml(target)}</option>\`).join('')}
|
|
8536
|
+
</select>
|
|
8537
|
+
\` : ''}
|
|
8538
|
+
<input class="filter-input" id="filter-search" type="search" placeholder="Search test IDs, targets, eval files, or assertion text">
|
|
8539
|
+
<span class="filter-count">\${escapeHtml(String(filteredRows.length))} of \${escapeHtml(String(DATA.length))} tests</span>
|
|
8540
|
+
</div>
|
|
8541
|
+
\${renderTestGroups()}
|
|
8542
|
+
</section>
|
|
8543
|
+
\`;
|
|
8544
|
+
bindTestEvents();
|
|
8545
|
+
}
|
|
8546
|
+
|
|
8547
|
+
const IS_SMALL = DATA.length <= 20;
|
|
8548
|
+
const TAB_NAV = document.getElementById('tab-nav');
|
|
8549
|
+
|
|
8550
|
+
function render() {
|
|
8551
|
+
if (IS_SMALL) {
|
|
8552
|
+
TAB_NAV.classList.add('hidden');
|
|
8553
|
+
renderSinglePage();
|
|
8554
|
+
} else {
|
|
8555
|
+
TAB_NAV.classList.remove('hidden');
|
|
8556
|
+
if (STATE.tab === 'overview') renderOverview();
|
|
8557
|
+
else renderTests();
|
|
8558
|
+
}
|
|
8559
|
+
}
|
|
8560
|
+
|
|
8561
|
+
function renderSinglePage() {
|
|
8562
|
+
if (DATA.length === 0) {
|
|
8563
|
+
APP.innerHTML = \`
|
|
8564
|
+
<div class="empty-state">
|
|
8565
|
+
<h2>No results loaded</h2>
|
|
8566
|
+
<p>Add one or more parsed JSONL result rows to the template placeholder.</p>
|
|
8567
|
+
</div>
|
|
8568
|
+
\`;
|
|
8569
|
+
return;
|
|
8570
|
+
}
|
|
8571
|
+
|
|
8572
|
+
const sections = [];
|
|
8573
|
+
sections.push(\`
|
|
8574
|
+
<section class="section">
|
|
8575
|
+
<div class="stats-grid">
|
|
8576
|
+
\${renderStatCard('Total tests', String(OVERALL_STATS.total))}
|
|
8577
|
+
\${renderStatCard('Passed', String(OVERALL_STATS.passed), 'tone-pass')}
|
|
8578
|
+
\${renderStatCard('Failed', String(OVERALL_STATS.failed), OVERALL_STATS.failed > 0 ? 'tone-fail' : '')}
|
|
8579
|
+
\${renderStatCard('Pass rate', formatPercent(OVERALL_STATS.pass_rate), toneClassForRate(OVERALL_STATS.pass_rate))}
|
|
8580
|
+
\${renderStatCard('Duration', formatDuration(OVERALL_STATS.total_duration_ms))}
|
|
8581
|
+
\${HAS_ANY_COST ? renderStatCard('Cost', formatCost(OVERALL_STATS.total_cost_usd)) : ''}
|
|
8582
|
+
</div>
|
|
8583
|
+
</section>
|
|
8584
|
+
\`);
|
|
8585
|
+
|
|
8586
|
+
sections.push(\`
|
|
8587
|
+
<section class="section">
|
|
8588
|
+
<div class="filter-bar">
|
|
8589
|
+
<select class="filter-select" id="filter-status" aria-label="Filter by status">
|
|
8590
|
+
<option value="all">All status</option>
|
|
8591
|
+
<option value="pass">Passed</option>
|
|
8592
|
+
<option value="fail">Failed</option>
|
|
8593
|
+
<option value="error">Errors</option>
|
|
8594
|
+
</select>
|
|
8595
|
+
<input class="filter-input" id="filter-search" type="search" placeholder="Search test IDs, targets, or assertion text">
|
|
8596
|
+
</div>
|
|
8597
|
+
\${renderTestGroups()}
|
|
8598
|
+
</section>
|
|
8599
|
+
\`);
|
|
8600
|
+
|
|
8601
|
+
APP.innerHTML = sections.join('');
|
|
8602
|
+
bindTestEvents();
|
|
8603
|
+
}
|
|
8604
|
+
|
|
8605
|
+
syncHeaderMeta();
|
|
8606
|
+
render();
|
|
8607
|
+
})();
|
|
8608
|
+
</script>
|
|
8609
|
+
</body>
|
|
8610
|
+
</html>
|
|
8611
|
+
`;
|
|
8612
|
+
|
|
8613
|
+
// src/commands/results/report.ts
|
|
8614
|
+
function normalizeEvalFileLabel(value) {
|
|
8615
|
+
const trimmed = value?.trim();
|
|
8616
|
+
if (!trimmed) {
|
|
8617
|
+
return void 0;
|
|
8618
|
+
}
|
|
8619
|
+
return path13.basename(trimmed).replace(/\.results\.jsonl$/i, "").replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "").replace(/\.jsonl$/i, "");
|
|
8620
|
+
}
|
|
8621
|
+
function readBenchmarkEvalFile(sourceFile) {
|
|
8622
|
+
const benchmarkPath = path13.join(path13.dirname(sourceFile), "benchmark.json");
|
|
8623
|
+
if (!existsSync8(benchmarkPath)) {
|
|
8624
|
+
return void 0;
|
|
8625
|
+
}
|
|
8626
|
+
try {
|
|
8627
|
+
const benchmark = JSON.parse(readFileSync7(benchmarkPath, "utf8"));
|
|
8628
|
+
return normalizeEvalFileLabel(benchmark.metadata?.eval_file);
|
|
8629
|
+
} catch {
|
|
8630
|
+
return void 0;
|
|
8631
|
+
}
|
|
8632
|
+
}
|
|
8633
|
+
function deriveReportPath(sourceFile) {
|
|
8634
|
+
return path13.join(path13.dirname(sourceFile), "report.html");
|
|
8635
|
+
}
|
|
8636
|
+
function serializeReportResult(result, sourceFile, manifestRecord, benchmarkEvalFile) {
|
|
8637
|
+
const fallbackEvalFile = normalizeEvalFileLabel(manifestRecord?.eval_file) ?? benchmarkEvalFile ?? normalizeEvalFileLabel(result.suite) ?? path13.basename(path13.dirname(sourceFile));
|
|
8638
|
+
return {
|
|
8639
|
+
timestamp: result.timestamp,
|
|
8640
|
+
test_id: result.testId,
|
|
8641
|
+
suite: result.suite,
|
|
8642
|
+
category: result.category,
|
|
8643
|
+
target: result.target,
|
|
8644
|
+
score: result.score,
|
|
8645
|
+
scores: result.scores,
|
|
8646
|
+
execution_status: result.executionStatus,
|
|
8647
|
+
error: result.error,
|
|
8648
|
+
duration_ms: result.durationMs,
|
|
8649
|
+
token_usage: result.tokenUsage,
|
|
8650
|
+
cost_usd: result.costUsd,
|
|
8651
|
+
input: result.input,
|
|
8652
|
+
output: result.output,
|
|
8653
|
+
assertions: result.assertions,
|
|
8654
|
+
eval_file: fallbackEvalFile
|
|
8655
|
+
};
|
|
8656
|
+
}
|
|
8657
|
+
async function loadReportSource(source, cwd) {
|
|
8658
|
+
const { sourceFile } = await resolveSourceFile(source, cwd);
|
|
8659
|
+
const resolvedSourceFile = resolveResultSourcePath(sourceFile, cwd);
|
|
8660
|
+
const content = readFileSync7(resolvedSourceFile, "utf8");
|
|
8661
|
+
const records = parseResultManifest(content);
|
|
8662
|
+
const results = loadManifestResults(resolvedSourceFile);
|
|
8663
|
+
if (results.length === 0) {
|
|
8664
|
+
throw new Error(`No results found in ${resolvedSourceFile}`);
|
|
8665
|
+
}
|
|
8666
|
+
return {
|
|
8667
|
+
sourceFile: resolvedSourceFile,
|
|
8668
|
+
results,
|
|
8669
|
+
records,
|
|
8670
|
+
benchmarkEvalFile: readBenchmarkEvalFile(resolvedSourceFile)
|
|
8671
|
+
};
|
|
8672
|
+
}
|
|
8673
|
+
function renderResultsReport(results, sourceFile, records, benchmarkEvalFile) {
|
|
8674
|
+
if (!RESULTS_REPORT_TEMPLATE.includes("__DATA_PLACEHOLDER__")) {
|
|
8675
|
+
throw new Error("Report template is missing __DATA_PLACEHOLDER__");
|
|
8676
|
+
}
|
|
8677
|
+
const rows = results.map(
|
|
8678
|
+
(result, index) => serializeReportResult(result, sourceFile, records[index], benchmarkEvalFile)
|
|
8679
|
+
);
|
|
8680
|
+
const dataJson = JSON.stringify(rows).replace(/<\//g, "<\\/");
|
|
8681
|
+
return RESULTS_REPORT_TEMPLATE.replace("__DATA_PLACEHOLDER__", dataJson);
|
|
8682
|
+
}
|
|
8683
|
+
async function writeResultsReport(source, outputPath, cwd) {
|
|
8684
|
+
const { sourceFile, results, records, benchmarkEvalFile } = await loadReportSource(source, cwd);
|
|
8685
|
+
const resolvedOutputPath = outputPath ? path13.isAbsolute(outputPath) ? outputPath : path13.resolve(cwd, outputPath) : deriveReportPath(sourceFile);
|
|
8686
|
+
const html = renderResultsReport(results, sourceFile, records, benchmarkEvalFile);
|
|
8687
|
+
mkdirSync2(path13.dirname(resolvedOutputPath), { recursive: true });
|
|
8688
|
+
writeFileSync3(resolvedOutputPath, html, "utf8");
|
|
8689
|
+
const written = readFileSync7(resolvedOutputPath, "utf8");
|
|
8690
|
+
if (written.includes("__DATA_PLACEHOLDER__")) {
|
|
8691
|
+
throw new Error("Report placeholder substitution failed");
|
|
8692
|
+
}
|
|
8693
|
+
return { sourceFile, outputPath: resolvedOutputPath, html: written };
|
|
8694
|
+
}
|
|
8695
|
+
var resultsReportCommand = command({
|
|
8696
|
+
name: "report",
|
|
8697
|
+
description: "Generate a static HTML report from a run workspace or index.jsonl manifest",
|
|
8698
|
+
args: {
|
|
8699
|
+
source: sourceArg,
|
|
8700
|
+
out: option({
|
|
8701
|
+
type: optional(string),
|
|
8702
|
+
long: "out",
|
|
8703
|
+
short: "o",
|
|
8704
|
+
description: "Output HTML file (defaults to <run-dir>/report.html)"
|
|
8705
|
+
}),
|
|
8706
|
+
dir: option({
|
|
8707
|
+
type: optional(string),
|
|
8708
|
+
long: "dir",
|
|
8709
|
+
short: "d",
|
|
8710
|
+
description: "Working directory (default: current directory)"
|
|
8711
|
+
})
|
|
8712
|
+
},
|
|
8713
|
+
handler: async ({ source, out, dir }) => {
|
|
8714
|
+
const cwd = dir ?? process.cwd();
|
|
8715
|
+
try {
|
|
8716
|
+
const { sourceFile, outputPath } = await writeResultsReport(source, out, cwd);
|
|
8717
|
+
console.log(`Report written to ${outputPath}`);
|
|
8718
|
+
console.log(`Source: ${sourceFile}`);
|
|
8719
|
+
} catch (error) {
|
|
8720
|
+
console.error(`Error: ${error.message}`);
|
|
8721
|
+
process.exit(1);
|
|
8722
|
+
}
|
|
8723
|
+
}
|
|
8724
|
+
});
|
|
8725
|
+
|
|
6975
8726
|
// src/commands/results/show.ts
|
|
6976
8727
|
function findResult(results, testId) {
|
|
6977
8728
|
return results.find((r) => r.testId === testId);
|
|
@@ -7054,7 +8805,7 @@ var resultsShowCommand = command({
|
|
|
7054
8805
|
});
|
|
7055
8806
|
|
|
7056
8807
|
// src/commands/results/summary.ts
|
|
7057
|
-
import { existsSync as
|
|
8808
|
+
import { existsSync as existsSync9, readFileSync as readFileSync8 } from "node:fs";
|
|
7058
8809
|
function formatSummary(results, grading) {
|
|
7059
8810
|
const total = results.length;
|
|
7060
8811
|
let passed;
|
|
@@ -7105,9 +8856,9 @@ var resultsSummaryCommand = command({
|
|
|
7105
8856
|
const { results, sourceFile } = await loadResults(source, cwd);
|
|
7106
8857
|
let grading;
|
|
7107
8858
|
const gradingPath = sourceFile.replace(/\.jsonl$/, ".grading.json");
|
|
7108
|
-
if (
|
|
8859
|
+
if (existsSync9(gradingPath)) {
|
|
7109
8860
|
try {
|
|
7110
|
-
grading = JSON.parse(
|
|
8861
|
+
grading = JSON.parse(readFileSync8(gradingPath, "utf8"));
|
|
7111
8862
|
} catch {
|
|
7112
8863
|
}
|
|
7113
8864
|
}
|
|
@@ -7120,11 +8871,11 @@ var resultsSummaryCommand = command({
|
|
|
7120
8871
|
});
|
|
7121
8872
|
|
|
7122
8873
|
// src/commands/results/validate.ts
|
|
7123
|
-
import { existsSync as
|
|
7124
|
-
import
|
|
8874
|
+
import { existsSync as existsSync10, readFileSync as readFileSync9, statSync as statSync4 } from "node:fs";
|
|
8875
|
+
import path14 from "node:path";
|
|
7125
8876
|
function checkDirectoryNaming(runDir) {
|
|
7126
|
-
const dirName =
|
|
7127
|
-
const pathSegments =
|
|
8877
|
+
const dirName = path14.basename(runDir);
|
|
8878
|
+
const pathSegments = path14.normalize(runDir).split(path14.sep).filter(Boolean);
|
|
7128
8879
|
const runsIndex = pathSegments.lastIndexOf("runs");
|
|
7129
8880
|
const diagnostics = [];
|
|
7130
8881
|
if (runsIndex < 0 || runsIndex >= pathSegments.length - 1) {
|
|
@@ -7154,14 +8905,14 @@ function validateRunDirectory(runDir) {
|
|
|
7154
8905
|
return { diagnostics, entries: entries2 };
|
|
7155
8906
|
}
|
|
7156
8907
|
function checkIndexJsonl(runDir) {
|
|
7157
|
-
const indexPath =
|
|
8908
|
+
const indexPath = path14.join(runDir, "index.jsonl");
|
|
7158
8909
|
const diagnostics = [];
|
|
7159
8910
|
const entries2 = [];
|
|
7160
|
-
if (!
|
|
8911
|
+
if (!existsSync10(indexPath)) {
|
|
7161
8912
|
diagnostics.push({ severity: "error", message: "index.jsonl is missing" });
|
|
7162
8913
|
return { diagnostics, entries: entries2 };
|
|
7163
8914
|
}
|
|
7164
|
-
const content =
|
|
8915
|
+
const content = readFileSync9(indexPath, "utf8");
|
|
7165
8916
|
const lines = content.split("\n").filter((l) => l.trim().length > 0);
|
|
7166
8917
|
if (lines.length === 0) {
|
|
7167
8918
|
diagnostics.push({ severity: "error", message: "index.jsonl is empty" });
|
|
@@ -7253,15 +9004,15 @@ function checkArtifactFiles(runDir, entries2) {
|
|
|
7253
9004
|
for (const entry of entries2) {
|
|
7254
9005
|
const testId = entry.test_id ?? "?";
|
|
7255
9006
|
if (entry.grading_path) {
|
|
7256
|
-
const gradingPath =
|
|
7257
|
-
if (!
|
|
9007
|
+
const gradingPath = path14.join(runDir, entry.grading_path);
|
|
9008
|
+
if (!existsSync10(gradingPath)) {
|
|
7258
9009
|
diagnostics.push({
|
|
7259
9010
|
severity: "error",
|
|
7260
9011
|
message: `${testId}: grading.json not found at '${entry.grading_path}'`
|
|
7261
9012
|
});
|
|
7262
9013
|
} else {
|
|
7263
9014
|
try {
|
|
7264
|
-
const grading = JSON.parse(
|
|
9015
|
+
const grading = JSON.parse(readFileSync9(gradingPath, "utf8"));
|
|
7265
9016
|
if (!grading.assertions || !Array.isArray(grading.assertions)) {
|
|
7266
9017
|
diagnostics.push({
|
|
7267
9018
|
severity: "error",
|
|
@@ -7283,8 +9034,8 @@ function checkArtifactFiles(runDir, entries2) {
|
|
|
7283
9034
|
}
|
|
7284
9035
|
}
|
|
7285
9036
|
if (entry.timing_path) {
|
|
7286
|
-
const timingPath =
|
|
7287
|
-
if (!
|
|
9037
|
+
const timingPath = path14.join(runDir, entry.timing_path);
|
|
9038
|
+
if (!existsSync10(timingPath)) {
|
|
7288
9039
|
diagnostics.push({
|
|
7289
9040
|
severity: "warning",
|
|
7290
9041
|
message: `${testId}: timing.json not found at '${entry.timing_path}'`
|
|
@@ -7292,8 +9043,8 @@ function checkArtifactFiles(runDir, entries2) {
|
|
|
7292
9043
|
}
|
|
7293
9044
|
}
|
|
7294
9045
|
}
|
|
7295
|
-
const benchmarkPath =
|
|
7296
|
-
if (!
|
|
9046
|
+
const benchmarkPath = path14.join(runDir, "benchmark.json");
|
|
9047
|
+
if (!existsSync10(benchmarkPath)) {
|
|
7297
9048
|
diagnostics.push({ severity: "warning", message: "benchmark.json is missing" });
|
|
7298
9049
|
}
|
|
7299
9050
|
return diagnostics;
|
|
@@ -7309,8 +9060,8 @@ var resultsValidateCommand = command({
|
|
|
7309
9060
|
})
|
|
7310
9061
|
},
|
|
7311
9062
|
handler: async ({ runDir }) => {
|
|
7312
|
-
const resolvedDir =
|
|
7313
|
-
if (!
|
|
9063
|
+
const resolvedDir = path14.resolve(runDir);
|
|
9064
|
+
if (!existsSync10(resolvedDir) || !statSync4(resolvedDir).isDirectory()) {
|
|
7314
9065
|
console.error(`Error: '${runDir}' is not a directory`);
|
|
7315
9066
|
process.exit(1);
|
|
7316
9067
|
}
|
|
@@ -7343,6 +9094,7 @@ var resultsCommand = subcommands({
|
|
|
7343
9094
|
description: "Inspect, export, and manage evaluation results",
|
|
7344
9095
|
cmds: {
|
|
7345
9096
|
export: resultsExportCommand,
|
|
9097
|
+
report: resultsReportCommand,
|
|
7346
9098
|
summary: resultsSummaryCommand,
|
|
7347
9099
|
failures: resultsFailuresCommand,
|
|
7348
9100
|
show: resultsShowCommand,
|
|
@@ -7351,15 +9103,15 @@ var resultsCommand = subcommands({
|
|
|
7351
9103
|
});
|
|
7352
9104
|
|
|
7353
9105
|
// src/commands/results/serve.ts
|
|
7354
|
-
import { existsSync as
|
|
7355
|
-
import
|
|
9106
|
+
import { existsSync as existsSync14, readFileSync as readFileSync12, readdirSync as readdirSync4, statSync as statSync5, writeFileSync as writeFileSync6 } from "node:fs";
|
|
9107
|
+
import path18 from "node:path";
|
|
7356
9108
|
import { fileURLToPath as fileURLToPath3 } from "node:url";
|
|
7357
9109
|
import { Hono } from "hono";
|
|
7358
9110
|
|
|
7359
9111
|
// src/commands/results/eval-runner.ts
|
|
7360
9112
|
import { execFileSync, spawn } from "node:child_process";
|
|
7361
|
-
import { existsSync as
|
|
7362
|
-
import
|
|
9113
|
+
import { existsSync as existsSync11 } from "node:fs";
|
|
9114
|
+
import path15 from "node:path";
|
|
7363
9115
|
import { fileURLToPath as fileURLToPath2 } from "node:url";
|
|
7364
9116
|
var activeRuns = /* @__PURE__ */ new Map();
|
|
7365
9117
|
function generateRunId() {
|
|
@@ -7381,16 +9133,16 @@ async function discoverTargetsInProject(cwd) {
|
|
|
7381
9133
|
const repoRoot = await findRepoRoot(cwd) ?? cwd;
|
|
7382
9134
|
let targetsFilePath;
|
|
7383
9135
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
7384
|
-
const fullPath =
|
|
7385
|
-
if (
|
|
9136
|
+
const fullPath = path15.join(cwd, candidate);
|
|
9137
|
+
if (existsSync11(fullPath)) {
|
|
7386
9138
|
targetsFilePath = fullPath;
|
|
7387
9139
|
break;
|
|
7388
9140
|
}
|
|
7389
9141
|
}
|
|
7390
9142
|
if (!targetsFilePath) {
|
|
7391
9143
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
7392
|
-
const fullPath =
|
|
7393
|
-
if (
|
|
9144
|
+
const fullPath = path15.join(repoRoot, candidate);
|
|
9145
|
+
if (existsSync11(fullPath)) {
|
|
7394
9146
|
targetsFilePath = fullPath;
|
|
7395
9147
|
break;
|
|
7396
9148
|
}
|
|
@@ -7439,19 +9191,19 @@ function buildCliPreview(args) {
|
|
|
7439
9191
|
}
|
|
7440
9192
|
function resolveCliPath(cwd) {
|
|
7441
9193
|
const candidates = [
|
|
7442
|
-
|
|
7443
|
-
|
|
9194
|
+
path15.join(cwd, "apps/cli/src/cli.ts"),
|
|
9195
|
+
path15.join(cwd, "apps/cli/dist/cli.js")
|
|
7444
9196
|
];
|
|
7445
9197
|
for (const c4 of candidates) {
|
|
7446
|
-
if (
|
|
9198
|
+
if (existsSync11(c4)) {
|
|
7447
9199
|
return { binPath: "bun", args: [c4] };
|
|
7448
9200
|
}
|
|
7449
9201
|
}
|
|
7450
|
-
const currentDir = typeof __dirname !== "undefined" ? __dirname :
|
|
7451
|
-
const fromSrc =
|
|
7452
|
-
const fromDist =
|
|
7453
|
-
if (
|
|
7454
|
-
if (
|
|
9202
|
+
const currentDir = typeof __dirname !== "undefined" ? __dirname : path15.dirname(fileURLToPath2(import.meta.url));
|
|
9203
|
+
const fromSrc = path15.resolve(currentDir, "../../../cli.ts");
|
|
9204
|
+
const fromDist = path15.resolve(currentDir, "../../cli.js");
|
|
9205
|
+
if (existsSync11(fromSrc)) return { binPath: "bun", args: [fromSrc] };
|
|
9206
|
+
if (existsSync11(fromDist)) return { binPath: "bun", args: [fromDist] };
|
|
7455
9207
|
if (isCommandAvailable("agentv")) {
|
|
7456
9208
|
return { binPath: "agentv", args: [] };
|
|
7457
9209
|
}
|
|
@@ -7739,19 +9491,19 @@ Process error: ${err2.message}`;
|
|
|
7739
9491
|
}
|
|
7740
9492
|
|
|
7741
9493
|
// src/commands/results/run-tags.ts
|
|
7742
|
-
import { existsSync as
|
|
7743
|
-
import
|
|
9494
|
+
import { existsSync as existsSync12, readFileSync as readFileSync10, unlinkSync as unlinkSync2, writeFileSync as writeFileSync4 } from "node:fs";
|
|
9495
|
+
import path16 from "node:path";
|
|
7744
9496
|
var RUN_TAGS_FILENAME = "tags.json";
|
|
7745
9497
|
var MAX_TAGS_PER_RUN = 20;
|
|
7746
9498
|
var MAX_TAG_LENGTH = 60;
|
|
7747
9499
|
function runTagsPath(manifestPath) {
|
|
7748
|
-
return
|
|
9500
|
+
return path16.join(path16.dirname(manifestPath), RUN_TAGS_FILENAME);
|
|
7749
9501
|
}
|
|
7750
9502
|
function readRunTags(manifestPath) {
|
|
7751
9503
|
const fp = runTagsPath(manifestPath);
|
|
7752
|
-
if (!
|
|
9504
|
+
if (!existsSync12(fp)) return void 0;
|
|
7753
9505
|
try {
|
|
7754
|
-
const parsed = JSON.parse(
|
|
9506
|
+
const parsed = JSON.parse(readFileSync10(fp, "utf8"));
|
|
7755
9507
|
if (!parsed || typeof parsed !== "object") return void 0;
|
|
7756
9508
|
const record = parsed;
|
|
7757
9509
|
if (!Array.isArray(record.tags)) return void 0;
|
|
@@ -7777,13 +9529,13 @@ function writeRunTags(manifestPath, tags) {
|
|
|
7777
9529
|
tags: cleaned,
|
|
7778
9530
|
updated_at: (/* @__PURE__ */ new Date()).toISOString()
|
|
7779
9531
|
};
|
|
7780
|
-
|
|
9532
|
+
writeFileSync4(runTagsPath(manifestPath), `${JSON.stringify(entry, null, 2)}
|
|
7781
9533
|
`, "utf8");
|
|
7782
9534
|
return entry;
|
|
7783
9535
|
}
|
|
7784
9536
|
function deleteRunTags(manifestPath) {
|
|
7785
9537
|
const fp = runTagsPath(manifestPath);
|
|
7786
|
-
if (
|
|
9538
|
+
if (existsSync12(fp)) {
|
|
7787
9539
|
unlinkSync2(fp);
|
|
7788
9540
|
}
|
|
7789
9541
|
}
|
|
@@ -7816,18 +9568,18 @@ function normalizeTags(tags) {
|
|
|
7816
9568
|
}
|
|
7817
9569
|
|
|
7818
9570
|
// src/commands/results/studio-config.ts
|
|
7819
|
-
import { existsSync as
|
|
7820
|
-
import
|
|
9571
|
+
import { existsSync as existsSync13, mkdirSync as mkdirSync3, readFileSync as readFileSync11, writeFileSync as writeFileSync5 } from "node:fs";
|
|
9572
|
+
import path17 from "node:path";
|
|
7821
9573
|
import { parse as parseYaml, stringify as stringifyYaml2 } from "yaml";
|
|
7822
9574
|
var DEFAULTS = {
|
|
7823
9575
|
threshold: DEFAULT_THRESHOLD
|
|
7824
9576
|
};
|
|
7825
9577
|
function loadStudioConfig(agentvDir) {
|
|
7826
|
-
const configPath =
|
|
7827
|
-
if (!
|
|
9578
|
+
const configPath = path17.join(agentvDir, "config.yaml");
|
|
9579
|
+
if (!existsSync13(configPath)) {
|
|
7828
9580
|
return { ...DEFAULTS };
|
|
7829
9581
|
}
|
|
7830
|
-
const raw =
|
|
9582
|
+
const raw = readFileSync11(configPath, "utf-8");
|
|
7831
9583
|
const parsed = parseYaml(raw);
|
|
7832
9584
|
if (!parsed || typeof parsed !== "object") {
|
|
7833
9585
|
return { ...DEFAULTS };
|
|
@@ -7849,13 +9601,13 @@ function loadStudioConfig(agentvDir) {
|
|
|
7849
9601
|
};
|
|
7850
9602
|
}
|
|
7851
9603
|
function saveStudioConfig(agentvDir, config) {
|
|
7852
|
-
if (!
|
|
7853
|
-
|
|
9604
|
+
if (!existsSync13(agentvDir)) {
|
|
9605
|
+
mkdirSync3(agentvDir, { recursive: true });
|
|
7854
9606
|
}
|
|
7855
|
-
const configPath =
|
|
9607
|
+
const configPath = path17.join(agentvDir, "config.yaml");
|
|
7856
9608
|
let existing = {};
|
|
7857
|
-
if (
|
|
7858
|
-
const raw =
|
|
9609
|
+
if (existsSync13(configPath)) {
|
|
9610
|
+
const raw = readFileSync11(configPath, "utf-8");
|
|
7859
9611
|
const parsed = parseYaml(raw);
|
|
7860
9612
|
if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
|
|
7861
9613
|
existing = parsed;
|
|
@@ -7871,14 +9623,14 @@ function saveStudioConfig(agentvDir, config) {
|
|
|
7871
9623
|
existing.studio = { ...config };
|
|
7872
9624
|
}
|
|
7873
9625
|
const yamlStr = stringifyYaml2(existing);
|
|
7874
|
-
|
|
9626
|
+
writeFileSync5(configPath, yamlStr, "utf-8");
|
|
7875
9627
|
}
|
|
7876
9628
|
|
|
7877
9629
|
// src/commands/results/serve.ts
|
|
7878
9630
|
async function resolveSourceFile2(source, cwd) {
|
|
7879
9631
|
if (source) {
|
|
7880
9632
|
let resolved = resolveResultSourcePath(source, cwd);
|
|
7881
|
-
if (!
|
|
9633
|
+
if (!existsSync14(resolved)) {
|
|
7882
9634
|
throw new Error(`Source file not found: ${resolved}`);
|
|
7883
9635
|
}
|
|
7884
9636
|
resolved = resolveRunManifestPath(resolved);
|
|
@@ -7886,7 +9638,7 @@ async function resolveSourceFile2(source, cwd) {
|
|
|
7886
9638
|
}
|
|
7887
9639
|
const cache = await loadRunCache(cwd);
|
|
7888
9640
|
const cachedFile = cache ? resolveRunCacheFile(cache) : "";
|
|
7889
|
-
if (cachedFile &&
|
|
9641
|
+
if (cachedFile && existsSync14(cachedFile)) {
|
|
7890
9642
|
return cachedFile;
|
|
7891
9643
|
}
|
|
7892
9644
|
const metas = listResultFiles(cwd, 10);
|
|
@@ -7916,26 +9668,26 @@ function resolveDashboardMode(projectCount, options) {
|
|
|
7916
9668
|
return { isMultiProject: projectCount > 1, showMultiWarning: false };
|
|
7917
9669
|
}
|
|
7918
9670
|
function feedbackPath(resultDir) {
|
|
7919
|
-
return
|
|
9671
|
+
return path18.join(resultDir, "feedback.json");
|
|
7920
9672
|
}
|
|
7921
9673
|
function readFeedback(cwd) {
|
|
7922
9674
|
const fp = feedbackPath(cwd);
|
|
7923
|
-
if (!
|
|
9675
|
+
if (!existsSync14(fp)) {
|
|
7924
9676
|
return { reviews: [] };
|
|
7925
9677
|
}
|
|
7926
9678
|
try {
|
|
7927
|
-
return JSON.parse(
|
|
9679
|
+
return JSON.parse(readFileSync12(fp, "utf8"));
|
|
7928
9680
|
} catch (err2) {
|
|
7929
9681
|
console.error(`Warning: could not parse ${fp}, starting fresh: ${err2.message}`);
|
|
7930
9682
|
return { reviews: [] };
|
|
7931
9683
|
}
|
|
7932
9684
|
}
|
|
7933
9685
|
function writeFeedback(cwd, data) {
|
|
7934
|
-
|
|
9686
|
+
writeFileSync6(feedbackPath(cwd), `${JSON.stringify(data, null, 2)}
|
|
7935
9687
|
`, "utf8");
|
|
7936
9688
|
}
|
|
7937
9689
|
function buildFileTree(dirPath, relativeTo) {
|
|
7938
|
-
if (!
|
|
9690
|
+
if (!existsSync14(dirPath) || !statSync5(dirPath).isDirectory()) {
|
|
7939
9691
|
return [];
|
|
7940
9692
|
}
|
|
7941
9693
|
const entries2 = readdirSync4(dirPath, { withFileTypes: true });
|
|
@@ -7943,8 +9695,8 @@ function buildFileTree(dirPath, relativeTo) {
|
|
|
7943
9695
|
if (a.isDirectory() !== b.isDirectory()) return a.isDirectory() ? -1 : 1;
|
|
7944
9696
|
return a.name.localeCompare(b.name);
|
|
7945
9697
|
}).map((entry) => {
|
|
7946
|
-
const fullPath =
|
|
7947
|
-
const relPath =
|
|
9698
|
+
const fullPath = path18.join(dirPath, entry.name);
|
|
9699
|
+
const relPath = path18.relative(relativeTo, fullPath);
|
|
7948
9700
|
if (entry.isDirectory()) {
|
|
7949
9701
|
return {
|
|
7950
9702
|
name: entry.name,
|
|
@@ -7957,7 +9709,7 @@ function buildFileTree(dirPath, relativeTo) {
|
|
|
7957
9709
|
});
|
|
7958
9710
|
}
|
|
7959
9711
|
function inferLanguage(filePath) {
|
|
7960
|
-
const ext =
|
|
9712
|
+
const ext = path18.extname(filePath).toLowerCase();
|
|
7961
9713
|
const langMap = {
|
|
7962
9714
|
".json": "json",
|
|
7963
9715
|
".jsonl": "json",
|
|
@@ -8158,11 +9910,11 @@ async function handleEvalFiles(c4, { searchDir }) {
|
|
|
8158
9910
|
const meta = await findRunById(searchDir, filename);
|
|
8159
9911
|
if (!meta) return c4.json({ error: "Run not found" }, 404);
|
|
8160
9912
|
try {
|
|
8161
|
-
const content =
|
|
9913
|
+
const content = readFileSync12(meta.path, "utf8");
|
|
8162
9914
|
const records = parseResultManifest(content);
|
|
8163
9915
|
const record = records.find((r) => r.test_id === evalId);
|
|
8164
9916
|
if (!record) return c4.json({ error: "Eval not found" }, 404);
|
|
8165
|
-
const baseDir =
|
|
9917
|
+
const baseDir = path18.dirname(meta.path);
|
|
8166
9918
|
const knownPaths = [
|
|
8167
9919
|
record.grading_path,
|
|
8168
9920
|
record.timing_path,
|
|
@@ -8171,14 +9923,14 @@ async function handleEvalFiles(c4, { searchDir }) {
|
|
|
8171
9923
|
record.response_path
|
|
8172
9924
|
].filter((p) => !!p);
|
|
8173
9925
|
if (knownPaths.length === 0) return c4.json({ files: [] });
|
|
8174
|
-
const artifactDirs = knownPaths.map((p) =>
|
|
9926
|
+
const artifactDirs = knownPaths.map((p) => path18.dirname(p));
|
|
8175
9927
|
let commonDir = artifactDirs[0];
|
|
8176
9928
|
for (const dir of artifactDirs) {
|
|
8177
9929
|
while (!dir.startsWith(commonDir)) {
|
|
8178
|
-
commonDir =
|
|
9930
|
+
commonDir = path18.dirname(commonDir);
|
|
8179
9931
|
}
|
|
8180
9932
|
}
|
|
8181
|
-
const artifactAbsDir =
|
|
9933
|
+
const artifactAbsDir = path18.join(baseDir, commonDir);
|
|
8182
9934
|
const files = buildFileTree(artifactAbsDir, baseDir);
|
|
8183
9935
|
return c4.json({ files });
|
|
8184
9936
|
} catch {
|
|
@@ -8193,16 +9945,16 @@ async function handleEvalFileContent(c4, { searchDir }) {
|
|
|
8193
9945
|
const markerIdx = c4.req.path.indexOf(marker);
|
|
8194
9946
|
const filePath = markerIdx >= 0 ? c4.req.path.slice(markerIdx + marker.length) : "";
|
|
8195
9947
|
if (!filePath) return c4.json({ error: "No file path specified" }, 400);
|
|
8196
|
-
const baseDir =
|
|
8197
|
-
const absolutePath =
|
|
8198
|
-
if (!absolutePath.startsWith(
|
|
9948
|
+
const baseDir = path18.dirname(meta.path);
|
|
9949
|
+
const absolutePath = path18.resolve(baseDir, filePath);
|
|
9950
|
+
if (!absolutePath.startsWith(path18.resolve(baseDir) + path18.sep) && absolutePath !== path18.resolve(baseDir)) {
|
|
8199
9951
|
return c4.json({ error: "Path traversal not allowed" }, 403);
|
|
8200
9952
|
}
|
|
8201
|
-
if (!
|
|
9953
|
+
if (!existsSync14(absolutePath) || !statSync5(absolutePath).isFile()) {
|
|
8202
9954
|
return c4.json({ error: "File not found" }, 404);
|
|
8203
9955
|
}
|
|
8204
9956
|
try {
|
|
8205
|
-
const fileContent =
|
|
9957
|
+
const fileContent = readFileSync12(absolutePath, "utf8");
|
|
8206
9958
|
const language = inferLanguage(absolutePath);
|
|
8207
9959
|
return c4.json({ content: fileContent, language });
|
|
8208
9960
|
} catch {
|
|
@@ -8331,6 +10083,18 @@ async function handleCompare(c4, { searchDir, agentvDir }) {
|
|
|
8331
10083
|
} catch {
|
|
8332
10084
|
}
|
|
8333
10085
|
}
|
|
10086
|
+
const baselineTarget = c4.req.query("baseline") ?? "";
|
|
10087
|
+
if (baselineTarget && !targetsSet.has(baselineTarget)) {
|
|
10088
|
+
return c4.json({ error: `Baseline target "${baselineTarget}" does not exist in the data` }, 400);
|
|
10089
|
+
}
|
|
10090
|
+
const baselineScores = /* @__PURE__ */ new Map();
|
|
10091
|
+
if (baselineTarget) {
|
|
10092
|
+
for (const entry of cellMap.values()) {
|
|
10093
|
+
if (entry.target === baselineTarget && entry.evalCount > 0) {
|
|
10094
|
+
baselineScores.set(entry.experiment, entry.scoreSum / entry.evalCount);
|
|
10095
|
+
}
|
|
10096
|
+
}
|
|
10097
|
+
}
|
|
8334
10098
|
const cells = [...cellMap.values()].map((entry) => {
|
|
8335
10099
|
const dedupMap = /* @__PURE__ */ new Map();
|
|
8336
10100
|
for (const t of entry.tests) {
|
|
@@ -8338,15 +10102,24 @@ async function handleCompare(c4, { searchDir, agentvDir }) {
|
|
|
8338
10102
|
}
|
|
8339
10103
|
const dedupedTests = [...dedupMap.values()];
|
|
8340
10104
|
const cappedTests = dedupedTests.slice(-MAX_TESTS_PER_CELL);
|
|
8341
|
-
|
|
10105
|
+
const avgScore = entry.evalCount > 0 ? entry.scoreSum / entry.evalCount : 0;
|
|
10106
|
+
const cell = {
|
|
8342
10107
|
experiment: entry.experiment,
|
|
8343
10108
|
target: entry.target,
|
|
8344
10109
|
eval_count: entry.evalCount,
|
|
8345
10110
|
passed_count: entry.passedCount,
|
|
8346
10111
|
pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0,
|
|
8347
|
-
avg_score:
|
|
10112
|
+
avg_score: avgScore,
|
|
8348
10113
|
tests: cappedTests
|
|
8349
10114
|
};
|
|
10115
|
+
if (baselineTarget && entry.target !== baselineTarget) {
|
|
10116
|
+
const baseAvg = baselineScores.get(entry.experiment);
|
|
10117
|
+
if (baseAvg !== void 0) {
|
|
10118
|
+
cell.delta = Math.round((avgScore - baseAvg) * 1e3) / 1e3;
|
|
10119
|
+
cell.normalized_gain = baseAvg >= 1 ? null : Math.round((avgScore - baseAvg) / (1 - baseAvg) * 1e3) / 1e3;
|
|
10120
|
+
}
|
|
10121
|
+
}
|
|
10122
|
+
return cell;
|
|
8350
10123
|
});
|
|
8351
10124
|
runEntries.sort((a, b) => b.started_at.localeCompare(a.started_at));
|
|
8352
10125
|
return c4.json({
|
|
@@ -8394,13 +10167,13 @@ function handleConfig(c4, { agentvDir, searchDir }, options) {
|
|
|
8394
10167
|
return c4.json({
|
|
8395
10168
|
...loadStudioConfig(agentvDir),
|
|
8396
10169
|
read_only: options?.readOnly === true,
|
|
8397
|
-
project_name:
|
|
10170
|
+
project_name: path18.basename(searchDir),
|
|
8398
10171
|
multi_project_dashboard: options?.multiProjectDashboard === true
|
|
8399
10172
|
});
|
|
8400
10173
|
}
|
|
8401
10174
|
function handleFeedbackRead(c4, { searchDir }) {
|
|
8402
|
-
const resultsDir =
|
|
8403
|
-
return c4.json(readFeedback(
|
|
10175
|
+
const resultsDir = path18.join(searchDir, ".agentv", "results");
|
|
10176
|
+
return c4.json(readFeedback(existsSync14(resultsDir) ? resultsDir : searchDir));
|
|
8404
10177
|
}
|
|
8405
10178
|
async function handleRunTagsPut(c4, { searchDir }) {
|
|
8406
10179
|
const filename = c4.req.param("filename") ?? "";
|
|
@@ -8448,18 +10221,18 @@ async function handleRunTagsDelete(c4, { searchDir }) {
|
|
|
8448
10221
|
}
|
|
8449
10222
|
function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
8450
10223
|
const searchDir = cwd ?? resultDir;
|
|
8451
|
-
const agentvDir =
|
|
10224
|
+
const agentvDir = path18.join(searchDir, ".agentv");
|
|
8452
10225
|
const defaultCtx = { searchDir, agentvDir };
|
|
8453
10226
|
const readOnly = options?.readOnly === true;
|
|
8454
10227
|
const app2 = new Hono();
|
|
8455
10228
|
function withBenchmark(c4, handler) {
|
|
8456
10229
|
const benchmark = getBenchmark(c4.req.param("benchmarkId") ?? "");
|
|
8457
|
-
if (!benchmark || !
|
|
10230
|
+
if (!benchmark || !existsSync14(benchmark.path)) {
|
|
8458
10231
|
return c4.json({ error: "Project not found" }, 404);
|
|
8459
10232
|
}
|
|
8460
10233
|
return handler(c4, {
|
|
8461
10234
|
searchDir: benchmark.path,
|
|
8462
|
-
agentvDir:
|
|
10235
|
+
agentvDir: path18.join(benchmark.path, ".agentv")
|
|
8463
10236
|
});
|
|
8464
10237
|
}
|
|
8465
10238
|
app2.post("/api/config", async (c4) => {
|
|
@@ -8786,20 +10559,20 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
|
8786
10559
|
{ readOnly }
|
|
8787
10560
|
);
|
|
8788
10561
|
const studioDistPath = options?.studioDir ?? resolveStudioDistDir();
|
|
8789
|
-
if (!studioDistPath || !
|
|
10562
|
+
if (!studioDistPath || !existsSync14(path18.join(studioDistPath, "index.html"))) {
|
|
8790
10563
|
throw new Error('Studio dist not found. Run "bun run build" in apps/studio/ to build the SPA.');
|
|
8791
10564
|
}
|
|
8792
10565
|
app2.get("/", (c4) => {
|
|
8793
|
-
const indexPath =
|
|
8794
|
-
if (
|
|
10566
|
+
const indexPath = path18.join(studioDistPath, "index.html");
|
|
10567
|
+
if (existsSync14(indexPath)) return c4.html(readFileSync12(indexPath, "utf8"));
|
|
8795
10568
|
return c4.notFound();
|
|
8796
10569
|
});
|
|
8797
10570
|
app2.get("/assets/*", (c4) => {
|
|
8798
10571
|
const assetPath = c4.req.path;
|
|
8799
|
-
const filePath =
|
|
8800
|
-
if (!
|
|
8801
|
-
const content =
|
|
8802
|
-
const ext =
|
|
10572
|
+
const filePath = path18.join(studioDistPath, assetPath);
|
|
10573
|
+
if (!existsSync14(filePath)) return c4.notFound();
|
|
10574
|
+
const content = readFileSync12(filePath);
|
|
10575
|
+
const ext = path18.extname(filePath);
|
|
8803
10576
|
const mimeTypes = {
|
|
8804
10577
|
".js": "application/javascript",
|
|
8805
10578
|
".css": "text/css",
|
|
@@ -8820,26 +10593,26 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
|
8820
10593
|
});
|
|
8821
10594
|
app2.get("*", (c4) => {
|
|
8822
10595
|
if (c4.req.path.startsWith("/api/")) return c4.json({ error: "Not found" }, 404);
|
|
8823
|
-
const indexPath =
|
|
8824
|
-
if (
|
|
10596
|
+
const indexPath = path18.join(studioDistPath, "index.html");
|
|
10597
|
+
if (existsSync14(indexPath)) return c4.html(readFileSync12(indexPath, "utf8"));
|
|
8825
10598
|
return c4.notFound();
|
|
8826
10599
|
});
|
|
8827
10600
|
return app2;
|
|
8828
10601
|
}
|
|
8829
10602
|
function resolveStudioDistDir() {
|
|
8830
|
-
const currentDir = typeof __dirname !== "undefined" ? __dirname :
|
|
10603
|
+
const currentDir = typeof __dirname !== "undefined" ? __dirname : path18.dirname(fileURLToPath3(import.meta.url));
|
|
8831
10604
|
const candidates = [
|
|
8832
10605
|
// From src/commands/results/ → sibling apps/studio/dist
|
|
8833
|
-
|
|
10606
|
+
path18.resolve(currentDir, "../../../../studio/dist"),
|
|
8834
10607
|
// From dist/ → sibling apps/studio/dist (monorepo dev)
|
|
8835
|
-
|
|
10608
|
+
path18.resolve(currentDir, "../../studio/dist"),
|
|
8836
10609
|
// Bundled inside CLI dist (published package: dist/studio/)
|
|
8837
|
-
|
|
10610
|
+
path18.resolve(currentDir, "studio"),
|
|
8838
10611
|
// From dist/ in monorepo root context
|
|
8839
|
-
|
|
10612
|
+
path18.resolve(currentDir, "../../../apps/studio/dist")
|
|
8840
10613
|
];
|
|
8841
10614
|
for (const candidate of candidates) {
|
|
8842
|
-
if (
|
|
10615
|
+
if (existsSync14(candidate) && existsSync14(path18.join(candidate, "index.html"))) {
|
|
8843
10616
|
return candidate;
|
|
8844
10617
|
}
|
|
8845
10618
|
}
|
|
@@ -8945,7 +10718,7 @@ Discovered ${discovered.length} project(s).`);
|
|
|
8945
10718
|
} else {
|
|
8946
10719
|
const cache = await loadRunCache(cwd);
|
|
8947
10720
|
const cachedFile = cache ? resolveRunCacheFile(cache) : "";
|
|
8948
|
-
if (cachedFile &&
|
|
10721
|
+
if (cachedFile && existsSync14(cachedFile)) {
|
|
8949
10722
|
sourceFile = cachedFile;
|
|
8950
10723
|
results = loadManifestResults(cachedFile);
|
|
8951
10724
|
} else {
|
|
@@ -8956,7 +10729,7 @@ Discovered ${discovered.length} project(s).`);
|
|
|
8956
10729
|
}
|
|
8957
10730
|
}
|
|
8958
10731
|
}
|
|
8959
|
-
const resultDir = sourceFile ?
|
|
10732
|
+
const resultDir = sourceFile ? path18.dirname(path18.resolve(sourceFile)) : cwd;
|
|
8960
10733
|
const app2 = createApp(results, resultDir, cwd, sourceFile, {
|
|
8961
10734
|
readOnly,
|
|
8962
10735
|
multiProjectDashboard: isMultiProject
|
|
@@ -9080,8 +10853,8 @@ var selfCommand = subcommands({
|
|
|
9080
10853
|
});
|
|
9081
10854
|
|
|
9082
10855
|
// src/commands/transpile/index.ts
|
|
9083
|
-
import { writeFileSync as
|
|
9084
|
-
import
|
|
10856
|
+
import { writeFileSync as writeFileSync7 } from "node:fs";
|
|
10857
|
+
import path19 from "node:path";
|
|
9085
10858
|
var transpileCommand = command({
|
|
9086
10859
|
name: "transpile",
|
|
9087
10860
|
description: "Convert an EVAL.yaml file to Agent Skills evals.json format",
|
|
@@ -9105,7 +10878,7 @@ var transpileCommand = command({
|
|
|
9105
10878
|
handler: async ({ input, outDir, stdout }) => {
|
|
9106
10879
|
let result;
|
|
9107
10880
|
try {
|
|
9108
|
-
result = transpileEvalYamlFile(
|
|
10881
|
+
result = transpileEvalYamlFile(path19.resolve(input));
|
|
9109
10882
|
} catch (error) {
|
|
9110
10883
|
console.error(`Error: ${error.message}`);
|
|
9111
10884
|
process.exit(1);
|
|
@@ -9129,12 +10902,12 @@ var transpileCommand = command({
|
|
|
9129
10902
|
process.stdout.write("\n");
|
|
9130
10903
|
return;
|
|
9131
10904
|
}
|
|
9132
|
-
const outputDir = outDir ?
|
|
10905
|
+
const outputDir = outDir ? path19.resolve(outDir) : path19.dirname(path19.resolve(input));
|
|
9133
10906
|
const fileNames = getOutputFilenames(result);
|
|
9134
10907
|
for (const [skill, evalsJson] of result.files) {
|
|
9135
10908
|
const fileName = fileNames.get(skill) ?? "evals.json";
|
|
9136
|
-
const outputPath =
|
|
9137
|
-
|
|
10909
|
+
const outputPath = path19.join(outputDir, fileName);
|
|
10910
|
+
writeFileSync7(outputPath, `${JSON.stringify(evalsJson, null, 2)}
|
|
9138
10911
|
`);
|
|
9139
10912
|
console.log(`Transpiled to ${outputPath}`);
|
|
9140
10913
|
}
|
|
@@ -9142,7 +10915,7 @@ var transpileCommand = command({
|
|
|
9142
10915
|
});
|
|
9143
10916
|
|
|
9144
10917
|
// src/commands/trend/index.ts
|
|
9145
|
-
import
|
|
10918
|
+
import path20 from "node:path";
|
|
9146
10919
|
var colors2 = {
|
|
9147
10920
|
reset: "\x1B[0m",
|
|
9148
10921
|
bold: "\x1B[1m",
|
|
@@ -9192,7 +10965,7 @@ function colorizeSlope(value) {
|
|
|
9192
10965
|
}
|
|
9193
10966
|
function ensureTrendIndexPath(source, cwd) {
|
|
9194
10967
|
const resolved = resolveResultSourcePath(source, cwd);
|
|
9195
|
-
if (
|
|
10968
|
+
if (path20.basename(resolved) !== RESULT_INDEX_FILENAME) {
|
|
9196
10969
|
throw new Error(
|
|
9197
10970
|
`Unsupported result source for trend: ${source}. Use a run workspace directory or ${RESULT_INDEX_FILENAME} manifest.`
|
|
9198
10971
|
);
|
|
@@ -9212,7 +10985,7 @@ function resolveTrendSources(cwd, sources, last) {
|
|
|
9212
10985
|
if (last < 2) {
|
|
9213
10986
|
throw new Error("--last must be at least 2");
|
|
9214
10987
|
}
|
|
9215
|
-
const metas = listResultFiles(cwd).filter((meta) =>
|
|
10988
|
+
const metas = listResultFiles(cwd).filter((meta) => path20.basename(meta.path) === RESULT_INDEX_FILENAME).slice(0, last);
|
|
9216
10989
|
if (metas.length < 2) {
|
|
9217
10990
|
throw new Error(
|
|
9218
10991
|
"Trend analysis requires at least 2 canonical run workspaces in .agentv/results/runs/"
|
|
@@ -9227,10 +11000,10 @@ function getRunLabel(sourcePath, timestamp) {
|
|
|
9227
11000
|
if (timestamp) {
|
|
9228
11001
|
return timestamp;
|
|
9229
11002
|
}
|
|
9230
|
-
return
|
|
11003
|
+
return path20.basename(path20.dirname(sourcePath));
|
|
9231
11004
|
}
|
|
9232
11005
|
function getRunSortKey(sourcePath, timestamp) {
|
|
9233
|
-
return timestamp ??
|
|
11006
|
+
return timestamp ?? path20.basename(path20.dirname(sourcePath));
|
|
9234
11007
|
}
|
|
9235
11008
|
function mean2(values) {
|
|
9236
11009
|
return values.reduce((sum, value) => sum + value, 0) / values.length;
|
|
@@ -9485,7 +11258,7 @@ var trendCommand = command({
|
|
|
9485
11258
|
});
|
|
9486
11259
|
|
|
9487
11260
|
// src/commands/trim/index.ts
|
|
9488
|
-
import { readFileSync as
|
|
11261
|
+
import { readFileSync as readFileSync13, writeFileSync as writeFileSync8 } from "node:fs";
|
|
9489
11262
|
var trimCommand = command({
|
|
9490
11263
|
name: "trim",
|
|
9491
11264
|
description: "Trim evaluation results for baseline storage (strips debug/audit fields)",
|
|
@@ -9504,7 +11277,7 @@ var trimCommand = command({
|
|
|
9504
11277
|
},
|
|
9505
11278
|
handler: async ({ input, out }) => {
|
|
9506
11279
|
try {
|
|
9507
|
-
const content =
|
|
11280
|
+
const content = readFileSync13(input, "utf8");
|
|
9508
11281
|
const lines = content.trim().split("\n").filter((line) => line.trim());
|
|
9509
11282
|
const trimmedLines = lines.map((line) => {
|
|
9510
11283
|
const record = JSON.parse(line);
|
|
@@ -9516,7 +11289,7 @@ var trimCommand = command({
|
|
|
9516
11289
|
const output = `${trimmedLines.join("\n")}
|
|
9517
11290
|
`;
|
|
9518
11291
|
if (out) {
|
|
9519
|
-
|
|
11292
|
+
writeFileSync8(out, output, "utf8");
|
|
9520
11293
|
console.error(`Trimmed ${lines.length} record(s) \u2192 ${out}`);
|
|
9521
11294
|
} else {
|
|
9522
11295
|
process.stdout.write(output);
|
|
@@ -9611,7 +11384,7 @@ function isTTY() {
|
|
|
9611
11384
|
// src/commands/validate/validate-files.ts
|
|
9612
11385
|
import { constants } from "node:fs";
|
|
9613
11386
|
import { access, readdir as readdir4, stat } from "node:fs/promises";
|
|
9614
|
-
import
|
|
11387
|
+
import path21 from "node:path";
|
|
9615
11388
|
import fg2 from "fast-glob";
|
|
9616
11389
|
async function validateFiles(paths) {
|
|
9617
11390
|
const filePaths = await expandPaths(paths);
|
|
@@ -9626,7 +11399,7 @@ async function validateFiles(paths) {
|
|
|
9626
11399
|
};
|
|
9627
11400
|
}
|
|
9628
11401
|
async function validateSingleFile(filePath) {
|
|
9629
|
-
const absolutePath =
|
|
11402
|
+
const absolutePath = path21.resolve(filePath);
|
|
9630
11403
|
const fileType = await detectFileType(absolutePath);
|
|
9631
11404
|
let result;
|
|
9632
11405
|
if (fileType === "eval") {
|
|
@@ -9670,7 +11443,7 @@ async function validateSingleFile(filePath) {
|
|
|
9670
11443
|
async function expandPaths(paths) {
|
|
9671
11444
|
const expanded = /* @__PURE__ */ new Set();
|
|
9672
11445
|
for (const inputPath of paths) {
|
|
9673
|
-
const absolutePath =
|
|
11446
|
+
const absolutePath = path21.resolve(inputPath);
|
|
9674
11447
|
try {
|
|
9675
11448
|
await access(absolutePath, constants.F_OK);
|
|
9676
11449
|
const stats = await stat(absolutePath);
|
|
@@ -9698,7 +11471,7 @@ async function expandPaths(paths) {
|
|
|
9698
11471
|
if (yamlMatches.length === 0) {
|
|
9699
11472
|
console.warn(`Warning: No YAML files matched pattern: ${inputPath}`);
|
|
9700
11473
|
}
|
|
9701
|
-
for (const f of yamlMatches) expanded.add(
|
|
11474
|
+
for (const f of yamlMatches) expanded.add(path21.normalize(f));
|
|
9702
11475
|
}
|
|
9703
11476
|
const sorted = Array.from(expanded);
|
|
9704
11477
|
sorted.sort();
|
|
@@ -9709,7 +11482,7 @@ async function findYamlFiles(dirPath) {
|
|
|
9709
11482
|
try {
|
|
9710
11483
|
const entries2 = await readdir4(dirPath, { withFileTypes: true });
|
|
9711
11484
|
for (const entry of entries2) {
|
|
9712
|
-
const fullPath =
|
|
11485
|
+
const fullPath = path21.join(dirPath, entry.name);
|
|
9713
11486
|
if (entry.isDirectory()) {
|
|
9714
11487
|
if (entry.name === "node_modules" || entry.name.startsWith(".")) {
|
|
9715
11488
|
continue;
|
|
@@ -9726,11 +11499,11 @@ async function findYamlFiles(dirPath) {
|
|
|
9726
11499
|
return results;
|
|
9727
11500
|
}
|
|
9728
11501
|
function isYamlFile(filePath) {
|
|
9729
|
-
const ext =
|
|
11502
|
+
const ext = path21.extname(filePath).toLowerCase();
|
|
9730
11503
|
return ext === ".yaml" || ext === ".yml";
|
|
9731
11504
|
}
|
|
9732
11505
|
function isEvalYamlFile(filePath) {
|
|
9733
|
-
const lower =
|
|
11506
|
+
const lower = path21.basename(filePath).toLowerCase();
|
|
9734
11507
|
return lower.endsWith(".eval.yaml") || lower.endsWith(".eval.yml");
|
|
9735
11508
|
}
|
|
9736
11509
|
|
|
@@ -9785,9 +11558,9 @@ var validateCommand = command({
|
|
|
9785
11558
|
});
|
|
9786
11559
|
|
|
9787
11560
|
// src/commands/workspace/clean.ts
|
|
9788
|
-
import { existsSync as
|
|
11561
|
+
import { existsSync as existsSync15 } from "node:fs";
|
|
9789
11562
|
import { readFile as readFile6, readdir as readdir5, rm } from "node:fs/promises";
|
|
9790
|
-
import
|
|
11563
|
+
import path22 from "node:path";
|
|
9791
11564
|
async function confirm(message) {
|
|
9792
11565
|
const readline2 = await import("node:readline");
|
|
9793
11566
|
const rl = readline2.createInterface({ input: process.stdin, output: process.stdout });
|
|
@@ -9814,7 +11587,7 @@ var cleanCommand = command({
|
|
|
9814
11587
|
},
|
|
9815
11588
|
handler: async ({ repo, force }) => {
|
|
9816
11589
|
const poolRoot = getWorkspacePoolRoot();
|
|
9817
|
-
if (!
|
|
11590
|
+
if (!existsSync15(poolRoot)) {
|
|
9818
11591
|
console.log("No workspace pool entries found.");
|
|
9819
11592
|
return;
|
|
9820
11593
|
}
|
|
@@ -9823,8 +11596,8 @@ var cleanCommand = command({
|
|
|
9823
11596
|
const poolDirs = entries2.filter((e) => e.isDirectory());
|
|
9824
11597
|
const matchingDirs = [];
|
|
9825
11598
|
for (const dir of poolDirs) {
|
|
9826
|
-
const poolDir =
|
|
9827
|
-
const metadataPath =
|
|
11599
|
+
const poolDir = path22.join(poolRoot, dir.name);
|
|
11600
|
+
const metadataPath = path22.join(poolDir, "metadata.json");
|
|
9828
11601
|
try {
|
|
9829
11602
|
const raw = await readFile6(metadataPath, "utf-8");
|
|
9830
11603
|
const metadata = JSON.parse(raw);
|
|
@@ -9855,7 +11628,7 @@ var cleanCommand = command({
|
|
|
9855
11628
|
}
|
|
9856
11629
|
for (const dir of matchingDirs) {
|
|
9857
11630
|
await rm(dir, { recursive: true, force: true });
|
|
9858
|
-
console.log(`Removed: ${
|
|
11631
|
+
console.log(`Removed: ${path22.basename(dir).slice(0, 12)}...`);
|
|
9859
11632
|
}
|
|
9860
11633
|
console.log("Done.");
|
|
9861
11634
|
} else {
|
|
@@ -9873,7 +11646,7 @@ var cleanCommand = command({
|
|
|
9873
11646
|
});
|
|
9874
11647
|
|
|
9875
11648
|
// src/commands/workspace/deps.ts
|
|
9876
|
-
import
|
|
11649
|
+
import path23 from "node:path";
|
|
9877
11650
|
var depsCommand = command({
|
|
9878
11651
|
name: "deps",
|
|
9879
11652
|
description: "Scan eval files and list git repo dependencies needed by workspaces",
|
|
@@ -9897,7 +11670,7 @@ var depsCommand = command({
|
|
|
9897
11670
|
const resolvedPaths = await resolveEvalPaths(evalPaths, cwd);
|
|
9898
11671
|
const result = await scanRepoDeps(resolvedPaths);
|
|
9899
11672
|
for (const err2 of result.errors) {
|
|
9900
|
-
console.error(`warning: ${
|
|
11673
|
+
console.error(`warning: ${path23.relative(cwd, err2.file)}: ${err2.message}`);
|
|
9901
11674
|
}
|
|
9902
11675
|
const output = {
|
|
9903
11676
|
repos: result.repos.map((r) => ({
|
|
@@ -9905,7 +11678,7 @@ var depsCommand = command({
|
|
|
9905
11678
|
...r.ref !== void 0 && { ref: r.ref },
|
|
9906
11679
|
...r.clone !== void 0 && { clone: r.clone },
|
|
9907
11680
|
...r.checkout !== void 0 && { checkout: r.checkout },
|
|
9908
|
-
...usedBy && { used_by: r.usedBy.map((p) =>
|
|
11681
|
+
...usedBy && { used_by: r.usedBy.map((p) => path23.relative(cwd, p)) }
|
|
9909
11682
|
}))
|
|
9910
11683
|
};
|
|
9911
11684
|
console.log(JSON.stringify(output, null, 2));
|
|
@@ -9913,15 +11686,15 @@ var depsCommand = command({
|
|
|
9913
11686
|
});
|
|
9914
11687
|
|
|
9915
11688
|
// src/commands/workspace/list.ts
|
|
9916
|
-
import { existsSync as
|
|
11689
|
+
import { existsSync as existsSync16 } from "node:fs";
|
|
9917
11690
|
import { readFile as readFile7, readdir as readdir6, stat as stat2 } from "node:fs/promises";
|
|
9918
|
-
import
|
|
11691
|
+
import path24 from "node:path";
|
|
9919
11692
|
async function getDirectorySize(dirPath) {
|
|
9920
11693
|
let totalSize = 0;
|
|
9921
11694
|
try {
|
|
9922
11695
|
const entries2 = await readdir6(dirPath, { withFileTypes: true });
|
|
9923
11696
|
for (const entry of entries2) {
|
|
9924
|
-
const fullPath =
|
|
11697
|
+
const fullPath = path24.join(dirPath, entry.name);
|
|
9925
11698
|
if (entry.isDirectory()) {
|
|
9926
11699
|
totalSize += await getDirectorySize(fullPath);
|
|
9927
11700
|
} else {
|
|
@@ -9945,7 +11718,7 @@ var listCommand = command({
|
|
|
9945
11718
|
args: {},
|
|
9946
11719
|
handler: async () => {
|
|
9947
11720
|
const poolRoot = getWorkspacePoolRoot();
|
|
9948
|
-
if (!
|
|
11721
|
+
if (!existsSync16(poolRoot)) {
|
|
9949
11722
|
console.log("No workspace pool entries found.");
|
|
9950
11723
|
return;
|
|
9951
11724
|
}
|
|
@@ -9956,11 +11729,11 @@ var listCommand = command({
|
|
|
9956
11729
|
return;
|
|
9957
11730
|
}
|
|
9958
11731
|
for (const dir of poolDirs) {
|
|
9959
|
-
const poolDir =
|
|
11732
|
+
const poolDir = path24.join(poolRoot, dir.name);
|
|
9960
11733
|
const fingerprint = dir.name;
|
|
9961
11734
|
const poolEntries = await readdir6(poolDir, { withFileTypes: true });
|
|
9962
11735
|
const slots = poolEntries.filter((e) => e.isDirectory() && e.name.startsWith("slot-"));
|
|
9963
|
-
const metadataPath =
|
|
11736
|
+
const metadataPath = path24.join(poolDir, "metadata.json");
|
|
9964
11737
|
let metadata = null;
|
|
9965
11738
|
try {
|
|
9966
11739
|
const raw = await readFile7(metadataPath, "utf-8");
|
|
@@ -10007,8 +11780,8 @@ var CHECK_INTERVAL_MS = 24 * 60 * 60 * 1e3;
|
|
|
10007
11780
|
var CONFIG_DIR = getAgentvConfigDir();
|
|
10008
11781
|
var CACHE_FILE = "version-check.json";
|
|
10009
11782
|
var NPM_REGISTRY_URL = "https://registry.npmjs.org/agentv/latest";
|
|
10010
|
-
async function getCachedUpdateInfo(
|
|
10011
|
-
const filePath =
|
|
11783
|
+
async function getCachedUpdateInfo(path25) {
|
|
11784
|
+
const filePath = path25 ?? join5(CONFIG_DIR, CACHE_FILE);
|
|
10012
11785
|
try {
|
|
10013
11786
|
const raw = await readFile8(filePath, "utf-8");
|
|
10014
11787
|
const data = JSON.parse(raw);
|
|
@@ -10169,4 +11942,4 @@ export {
|
|
|
10169
11942
|
preprocessArgv,
|
|
10170
11943
|
runCli
|
|
10171
11944
|
};
|
|
10172
|
-
//# sourceMappingURL=chunk-
|
|
11945
|
+
//# sourceMappingURL=chunk-MQCJCM3I.js.map
|