agentv 4.0.0 → 4.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +59 -533
- package/dist/{chunk-OT2J474N.js → chunk-QCKPJPYC.js} +5 -8
- package/dist/chunk-QCKPJPYC.js.map +1 -0
- package/dist/{chunk-E3VSJJI4.js → chunk-TDY2FQN5.js} +60 -720
- package/dist/chunk-TDY2FQN5.js.map +1 -0
- package/dist/{chunk-OXBBWZOY.js → chunk-XEAW7OQT.js} +4 -3
- package/dist/chunk-XEAW7OQT.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-3Z22B6SU.js → dist-2JUUJ6PT.js} +2 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-D5UTP72M.js → interactive-ASB4FU3J.js} +3 -3
- package/dist/studio/assets/index-CDGReinH.js +71 -0
- package/dist/studio/assets/{index-CE3-mmv0.js → index-DofvSOmX.js} +1 -1
- package/dist/studio/assets/index-izxfmBKC.css +1 -0
- package/dist/studio/index.html +2 -2
- package/package.json +1 -1
- package/dist/chunk-E3VSJJI4.js.map +0 -1
- package/dist/chunk-OT2J474N.js.map +0 -1
- package/dist/chunk-OXBBWZOY.js.map +0 -1
- package/dist/studio/assets/index-BuKVkxFj.css +0 -1
- package/dist/studio/assets/index-DBU720Fm.js +0 -71
- /package/dist/{dist-3Z22B6SU.js.map → dist-2JUUJ6PT.js.map} +0 -0
- /package/dist/{interactive-D5UTP72M.js.map → interactive-ASB4FU3J.js.map} +0 -0
|
@@ -24,7 +24,7 @@ import {
|
|
|
24
24
|
validateFileReferences,
|
|
25
25
|
validateTargetsFile,
|
|
26
26
|
writeArtifactsFromResults
|
|
27
|
-
} from "./chunk-
|
|
27
|
+
} from "./chunk-QCKPJPYC.js";
|
|
28
28
|
import {
|
|
29
29
|
DEFAULT_CATEGORY,
|
|
30
30
|
createBuiltinRegistry,
|
|
@@ -43,7 +43,7 @@ import {
|
|
|
43
43
|
toSnakeCaseDeep as toSnakeCaseDeep2,
|
|
44
44
|
transpileEvalYamlFile,
|
|
45
45
|
trimBaselineResult
|
|
46
|
-
} from "./chunk-
|
|
46
|
+
} from "./chunk-XEAW7OQT.js";
|
|
47
47
|
import {
|
|
48
48
|
__commonJS,
|
|
49
49
|
__esm,
|
|
@@ -4217,7 +4217,7 @@ var evalRunCommand = command({
|
|
|
4217
4217
|
},
|
|
4218
4218
|
handler: async (args) => {
|
|
4219
4219
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
4220
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
4220
|
+
const { launchInteractiveWizard } = await import("./interactive-ASB4FU3J.js");
|
|
4221
4221
|
await launchInteractiveWizard();
|
|
4222
4222
|
return;
|
|
4223
4223
|
}
|
|
@@ -4972,9 +4972,14 @@ var evalRunCommand2 = command({
|
|
|
4972
4972
|
type: optional(string),
|
|
4973
4973
|
long: "experiment",
|
|
4974
4974
|
description: "Experiment label (e.g. with_skills, without_skills)"
|
|
4975
|
+
}),
|
|
4976
|
+
graderType: option({
|
|
4977
|
+
type: optional(oneOf(["code", "none"])),
|
|
4978
|
+
long: "grader-type",
|
|
4979
|
+
description: 'Which grading phase to run: "code" runs code-graders inline, omit to skip grading (use pipeline grade separately)'
|
|
4975
4980
|
})
|
|
4976
4981
|
},
|
|
4977
|
-
handler: async ({ evalPath, out, workers, experiment }) => {
|
|
4982
|
+
handler: async ({ evalPath, out, workers, experiment, graderType }) => {
|
|
4978
4983
|
const resolvedEvalPath = resolve2(evalPath);
|
|
4979
4984
|
const outDir = resolve2(out ?? buildDefaultRunDir(process.cwd()));
|
|
4980
4985
|
const repoRoot = await findRepoRoot(dirname2(resolvedEvalPath));
|
|
@@ -5143,6 +5148,14 @@ var evalRunCommand2 = command({
|
|
|
5143
5148
|
} else {
|
|
5144
5149
|
console.log("Subagent-as-target mode \u2014 skipping CLI invocation.");
|
|
5145
5150
|
}
|
|
5151
|
+
if (graderType !== "code") {
|
|
5152
|
+
console.log(`
|
|
5153
|
+
Done. Results in ${outDir}`);
|
|
5154
|
+
console.log(
|
|
5155
|
+
"To run code graders: agentv pipeline grade <run-dir> (or re-run with --grader-type code)"
|
|
5156
|
+
);
|
|
5157
|
+
return;
|
|
5158
|
+
}
|
|
5146
5159
|
let totalGraders = 0;
|
|
5147
5160
|
let totalPassed = 0;
|
|
5148
5161
|
for (const testId of testIds) {
|
|
@@ -6261,15 +6274,16 @@ function writeFeedback(cwd, data) {
|
|
|
6261
6274
|
function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
6262
6275
|
const searchDir = cwd ?? resultDir;
|
|
6263
6276
|
const app2 = new Hono();
|
|
6264
|
-
const studioDistPath = options?.studioDir
|
|
6277
|
+
const studioDistPath = options?.studioDir ?? resolveStudioDistDir();
|
|
6278
|
+
if (!studioDistPath || !existsSync7(path9.join(studioDistPath, "index.html"))) {
|
|
6279
|
+
throw new Error('Studio dist not found. Run "bun run build" in apps/studio/ to build the SPA.');
|
|
6280
|
+
}
|
|
6265
6281
|
app2.get("/", (c3) => {
|
|
6266
|
-
|
|
6267
|
-
|
|
6268
|
-
|
|
6269
|
-
return c3.html(readFileSync8(indexPath, "utf8"));
|
|
6270
|
-
}
|
|
6282
|
+
const indexPath = path9.join(studioDistPath, "index.html");
|
|
6283
|
+
if (existsSync7(indexPath)) {
|
|
6284
|
+
return c3.html(readFileSync8(indexPath, "utf8"));
|
|
6271
6285
|
}
|
|
6272
|
-
return c3.
|
|
6286
|
+
return c3.notFound();
|
|
6273
6287
|
});
|
|
6274
6288
|
app2.get("/api/runs", (c3) => {
|
|
6275
6289
|
const metas = listResultFiles(searchDir);
|
|
@@ -6684,44 +6698,42 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
|
6684
6698
|
}));
|
|
6685
6699
|
return c3.json({ targets });
|
|
6686
6700
|
});
|
|
6687
|
-
|
|
6688
|
-
|
|
6689
|
-
|
|
6690
|
-
|
|
6691
|
-
if (!existsSync7(filePath)) {
|
|
6692
|
-
return c3.notFound();
|
|
6693
|
-
}
|
|
6694
|
-
const content = readFileSync8(filePath);
|
|
6695
|
-
const ext = path9.extname(filePath);
|
|
6696
|
-
const mimeTypes = {
|
|
6697
|
-
".js": "application/javascript",
|
|
6698
|
-
".css": "text/css",
|
|
6699
|
-
".html": "text/html",
|
|
6700
|
-
".json": "application/json",
|
|
6701
|
-
".svg": "image/svg+xml",
|
|
6702
|
-
".png": "image/png",
|
|
6703
|
-
".woff2": "font/woff2",
|
|
6704
|
-
".woff": "font/woff"
|
|
6705
|
-
};
|
|
6706
|
-
const contentType = mimeTypes[ext] ?? "application/octet-stream";
|
|
6707
|
-
return new Response(content, {
|
|
6708
|
-
headers: {
|
|
6709
|
-
"Content-Type": contentType,
|
|
6710
|
-
"Cache-Control": "public, max-age=31536000, immutable"
|
|
6711
|
-
}
|
|
6712
|
-
});
|
|
6713
|
-
});
|
|
6714
|
-
app2.get("*", (c3) => {
|
|
6715
|
-
if (c3.req.path.startsWith("/api/")) {
|
|
6716
|
-
return c3.json({ error: "Not found" }, 404);
|
|
6717
|
-
}
|
|
6718
|
-
const indexPath = path9.join(studioDistPath, "index.html");
|
|
6719
|
-
if (existsSync7(indexPath)) {
|
|
6720
|
-
return c3.html(readFileSync8(indexPath, "utf8"));
|
|
6721
|
-
}
|
|
6701
|
+
app2.get("/assets/*", (c3) => {
|
|
6702
|
+
const assetPath = c3.req.path;
|
|
6703
|
+
const filePath = path9.join(studioDistPath, assetPath);
|
|
6704
|
+
if (!existsSync7(filePath)) {
|
|
6722
6705
|
return c3.notFound();
|
|
6706
|
+
}
|
|
6707
|
+
const content = readFileSync8(filePath);
|
|
6708
|
+
const ext = path9.extname(filePath);
|
|
6709
|
+
const mimeTypes = {
|
|
6710
|
+
".js": "application/javascript",
|
|
6711
|
+
".css": "text/css",
|
|
6712
|
+
".html": "text/html",
|
|
6713
|
+
".json": "application/json",
|
|
6714
|
+
".svg": "image/svg+xml",
|
|
6715
|
+
".png": "image/png",
|
|
6716
|
+
".woff2": "font/woff2",
|
|
6717
|
+
".woff": "font/woff"
|
|
6718
|
+
};
|
|
6719
|
+
const contentType = mimeTypes[ext] ?? "application/octet-stream";
|
|
6720
|
+
return new Response(content, {
|
|
6721
|
+
headers: {
|
|
6722
|
+
"Content-Type": contentType,
|
|
6723
|
+
"Cache-Control": "public, max-age=31536000, immutable"
|
|
6724
|
+
}
|
|
6723
6725
|
});
|
|
6724
|
-
}
|
|
6726
|
+
});
|
|
6727
|
+
app2.get("*", (c3) => {
|
|
6728
|
+
if (c3.req.path.startsWith("/api/")) {
|
|
6729
|
+
return c3.json({ error: "Not found" }, 404);
|
|
6730
|
+
}
|
|
6731
|
+
const indexPath = path9.join(studioDistPath, "index.html");
|
|
6732
|
+
if (existsSync7(indexPath)) {
|
|
6733
|
+
return c3.html(readFileSync8(indexPath, "utf8"));
|
|
6734
|
+
}
|
|
6735
|
+
return c3.notFound();
|
|
6736
|
+
});
|
|
6725
6737
|
return app2;
|
|
6726
6738
|
}
|
|
6727
6739
|
function resolveStudioDistDir() {
|
|
@@ -6755,675 +6767,6 @@ function stripHeavyFields(results) {
|
|
|
6755
6767
|
};
|
|
6756
6768
|
});
|
|
6757
6769
|
}
|
|
6758
|
-
function escapeHtml(s) {
|
|
6759
|
-
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
6760
|
-
}
|
|
6761
|
-
function generateServeHtml(results, sourceFile) {
|
|
6762
|
-
const lightResults = stripHeavyFields(results);
|
|
6763
|
-
const dataJson = JSON.stringify(lightResults).replace(/</g, "\\u003c").replace(/>/g, "\\u003e").replace(/\u2028/g, "\\u2028").replace(/\u2029/g, "\\u2029");
|
|
6764
|
-
return `<!DOCTYPE html>
|
|
6765
|
-
<html lang="en">
|
|
6766
|
-
<head>
|
|
6767
|
-
<meta charset="utf-8">
|
|
6768
|
-
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
6769
|
-
<title>AgentV Results Review</title>
|
|
6770
|
-
<style>
|
|
6771
|
-
${SERVE_STYLES}
|
|
6772
|
-
</style>
|
|
6773
|
-
</head>
|
|
6774
|
-
<body>
|
|
6775
|
-
<header class="header">
|
|
6776
|
-
<div class="header-left">
|
|
6777
|
-
<h1 class="header-title">AgentV</h1>
|
|
6778
|
-
<span class="header-subtitle">Results Review</span>
|
|
6779
|
-
</div>
|
|
6780
|
-
<div class="header-center">
|
|
6781
|
-
<select id="run-picker" class="run-picker" title="Switch result file">
|
|
6782
|
-
<option value="">Loading runs...</option>
|
|
6783
|
-
</select>
|
|
6784
|
-
</div>
|
|
6785
|
-
<div class="header-right">
|
|
6786
|
-
<span class="timestamp">${escapeHtml((/* @__PURE__ */ new Date()).toISOString())}</span>
|
|
6787
|
-
</div>
|
|
6788
|
-
</header>
|
|
6789
|
-
<nav class="tabs" id="tabs">
|
|
6790
|
-
<button class="tab active" data-tab="overview">Overview</button>
|
|
6791
|
-
<button class="tab" data-tab="tests">Test Cases</button>
|
|
6792
|
-
</nav>
|
|
6793
|
-
<main id="app"></main>
|
|
6794
|
-
<script>
|
|
6795
|
-
var DATA = ${dataJson};
|
|
6796
|
-
var INITIAL_SOURCE = ${sourceFile ? JSON.stringify(path9.basename(sourceFile)).replace(/</g, "\\u003c").replace(/>/g, "\\u003e") : "null"};
|
|
6797
|
-
${SERVE_SCRIPT}
|
|
6798
|
-
</script>
|
|
6799
|
-
</body>
|
|
6800
|
-
</html>`;
|
|
6801
|
-
}
|
|
6802
|
-
var SERVE_STYLES = `
|
|
6803
|
-
*{margin:0;padding:0;box-sizing:border-box}
|
|
6804
|
-
:root{
|
|
6805
|
-
--bg:#f6f8fa;--surface:#fff;--border:#d0d7de;--border-light:#e8ebee;
|
|
6806
|
-
--text:#1f2328;--text-muted:#656d76;
|
|
6807
|
-
--primary:#0969da;--primary-bg:#ddf4ff;
|
|
6808
|
-
--success:#1a7f37;--success-bg:#dafbe1;
|
|
6809
|
-
--danger:#cf222e;--danger-bg:#ffebe9;
|
|
6810
|
-
--warning:#9a6700;--warning-bg:#fff8c5;
|
|
6811
|
-
--radius:6px;
|
|
6812
|
-
--shadow:0 1px 3px rgba(31,35,40,.04),0 1px 2px rgba(31,35,40,.06);
|
|
6813
|
-
--font:-apple-system,BlinkMacSystemFont,"Segoe UI","Noto Sans",Helvetica,Arial,sans-serif;
|
|
6814
|
-
--mono:ui-monospace,SFMono-Regular,"SF Mono",Menlo,Consolas,monospace;
|
|
6815
|
-
}
|
|
6816
|
-
body{font-family:var(--font);background:var(--bg);color:var(--text);line-height:1.5;font-size:14px}
|
|
6817
|
-
|
|
6818
|
-
/* Header */
|
|
6819
|
-
.header{background:var(--surface);border-bottom:1px solid var(--border);padding:12px 24px;display:flex;align-items:center;justify-content:space-between}
|
|
6820
|
-
.header-left{display:flex;align-items:baseline;gap:12px}
|
|
6821
|
-
.header-title{font-size:18px;font-weight:600}
|
|
6822
|
-
.header-subtitle{font-size:14px;color:var(--text-muted)}
|
|
6823
|
-
.header-center{flex:1;display:flex;justify-content:center;padding:0 16px}
|
|
6824
|
-
.run-picker{padding:6px 10px;border:1px solid var(--border);border-radius:var(--radius);font-size:13px;background:var(--surface);color:var(--text);font-family:var(--font);max-width:400px;width:100%;cursor:pointer}
|
|
6825
|
-
.run-picker:hover{border-color:var(--primary)}
|
|
6826
|
-
.run-picker:focus{outline:none;border-color:var(--primary);box-shadow:0 0 0 3px var(--primary-bg)}
|
|
6827
|
-
.timestamp{font-size:12px;color:var(--text-muted);font-family:var(--mono)}
|
|
6828
|
-
|
|
6829
|
-
/* Tabs */
|
|
6830
|
-
.tabs{background:var(--surface);border-bottom:1px solid var(--border);padding:0 24px;display:flex}
|
|
6831
|
-
.tab{background:none;border:none;padding:10px 16px;font-size:14px;color:var(--text-muted);cursor:pointer;border-bottom:2px solid transparent;font-family:var(--font);transition:color .15s,border-color .15s}
|
|
6832
|
-
.tab:hover{color:var(--text)}
|
|
6833
|
-
.tab.active{color:var(--text);font-weight:600;border-bottom-color:var(--primary)}
|
|
6834
|
-
|
|
6835
|
-
#app{max-width:1280px;margin:0 auto;padding:24px}
|
|
6836
|
-
|
|
6837
|
-
/* Stat cards */
|
|
6838
|
-
.stats-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));gap:12px;margin-bottom:24px}
|
|
6839
|
-
.stat-card{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:16px;text-align:center;box-shadow:var(--shadow)}
|
|
6840
|
-
.stat-card.pass .stat-value{color:var(--success)}
|
|
6841
|
-
.stat-card.fail .stat-value{color:var(--danger)}
|
|
6842
|
-
.stat-card.error .stat-value{color:var(--danger)}
|
|
6843
|
-
.stat-card.warn .stat-value{color:var(--warning)}
|
|
6844
|
-
.stat-card.total .stat-value{color:var(--primary)}
|
|
6845
|
-
.stat-value{font-size:28px;font-weight:700;line-height:1.2}
|
|
6846
|
-
.stat-label{font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.5px;margin-top:4px}
|
|
6847
|
-
|
|
6848
|
-
/* Sections */
|
|
6849
|
-
.section{margin-bottom:24px}
|
|
6850
|
-
.section-title{font-size:16px;font-weight:600;margin-bottom:12px}
|
|
6851
|
-
|
|
6852
|
-
/* Tables */
|
|
6853
|
-
.table-wrap{overflow-x:auto;background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);box-shadow:var(--shadow)}
|
|
6854
|
-
.data-table{width:100%;border-collapse:collapse;font-size:13px}
|
|
6855
|
-
.data-table th{background:var(--bg);border-bottom:1px solid var(--border);padding:8px 12px;text-align:left;font-weight:600;font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.3px;white-space:nowrap}
|
|
6856
|
-
.data-table th.sortable{cursor:pointer;user-select:none}
|
|
6857
|
-
.data-table th.sortable:hover{color:var(--text)}
|
|
6858
|
-
.data-table td{padding:8px 12px;border-bottom:1px solid var(--border-light);vertical-align:middle}
|
|
6859
|
-
.data-table tbody tr:last-child td{border-bottom:none}
|
|
6860
|
-
|
|
6861
|
-
/* Status icons */
|
|
6862
|
-
.status-icon{display:inline-flex;align-items:center;justify-content:center;width:22px;height:22px;border-radius:50%;font-size:12px;font-weight:700}
|
|
6863
|
-
.status-icon.pass{background:var(--success-bg);color:var(--success)}
|
|
6864
|
-
.status-icon.fail{background:var(--danger-bg);color:var(--danger)}
|
|
6865
|
-
.status-icon.error{background:var(--warning-bg);color:var(--warning)}
|
|
6866
|
-
|
|
6867
|
-
/* Score colors */
|
|
6868
|
-
.score-high{color:var(--success);font-weight:600}
|
|
6869
|
-
.score-mid{color:var(--warning);font-weight:600}
|
|
6870
|
-
.score-low{color:var(--danger);font-weight:600}
|
|
6871
|
-
|
|
6872
|
-
/* Pass-rate bar */
|
|
6873
|
-
.bar-bg{width:100px;height:8px;background:var(--border-light);border-radius:4px;overflow:hidden}
|
|
6874
|
-
.bar-fill{height:100%;border-radius:4px;transition:width .3s}
|
|
6875
|
-
.bar-fill.score-high{background:var(--success)}
|
|
6876
|
-
.bar-fill.score-mid{background:var(--warning)}
|
|
6877
|
-
.bar-fill.score-low{background:var(--danger)}
|
|
6878
|
-
|
|
6879
|
-
/* Histogram */
|
|
6880
|
-
.histogram{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:16px;box-shadow:var(--shadow)}
|
|
6881
|
-
.hist-row{display:flex;align-items:center;gap:12px;margin-bottom:8px}
|
|
6882
|
-
.hist-row:last-child{margin-bottom:0}
|
|
6883
|
-
.hist-label{width:60px;font-size:12px;color:var(--text-muted);text-align:right;flex-shrink:0}
|
|
6884
|
-
.hist-bar-bg{flex:1;height:20px;background:var(--border-light);border-radius:3px;overflow:hidden}
|
|
6885
|
-
.hist-bar{height:100%;border-radius:3px;transition:width .3s}
|
|
6886
|
-
.hist-count{width:30px;font-size:12px;color:var(--text-muted);text-align:right;flex-shrink:0}
|
|
6887
|
-
|
|
6888
|
-
/* Filters */
|
|
6889
|
-
.filter-bar{display:flex;gap:8px;margin-bottom:16px;align-items:center;flex-wrap:wrap}
|
|
6890
|
-
.filter-select,.filter-search{padding:6px 10px;border:1px solid var(--border);border-radius:var(--radius);font-size:13px;background:var(--surface);color:var(--text);font-family:var(--font)}
|
|
6891
|
-
.filter-search{flex:1;min-width:200px}
|
|
6892
|
-
.filter-count{font-size:12px;color:var(--text-muted);margin-left:auto}
|
|
6893
|
-
|
|
6894
|
-
/* Test rows */
|
|
6895
|
-
.test-row{cursor:pointer;transition:background .1s}
|
|
6896
|
-
.test-row:hover{background:var(--bg)!important}
|
|
6897
|
-
.test-row.expanded{background:var(--primary-bg)!important}
|
|
6898
|
-
.expand-col{width:32px;text-align:center}
|
|
6899
|
-
.expand-icon{color:var(--text-muted);font-size:12px}
|
|
6900
|
-
.fw-medium{font-weight:500}
|
|
6901
|
-
.text-pass{color:var(--success)}.text-fail{color:var(--danger)}.text-error{color:var(--warning)}
|
|
6902
|
-
|
|
6903
|
-
/* Detail panel */
|
|
6904
|
-
.detail-row td{padding:0!important;background:var(--bg)!important}
|
|
6905
|
-
.detail-panel{padding:16px 24px}
|
|
6906
|
-
.detail-grid{display:grid;grid-template-columns:1fr 1fr;gap:16px;margin-bottom:16px}
|
|
6907
|
-
.detail-block h4{font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.3px;margin-bottom:6px}
|
|
6908
|
-
.detail-pre{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:12px;font-family:var(--mono);font-size:12px;white-space:pre-wrap;word-break:break-word;max-height:300px;overflow-y:auto;line-height:1.6}
|
|
6909
|
-
.detail-panel h4{font-size:13px;font-weight:600;margin:16px 0 8px}
|
|
6910
|
-
.eval-table{width:100%;border-collapse:collapse;font-size:13px;background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);margin-bottom:12px}
|
|
6911
|
-
.eval-table th{background:var(--bg);padding:6px 10px;text-align:left;font-size:11px;font-weight:600;color:var(--text-muted);text-transform:uppercase;border-bottom:1px solid var(--border)}
|
|
6912
|
-
.eval-table td{padding:8px 10px;border-bottom:1px solid var(--border-light)}
|
|
6913
|
-
.reasoning-cell{max-width:500px;font-size:12px;color:var(--text-muted)}
|
|
6914
|
-
.expect-list{list-style:none;padding:0;margin-bottom:12px}
|
|
6915
|
-
.expect-list li{padding:4px 8px 4px 24px;position:relative;font-size:13px}
|
|
6916
|
-
.expect-list.pass li::before{content:"\\2713";position:absolute;left:4px;color:var(--success);font-weight:700}
|
|
6917
|
-
.expect-list.fail li::before{content:"\\2717";position:absolute;left:4px;color:var(--danger);font-weight:700}
|
|
6918
|
-
.error-box{background:var(--danger-bg);border:1px solid var(--danger);border-radius:var(--radius);padding:12px;margin-bottom:12px}
|
|
6919
|
-
.error-box h4{color:var(--danger);margin:0 0 6px}
|
|
6920
|
-
.error-box pre{font-family:var(--mono);font-size:12px;white-space:pre-wrap;word-break:break-word}
|
|
6921
|
-
.detail-meta{font-size:12px;color:var(--text-muted);margin-top:12px;padding-top:12px;border-top:1px solid var(--border-light)}
|
|
6922
|
-
.tool-calls{display:flex;flex-wrap:wrap;gap:6px;margin-bottom:12px}
|
|
6923
|
-
.tool-tag{display:inline-block;padding:2px 10px;font-size:12px;font-family:var(--mono);background:var(--primary-bg);color:var(--primary);border:1px solid var(--border);border-radius:12px}
|
|
6924
|
-
.empty-state{text-align:center;padding:48px 24px;color:var(--text-muted)}
|
|
6925
|
-
.empty-state h3{font-size:16px;margin-bottom:8px;color:var(--text)}
|
|
6926
|
-
.welcome-state{text-align:center;padding:80px 24px;color:var(--text-muted)}
|
|
6927
|
-
.welcome-state h2{font-size:24px;margin-bottom:12px;color:var(--text);font-weight:600}
|
|
6928
|
-
.welcome-state p{font-size:15px;margin-bottom:8px;max-width:500px;margin-left:auto;margin-right:auto}
|
|
6929
|
-
.welcome-state code{font-family:var(--mono);background:var(--surface);border:1px solid var(--border);border-radius:3px;padding:2px 6px;font-size:13px}
|
|
6930
|
-
.welcome-state .hint{margin-top:24px;font-size:13px;color:var(--text-muted)}
|
|
6931
|
-
|
|
6932
|
-
/* Feedback */
|
|
6933
|
-
.feedback-section{margin-top:16px;padding-top:16px;border-top:1px solid var(--border-light)}
|
|
6934
|
-
.feedback-input{width:100%;min-height:80px;padding:8px 12px;border:1px solid var(--border);border-radius:var(--radius);font-family:var(--font);font-size:13px;resize:vertical;background:var(--surface);color:var(--text)}
|
|
6935
|
-
.feedback-input:focus{outline:none;border-color:var(--primary);box-shadow:0 0 0 3px var(--primary-bg)}
|
|
6936
|
-
.feedback-submit{margin-top:8px;padding:6px 16px;background:var(--primary);color:#fff;border:none;border-radius:var(--radius);font-size:13px;cursor:pointer;font-family:var(--font)}
|
|
6937
|
-
.feedback-submit:hover{opacity:.9}
|
|
6938
|
-
.feedback-submit:disabled{opacity:.5;cursor:default}
|
|
6939
|
-
.feedback-status{margin-left:8px;font-size:12px;color:var(--success)}
|
|
6940
|
-
`;
|
|
6941
|
-
var SERVE_SCRIPT = `
|
|
6942
|
-
(function(){
|
|
6943
|
-
/* ---- helpers ---- */
|
|
6944
|
-
function esc(s){
|
|
6945
|
-
if(s==null)return"";
|
|
6946
|
-
return String(s).replace(/&/g,"&").replace(/</g,"<").replace(/>/g,">").replace(/"/g,""");
|
|
6947
|
-
}
|
|
6948
|
-
function getStatus(r){
|
|
6949
|
-
if(r.executionStatus==="execution_error")return"error";
|
|
6950
|
-
if(r.executionStatus==="quality_failure")return"fail";
|
|
6951
|
-
if(r.executionStatus==="ok")return"pass";
|
|
6952
|
-
if(r.error)return"error";
|
|
6953
|
-
return r.score>=0.5?"pass":"fail";
|
|
6954
|
-
}
|
|
6955
|
-
function sIcon(s){
|
|
6956
|
-
if(s==="pass")return'<span class="status-icon pass">\\u2713</span>';
|
|
6957
|
-
if(s==="fail")return'<span class="status-icon fail">\\u2717</span>';
|
|
6958
|
-
return'<span class="status-icon error">!</span>';
|
|
6959
|
-
}
|
|
6960
|
-
function fmtDur(ms){
|
|
6961
|
-
if(ms==null)return"\\u2014";
|
|
6962
|
-
if(ms<1000)return ms+"ms";
|
|
6963
|
-
if(ms<60000)return(ms/1000).toFixed(1)+"s";
|
|
6964
|
-
return Math.floor(ms/60000)+"m "+Math.round((ms%60000)/1000)+"s";
|
|
6965
|
-
}
|
|
6966
|
-
function fmtTok(n){
|
|
6967
|
-
if(n==null)return"\\u2014";
|
|
6968
|
-
if(n>=1e6)return(n/1e6).toFixed(1)+"M";
|
|
6969
|
-
if(n>=1e3)return(n/1e3).toFixed(1)+"K";
|
|
6970
|
-
return String(n);
|
|
6971
|
-
}
|
|
6972
|
-
function fmtCost(u){if(u==null)return"\\u2014";if(u<0.01)return"<$0.01";return"$"+u.toFixed(2);}
|
|
6973
|
-
function fmtPct(v){if(v==null)return"\\u2014";return(v*100).toFixed(1)+"%";}
|
|
6974
|
-
function sCls(v){if(v==null)return"";if(v>=0.9)return"score-high";if(v>=0.5)return"score-mid";return"score-low";}
|
|
6975
|
-
|
|
6976
|
-
/* ---- feedback state ---- */
|
|
6977
|
-
var feedbackCache={};
|
|
6978
|
-
|
|
6979
|
-
function loadFeedback(){
|
|
6980
|
-
fetch("/api/feedback").then(function(r){return r.json();}).then(function(d){
|
|
6981
|
-
if(d&&d.reviews){
|
|
6982
|
-
for(var i=0;i<d.reviews.length;i++){
|
|
6983
|
-
feedbackCache[d.reviews[i].test_id]=d.reviews[i].comment;
|
|
6984
|
-
}
|
|
6985
|
-
populateFeedbackTextareas();
|
|
6986
|
-
}
|
|
6987
|
-
}).catch(function(){});
|
|
6988
|
-
}
|
|
6989
|
-
|
|
6990
|
-
function populateFeedbackTextareas(){
|
|
6991
|
-
var areas=document.querySelectorAll(".feedback-input");
|
|
6992
|
-
for(var i=0;i<areas.length;i++){
|
|
6993
|
-
var tid=areas[i].getAttribute("data-test-id");
|
|
6994
|
-
if(tid&&feedbackCache[tid]!=null){
|
|
6995
|
-
areas[i].value=feedbackCache[tid];
|
|
6996
|
-
}
|
|
6997
|
-
}
|
|
6998
|
-
}
|
|
6999
|
-
|
|
7000
|
-
function saveFeedback(testId,comment,statusEl,btn){
|
|
7001
|
-
btn.disabled=true;
|
|
7002
|
-
statusEl.textContent="Saving...";
|
|
7003
|
-
statusEl.style.color="var(--text-muted)";
|
|
7004
|
-
fetch("/api/feedback",{
|
|
7005
|
-
method:"POST",
|
|
7006
|
-
headers:{"Content-Type":"application/json"},
|
|
7007
|
-
body:JSON.stringify({reviews:[{test_id:testId,comment:comment}]})
|
|
7008
|
-
}).then(function(r){return r.json();}).then(function(){
|
|
7009
|
-
feedbackCache[testId]=comment;
|
|
7010
|
-
statusEl.textContent="Saved";
|
|
7011
|
-
statusEl.style.color="var(--success)";
|
|
7012
|
-
btn.disabled=false;
|
|
7013
|
-
setTimeout(function(){statusEl.textContent="";},2000);
|
|
7014
|
-
}).catch(function(){
|
|
7015
|
-
statusEl.textContent="Error saving";
|
|
7016
|
-
statusEl.style.color="var(--danger)";
|
|
7017
|
-
btn.disabled=false;
|
|
7018
|
-
});
|
|
7019
|
-
}
|
|
7020
|
-
|
|
7021
|
-
/* ---- compute stats ---- */
|
|
7022
|
-
function computeStats(d){
|
|
7023
|
-
var t=d.length,p=0,f=0,e=0,dur=0,ti=0,to=0,cost=0,sc=[],tc=0;
|
|
7024
|
-
for(var i=0;i<d.length;i++){
|
|
7025
|
-
var r=d[i],s=getStatus(r);
|
|
7026
|
-
if(s==="pass")p++;else if(s==="fail")f++;else e++;
|
|
7027
|
-
if(r.durationMs)dur+=r.durationMs;
|
|
7028
|
-
if(r.tokenUsage){ti+=(r.tokenUsage.input||0);to+=(r.tokenUsage.output||0);}
|
|
7029
|
-
if(r.costUsd)cost+=r.costUsd;
|
|
7030
|
-
if(s!=="error")sc.push(r.score);
|
|
7031
|
-
if(r._toolCalls){for(var k in r._toolCalls)tc+=r._toolCalls[k];}
|
|
7032
|
-
}
|
|
7033
|
-
var g=t-e;
|
|
7034
|
-
return{total:t,passed:p,failed:f,errors:e,passRate:g>0?p/g:0,dur:dur,tokens:ti+to,inTok:ti,outTok:to,cost:cost,scores:sc,toolCalls:tc};
|
|
7035
|
-
}
|
|
7036
|
-
function computeTargets(d){
|
|
7037
|
-
var m={};
|
|
7038
|
-
for(var i=0;i<d.length;i++){
|
|
7039
|
-
var r=d[i],tgt=r.target||"unknown";
|
|
7040
|
-
if(!m[tgt])m[tgt]={target:tgt,results:[],p:0,f:0,e:0,ts:0,sc:0,dur:0,tok:0,cost:0};
|
|
7041
|
-
var o=m[tgt];o.results.push(r);
|
|
7042
|
-
var s=getStatus(r);
|
|
7043
|
-
if(s==="pass")o.p++;else if(s==="fail")o.f++;else o.e++;
|
|
7044
|
-
if(s!=="error"){o.ts+=r.score;o.sc++;}
|
|
7045
|
-
if(r.durationMs)o.dur+=r.durationMs;
|
|
7046
|
-
if(r.tokenUsage)o.tok+=(r.tokenUsage.input||0)+(r.tokenUsage.output||0);
|
|
7047
|
-
if(r.costUsd)o.cost+=r.costUsd;
|
|
7048
|
-
}
|
|
7049
|
-
var a=[];for(var k in m)a.push(m[k]);return a;
|
|
7050
|
-
}
|
|
7051
|
-
function getEvalNames(){
|
|
7052
|
-
var n={};
|
|
7053
|
-
for(var i=0;i<DATA.length;i++){
|
|
7054
|
-
var sc=DATA[i].scores;
|
|
7055
|
-
if(sc)for(var j=0;j<sc.length;j++)n[sc[j].name]=true;
|
|
7056
|
-
}
|
|
7057
|
-
return Object.keys(n);
|
|
7058
|
-
}
|
|
7059
|
-
function getEvalScore(r,name){
|
|
7060
|
-
if(!r.scores)return null;
|
|
7061
|
-
for(var i=0;i<r.scores.length;i++)if(r.scores[i].name===name)return r.scores[i].score;
|
|
7062
|
-
return null;
|
|
7063
|
-
}
|
|
7064
|
-
|
|
7065
|
-
var stats=computeStats(DATA);
|
|
7066
|
-
var tgtStats=computeTargets(DATA);
|
|
7067
|
-
var tgtNames=tgtStats.map(function(t){return t.target;});
|
|
7068
|
-
|
|
7069
|
-
/* ---- state ---- */
|
|
7070
|
-
var state={tab:"overview",filter:{status:"all",target:"all",search:""},sort:{col:"testId",dir:"asc"},expanded:{}};
|
|
7071
|
-
|
|
7072
|
-
/* ---- DOM refs ---- */
|
|
7073
|
-
var app=document.getElementById("app");
|
|
7074
|
-
var tabBtns=document.querySelectorAll(".tab");
|
|
7075
|
-
|
|
7076
|
-
/* ---- tabs ---- */
|
|
7077
|
-
function setTab(t){
|
|
7078
|
-
state.tab=t;
|
|
7079
|
-
for(var i=0;i<tabBtns.length;i++)tabBtns[i].classList.toggle("active",tabBtns[i].getAttribute("data-tab")===t);
|
|
7080
|
-
render();
|
|
7081
|
-
}
|
|
7082
|
-
for(var i=0;i<tabBtns.length;i++){
|
|
7083
|
-
tabBtns[i].addEventListener("click",(function(b){return function(){setTab(b.getAttribute("data-tab"));};})(tabBtns[i]));
|
|
7084
|
-
}
|
|
7085
|
-
|
|
7086
|
-
/* ---- render ---- */
|
|
7087
|
-
function render(){
|
|
7088
|
-
if(DATA.length===0){
|
|
7089
|
-
app.innerHTML='<div class="welcome-state">'
|
|
7090
|
-
+'<h2>No results yet</h2>'
|
|
7091
|
-
+'<p>Run an evaluation or mount a results directory to see results here.</p>'
|
|
7092
|
-
+'<p><code>agentv eval <eval-file></code></p>'
|
|
7093
|
-
+'<p class="hint">The dashboard will automatically detect new result files.</p>'
|
|
7094
|
-
+'</div>';
|
|
7095
|
-
return;
|
|
7096
|
-
}
|
|
7097
|
-
if(state.tab==="overview")renderOverview();else renderTests();
|
|
7098
|
-
}
|
|
7099
|
-
|
|
7100
|
-
/* ---- stat card helper ---- */
|
|
7101
|
-
function card(label,value,type){
|
|
7102
|
-
return'<div class="stat-card '+type+'"><div class="stat-value">'+value+'</div><div class="stat-label">'+label+"</div></div>";
|
|
7103
|
-
}
|
|
7104
|
-
|
|
7105
|
-
/* ---- overview ---- */
|
|
7106
|
-
function renderOverview(){
|
|
7107
|
-
var h='<div class="stats-grid">';
|
|
7108
|
-
h+=card("Total Tests",stats.total,"total");
|
|
7109
|
-
h+=card("Passed",stats.passed,"pass");
|
|
7110
|
-
h+=card("Failed",stats.failed,"fail");
|
|
7111
|
-
h+=card("Errors",stats.errors,"error");
|
|
7112
|
-
var prCls=stats.passRate>=0.9?"pass":stats.passRate>=0.5?"warn":"fail";
|
|
7113
|
-
h+=card("Pass Rate",fmtPct(stats.passRate),prCls);
|
|
7114
|
-
h+=card("Duration",fmtDur(stats.dur),"neutral");
|
|
7115
|
-
h+=card("Tokens",fmtTok(stats.tokens),"neutral");
|
|
7116
|
-
h+=card("Est. Cost",fmtCost(stats.cost),"neutral");
|
|
7117
|
-
if(stats.toolCalls>0)h+=card("Tool Calls",fmtTok(stats.toolCalls),"neutral");
|
|
7118
|
-
h+="</div>";
|
|
7119
|
-
|
|
7120
|
-
/* targets table */
|
|
7121
|
-
if(tgtStats.length>1){
|
|
7122
|
-
h+='<div class="section"><h2 class="section-title">Targets</h2><div class="table-wrap"><table class="data-table">';
|
|
7123
|
-
h+="<thead><tr><th>Target</th><th>Pass Rate</th><th></th><th>Passed</th><th>Failed</th><th>Errors</th><th>Avg Score</th><th>Duration</th><th>Tokens</th><th>Cost</th></tr></thead><tbody>";
|
|
7124
|
-
for(var i=0;i<tgtStats.length;i++){
|
|
7125
|
-
var t=tgtStats[i],g=t.p+t.f,pr=g>0?t.p/g:0,avg=t.sc>0?t.ts/t.sc:0;
|
|
7126
|
-
h+="<tr><td class=\\"fw-medium\\">"+esc(t.target)+"</td><td>"+fmtPct(pr)+'</td><td><div class="bar-bg"><div class="bar-fill '+sCls(pr)+'" style="width:'+(pr*100)+'%"></div></div></td>';
|
|
7127
|
-
h+='<td class="text-pass">'+t.p+'</td><td class="text-fail">'+t.f+'</td><td class="text-error">'+t.e+"</td>";
|
|
7128
|
-
h+='<td class="'+sCls(avg)+'">'+fmtPct(avg)+"</td><td>"+fmtDur(t.dur)+"</td><td>"+fmtTok(t.tok)+"</td><td>"+fmtCost(t.cost)+"</td></tr>";
|
|
7129
|
-
}
|
|
7130
|
-
h+="</tbody></table></div></div>";
|
|
7131
|
-
}
|
|
7132
|
-
|
|
7133
|
-
/* histogram */
|
|
7134
|
-
if(stats.scores.length>0){
|
|
7135
|
-
var bk=[0,0,0,0,0];
|
|
7136
|
-
for(var i=0;i<stats.scores.length;i++){var idx=Math.min(Math.floor(stats.scores[i]*5),4);bk[idx]++;}
|
|
7137
|
-
var mx=Math.max.apply(null,bk);
|
|
7138
|
-
var lb=["0\\u201320%","20\\u201340%","40\\u201360%","60\\u201380%","80\\u2013100%"];
|
|
7139
|
-
h+='<div class="section"><h2 class="section-title">Score Distribution</h2><div class="histogram">';
|
|
7140
|
-
for(var i=0;i<bk.length;i++){
|
|
7141
|
-
var pct=mx>0?(bk[i]/mx*100):0;
|
|
7142
|
-
h+='<div class="hist-row"><span class="hist-label">'+lb[i]+'</span><div class="hist-bar-bg"><div class="hist-bar '+(i>=4?"score-high":i>=2?"score-mid":"score-low")+'" style="width:'+pct+'%"></div></div><span class="hist-count">'+bk[i]+"</span></div>";
|
|
7143
|
-
}
|
|
7144
|
-
h+="</div></div>";
|
|
7145
|
-
}
|
|
7146
|
-
app.innerHTML=h;
|
|
7147
|
-
}
|
|
7148
|
-
|
|
7149
|
-
/* ---- test cases ---- */
|
|
7150
|
-
function renderTests(){
|
|
7151
|
-
var evalNames=getEvalNames();
|
|
7152
|
-
var h='<div class="filter-bar">';
|
|
7153
|
-
h+='<select id="flt-status" class="filter-select"><option value="all">All Status</option><option value="pass">Passed</option><option value="fail">Failed</option><option value="error">Errors</option></select>';
|
|
7154
|
-
if(tgtNames.length>1){
|
|
7155
|
-
h+='<select id="flt-target" class="filter-select"><option value="all">All Targets</option>';
|
|
7156
|
-
for(var i=0;i<tgtNames.length;i++)h+='<option value="'+esc(tgtNames[i])+'">'+esc(tgtNames[i])+"</option>";
|
|
7157
|
-
h+="</select>";
|
|
7158
|
-
}
|
|
7159
|
-
h+='<input type="text" id="flt-search" class="filter-search" placeholder="Search tests..." value="'+esc(state.filter.search)+'">';
|
|
7160
|
-
h+='<span class="filter-count" id="flt-count"></span></div>';
|
|
7161
|
-
|
|
7162
|
-
h+='<div class="table-wrap"><table class="data-table" id="test-tbl"><thead><tr>';
|
|
7163
|
-
h+='<th class="expand-col"></th>';
|
|
7164
|
-
h+=sHdr("Status","status");
|
|
7165
|
-
h+=sHdr("Test ID","testId");
|
|
7166
|
-
if(tgtNames.length>1)h+=sHdr("Target","target");
|
|
7167
|
-
h+=sHdr("Score","score");
|
|
7168
|
-
for(var i=0;i<evalNames.length;i++)h+="<th>"+esc(evalNames[i])+"</th>";
|
|
7169
|
-
h+=sHdr("Duration","durationMs");
|
|
7170
|
-
h+=sHdr("Cost","costUsd");
|
|
7171
|
-
h+="</tr></thead><tbody id=\\"test-body\\"></tbody></table></div>";
|
|
7172
|
-
app.innerHTML=h;
|
|
7173
|
-
|
|
7174
|
-
/* wire events */
|
|
7175
|
-
var selS=document.getElementById("flt-status");
|
|
7176
|
-
selS.value=state.filter.status;
|
|
7177
|
-
selS.addEventListener("change",function(e){state.filter.status=e.target.value;renderRows();});
|
|
7178
|
-
var selT=document.getElementById("flt-target");
|
|
7179
|
-
if(selT){selT.value=state.filter.target;selT.addEventListener("change",function(e){state.filter.target=e.target.value;renderRows();});}
|
|
7180
|
-
document.getElementById("flt-search").addEventListener("input",function(e){state.filter.search=e.target.value;renderRows();});
|
|
7181
|
-
var ths=document.querySelectorAll("th[data-sort]");
|
|
7182
|
-
for(var i=0;i<ths.length;i++){
|
|
7183
|
-
ths[i].addEventListener("click",(function(th){return function(){
|
|
7184
|
-
var c=th.getAttribute("data-sort");
|
|
7185
|
-
if(state.sort.col===c)state.sort.dir=state.sort.dir==="asc"?"desc":"asc";
|
|
7186
|
-
else{state.sort.col=c;state.sort.dir="asc";}
|
|
7187
|
-
renderTests();
|
|
7188
|
-
};})(ths[i]));
|
|
7189
|
-
}
|
|
7190
|
-
renderRows();
|
|
7191
|
-
}
|
|
7192
|
-
|
|
7193
|
-
function sHdr(label,col){
|
|
7194
|
-
var arrow="";
|
|
7195
|
-
if(state.sort.col===col)arrow=state.sort.dir==="asc"?" \\u2191":" \\u2193";
|
|
7196
|
-
return'<th class="sortable" data-sort="'+col+'">'+label+arrow+"</th>";
|
|
7197
|
-
}
|
|
7198
|
-
|
|
7199
|
-
function filtered(){
|
|
7200
|
-
var out=[];
|
|
7201
|
-
for(var i=0;i<DATA.length;i++){
|
|
7202
|
-
var r=DATA[i],s=getStatus(r);
|
|
7203
|
-
if(state.filter.status!=="all"&&s!==state.filter.status)continue;
|
|
7204
|
-
if(state.filter.target!=="all"&&r.target!==state.filter.target)continue;
|
|
7205
|
-
if(state.filter.search&&(r.testId||"").toLowerCase().indexOf(state.filter.search.toLowerCase())===-1)continue;
|
|
7206
|
-
out.push(r);
|
|
7207
|
-
}
|
|
7208
|
-
var col=state.sort.col,dir=state.sort.dir==="asc"?1:-1;
|
|
7209
|
-
out.sort(function(a,b){
|
|
7210
|
-
var va=col==="status"?getStatus(a):a[col],vb=col==="status"?getStatus(b):b[col];
|
|
7211
|
-
if(va==null&&vb==null)return 0;if(va==null)return 1;if(vb==null)return-1;
|
|
7212
|
-
if(typeof va==="string")return va.localeCompare(vb)*dir;
|
|
7213
|
-
return(va-vb)*dir;
|
|
7214
|
-
});
|
|
7215
|
-
return out;
|
|
7216
|
-
}
|
|
7217
|
-
|
|
7218
|
-
function renderRows(){
|
|
7219
|
-
var rows=filtered(),evalNames=getEvalNames();
|
|
7220
|
-
var tbody=document.getElementById("test-body");
|
|
7221
|
-
var colSpan=5+evalNames.length+(tgtNames.length>1?1:0);
|
|
7222
|
-
document.getElementById("flt-count").textContent=rows.length+" of "+DATA.length+" tests";
|
|
7223
|
-
var h="";
|
|
7224
|
-
for(var i=0;i<rows.length;i++){
|
|
7225
|
-
var r=rows[i],s=getStatus(r),key=r.testId+":"+r.target,exp=!!state.expanded[key];
|
|
7226
|
-
h+='<tr class="test-row '+s+(exp?" expanded":"")+'" data-key="'+esc(key)+'" data-test-id="'+esc(r.testId)+'">';
|
|
7227
|
-
h+='<td class="expand-col"><span class="expand-icon">'+(exp?"\\u25BE":"\\u25B8")+"</span></td>";
|
|
7228
|
-
h+="<td>"+sIcon(s)+"</td>";
|
|
7229
|
-
h+='<td class="fw-medium">'+esc(r.testId)+"</td>";
|
|
7230
|
-
if(tgtNames.length>1)h+="<td>"+esc(r.target)+"</td>";
|
|
7231
|
-
h+='<td class="'+sCls(r.score)+'">'+fmtPct(r.score)+"</td>";
|
|
7232
|
-
for(var j=0;j<evalNames.length;j++){
|
|
7233
|
-
var es=getEvalScore(r,evalNames[j]);
|
|
7234
|
-
h+='<td class="'+sCls(es)+'">'+(es!=null?fmtPct(es):"\\u2014")+"</td>";
|
|
7235
|
-
}
|
|
7236
|
-
h+="<td>"+fmtDur(r.durationMs)+"</td><td>"+fmtCost(r.costUsd)+"</td></tr>";
|
|
7237
|
-
if(exp)h+='<tr class="detail-row"><td colspan="'+colSpan+'">'+renderDetail(r)+"</td></tr>";
|
|
7238
|
-
}
|
|
7239
|
-
if(rows.length===0)h+='<tr><td colspan="'+colSpan+'" class="empty-state">No matching tests</td></tr>';
|
|
7240
|
-
tbody.innerHTML=h;
|
|
7241
|
-
|
|
7242
|
-
/* row click */
|
|
7243
|
-
var trs=tbody.querySelectorAll(".test-row");
|
|
7244
|
-
for(var k=0;k<trs.length;k++){
|
|
7245
|
-
trs[k].addEventListener("click",(function(tr){return function(){
|
|
7246
|
-
var key=tr.getAttribute("data-key");
|
|
7247
|
-
state.expanded[key]=!state.expanded[key];
|
|
7248
|
-
renderRows();
|
|
7249
|
-
};})(trs[k]));
|
|
7250
|
-
}
|
|
7251
|
-
|
|
7252
|
-
/* wire feedback buttons */
|
|
7253
|
-
var btns=tbody.querySelectorAll(".feedback-submit");
|
|
7254
|
-
for(var k=0;k<btns.length;k++){
|
|
7255
|
-
btns[k].addEventListener("click",(function(btn){return function(ev){
|
|
7256
|
-
ev.stopPropagation();
|
|
7257
|
-
var tid=btn.getAttribute("data-test-id");
|
|
7258
|
-
var sec=btn.closest(".feedback-section");
|
|
7259
|
-
var ta=sec.querySelector(".feedback-input");
|
|
7260
|
-
var st=sec.querySelector(".feedback-status");
|
|
7261
|
-
saveFeedback(tid,ta.value,st,btn);
|
|
7262
|
-
};})(btns[k]));
|
|
7263
|
-
}
|
|
7264
|
-
|
|
7265
|
-
/* prevent textarea clicks from toggling row */
|
|
7266
|
-
var tas=tbody.querySelectorAll(".feedback-input");
|
|
7267
|
-
for(var k=0;k<tas.length;k++){
|
|
7268
|
-
tas[k].addEventListener("click",function(ev){ev.stopPropagation();});
|
|
7269
|
-
}
|
|
7270
|
-
|
|
7271
|
-
populateFeedbackTextareas();
|
|
7272
|
-
}
|
|
7273
|
-
|
|
7274
|
-
/* ---- detail panel ---- */
|
|
7275
|
-
function renderDetail(r){
|
|
7276
|
-
var h='<div class="detail-panel">';
|
|
7277
|
-
|
|
7278
|
-
/* input / output */
|
|
7279
|
-
h+='<div class="detail-grid">';
|
|
7280
|
-
if(r.input!=null){
|
|
7281
|
-
h+='<div class="detail-block"><h4>Input</h4><pre class="detail-pre">'+esc(JSON.stringify(r.input,null,2))+"</pre></div>";
|
|
7282
|
-
}
|
|
7283
|
-
h+='<div class="detail-block"><h4>Output</h4><pre class="detail-pre">'+esc(r.output?JSON.stringify(r.output,null,2):"")+"</pre></div>";
|
|
7284
|
-
h+="</div>";
|
|
7285
|
-
|
|
7286
|
-
/* evaluator results */
|
|
7287
|
-
if(r.scores&&r.scores.length>0){
|
|
7288
|
-
h+="<h4>Evaluator Results</h4>";
|
|
7289
|
-
h+='<table class="eval-table"><thead><tr><th>Evaluator</th><th>Score</th><th>Status</th><th>Assertions</th></tr></thead><tbody>';
|
|
7290
|
-
for(var i=0;i<r.scores.length;i++){
|
|
7291
|
-
var ev=r.scores[i],evS=ev.score>=0.5?"pass":"fail";
|
|
7292
|
-
var evAssertions=ev.assertions||[];
|
|
7293
|
-
var evSummary=evAssertions.map(function(a){return (a.passed?"\\u2713 ":"\\u2717 ")+a.text;}).join("; ");
|
|
7294
|
-
h+="<tr><td class=\\"fw-medium\\">"+esc(ev.name)+'</td><td class="'+sCls(ev.score)+'">'+fmtPct(ev.score)+"</td><td>"+sIcon(evS)+'</td><td class="reasoning-cell">'+esc(evSummary)+"</td></tr>";
|
|
7295
|
-
}
|
|
7296
|
-
h+="</tbody></table>";
|
|
7297
|
-
}
|
|
7298
|
-
|
|
7299
|
-
/* assertions */
|
|
7300
|
-
var passedA=r.assertions?r.assertions.filter(function(a){return a.passed;}):[];
|
|
7301
|
-
var failedA=r.assertions?r.assertions.filter(function(a){return !a.passed;}):[];
|
|
7302
|
-
if(passedA.length>0){
|
|
7303
|
-
h+='<h4>Passed Assertions</h4><ul class="expect-list pass">';
|
|
7304
|
-
for(var i=0;i<passedA.length;i++)h+="<li>"+esc(passedA[i].text)+(passedA[i].evidence?" <span class=\\"reasoning-cell\\">("+esc(passedA[i].evidence)+")</span>":"")+"</li>";
|
|
7305
|
-
h+="</ul>";
|
|
7306
|
-
}
|
|
7307
|
-
if(failedA.length>0){
|
|
7308
|
-
h+='<h4>Failed Assertions</h4><ul class="expect-list fail">';
|
|
7309
|
-
for(var i=0;i<failedA.length;i++)h+="<li>"+esc(failedA[i].text)+(failedA[i].evidence?" <span class=\\"reasoning-cell\\">("+esc(failedA[i].evidence)+")</span>":"")+"</li>";
|
|
7310
|
-
h+="</ul>";
|
|
7311
|
-
}
|
|
7312
|
-
|
|
7313
|
-
/* tool calls */
|
|
7314
|
-
if(r._toolCalls){
|
|
7315
|
-
var tc=r._toolCalls,tcArr=[];
|
|
7316
|
-
for(var k in tc)tcArr.push({name:k,count:tc[k]});
|
|
7317
|
-
tcArr.sort(function(a,b){return b.count-a.count;});
|
|
7318
|
-
h+='<h4>Tool Calls</h4><div class="tool-calls">';
|
|
7319
|
-
for(var i=0;i<tcArr.length;i++)h+='<span class="tool-tag">'+esc(tcArr[i].name)+": "+tcArr[i].count+"</span>";
|
|
7320
|
-
h+="</div>";
|
|
7321
|
-
}
|
|
7322
|
-
|
|
7323
|
-
/* error */
|
|
7324
|
-
if(r.error)h+='<div class="error-box"><h4>Error</h4><pre>'+esc(r.error)+"</pre></div>";
|
|
7325
|
-
|
|
7326
|
-
/* metadata */
|
|
7327
|
-
h+='<div class="detail-meta">';
|
|
7328
|
-
var m=[];
|
|
7329
|
-
if(r.tokenUsage)m.push(fmtTok(r.tokenUsage.input)+" in / "+fmtTok(r.tokenUsage.output)+" out tokens");
|
|
7330
|
-
if(r.durationMs){
|
|
7331
|
-
if(r._graderDurationMs>0){
|
|
7332
|
-
var execMs=r.durationMs-r._graderDurationMs;
|
|
7333
|
-
m.push(fmtDur(execMs>0?execMs:0)+" executor + "+fmtDur(r._graderDurationMs)+" grader");
|
|
7334
|
-
}else{
|
|
7335
|
-
m.push(fmtDur(r.durationMs));
|
|
7336
|
-
}
|
|
7337
|
-
}
|
|
7338
|
-
if(r.target)m.push(r.target);
|
|
7339
|
-
if(r.costUsd)m.push(fmtCost(r.costUsd));
|
|
7340
|
-
if(r.timestamp)m.push(r.timestamp);
|
|
7341
|
-
h+=esc(m.join(" \\u00B7 "));
|
|
7342
|
-
h+="</div>";
|
|
7343
|
-
|
|
7344
|
-
/* feedback section */
|
|
7345
|
-
var tid=r.testId||"";
|
|
7346
|
-
var existingComment=feedbackCache[tid]||"";
|
|
7347
|
-
h+='<div class="feedback-section">';
|
|
7348
|
-
h+='<h4>Feedback</h4>';
|
|
7349
|
-
h+='<textarea class="feedback-input" data-test-id="'+esc(tid)+'" placeholder="Add feedback for this test..." onclick="event.stopPropagation()">'+esc(existingComment)+'</textarea>';
|
|
7350
|
-
h+='<div style="display:flex;align-items:center">';
|
|
7351
|
-
h+='<button class="feedback-submit" data-test-id="'+esc(tid)+'">Save Feedback</button>';
|
|
7352
|
-
h+='<span class="feedback-status"></span>';
|
|
7353
|
-
h+='</div></div>';
|
|
7354
|
-
|
|
7355
|
-
h+="</div>";
|
|
7356
|
-
return h;
|
|
7357
|
-
}
|
|
7358
|
-
|
|
7359
|
-
/* ---- run picker ---- */
|
|
7360
|
-
var runPicker=document.getElementById("run-picker");
|
|
7361
|
-
var knownRunFilenames=[];
|
|
7362
|
-
|
|
7363
|
-
function refreshRunList(){
|
|
7364
|
-
fetch("/api/runs").then(function(r){return r.json();}).then(function(d){
|
|
7365
|
-
if(!d||!d.runs)return;
|
|
7366
|
-
var runs=d.runs;
|
|
7367
|
-
var newFilenames=runs.map(function(r){return r.filename;});
|
|
7368
|
-
|
|
7369
|
-
/* Detect new runs that appeared since last poll */
|
|
7370
|
-
if(knownRunFilenames.length>0){
|
|
7371
|
-
var hasNew=newFilenames.some(function(f){return knownRunFilenames.indexOf(f)===-1;});
|
|
7372
|
-
if(hasNew&&DATA.length===0){
|
|
7373
|
-
/* Auto-load the first (most recent) run when starting from empty state */
|
|
7374
|
-
loadRun(runs[0].filename);
|
|
7375
|
-
}
|
|
7376
|
-
}
|
|
7377
|
-
knownRunFilenames=newFilenames;
|
|
7378
|
-
|
|
7379
|
-
/* Rebuild picker options */
|
|
7380
|
-
var h='<option value="">Select a result file...</option>';
|
|
7381
|
-
if(runs.length===0){
|
|
7382
|
-
h='<option value="">No result files</option>';
|
|
7383
|
-
}
|
|
7384
|
-
for(var i=0;i<runs.length;i++){
|
|
7385
|
-
var r=runs[i];
|
|
7386
|
-
var label=r.filename+" ("+r.test_count+" tests, "+(r.pass_rate*100).toFixed(0)+"% pass)";
|
|
7387
|
-
h+='<option value="'+esc(r.filename)+'">'+esc(label)+"</option>";
|
|
7388
|
-
}
|
|
7389
|
-
runPicker.innerHTML=h;
|
|
7390
|
-
/* Pre-select the initially loaded run */
|
|
7391
|
-
if(INITIAL_SOURCE&&runs.length>0){
|
|
7392
|
-
runPicker.value=INITIAL_SOURCE;
|
|
7393
|
-
}
|
|
7394
|
-
}).catch(function(err){console.warn("Failed to refresh run list:",err);});
|
|
7395
|
-
}
|
|
7396
|
-
|
|
7397
|
-
function loadRun(filename){
|
|
7398
|
-
fetch("/api/runs/"+encodeURIComponent(filename)).then(function(r){return r.json();}).then(function(d){
|
|
7399
|
-
if(d.error){console.error(d.error);return;}
|
|
7400
|
-
DATA=d.results;
|
|
7401
|
-
stats=computeStats(DATA);
|
|
7402
|
-
tgtStats=computeTargets(DATA);
|
|
7403
|
-
tgtNames=tgtStats.map(function(t){return t.target;});
|
|
7404
|
-
state.expanded={};
|
|
7405
|
-
feedbackCache={};
|
|
7406
|
-
loadFeedback();
|
|
7407
|
-
render();
|
|
7408
|
-
/* Update picker selection */
|
|
7409
|
-
runPicker.value=filename;
|
|
7410
|
-
}).catch(function(err){console.error("Failed to load run:",err);});
|
|
7411
|
-
}
|
|
7412
|
-
|
|
7413
|
-
runPicker.addEventListener("change",function(){
|
|
7414
|
-
var val=runPicker.value;
|
|
7415
|
-
if(val)loadRun(val);
|
|
7416
|
-
});
|
|
7417
|
-
|
|
7418
|
-
/* Poll for new result files every 5 seconds */
|
|
7419
|
-
refreshRunList();
|
|
7420
|
-
setInterval(refreshRunList,5000);
|
|
7421
|
-
|
|
7422
|
-
/* ---- init ---- */
|
|
7423
|
-
loadFeedback();
|
|
7424
|
-
render();
|
|
7425
|
-
})();
|
|
7426
|
-
`;
|
|
7427
6770
|
var resultsServeCommand = command({
|
|
7428
6771
|
name: "studio",
|
|
7429
6772
|
description: "Start AgentV Studio \u2014 a local dashboard for reviewing evaluation results",
|
|
@@ -8952,8 +8295,6 @@ var app = subcommands({
|
|
|
8952
8295
|
results: resultsCommand,
|
|
8953
8296
|
self: selfCommand,
|
|
8954
8297
|
studio: resultsServeCommand,
|
|
8955
|
-
serve: resultsServeCommand,
|
|
8956
|
-
// hidden alias for backward compatibility
|
|
8957
8298
|
trace: traceCommand,
|
|
8958
8299
|
transpile: transpileCommand,
|
|
8959
8300
|
trim: trimCommand,
|
|
@@ -8971,7 +8312,6 @@ var TOP_LEVEL_COMMANDS = /* @__PURE__ */ new Set([
|
|
|
8971
8312
|
"pipeline",
|
|
8972
8313
|
"results",
|
|
8973
8314
|
"self",
|
|
8974
|
-
"serve",
|
|
8975
8315
|
"studio",
|
|
8976
8316
|
"trace",
|
|
8977
8317
|
"transpile",
|
|
@@ -9019,4 +8359,4 @@ export {
|
|
|
9019
8359
|
preprocessArgv,
|
|
9020
8360
|
runCli
|
|
9021
8361
|
};
|
|
9022
|
-
//# sourceMappingURL=chunk-
|
|
8362
|
+
//# sourceMappingURL=chunk-TDY2FQN5.js.map
|