agentv 4.3.3 → 4.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-HMOXP7T5.js → chunk-63NDZ6UC.js} +182 -60
- package/dist/chunk-63NDZ6UC.js.map +1 -0
- package/dist/{chunk-EW4COQU2.js → chunk-BAYNXTX6.js} +67 -16
- package/dist/{chunk-EW4COQU2.js.map → chunk-BAYNXTX6.js.map} +1 -1
- package/dist/{chunk-5DDVNHOS.js → chunk-YORCRL4G.js} +592 -392
- package/dist/chunk-YORCRL4G.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-M7R6II6Y.js → dist-P74O2P2I.js} +20 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-6BO4RY6U.js → interactive-YNSOO2BS.js} +3 -3
- package/dist/studio/assets/index-4pi03cUm.js +65 -0
- package/dist/studio/assets/{index-tOa8ADje.js → index-CnW7PJA8.js} +1 -1
- package/dist/studio/assets/index-jJVIJh8b.css +1 -0
- package/dist/studio/index.html +2 -2
- package/package.json +1 -1
- package/dist/chunk-5DDVNHOS.js.map +0 -1
- package/dist/chunk-HMOXP7T5.js.map +0 -1
- package/dist/studio/assets/index-CcrZuqEa.js +0 -65
- package/dist/studio/assets/index-xvMmIJ7Q.css +0 -1
- /package/dist/{dist-M7R6II6Y.js.map → dist-P74O2P2I.js.map} +0 -0
- /package/dist/{interactive-6BO4RY6U.js.map → interactive-YNSOO2BS.js.map} +0 -0
|
@@ -24,27 +24,32 @@ import {
|
|
|
24
24
|
validateFileReferences,
|
|
25
25
|
validateTargetsFile,
|
|
26
26
|
writeArtifactsFromResults
|
|
27
|
-
} from "./chunk-
|
|
27
|
+
} from "./chunk-BAYNXTX6.js";
|
|
28
28
|
import {
|
|
29
29
|
DEFAULT_CATEGORY,
|
|
30
30
|
PASS_THRESHOLD,
|
|
31
|
+
addProject,
|
|
31
32
|
createBuiltinRegistry,
|
|
32
33
|
deriveCategory,
|
|
34
|
+
discoverProjects,
|
|
33
35
|
executeScript,
|
|
34
36
|
getAgentvHome,
|
|
35
37
|
getOutputFilenames,
|
|
38
|
+
getProject,
|
|
36
39
|
getWorkspacePoolRoot,
|
|
37
40
|
isAgentSkillsFormat,
|
|
41
|
+
loadProjectRegistry,
|
|
38
42
|
loadTestById,
|
|
39
43
|
loadTestSuite,
|
|
40
44
|
loadTests,
|
|
41
45
|
normalizeLineEndings,
|
|
42
46
|
parseAgentSkillsEvals,
|
|
47
|
+
removeProject,
|
|
43
48
|
toCamelCaseDeep,
|
|
44
49
|
toSnakeCaseDeep as toSnakeCaseDeep2,
|
|
45
50
|
transpileEvalYamlFile,
|
|
46
51
|
trimBaselineResult
|
|
47
|
-
} from "./chunk-
|
|
52
|
+
} from "./chunk-63NDZ6UC.js";
|
|
48
53
|
import {
|
|
49
54
|
__commonJS,
|
|
50
55
|
__esm,
|
|
@@ -4218,7 +4223,7 @@ var evalRunCommand = command({
|
|
|
4218
4223
|
},
|
|
4219
4224
|
handler: async (args) => {
|
|
4220
4225
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
4221
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
4226
|
+
const { launchInteractiveWizard } = await import("./interactive-YNSOO2BS.js");
|
|
4222
4227
|
await launchInteractiveWizard();
|
|
4223
4228
|
return;
|
|
4224
4229
|
}
|
|
@@ -6255,7 +6260,16 @@ function loadStudioConfig(agentvDir) {
|
|
|
6255
6260
|
if (!parsed || typeof parsed !== "object") {
|
|
6256
6261
|
return { ...DEFAULTS };
|
|
6257
6262
|
}
|
|
6258
|
-
const
|
|
6263
|
+
const studio = parsed.studio;
|
|
6264
|
+
let threshold = DEFAULTS.pass_threshold;
|
|
6265
|
+
if (studio && typeof studio === "object" && !Array.isArray(studio)) {
|
|
6266
|
+
const studioThreshold = studio.pass_threshold;
|
|
6267
|
+
if (typeof studioThreshold === "number") {
|
|
6268
|
+
threshold = studioThreshold;
|
|
6269
|
+
}
|
|
6270
|
+
} else if (typeof parsed.pass_threshold === "number") {
|
|
6271
|
+
threshold = parsed.pass_threshold;
|
|
6272
|
+
}
|
|
6259
6273
|
return {
|
|
6260
6274
|
pass_threshold: Math.min(1, Math.max(0, threshold))
|
|
6261
6275
|
};
|
|
@@ -6265,7 +6279,18 @@ function saveStudioConfig(agentvDir, config) {
|
|
|
6265
6279
|
mkdirSync2(agentvDir, { recursive: true });
|
|
6266
6280
|
}
|
|
6267
6281
|
const configPath = path9.join(agentvDir, "config.yaml");
|
|
6268
|
-
|
|
6282
|
+
let existing = {};
|
|
6283
|
+
if (existsSync7(configPath)) {
|
|
6284
|
+
const raw = readFileSync8(configPath, "utf-8");
|
|
6285
|
+
const parsed = parseYaml(raw);
|
|
6286
|
+
if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
|
|
6287
|
+
existing = parsed;
|
|
6288
|
+
}
|
|
6289
|
+
}
|
|
6290
|
+
const { pass_threshold: _, ...rest } = existing;
|
|
6291
|
+
existing = rest;
|
|
6292
|
+
existing.studio = { ...config };
|
|
6293
|
+
const yamlStr = stringifyYaml2(existing);
|
|
6269
6294
|
writeFileSync3(configPath, yamlStr, "utf-8");
|
|
6270
6295
|
}
|
|
6271
6296
|
|
|
@@ -6289,11 +6314,366 @@ function writeFeedback(cwd, data) {
|
|
|
6289
6314
|
writeFileSync4(feedbackPath(cwd), `${JSON.stringify(data, null, 2)}
|
|
6290
6315
|
`, "utf8");
|
|
6291
6316
|
}
|
|
6317
|
+
function buildFileTree(dirPath, relativeTo) {
|
|
6318
|
+
if (!existsSync8(dirPath) || !statSync4(dirPath).isDirectory()) {
|
|
6319
|
+
return [];
|
|
6320
|
+
}
|
|
6321
|
+
const entries2 = readdirSync3(dirPath, { withFileTypes: true });
|
|
6322
|
+
return entries2.sort((a, b) => {
|
|
6323
|
+
if (a.isDirectory() !== b.isDirectory()) return a.isDirectory() ? -1 : 1;
|
|
6324
|
+
return a.name.localeCompare(b.name);
|
|
6325
|
+
}).map((entry) => {
|
|
6326
|
+
const fullPath = path10.join(dirPath, entry.name);
|
|
6327
|
+
const relPath = path10.relative(relativeTo, fullPath);
|
|
6328
|
+
if (entry.isDirectory()) {
|
|
6329
|
+
return {
|
|
6330
|
+
name: entry.name,
|
|
6331
|
+
path: relPath,
|
|
6332
|
+
type: "dir",
|
|
6333
|
+
children: buildFileTree(fullPath, relativeTo)
|
|
6334
|
+
};
|
|
6335
|
+
}
|
|
6336
|
+
return { name: entry.name, path: relPath, type: "file" };
|
|
6337
|
+
});
|
|
6338
|
+
}
|
|
6339
|
+
function inferLanguage(filePath) {
|
|
6340
|
+
const ext = path10.extname(filePath).toLowerCase();
|
|
6341
|
+
const langMap = {
|
|
6342
|
+
".json": "json",
|
|
6343
|
+
".jsonl": "json",
|
|
6344
|
+
".ts": "typescript",
|
|
6345
|
+
".tsx": "typescript",
|
|
6346
|
+
".js": "javascript",
|
|
6347
|
+
".jsx": "javascript",
|
|
6348
|
+
".md": "markdown",
|
|
6349
|
+
".yaml": "yaml",
|
|
6350
|
+
".yml": "yaml",
|
|
6351
|
+
".log": "plaintext",
|
|
6352
|
+
".txt": "plaintext",
|
|
6353
|
+
".py": "python",
|
|
6354
|
+
".sh": "shell",
|
|
6355
|
+
".bash": "shell",
|
|
6356
|
+
".css": "css",
|
|
6357
|
+
".html": "html",
|
|
6358
|
+
".xml": "xml",
|
|
6359
|
+
".svg": "xml",
|
|
6360
|
+
".toml": "toml",
|
|
6361
|
+
".diff": "diff",
|
|
6362
|
+
".patch": "diff"
|
|
6363
|
+
};
|
|
6364
|
+
return langMap[ext] ?? "plaintext";
|
|
6365
|
+
}
|
|
6366
|
+
function stripHeavyFields(results) {
|
|
6367
|
+
return results.map((r) => {
|
|
6368
|
+
const { requests, trace, ...rest } = r;
|
|
6369
|
+
const toolCalls = trace?.toolCalls && Object.keys(trace.toolCalls).length > 0 ? trace.toolCalls : void 0;
|
|
6370
|
+
const graderDurationMs = (r.scores ?? []).reduce((sum, s) => sum + (s.durationMs ?? 0), 0);
|
|
6371
|
+
return {
|
|
6372
|
+
...rest,
|
|
6373
|
+
...toolCalls && { _toolCalls: toolCalls },
|
|
6374
|
+
...graderDurationMs > 0 && { _graderDurationMs: graderDurationMs }
|
|
6375
|
+
};
|
|
6376
|
+
});
|
|
6377
|
+
}
|
|
6378
|
+
function handleRuns(c3, { searchDir }) {
|
|
6379
|
+
const metas = listResultFiles(searchDir);
|
|
6380
|
+
return c3.json({
|
|
6381
|
+
runs: metas.map((m) => {
|
|
6382
|
+
let target;
|
|
6383
|
+
let experiment;
|
|
6384
|
+
try {
|
|
6385
|
+
const records = loadLightweightResults(m.path);
|
|
6386
|
+
if (records.length > 0) {
|
|
6387
|
+
target = records[0].target;
|
|
6388
|
+
experiment = records[0].experiment;
|
|
6389
|
+
}
|
|
6390
|
+
} catch {
|
|
6391
|
+
}
|
|
6392
|
+
return {
|
|
6393
|
+
filename: m.filename,
|
|
6394
|
+
path: m.path,
|
|
6395
|
+
timestamp: m.timestamp,
|
|
6396
|
+
test_count: m.testCount,
|
|
6397
|
+
pass_rate: m.passRate,
|
|
6398
|
+
avg_score: m.avgScore,
|
|
6399
|
+
size_bytes: m.sizeBytes,
|
|
6400
|
+
...target && { target },
|
|
6401
|
+
...experiment && { experiment }
|
|
6402
|
+
};
|
|
6403
|
+
})
|
|
6404
|
+
});
|
|
6405
|
+
}
|
|
6406
|
+
function handleRunDetail(c3, { searchDir }) {
|
|
6407
|
+
const filename = c3.req.param("filename");
|
|
6408
|
+
const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
|
|
6409
|
+
if (!meta) return c3.json({ error: "Run not found" }, 404);
|
|
6410
|
+
try {
|
|
6411
|
+
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6412
|
+
return c3.json({ results: stripHeavyFields(loaded), source: meta.filename });
|
|
6413
|
+
} catch {
|
|
6414
|
+
return c3.json({ error: "Failed to load run" }, 500);
|
|
6415
|
+
}
|
|
6416
|
+
}
|
|
6417
|
+
function handleRunDatasets(c3, { searchDir, agentvDir }) {
|
|
6418
|
+
const filename = c3.req.param("filename");
|
|
6419
|
+
const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
|
|
6420
|
+
if (!meta) return c3.json({ error: "Run not found" }, 404);
|
|
6421
|
+
try {
|
|
6422
|
+
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6423
|
+
const { pass_threshold } = loadStudioConfig(agentvDir);
|
|
6424
|
+
const datasetMap = /* @__PURE__ */ new Map();
|
|
6425
|
+
for (const r of loaded) {
|
|
6426
|
+
const ds = r.dataset ?? r.target ?? "default";
|
|
6427
|
+
const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
|
|
6428
|
+
entry.total++;
|
|
6429
|
+
if (r.score >= pass_threshold) entry.passed++;
|
|
6430
|
+
entry.scoreSum += r.score;
|
|
6431
|
+
datasetMap.set(ds, entry);
|
|
6432
|
+
}
|
|
6433
|
+
const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
|
|
6434
|
+
name,
|
|
6435
|
+
total: entry.total,
|
|
6436
|
+
passed: entry.passed,
|
|
6437
|
+
failed: entry.total - entry.passed,
|
|
6438
|
+
avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0
|
|
6439
|
+
}));
|
|
6440
|
+
return c3.json({ datasets });
|
|
6441
|
+
} catch {
|
|
6442
|
+
return c3.json({ error: "Failed to load datasets" }, 500);
|
|
6443
|
+
}
|
|
6444
|
+
}
|
|
6445
|
+
function handleRunCategories(c3, { searchDir, agentvDir }) {
|
|
6446
|
+
const filename = c3.req.param("filename");
|
|
6447
|
+
const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
|
|
6448
|
+
if (!meta) return c3.json({ error: "Run not found" }, 404);
|
|
6449
|
+
try {
|
|
6450
|
+
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6451
|
+
const { pass_threshold } = loadStudioConfig(agentvDir);
|
|
6452
|
+
const categoryMap = /* @__PURE__ */ new Map();
|
|
6453
|
+
for (const r of loaded) {
|
|
6454
|
+
const cat = r.category ?? DEFAULT_CATEGORY;
|
|
6455
|
+
const entry = categoryMap.get(cat) ?? {
|
|
6456
|
+
total: 0,
|
|
6457
|
+
passed: 0,
|
|
6458
|
+
scoreSum: 0,
|
|
6459
|
+
datasets: /* @__PURE__ */ new Set()
|
|
6460
|
+
};
|
|
6461
|
+
entry.total++;
|
|
6462
|
+
if (r.score >= pass_threshold) entry.passed++;
|
|
6463
|
+
entry.scoreSum += r.score;
|
|
6464
|
+
entry.datasets.add(r.dataset ?? r.target ?? "default");
|
|
6465
|
+
categoryMap.set(cat, entry);
|
|
6466
|
+
}
|
|
6467
|
+
const categories = [...categoryMap.entries()].map(([name, entry]) => ({
|
|
6468
|
+
name,
|
|
6469
|
+
total: entry.total,
|
|
6470
|
+
passed: entry.passed,
|
|
6471
|
+
failed: entry.total - entry.passed,
|
|
6472
|
+
avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0,
|
|
6473
|
+
dataset_count: entry.datasets.size
|
|
6474
|
+
}));
|
|
6475
|
+
return c3.json({ categories });
|
|
6476
|
+
} catch {
|
|
6477
|
+
return c3.json({ error: "Failed to load categories" }, 500);
|
|
6478
|
+
}
|
|
6479
|
+
}
|
|
6480
|
+
function handleCategoryDatasets(c3, { searchDir, agentvDir }) {
|
|
6481
|
+
const filename = c3.req.param("filename");
|
|
6482
|
+
const category = decodeURIComponent(c3.req.param("category") ?? "");
|
|
6483
|
+
const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
|
|
6484
|
+
if (!meta) return c3.json({ error: "Run not found" }, 404);
|
|
6485
|
+
try {
|
|
6486
|
+
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6487
|
+
const { pass_threshold } = loadStudioConfig(agentvDir);
|
|
6488
|
+
const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category);
|
|
6489
|
+
const datasetMap = /* @__PURE__ */ new Map();
|
|
6490
|
+
for (const r of filtered) {
|
|
6491
|
+
const ds = r.dataset ?? r.target ?? "default";
|
|
6492
|
+
const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
|
|
6493
|
+
entry.total++;
|
|
6494
|
+
if (r.score >= pass_threshold) entry.passed++;
|
|
6495
|
+
entry.scoreSum += r.score;
|
|
6496
|
+
datasetMap.set(ds, entry);
|
|
6497
|
+
}
|
|
6498
|
+
const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
|
|
6499
|
+
name,
|
|
6500
|
+
total: entry.total,
|
|
6501
|
+
passed: entry.passed,
|
|
6502
|
+
failed: entry.total - entry.passed,
|
|
6503
|
+
avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0
|
|
6504
|
+
}));
|
|
6505
|
+
return c3.json({ datasets });
|
|
6506
|
+
} catch {
|
|
6507
|
+
return c3.json({ error: "Failed to load datasets" }, 500);
|
|
6508
|
+
}
|
|
6509
|
+
}
|
|
6510
|
+
function handleEvalDetail(c3, { searchDir }) {
|
|
6511
|
+
const filename = c3.req.param("filename");
|
|
6512
|
+
const evalId = c3.req.param("evalId");
|
|
6513
|
+
const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
|
|
6514
|
+
if (!meta) return c3.json({ error: "Run not found" }, 404);
|
|
6515
|
+
try {
|
|
6516
|
+
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6517
|
+
const result = loaded.find((r) => r.testId === evalId);
|
|
6518
|
+
if (!result) return c3.json({ error: "Eval not found" }, 404);
|
|
6519
|
+
return c3.json({ eval: result });
|
|
6520
|
+
} catch {
|
|
6521
|
+
return c3.json({ error: "Failed to load eval" }, 500);
|
|
6522
|
+
}
|
|
6523
|
+
}
|
|
6524
|
+
function handleEvalFiles(c3, { searchDir }) {
|
|
6525
|
+
const filename = c3.req.param("filename");
|
|
6526
|
+
const evalId = c3.req.param("evalId");
|
|
6527
|
+
const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
|
|
6528
|
+
if (!meta) return c3.json({ error: "Run not found" }, 404);
|
|
6529
|
+
try {
|
|
6530
|
+
const content = readFileSync9(meta.path, "utf8");
|
|
6531
|
+
const records = parseResultManifest(content);
|
|
6532
|
+
const record = records.find((r) => (r.test_id ?? r.eval_id) === evalId);
|
|
6533
|
+
if (!record) return c3.json({ error: "Eval not found" }, 404);
|
|
6534
|
+
const baseDir = path10.dirname(meta.path);
|
|
6535
|
+
const knownPaths = [
|
|
6536
|
+
record.grading_path,
|
|
6537
|
+
record.timing_path,
|
|
6538
|
+
record.input_path,
|
|
6539
|
+
record.output_path,
|
|
6540
|
+
record.response_path
|
|
6541
|
+
].filter((p) => !!p);
|
|
6542
|
+
if (knownPaths.length === 0) return c3.json({ files: [] });
|
|
6543
|
+
const artifactDirs = knownPaths.map((p) => path10.dirname(p));
|
|
6544
|
+
let commonDir = artifactDirs[0];
|
|
6545
|
+
for (const dir of artifactDirs) {
|
|
6546
|
+
while (!dir.startsWith(commonDir)) {
|
|
6547
|
+
commonDir = path10.dirname(commonDir);
|
|
6548
|
+
}
|
|
6549
|
+
}
|
|
6550
|
+
const artifactAbsDir = path10.join(baseDir, commonDir);
|
|
6551
|
+
const files = buildFileTree(artifactAbsDir, baseDir);
|
|
6552
|
+
return c3.json({ files });
|
|
6553
|
+
} catch {
|
|
6554
|
+
return c3.json({ error: "Failed to load file tree" }, 500);
|
|
6555
|
+
}
|
|
6556
|
+
}
|
|
6557
|
+
function handleEvalFileContent(c3, { searchDir }) {
|
|
6558
|
+
const filename = c3.req.param("filename");
|
|
6559
|
+
const evalId = c3.req.param("evalId");
|
|
6560
|
+
const meta = listResultFiles(searchDir).find((m) => m.filename === filename);
|
|
6561
|
+
if (!meta) return c3.json({ error: "Run not found" }, 404);
|
|
6562
|
+
const marker = `/runs/${filename}/evals/${evalId}/files/`;
|
|
6563
|
+
const markerIdx = c3.req.path.indexOf(marker);
|
|
6564
|
+
const filePath = markerIdx >= 0 ? c3.req.path.slice(markerIdx + marker.length) : "";
|
|
6565
|
+
if (!filePath) return c3.json({ error: "No file path specified" }, 400);
|
|
6566
|
+
const baseDir = path10.dirname(meta.path);
|
|
6567
|
+
const absolutePath = path10.resolve(baseDir, filePath);
|
|
6568
|
+
if (!absolutePath.startsWith(path10.resolve(baseDir) + path10.sep) && absolutePath !== path10.resolve(baseDir)) {
|
|
6569
|
+
return c3.json({ error: "Path traversal not allowed" }, 403);
|
|
6570
|
+
}
|
|
6571
|
+
if (!existsSync8(absolutePath) || !statSync4(absolutePath).isFile()) {
|
|
6572
|
+
return c3.json({ error: "File not found" }, 404);
|
|
6573
|
+
}
|
|
6574
|
+
try {
|
|
6575
|
+
const fileContent = readFileSync9(absolutePath, "utf8");
|
|
6576
|
+
const language = inferLanguage(absolutePath);
|
|
6577
|
+
return c3.json({ content: fileContent, language });
|
|
6578
|
+
} catch {
|
|
6579
|
+
return c3.json({ error: "Failed to read file" }, 500);
|
|
6580
|
+
}
|
|
6581
|
+
}
|
|
6582
|
+
function handleExperiments(c3, { searchDir, agentvDir }) {
|
|
6583
|
+
const metas = listResultFiles(searchDir);
|
|
6584
|
+
const { pass_threshold } = loadStudioConfig(agentvDir);
|
|
6585
|
+
const experimentMap = /* @__PURE__ */ new Map();
|
|
6586
|
+
for (const m of metas) {
|
|
6587
|
+
try {
|
|
6588
|
+
const records = loadLightweightResults(m.path);
|
|
6589
|
+
for (const r of records) {
|
|
6590
|
+
const experiment = r.experiment ?? "default";
|
|
6591
|
+
const entry = experimentMap.get(experiment) ?? {
|
|
6592
|
+
targets: /* @__PURE__ */ new Set(),
|
|
6593
|
+
runFilenames: /* @__PURE__ */ new Set(),
|
|
6594
|
+
evalCount: 0,
|
|
6595
|
+
passedCount: 0,
|
|
6596
|
+
lastTimestamp: ""
|
|
6597
|
+
};
|
|
6598
|
+
entry.runFilenames.add(m.filename);
|
|
6599
|
+
if (r.target) entry.targets.add(r.target);
|
|
6600
|
+
entry.evalCount++;
|
|
6601
|
+
if (r.score >= pass_threshold) entry.passedCount++;
|
|
6602
|
+
if (r.timestamp && r.timestamp > entry.lastTimestamp) {
|
|
6603
|
+
entry.lastTimestamp = r.timestamp;
|
|
6604
|
+
}
|
|
6605
|
+
experimentMap.set(experiment, entry);
|
|
6606
|
+
}
|
|
6607
|
+
} catch {
|
|
6608
|
+
}
|
|
6609
|
+
}
|
|
6610
|
+
const experiments = [...experimentMap.entries()].map(([name, entry]) => ({
|
|
6611
|
+
name,
|
|
6612
|
+
run_count: entry.runFilenames.size,
|
|
6613
|
+
target_count: entry.targets.size,
|
|
6614
|
+
eval_count: entry.evalCount,
|
|
6615
|
+
passed_count: entry.passedCount,
|
|
6616
|
+
pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0,
|
|
6617
|
+
last_run: entry.lastTimestamp || null
|
|
6618
|
+
}));
|
|
6619
|
+
return c3.json({ experiments });
|
|
6620
|
+
}
|
|
6621
|
+
function handleTargets(c3, { searchDir, agentvDir }) {
|
|
6622
|
+
const metas = listResultFiles(searchDir);
|
|
6623
|
+
const { pass_threshold } = loadStudioConfig(agentvDir);
|
|
6624
|
+
const targetMap = /* @__PURE__ */ new Map();
|
|
6625
|
+
for (const m of metas) {
|
|
6626
|
+
try {
|
|
6627
|
+
const records = loadLightweightResults(m.path);
|
|
6628
|
+
for (const r of records) {
|
|
6629
|
+
const target = r.target ?? "default";
|
|
6630
|
+
const entry = targetMap.get(target) ?? {
|
|
6631
|
+
experiments: /* @__PURE__ */ new Set(),
|
|
6632
|
+
runFilenames: /* @__PURE__ */ new Set(),
|
|
6633
|
+
evalCount: 0,
|
|
6634
|
+
passedCount: 0
|
|
6635
|
+
};
|
|
6636
|
+
entry.runFilenames.add(m.filename);
|
|
6637
|
+
if (r.experiment) entry.experiments.add(r.experiment);
|
|
6638
|
+
entry.evalCount++;
|
|
6639
|
+
if (r.score >= pass_threshold) entry.passedCount++;
|
|
6640
|
+
targetMap.set(target, entry);
|
|
6641
|
+
}
|
|
6642
|
+
} catch {
|
|
6643
|
+
}
|
|
6644
|
+
}
|
|
6645
|
+
const targets = [...targetMap.entries()].map(([name, entry]) => ({
|
|
6646
|
+
name,
|
|
6647
|
+
run_count: entry.runFilenames.size,
|
|
6648
|
+
experiment_count: entry.experiments.size,
|
|
6649
|
+
eval_count: entry.evalCount,
|
|
6650
|
+
passed_count: entry.passedCount,
|
|
6651
|
+
pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0
|
|
6652
|
+
}));
|
|
6653
|
+
return c3.json({ targets });
|
|
6654
|
+
}
|
|
6655
|
+
function handleConfig(c3, { agentvDir }) {
|
|
6656
|
+
return c3.json(loadStudioConfig(agentvDir));
|
|
6657
|
+
}
|
|
6658
|
+
function handleFeedbackRead(c3, { searchDir }) {
|
|
6659
|
+
const resultsDir = path10.join(searchDir, ".agentv", "results");
|
|
6660
|
+
return c3.json(readFeedback(existsSync8(resultsDir) ? resultsDir : searchDir));
|
|
6661
|
+
}
|
|
6292
6662
|
function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
6293
6663
|
const searchDir = cwd ?? resultDir;
|
|
6294
6664
|
const agentvDir = path10.join(searchDir, ".agentv");
|
|
6665
|
+
const defaultCtx = { searchDir, agentvDir };
|
|
6295
6666
|
const app2 = new Hono();
|
|
6296
|
-
|
|
6667
|
+
function withProject(c3, handler) {
|
|
6668
|
+
const project = getProject(c3.req.param("projectId") ?? "");
|
|
6669
|
+
if (!project || !existsSync8(project.path)) {
|
|
6670
|
+
return c3.json({ error: "Project not found" }, 404);
|
|
6671
|
+
}
|
|
6672
|
+
return handler(c3, {
|
|
6673
|
+
searchDir: project.path,
|
|
6674
|
+
agentvDir: path10.join(project.path, ".agentv")
|
|
6675
|
+
});
|
|
6676
|
+
}
|
|
6297
6677
|
app2.post("/api/config", async (c3) => {
|
|
6298
6678
|
try {
|
|
6299
6679
|
const body = await c3.req.json();
|
|
@@ -6308,60 +6688,100 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
|
6308
6688
|
return c3.json({ error: "Failed to save config" }, 500);
|
|
6309
6689
|
}
|
|
6310
6690
|
});
|
|
6311
|
-
|
|
6312
|
-
|
|
6313
|
-
|
|
6691
|
+
function projectEntryToWire(entry) {
|
|
6692
|
+
return {
|
|
6693
|
+
id: entry.id,
|
|
6694
|
+
name: entry.name,
|
|
6695
|
+
path: entry.path,
|
|
6696
|
+
added_at: entry.addedAt,
|
|
6697
|
+
last_opened_at: entry.lastOpenedAt
|
|
6698
|
+
};
|
|
6314
6699
|
}
|
|
6315
|
-
app2.get("/", (c3) => {
|
|
6316
|
-
const
|
|
6317
|
-
|
|
6318
|
-
|
|
6319
|
-
|
|
6320
|
-
|
|
6321
|
-
|
|
6322
|
-
|
|
6323
|
-
|
|
6324
|
-
|
|
6325
|
-
|
|
6326
|
-
|
|
6327
|
-
|
|
6328
|
-
try {
|
|
6329
|
-
const records = loadLightweightResults(m.path);
|
|
6330
|
-
if (records.length > 0) {
|
|
6331
|
-
target = records[0].target;
|
|
6332
|
-
experiment = records[0].experiment;
|
|
6333
|
-
}
|
|
6334
|
-
} catch {
|
|
6700
|
+
app2.get("/api/projects", (c3) => {
|
|
6701
|
+
const registry = loadProjectRegistry();
|
|
6702
|
+
const projects = registry.projects.map((p) => {
|
|
6703
|
+
let runCount = 0;
|
|
6704
|
+
let passRate = 0;
|
|
6705
|
+
let lastRun = null;
|
|
6706
|
+
try {
|
|
6707
|
+
const metas = listResultFiles(p.path);
|
|
6708
|
+
runCount = metas.length;
|
|
6709
|
+
if (metas.length > 0) {
|
|
6710
|
+
const totalPassRate = metas.reduce((sum, m) => sum + m.passRate, 0);
|
|
6711
|
+
passRate = totalPassRate / metas.length;
|
|
6712
|
+
lastRun = metas[0].timestamp;
|
|
6335
6713
|
}
|
|
6336
|
-
|
|
6337
|
-
|
|
6338
|
-
|
|
6339
|
-
|
|
6340
|
-
|
|
6341
|
-
|
|
6342
|
-
|
|
6343
|
-
|
|
6344
|
-
...target && { target },
|
|
6345
|
-
...experiment && { experiment }
|
|
6346
|
-
};
|
|
6347
|
-
})
|
|
6714
|
+
} catch {
|
|
6715
|
+
}
|
|
6716
|
+
return {
|
|
6717
|
+
...projectEntryToWire(p),
|
|
6718
|
+
run_count: runCount,
|
|
6719
|
+
pass_rate: passRate,
|
|
6720
|
+
last_run: lastRun
|
|
6721
|
+
};
|
|
6348
6722
|
});
|
|
6723
|
+
return c3.json({ projects });
|
|
6349
6724
|
});
|
|
6350
|
-
app2.
|
|
6351
|
-
|
|
6352
|
-
|
|
6353
|
-
|
|
6354
|
-
|
|
6355
|
-
return c3.json(
|
|
6725
|
+
app2.post("/api/projects", async (c3) => {
|
|
6726
|
+
try {
|
|
6727
|
+
const body = await c3.req.json();
|
|
6728
|
+
if (!body.path) return c3.json({ error: "Missing path" }, 400);
|
|
6729
|
+
const entry = addProject(body.path);
|
|
6730
|
+
return c3.json(projectEntryToWire(entry), 201);
|
|
6731
|
+
} catch (err2) {
|
|
6732
|
+
return c3.json({ error: err2.message }, 400);
|
|
6733
|
+
}
|
|
6734
|
+
});
|
|
6735
|
+
app2.delete("/api/projects/:projectId", (c3) => {
|
|
6736
|
+
const removed = removeProject(c3.req.param("projectId") ?? "");
|
|
6737
|
+
if (!removed) return c3.json({ error: "Project not found" }, 404);
|
|
6738
|
+
return c3.json({ ok: true });
|
|
6739
|
+
});
|
|
6740
|
+
app2.get("/api/projects/:projectId/summary", (c3) => {
|
|
6741
|
+
const project = getProject(c3.req.param("projectId") ?? "");
|
|
6742
|
+
if (!project) return c3.json({ error: "Project not found" }, 404);
|
|
6743
|
+
try {
|
|
6744
|
+
const metas = listResultFiles(project.path);
|
|
6745
|
+
const runCount = metas.length;
|
|
6746
|
+
const passRate = runCount > 0 ? metas.reduce((s, m) => s + m.passRate, 0) / runCount : 0;
|
|
6747
|
+
const lastRun = metas.length > 0 ? metas[0].timestamp : null;
|
|
6748
|
+
return c3.json({
|
|
6749
|
+
id: project.id,
|
|
6750
|
+
name: project.name,
|
|
6751
|
+
path: project.path,
|
|
6752
|
+
run_count: runCount,
|
|
6753
|
+
pass_rate: passRate,
|
|
6754
|
+
last_run: lastRun
|
|
6755
|
+
});
|
|
6756
|
+
} catch {
|
|
6757
|
+
return c3.json({ error: "Failed to read project" }, 500);
|
|
6356
6758
|
}
|
|
6759
|
+
});
|
|
6760
|
+
app2.post("/api/projects/discover", async (c3) => {
|
|
6357
6761
|
try {
|
|
6358
|
-
const
|
|
6359
|
-
|
|
6360
|
-
|
|
6762
|
+
const body = await c3.req.json();
|
|
6763
|
+
if (!body.path) return c3.json({ error: "Missing path" }, 400);
|
|
6764
|
+
const discovered = discoverProjects(body.path);
|
|
6765
|
+
const registered = discovered.map((p) => projectEntryToWire(addProject(p)));
|
|
6766
|
+
return c3.json({ discovered: registered });
|
|
6361
6767
|
} catch (err2) {
|
|
6362
|
-
return c3.json({ error:
|
|
6768
|
+
return c3.json({ error: err2.message }, 400);
|
|
6363
6769
|
}
|
|
6364
6770
|
});
|
|
6771
|
+
app2.get("/api/config", (c3) => handleConfig(c3, defaultCtx));
|
|
6772
|
+
app2.get("/api/runs", (c3) => handleRuns(c3, defaultCtx));
|
|
6773
|
+
app2.get("/api/runs/:filename", (c3) => handleRunDetail(c3, defaultCtx));
|
|
6774
|
+
app2.get("/api/runs/:filename/datasets", (c3) => handleRunDatasets(c3, defaultCtx));
|
|
6775
|
+
app2.get("/api/runs/:filename/categories", (c3) => handleRunCategories(c3, defaultCtx));
|
|
6776
|
+
app2.get(
|
|
6777
|
+
"/api/runs/:filename/categories/:category/datasets",
|
|
6778
|
+
(c3) => handleCategoryDatasets(c3, defaultCtx)
|
|
6779
|
+
);
|
|
6780
|
+
app2.get("/api/runs/:filename/evals/:evalId", (c3) => handleEvalDetail(c3, defaultCtx));
|
|
6781
|
+
app2.get("/api/runs/:filename/evals/:evalId/files", (c3) => handleEvalFiles(c3, defaultCtx));
|
|
6782
|
+
app2.get("/api/runs/:filename/evals/:evalId/files/*", (c3) => handleEvalFileContent(c3, defaultCtx));
|
|
6783
|
+
app2.get("/api/experiments", (c3) => handleExperiments(c3, defaultCtx));
|
|
6784
|
+
app2.get("/api/targets", (c3) => handleTargets(c3, defaultCtx));
|
|
6365
6785
|
app2.get("/api/feedback", (c3) => {
|
|
6366
6786
|
const data = readFeedback(resultDir);
|
|
6367
6787
|
return c3.json(data);
|
|
@@ -6404,127 +6824,6 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
|
6404
6824
|
writeFeedback(resultDir, existing);
|
|
6405
6825
|
return c3.json(existing);
|
|
6406
6826
|
});
|
|
6407
|
-
app2.get("/api/runs/:filename/datasets", (c3) => {
|
|
6408
|
-
const filename = c3.req.param("filename");
|
|
6409
|
-
const metas = listResultFiles(searchDir);
|
|
6410
|
-
const meta = metas.find((m) => m.filename === filename);
|
|
6411
|
-
if (!meta) {
|
|
6412
|
-
return c3.json({ error: "Run not found" }, 404);
|
|
6413
|
-
}
|
|
6414
|
-
try {
|
|
6415
|
-
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6416
|
-
const { pass_threshold } = loadStudioConfig(agentvDir);
|
|
6417
|
-
const datasetMap = /* @__PURE__ */ new Map();
|
|
6418
|
-
for (const r of loaded) {
|
|
6419
|
-
const ds = r.dataset ?? r.target ?? "default";
|
|
6420
|
-
const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
|
|
6421
|
-
entry.total++;
|
|
6422
|
-
if (r.score >= pass_threshold) entry.passed++;
|
|
6423
|
-
entry.scoreSum += r.score;
|
|
6424
|
-
datasetMap.set(ds, entry);
|
|
6425
|
-
}
|
|
6426
|
-
const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
|
|
6427
|
-
name,
|
|
6428
|
-
total: entry.total,
|
|
6429
|
-
passed: entry.passed,
|
|
6430
|
-
failed: entry.total - entry.passed,
|
|
6431
|
-
avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0
|
|
6432
|
-
}));
|
|
6433
|
-
return c3.json({ datasets });
|
|
6434
|
-
} catch {
|
|
6435
|
-
return c3.json({ error: "Failed to load datasets" }, 500);
|
|
6436
|
-
}
|
|
6437
|
-
});
|
|
6438
|
-
app2.get("/api/runs/:filename/categories", (c3) => {
|
|
6439
|
-
const filename = c3.req.param("filename");
|
|
6440
|
-
const metas = listResultFiles(searchDir);
|
|
6441
|
-
const meta = metas.find((m) => m.filename === filename);
|
|
6442
|
-
if (!meta) {
|
|
6443
|
-
return c3.json({ error: "Run not found" }, 404);
|
|
6444
|
-
}
|
|
6445
|
-
try {
|
|
6446
|
-
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6447
|
-
const { pass_threshold } = loadStudioConfig(agentvDir);
|
|
6448
|
-
const categoryMap = /* @__PURE__ */ new Map();
|
|
6449
|
-
for (const r of loaded) {
|
|
6450
|
-
const cat = r.category ?? DEFAULT_CATEGORY;
|
|
6451
|
-
const entry = categoryMap.get(cat) ?? {
|
|
6452
|
-
total: 0,
|
|
6453
|
-
passed: 0,
|
|
6454
|
-
scoreSum: 0,
|
|
6455
|
-
datasets: /* @__PURE__ */ new Set()
|
|
6456
|
-
};
|
|
6457
|
-
entry.total++;
|
|
6458
|
-
if (r.score >= pass_threshold) entry.passed++;
|
|
6459
|
-
entry.scoreSum += r.score;
|
|
6460
|
-
entry.datasets.add(r.dataset ?? r.target ?? "default");
|
|
6461
|
-
categoryMap.set(cat, entry);
|
|
6462
|
-
}
|
|
6463
|
-
const categories = [...categoryMap.entries()].map(([name, entry]) => ({
|
|
6464
|
-
name,
|
|
6465
|
-
total: entry.total,
|
|
6466
|
-
passed: entry.passed,
|
|
6467
|
-
failed: entry.total - entry.passed,
|
|
6468
|
-
avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0,
|
|
6469
|
-
dataset_count: entry.datasets.size
|
|
6470
|
-
}));
|
|
6471
|
-
return c3.json({ categories });
|
|
6472
|
-
} catch {
|
|
6473
|
-
return c3.json({ error: "Failed to load categories" }, 500);
|
|
6474
|
-
}
|
|
6475
|
-
});
|
|
6476
|
-
app2.get("/api/runs/:filename/categories/:category/datasets", (c3) => {
|
|
6477
|
-
const filename = c3.req.param("filename");
|
|
6478
|
-
const category = decodeURIComponent(c3.req.param("category"));
|
|
6479
|
-
const metas = listResultFiles(searchDir);
|
|
6480
|
-
const meta = metas.find((m) => m.filename === filename);
|
|
6481
|
-
if (!meta) {
|
|
6482
|
-
return c3.json({ error: "Run not found" }, 404);
|
|
6483
|
-
}
|
|
6484
|
-
try {
|
|
6485
|
-
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6486
|
-
const { pass_threshold } = loadStudioConfig(agentvDir);
|
|
6487
|
-
const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category);
|
|
6488
|
-
const datasetMap = /* @__PURE__ */ new Map();
|
|
6489
|
-
for (const r of filtered) {
|
|
6490
|
-
const ds = r.dataset ?? r.target ?? "default";
|
|
6491
|
-
const entry = datasetMap.get(ds) ?? { total: 0, passed: 0, scoreSum: 0 };
|
|
6492
|
-
entry.total++;
|
|
6493
|
-
if (r.score >= pass_threshold) entry.passed++;
|
|
6494
|
-
entry.scoreSum += r.score;
|
|
6495
|
-
datasetMap.set(ds, entry);
|
|
6496
|
-
}
|
|
6497
|
-
const datasets = [...datasetMap.entries()].map(([name, entry]) => ({
|
|
6498
|
-
name,
|
|
6499
|
-
total: entry.total,
|
|
6500
|
-
passed: entry.passed,
|
|
6501
|
-
failed: entry.total - entry.passed,
|
|
6502
|
-
avg_score: entry.total > 0 ? entry.scoreSum / entry.total : 0
|
|
6503
|
-
}));
|
|
6504
|
-
return c3.json({ datasets });
|
|
6505
|
-
} catch {
|
|
6506
|
-
return c3.json({ error: "Failed to load datasets" }, 500);
|
|
6507
|
-
}
|
|
6508
|
-
});
|
|
6509
|
-
app2.get("/api/runs/:filename/evals/:evalId", (c3) => {
|
|
6510
|
-
const filename = c3.req.param("filename");
|
|
6511
|
-
const evalId = c3.req.param("evalId");
|
|
6512
|
-
const metas = listResultFiles(searchDir);
|
|
6513
|
-
const meta = metas.find((m) => m.filename === filename);
|
|
6514
|
-
if (!meta) {
|
|
6515
|
-
return c3.json({ error: "Run not found" }, 404);
|
|
6516
|
-
}
|
|
6517
|
-
try {
|
|
6518
|
-
const loaded = patchTestIds(loadManifestResults(meta.path));
|
|
6519
|
-
const result = loaded.find((r) => r.testId === evalId);
|
|
6520
|
-
if (!result) {
|
|
6521
|
-
return c3.json({ error: "Eval not found" }, 404);
|
|
6522
|
-
}
|
|
6523
|
-
return c3.json({ eval: result });
|
|
6524
|
-
} catch {
|
|
6525
|
-
return c3.json({ error: "Failed to load eval" }, 500);
|
|
6526
|
-
}
|
|
6527
|
-
});
|
|
6528
6827
|
app2.get("/api/index", (c3) => {
|
|
6529
6828
|
const metas = listResultFiles(searchDir);
|
|
6530
6829
|
const entries2 = metas.map((m) => {
|
|
@@ -6545,204 +6844,49 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
|
6545
6844
|
});
|
|
6546
6845
|
return c3.json({ entries: entries2 });
|
|
6547
6846
|
});
|
|
6548
|
-
|
|
6549
|
-
|
|
6550
|
-
|
|
6551
|
-
|
|
6552
|
-
|
|
6553
|
-
|
|
6554
|
-
|
|
6555
|
-
|
|
6556
|
-
|
|
6557
|
-
|
|
6558
|
-
|
|
6559
|
-
|
|
6560
|
-
|
|
6561
|
-
|
|
6562
|
-
|
|
6563
|
-
|
|
6564
|
-
|
|
6565
|
-
|
|
6566
|
-
|
|
6567
|
-
|
|
6568
|
-
|
|
6569
|
-
|
|
6570
|
-
|
|
6571
|
-
|
|
6572
|
-
|
|
6573
|
-
|
|
6574
|
-
|
|
6575
|
-
|
|
6576
|
-
|
|
6577
|
-
|
|
6578
|
-
|
|
6579
|
-
|
|
6580
|
-
|
|
6581
|
-
".yml": "yaml",
|
|
6582
|
-
".log": "plaintext",
|
|
6583
|
-
".txt": "plaintext",
|
|
6584
|
-
".py": "python",
|
|
6585
|
-
".sh": "shell",
|
|
6586
|
-
".bash": "shell",
|
|
6587
|
-
".css": "css",
|
|
6588
|
-
".html": "html",
|
|
6589
|
-
".xml": "xml",
|
|
6590
|
-
".svg": "xml",
|
|
6591
|
-
".toml": "toml",
|
|
6592
|
-
".diff": "diff",
|
|
6593
|
-
".patch": "diff"
|
|
6594
|
-
};
|
|
6595
|
-
return langMap[ext] ?? "plaintext";
|
|
6847
|
+
app2.get("/api/projects/:projectId/config", (c3) => withProject(c3, handleConfig));
|
|
6848
|
+
app2.get("/api/projects/:projectId/runs", (c3) => withProject(c3, handleRuns));
|
|
6849
|
+
app2.get("/api/projects/:projectId/runs/:filename", (c3) => withProject(c3, handleRunDetail));
|
|
6850
|
+
app2.get(
|
|
6851
|
+
"/api/projects/:projectId/runs/:filename/datasets",
|
|
6852
|
+
(c3) => withProject(c3, handleRunDatasets)
|
|
6853
|
+
);
|
|
6854
|
+
app2.get(
|
|
6855
|
+
"/api/projects/:projectId/runs/:filename/categories",
|
|
6856
|
+
(c3) => withProject(c3, handleRunCategories)
|
|
6857
|
+
);
|
|
6858
|
+
app2.get(
|
|
6859
|
+
"/api/projects/:projectId/runs/:filename/categories/:category/datasets",
|
|
6860
|
+
(c3) => withProject(c3, handleCategoryDatasets)
|
|
6861
|
+
);
|
|
6862
|
+
app2.get(
|
|
6863
|
+
"/api/projects/:projectId/runs/:filename/evals/:evalId",
|
|
6864
|
+
(c3) => withProject(c3, handleEvalDetail)
|
|
6865
|
+
);
|
|
6866
|
+
app2.get(
|
|
6867
|
+
"/api/projects/:projectId/runs/:filename/evals/:evalId/files",
|
|
6868
|
+
(c3) => withProject(c3, handleEvalFiles)
|
|
6869
|
+
);
|
|
6870
|
+
app2.get(
|
|
6871
|
+
"/api/projects/:projectId/runs/:filename/evals/:evalId/files/*",
|
|
6872
|
+
(c3) => withProject(c3, handleEvalFileContent)
|
|
6873
|
+
);
|
|
6874
|
+
app2.get("/api/projects/:projectId/experiments", (c3) => withProject(c3, handleExperiments));
|
|
6875
|
+
app2.get("/api/projects/:projectId/targets", (c3) => withProject(c3, handleTargets));
|
|
6876
|
+
app2.get("/api/projects/:projectId/feedback", (c3) => withProject(c3, handleFeedbackRead));
|
|
6877
|
+
const studioDistPath = options?.studioDir ?? resolveStudioDistDir();
|
|
6878
|
+
if (!studioDistPath || !existsSync8(path10.join(studioDistPath, "index.html"))) {
|
|
6879
|
+
throw new Error('Studio dist not found. Run "bun run build" in apps/studio/ to build the SPA.');
|
|
6596
6880
|
}
|
|
6597
|
-
app2.get("/
|
|
6598
|
-
const
|
|
6599
|
-
|
|
6600
|
-
|
|
6601
|
-
const meta = metas.find((m) => m.filename === filename);
|
|
6602
|
-
if (!meta) {
|
|
6603
|
-
return c3.json({ error: "Run not found" }, 404);
|
|
6604
|
-
}
|
|
6605
|
-
try {
|
|
6606
|
-
const content = readFileSync9(meta.path, "utf8");
|
|
6607
|
-
const records = parseResultManifest(content);
|
|
6608
|
-
const record = records.find((r) => (r.test_id ?? r.eval_id) === evalId);
|
|
6609
|
-
if (!record) {
|
|
6610
|
-
return c3.json({ error: "Eval not found" }, 404);
|
|
6611
|
-
}
|
|
6612
|
-
const baseDir = path10.dirname(meta.path);
|
|
6613
|
-
const knownPaths = [
|
|
6614
|
-
record.grading_path,
|
|
6615
|
-
record.timing_path,
|
|
6616
|
-
record.input_path,
|
|
6617
|
-
record.output_path,
|
|
6618
|
-
record.response_path
|
|
6619
|
-
].filter((p) => !!p);
|
|
6620
|
-
if (knownPaths.length === 0) {
|
|
6621
|
-
return c3.json({ files: [] });
|
|
6622
|
-
}
|
|
6623
|
-
const artifactDirs = knownPaths.map((p) => path10.dirname(p));
|
|
6624
|
-
let commonDir = artifactDirs[0];
|
|
6625
|
-
for (const dir of artifactDirs) {
|
|
6626
|
-
while (!dir.startsWith(commonDir)) {
|
|
6627
|
-
commonDir = path10.dirname(commonDir);
|
|
6628
|
-
}
|
|
6629
|
-
}
|
|
6630
|
-
const artifactAbsDir = path10.join(baseDir, commonDir);
|
|
6631
|
-
const files = buildFileTree(artifactAbsDir, baseDir);
|
|
6632
|
-
return c3.json({ files });
|
|
6633
|
-
} catch {
|
|
6634
|
-
return c3.json({ error: "Failed to load file tree" }, 500);
|
|
6635
|
-
}
|
|
6636
|
-
});
|
|
6637
|
-
app2.get("/api/runs/:filename/evals/:evalId/files/*", (c3) => {
|
|
6638
|
-
const filename = c3.req.param("filename");
|
|
6639
|
-
const evalId = c3.req.param("evalId");
|
|
6640
|
-
const metas = listResultFiles(searchDir);
|
|
6641
|
-
const meta = metas.find((m) => m.filename === filename);
|
|
6642
|
-
if (!meta) {
|
|
6643
|
-
return c3.json({ error: "Run not found" }, 404);
|
|
6644
|
-
}
|
|
6645
|
-
const requestPath = c3.req.path;
|
|
6646
|
-
const prefix = `/api/runs/${filename}/evals/${evalId}/files/`;
|
|
6647
|
-
const filePath = requestPath.slice(prefix.length);
|
|
6648
|
-
if (!filePath) {
|
|
6649
|
-
return c3.json({ error: "No file path specified" }, 400);
|
|
6650
|
-
}
|
|
6651
|
-
const baseDir = path10.dirname(meta.path);
|
|
6652
|
-
const absolutePath = path10.resolve(baseDir, filePath);
|
|
6653
|
-
if (!absolutePath.startsWith(path10.resolve(baseDir) + path10.sep) && absolutePath !== path10.resolve(baseDir)) {
|
|
6654
|
-
return c3.json({ error: "Path traversal not allowed" }, 403);
|
|
6655
|
-
}
|
|
6656
|
-
if (!existsSync8(absolutePath) || !statSync4(absolutePath).isFile()) {
|
|
6657
|
-
return c3.json({ error: "File not found" }, 404);
|
|
6658
|
-
}
|
|
6659
|
-
try {
|
|
6660
|
-
const fileContent = readFileSync9(absolutePath, "utf8");
|
|
6661
|
-
const language = inferLanguage(absolutePath);
|
|
6662
|
-
return c3.json({ content: fileContent, language });
|
|
6663
|
-
} catch {
|
|
6664
|
-
return c3.json({ error: "Failed to read file" }, 500);
|
|
6665
|
-
}
|
|
6666
|
-
});
|
|
6667
|
-
app2.get("/api/experiments", (c3) => {
|
|
6668
|
-
const metas = listResultFiles(searchDir);
|
|
6669
|
-
const { pass_threshold } = loadStudioConfig(agentvDir);
|
|
6670
|
-
const experimentMap = /* @__PURE__ */ new Map();
|
|
6671
|
-
for (const m of metas) {
|
|
6672
|
-
try {
|
|
6673
|
-
const records = loadLightweightResults(m.path);
|
|
6674
|
-
for (const r of records) {
|
|
6675
|
-
const experiment = r.experiment ?? "default";
|
|
6676
|
-
const entry = experimentMap.get(experiment) ?? {
|
|
6677
|
-
targets: /* @__PURE__ */ new Set(),
|
|
6678
|
-
runFilenames: /* @__PURE__ */ new Set(),
|
|
6679
|
-
evalCount: 0,
|
|
6680
|
-
passedCount: 0,
|
|
6681
|
-
lastTimestamp: ""
|
|
6682
|
-
};
|
|
6683
|
-
entry.runFilenames.add(m.filename);
|
|
6684
|
-
if (r.target) entry.targets.add(r.target);
|
|
6685
|
-
entry.evalCount++;
|
|
6686
|
-
if (r.score >= pass_threshold) entry.passedCount++;
|
|
6687
|
-
if (r.timestamp && r.timestamp > entry.lastTimestamp) {
|
|
6688
|
-
entry.lastTimestamp = r.timestamp;
|
|
6689
|
-
}
|
|
6690
|
-
experimentMap.set(experiment, entry);
|
|
6691
|
-
}
|
|
6692
|
-
} catch {
|
|
6693
|
-
}
|
|
6694
|
-
}
|
|
6695
|
-
const experiments = [...experimentMap.entries()].map(([name, entry]) => ({
|
|
6696
|
-
name,
|
|
6697
|
-
run_count: entry.runFilenames.size,
|
|
6698
|
-
target_count: entry.targets.size,
|
|
6699
|
-
eval_count: entry.evalCount,
|
|
6700
|
-
passed_count: entry.passedCount,
|
|
6701
|
-
pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0,
|
|
6702
|
-
last_run: entry.lastTimestamp || null
|
|
6703
|
-
}));
|
|
6704
|
-
return c3.json({ experiments });
|
|
6705
|
-
});
|
|
6706
|
-
app2.get("/api/targets", (c3) => {
|
|
6707
|
-
const metas = listResultFiles(searchDir);
|
|
6708
|
-
const { pass_threshold } = loadStudioConfig(agentvDir);
|
|
6709
|
-
const targetMap = /* @__PURE__ */ new Map();
|
|
6710
|
-
for (const m of metas) {
|
|
6711
|
-
try {
|
|
6712
|
-
const records = loadLightweightResults(m.path);
|
|
6713
|
-
for (const r of records) {
|
|
6714
|
-
const target = r.target ?? "default";
|
|
6715
|
-
const entry = targetMap.get(target) ?? {
|
|
6716
|
-
experiments: /* @__PURE__ */ new Set(),
|
|
6717
|
-
runFilenames: /* @__PURE__ */ new Set(),
|
|
6718
|
-
evalCount: 0,
|
|
6719
|
-
passedCount: 0
|
|
6720
|
-
};
|
|
6721
|
-
entry.runFilenames.add(m.filename);
|
|
6722
|
-
if (r.experiment) entry.experiments.add(r.experiment);
|
|
6723
|
-
entry.evalCount++;
|
|
6724
|
-
if (r.score >= pass_threshold) entry.passedCount++;
|
|
6725
|
-
targetMap.set(target, entry);
|
|
6726
|
-
}
|
|
6727
|
-
} catch {
|
|
6728
|
-
}
|
|
6729
|
-
}
|
|
6730
|
-
const targets = [...targetMap.entries()].map(([name, entry]) => ({
|
|
6731
|
-
name,
|
|
6732
|
-
run_count: entry.runFilenames.size,
|
|
6733
|
-
experiment_count: entry.experiments.size,
|
|
6734
|
-
eval_count: entry.evalCount,
|
|
6735
|
-
passed_count: entry.passedCount,
|
|
6736
|
-
pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0
|
|
6737
|
-
}));
|
|
6738
|
-
return c3.json({ targets });
|
|
6881
|
+
app2.get("/", (c3) => {
|
|
6882
|
+
const indexPath = path10.join(studioDistPath, "index.html");
|
|
6883
|
+
if (existsSync8(indexPath)) return c3.html(readFileSync9(indexPath, "utf8"));
|
|
6884
|
+
return c3.notFound();
|
|
6739
6885
|
});
|
|
6740
6886
|
app2.get("/assets/*", (c3) => {
|
|
6741
6887
|
const assetPath = c3.req.path;
|
|
6742
6888
|
const filePath = path10.join(studioDistPath, assetPath);
|
|
6743
|
-
if (!existsSync8(filePath))
|
|
6744
|
-
return c3.notFound();
|
|
6745
|
-
}
|
|
6889
|
+
if (!existsSync8(filePath)) return c3.notFound();
|
|
6746
6890
|
const content = readFileSync9(filePath);
|
|
6747
6891
|
const ext = path10.extname(filePath);
|
|
6748
6892
|
const mimeTypes = {
|
|
@@ -6764,13 +6908,9 @@ function createApp(results, resultDir, cwd, sourceFile, options) {
|
|
|
6764
6908
|
});
|
|
6765
6909
|
});
|
|
6766
6910
|
app2.get("*", (c3) => {
|
|
6767
|
-
if (c3.req.path.startsWith("/api/")) {
|
|
6768
|
-
return c3.json({ error: "Not found" }, 404);
|
|
6769
|
-
}
|
|
6911
|
+
if (c3.req.path.startsWith("/api/")) return c3.json({ error: "Not found" }, 404);
|
|
6770
6912
|
const indexPath = path10.join(studioDistPath, "index.html");
|
|
6771
|
-
if (existsSync8(indexPath))
|
|
6772
|
-
return c3.html(readFileSync9(indexPath, "utf8"));
|
|
6773
|
-
}
|
|
6913
|
+
if (existsSync8(indexPath)) return c3.html(readFileSync9(indexPath, "utf8"));
|
|
6774
6914
|
return c3.notFound();
|
|
6775
6915
|
});
|
|
6776
6916
|
return app2;
|
|
@@ -6794,18 +6934,6 @@ function resolveStudioDistDir() {
|
|
|
6794
6934
|
}
|
|
6795
6935
|
return void 0;
|
|
6796
6936
|
}
|
|
6797
|
-
function stripHeavyFields(results) {
|
|
6798
|
-
return results.map((r) => {
|
|
6799
|
-
const { requests, trace, ...rest } = r;
|
|
6800
|
-
const toolCalls = trace?.toolCalls && Object.keys(trace.toolCalls).length > 0 ? trace.toolCalls : void 0;
|
|
6801
|
-
const graderDurationMs = (r.scores ?? []).reduce((sum, s) => sum + (s.durationMs ?? 0), 0);
|
|
6802
|
-
return {
|
|
6803
|
-
...rest,
|
|
6804
|
-
...toolCalls && { _toolCalls: toolCalls },
|
|
6805
|
-
...graderDurationMs > 0 && { _graderDurationMs: graderDurationMs }
|
|
6806
|
-
};
|
|
6807
|
-
});
|
|
6808
|
-
}
|
|
6809
6937
|
var resultsServeCommand = command({
|
|
6810
6938
|
name: "studio",
|
|
6811
6939
|
description: "Start AgentV Studio \u2014 a local dashboard for reviewing evaluation results",
|
|
@@ -6826,11 +6954,66 @@ var resultsServeCommand = command({
|
|
|
6826
6954
|
long: "dir",
|
|
6827
6955
|
short: "d",
|
|
6828
6956
|
description: "Working directory (default: current directory)"
|
|
6957
|
+
}),
|
|
6958
|
+
multi: flag({
|
|
6959
|
+
long: "multi",
|
|
6960
|
+
description: "Launch in multi-project dashboard mode"
|
|
6961
|
+
}),
|
|
6962
|
+
add: option({
|
|
6963
|
+
type: optional(string),
|
|
6964
|
+
long: "add",
|
|
6965
|
+
description: "Register a project by path"
|
|
6966
|
+
}),
|
|
6967
|
+
remove: option({
|
|
6968
|
+
type: optional(string),
|
|
6969
|
+
long: "remove",
|
|
6970
|
+
description: "Unregister a project by ID"
|
|
6971
|
+
}),
|
|
6972
|
+
discover: option({
|
|
6973
|
+
type: optional(string),
|
|
6974
|
+
long: "discover",
|
|
6975
|
+
description: "Scan a directory tree for repos with .agentv/"
|
|
6829
6976
|
})
|
|
6830
6977
|
},
|
|
6831
|
-
handler: async ({ source, port, dir }) => {
|
|
6978
|
+
handler: async ({ source, port, dir, multi, add, remove, discover }) => {
|
|
6832
6979
|
const cwd = dir ?? process.cwd();
|
|
6833
6980
|
const listenPort = port ?? (process.env.PORT ? Number(process.env.PORT) : 3117);
|
|
6981
|
+
if (add) {
|
|
6982
|
+
try {
|
|
6983
|
+
const entry = addProject(add);
|
|
6984
|
+
console.log(`Registered project: ${entry.name} (${entry.id}) at ${entry.path}`);
|
|
6985
|
+
} catch (err2) {
|
|
6986
|
+
console.error(`Error: ${err2.message}`);
|
|
6987
|
+
process.exit(1);
|
|
6988
|
+
}
|
|
6989
|
+
return;
|
|
6990
|
+
}
|
|
6991
|
+
if (remove) {
|
|
6992
|
+
const removed = removeProject(remove);
|
|
6993
|
+
if (removed) {
|
|
6994
|
+
console.log(`Unregistered project: ${remove}`);
|
|
6995
|
+
} else {
|
|
6996
|
+
console.error(`Project not found: ${remove}`);
|
|
6997
|
+
process.exit(1);
|
|
6998
|
+
}
|
|
6999
|
+
return;
|
|
7000
|
+
}
|
|
7001
|
+
if (discover) {
|
|
7002
|
+
const discovered = discoverProjects(discover);
|
|
7003
|
+
if (discovered.length === 0) {
|
|
7004
|
+
console.log(`No projects with .agentv/ found under ${discover}`);
|
|
7005
|
+
return;
|
|
7006
|
+
}
|
|
7007
|
+
for (const p of discovered) {
|
|
7008
|
+
const entry = addProject(p);
|
|
7009
|
+
console.log(`Registered: ${entry.name} (${entry.id}) at ${entry.path}`);
|
|
7010
|
+
}
|
|
7011
|
+
console.log(`
|
|
7012
|
+
Discovered ${discovered.length} project(s).`);
|
|
7013
|
+
return;
|
|
7014
|
+
}
|
|
7015
|
+
const registry = loadProjectRegistry();
|
|
7016
|
+
const isMultiProject = multi || registry.projects.length > 0;
|
|
6834
7017
|
try {
|
|
6835
7018
|
let results = [];
|
|
6836
7019
|
let sourceFile;
|
|
@@ -6858,16 +7041,16 @@ var resultsServeCommand = command({
|
|
|
6858
7041
|
}
|
|
6859
7042
|
const resultDir = sourceFile ? path10.dirname(path10.resolve(sourceFile)) : cwd;
|
|
6860
7043
|
const app2 = createApp(results, resultDir, cwd, sourceFile);
|
|
6861
|
-
if (
|
|
7044
|
+
if (isMultiProject) {
|
|
7045
|
+
console.log(`Multi-project mode: ${registry.projects.length} project(s) registered`);
|
|
7046
|
+
} else if (results.length > 0 && sourceFile) {
|
|
6862
7047
|
console.log(`Serving ${results.length} result(s) from ${sourceFile}`);
|
|
6863
7048
|
} else {
|
|
6864
7049
|
console.log("No results found. Dashboard will show an empty state.");
|
|
6865
7050
|
console.log("Run an evaluation to see results: agentv eval <eval-file>");
|
|
6866
7051
|
}
|
|
6867
7052
|
console.log(`Dashboard: http://localhost:${listenPort}`);
|
|
6868
|
-
console.log(`
|
|
6869
|
-
console.log(`Result picker API: http://localhost:${listenPort}/api/runs`);
|
|
6870
|
-
console.log(`Feedback file: ${feedbackPath(resultDir)}`);
|
|
7053
|
+
console.log(`Projects API: http://localhost:${listenPort}/api/projects`);
|
|
6871
7054
|
console.log("Press Ctrl+C to stop");
|
|
6872
7055
|
const { serve: startServer } = await import("@hono/node-server");
|
|
6873
7056
|
startServer({
|
|
@@ -8042,7 +8225,7 @@ function isYamlFile(filePath) {
|
|
|
8042
8225
|
}
|
|
8043
8226
|
|
|
8044
8227
|
// src/commands/validate/index.ts
|
|
8045
|
-
async function runValidateCommand(paths) {
|
|
8228
|
+
async function runValidateCommand(paths, maxWarnings) {
|
|
8046
8229
|
if (paths.length === 0) {
|
|
8047
8230
|
console.error("Error: No paths specified. Usage: agentv validate <paths...>");
|
|
8048
8231
|
process.exit(1);
|
|
@@ -8053,6 +8236,18 @@ async function runValidateCommand(paths) {
|
|
|
8053
8236
|
if (summary.invalidFiles > 0) {
|
|
8054
8237
|
process.exit(1);
|
|
8055
8238
|
}
|
|
8239
|
+
if (maxWarnings !== void 0) {
|
|
8240
|
+
const warningCount = summary.results.reduce(
|
|
8241
|
+
(count, r) => count + r.errors.filter((e) => e.severity === "warning").length,
|
|
8242
|
+
0
|
|
8243
|
+
);
|
|
8244
|
+
if (warningCount > maxWarnings) {
|
|
8245
|
+
console.error(
|
|
8246
|
+
`Found ${warningCount} warning${warningCount === 1 ? "" : "s"} (max allowed: ${maxWarnings})`
|
|
8247
|
+
);
|
|
8248
|
+
process.exit(1);
|
|
8249
|
+
}
|
|
8250
|
+
}
|
|
8056
8251
|
}
|
|
8057
8252
|
var validateCommand = command({
|
|
8058
8253
|
name: "validate",
|
|
@@ -8062,11 +8257,16 @@ var validateCommand = command({
|
|
|
8062
8257
|
type: string,
|
|
8063
8258
|
displayName: "paths",
|
|
8064
8259
|
description: "Files or directories to validate"
|
|
8260
|
+
}),
|
|
8261
|
+
maxWarnings: option({
|
|
8262
|
+
type: optional(number),
|
|
8263
|
+
long: "max-warnings",
|
|
8264
|
+
description: "Maximum number of warnings allowed before failing (e.g., --max-warnings 0)"
|
|
8065
8265
|
})
|
|
8066
8266
|
},
|
|
8067
|
-
handler: async ({ paths }) => {
|
|
8267
|
+
handler: async ({ paths, maxWarnings }) => {
|
|
8068
8268
|
try {
|
|
8069
|
-
await runValidateCommand(paths);
|
|
8269
|
+
await runValidateCommand(paths, maxWarnings);
|
|
8070
8270
|
} catch (error) {
|
|
8071
8271
|
console.error(`Error: ${error.message}`);
|
|
8072
8272
|
process.exit(1);
|
|
@@ -8414,4 +8614,4 @@ export {
|
|
|
8414
8614
|
preprocessArgv,
|
|
8415
8615
|
runCli
|
|
8416
8616
|
};
|
|
8417
|
-
//# sourceMappingURL=chunk-
|
|
8617
|
+
//# sourceMappingURL=chunk-YORCRL4G.js.map
|