@ls-stack/agent-eval 0.5.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-C5CJ1sX6.mjs → app-TjV5nDMM.mjs} +5 -5
- package/dist/apps/web/dist/assets/index-ClE28i5w.css +1 -0
- package/dist/apps/web/dist/assets/index-gGumCEnD.js +112 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +39 -6
- package/dist/cli-BTtgQLjB.mjs +1285 -0
- package/dist/index.d.mts +1072 -829
- package/dist/index.mjs +4 -3
- package/dist/runChild.d.mts +1 -0
- package/dist/runChild.mjs +107 -0
- package/dist/{cli-C5FL7C4G.mjs → runOrchestration-HaMahl6b.mjs} +1216 -1697
- package/dist/{runner-Cdlvk56X.mjs → runner-CBDZos0Z.mjs} +1 -1
- package/dist/{runner-K2bN8KRS.mjs → runner-DGVoOyJt.mjs} +2 -2
- package/dist/src-Bt5Fz9HS.mjs +3 -0
- package/package.json +3 -2
- package/dist/apps/web/dist/assets/index-CBvHVkE7.js +0 -109
- package/dist/apps/web/dist/assets/index-Dd7I28ts.css +0 -1
- package/dist/src-gqm1z1Nu.mjs +0 -2
|
@@ -0,0 +1,1285 @@
|
|
|
1
|
+
import { D as deriveScopedSummaryFromCases, E as getEvalDisplayStatus, Kt as getEvalRegistry, T as getEvalTitle, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as normalizeScoreDef, c as persistCaseDetail, d as recomputePersistedCaseStatus, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, i as getLatestRunInfos, j as runSummarySchema, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, r as getLastRunStatuses, s as nextShortIdFromSnapshots, u as recomputeEvalStatusesInRuns, v as loadConfig, x as createFsCacheStore, y as buildDeclaredColumnDefs } from "./runOrchestration-HaMahl6b.mjs";
|
|
2
|
+
import { createHash } from "node:crypto";
|
|
3
|
+
import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
|
+
import { dirname, join, relative, resolve } from "node:path";
|
|
5
|
+
import { watch } from "chokidar";
|
|
6
|
+
import { glob } from "glob";
|
|
7
|
+
import { existsSync } from "node:fs";
|
|
8
|
+
import { resultify } from "t-result";
|
|
9
|
+
import { fileURLToPath } from "node:url";
|
|
10
|
+
import { spawn, spawnSync } from "node:child_process";
|
|
11
|
+
//#region ../runner/src/chartValidation.ts
|
|
12
|
+
function isValidColumnMetric(metric, columnsByKey, evalId, warnings) {
|
|
13
|
+
const columnDef = columnsByKey.get(metric.key);
|
|
14
|
+
if (!columnDef) {
|
|
15
|
+
warnings.push(`[${evalId}] chart metric references unknown column "${metric.key}" — dropped`);
|
|
16
|
+
return false;
|
|
17
|
+
}
|
|
18
|
+
if (metric.aggregate === "passThresholdRate") {
|
|
19
|
+
if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
|
|
20
|
+
warnings.push(`[${evalId}] chart metric "${metric.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
|
|
21
|
+
return false;
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
return true;
|
|
25
|
+
}
|
|
26
|
+
function isValidTooltipExtra(extra, columnsByKey, evalId, warnings) {
|
|
27
|
+
const columnDef = columnsByKey.get(extra.key);
|
|
28
|
+
if (!columnDef) {
|
|
29
|
+
warnings.push(`[${evalId}] chart tooltip extra references unknown column "${extra.key}" — dropped`);
|
|
30
|
+
return false;
|
|
31
|
+
}
|
|
32
|
+
if (extra.aggregate === "passThresholdRate") {
|
|
33
|
+
if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
|
|
34
|
+
warnings.push(`[${evalId}] chart tooltip extra "${extra.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
|
|
35
|
+
return false;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
return true;
|
|
39
|
+
}
|
|
40
|
+
function sanitizeChart(chart, columnsByKey, evalId, warnings) {
|
|
41
|
+
const metrics = chart.metrics.filter((metric) => {
|
|
42
|
+
if (metric.source === "builtin") return true;
|
|
43
|
+
return isValidColumnMetric(metric, columnsByKey, evalId, warnings);
|
|
44
|
+
});
|
|
45
|
+
if (metrics.length === 0) {
|
|
46
|
+
warnings.push(`[${evalId}] chart had no valid metrics after validation — chart dropped`);
|
|
47
|
+
return null;
|
|
48
|
+
}
|
|
49
|
+
const tooltipExtras = chart.tooltipExtras?.filter((extra) => {
|
|
50
|
+
if (extra.source === "builtin") return true;
|
|
51
|
+
return isValidTooltipExtra(extra, columnsByKey, evalId, warnings);
|
|
52
|
+
});
|
|
53
|
+
return {
|
|
54
|
+
...chart,
|
|
55
|
+
metrics,
|
|
56
|
+
tooltipExtras: tooltipExtras?.length ? tooltipExtras : void 0
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Validate and sanitize an authored `charts` config against the eval's
|
|
61
|
+
* declared columns. Drops metrics/extras that reference unknown columns or
|
|
62
|
+
* misuse `passThresholdRate`, and drops entire charts whose metrics are all
|
|
63
|
+
* invalid. Returns `charts: undefined` when nothing valid remains so the UI
|
|
64
|
+
* falls back to rendering no chart (matching the opt-in default).
|
|
65
|
+
*/
|
|
66
|
+
function validateCharts(params) {
|
|
67
|
+
const { charts, columnDefs, evalId } = params;
|
|
68
|
+
if (!charts || charts.length === 0) return {
|
|
69
|
+
charts: void 0,
|
|
70
|
+
warnings: []
|
|
71
|
+
};
|
|
72
|
+
const columnsByKey = new Map(columnDefs.map((def) => [def.key, def]));
|
|
73
|
+
const warnings = [];
|
|
74
|
+
const sanitized = [];
|
|
75
|
+
for (const chart of charts) {
|
|
76
|
+
const result = sanitizeChart(chart, columnsByKey, evalId, warnings);
|
|
77
|
+
if (result) sanitized.push(result);
|
|
78
|
+
}
|
|
79
|
+
return {
|
|
80
|
+
charts: sanitized.length > 0 ? sanitized : void 0,
|
|
81
|
+
warnings
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
//#endregion
|
|
85
|
+
//#region ../runner/src/discovery.ts
|
|
86
|
+
const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
|
|
87
|
+
const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
|
|
88
|
+
function parseEvalMetas(filePath, content) {
|
|
89
|
+
const metas = [];
|
|
90
|
+
let searchIndex = 0;
|
|
91
|
+
while (searchIndex < content.length) {
|
|
92
|
+
const defineEvalIndex = content.indexOf("defineEval", searchIndex);
|
|
93
|
+
if (defineEvalIndex === -1) break;
|
|
94
|
+
const extracted = extractDefineEvalObject(content, defineEvalIndex);
|
|
95
|
+
if (!extracted) {
|
|
96
|
+
searchIndex = defineEvalIndex + 10;
|
|
97
|
+
continue;
|
|
98
|
+
}
|
|
99
|
+
const id = evalIdMatchRegex.exec(extracted.objectText)?.[1];
|
|
100
|
+
if (id !== void 0) {
|
|
101
|
+
const result = {
|
|
102
|
+
filePath,
|
|
103
|
+
id
|
|
104
|
+
};
|
|
105
|
+
const title = evalTitleMatchRegex.exec(extracted.objectText)?.[1];
|
|
106
|
+
if (title !== void 0) result.title = title;
|
|
107
|
+
metas.push(result);
|
|
108
|
+
}
|
|
109
|
+
searchIndex = extracted.nextIndex;
|
|
110
|
+
}
|
|
111
|
+
return metas;
|
|
112
|
+
}
|
|
113
|
+
function extractDefineEvalObject(content, defineEvalIndex) {
|
|
114
|
+
const openParenIndex = content.indexOf("(", defineEvalIndex);
|
|
115
|
+
if (openParenIndex === -1) return void 0;
|
|
116
|
+
const objectStartIndex = content.indexOf("{", openParenIndex);
|
|
117
|
+
if (objectStartIndex === -1) return void 0;
|
|
118
|
+
let depth = 0;
|
|
119
|
+
let quote;
|
|
120
|
+
let inBlockComment = false;
|
|
121
|
+
let inLineComment = false;
|
|
122
|
+
let isEscaped = false;
|
|
123
|
+
for (let index = objectStartIndex; index < content.length; index++) {
|
|
124
|
+
const currentChar = content[index];
|
|
125
|
+
const nextChar = content[index + 1];
|
|
126
|
+
if (inLineComment) {
|
|
127
|
+
if (currentChar === "\n") inLineComment = false;
|
|
128
|
+
continue;
|
|
129
|
+
}
|
|
130
|
+
if (inBlockComment) {
|
|
131
|
+
if (currentChar === "*" && nextChar === "/") {
|
|
132
|
+
inBlockComment = false;
|
|
133
|
+
index++;
|
|
134
|
+
}
|
|
135
|
+
continue;
|
|
136
|
+
}
|
|
137
|
+
if (quote) {
|
|
138
|
+
if (isEscaped) {
|
|
139
|
+
isEscaped = false;
|
|
140
|
+
continue;
|
|
141
|
+
}
|
|
142
|
+
if (currentChar === "\\") {
|
|
143
|
+
isEscaped = true;
|
|
144
|
+
continue;
|
|
145
|
+
}
|
|
146
|
+
if (currentChar === quote) quote = void 0;
|
|
147
|
+
continue;
|
|
148
|
+
}
|
|
149
|
+
if (currentChar === "/" && nextChar === "/") {
|
|
150
|
+
inLineComment = true;
|
|
151
|
+
index++;
|
|
152
|
+
continue;
|
|
153
|
+
}
|
|
154
|
+
if (currentChar === "/" && nextChar === "*") {
|
|
155
|
+
inBlockComment = true;
|
|
156
|
+
index++;
|
|
157
|
+
continue;
|
|
158
|
+
}
|
|
159
|
+
if (currentChar === "\"" || currentChar === "'" || currentChar === "`") {
|
|
160
|
+
quote = currentChar;
|
|
161
|
+
continue;
|
|
162
|
+
}
|
|
163
|
+
if (currentChar === "{") {
|
|
164
|
+
depth++;
|
|
165
|
+
continue;
|
|
166
|
+
}
|
|
167
|
+
if (currentChar === "}") {
|
|
168
|
+
depth--;
|
|
169
|
+
if (depth === 0) return {
|
|
170
|
+
nextIndex: index + 1,
|
|
171
|
+
objectText: content.slice(objectStartIndex, index + 1)
|
|
172
|
+
};
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
//#endregion
|
|
177
|
+
//#region ../runner/src/gitState.ts
|
|
178
|
+
function runGitCommand(workspaceRoot, args) {
|
|
179
|
+
const result = spawnSync("git", args, {
|
|
180
|
+
cwd: workspaceRoot,
|
|
181
|
+
encoding: "utf8",
|
|
182
|
+
stdio: [
|
|
183
|
+
"ignore",
|
|
184
|
+
"pipe",
|
|
185
|
+
"ignore"
|
|
186
|
+
]
|
|
187
|
+
});
|
|
188
|
+
return {
|
|
189
|
+
status: result.status,
|
|
190
|
+
stdout: result.stdout.trim()
|
|
191
|
+
};
|
|
192
|
+
}
|
|
193
|
+
/** Read the current git commit for the workspace, if available. */
|
|
194
|
+
function readGitWorktreeState(workspaceRoot) {
|
|
195
|
+
const insideWorktree = runGitCommand(workspaceRoot, ["rev-parse", "--is-inside-work-tree"]);
|
|
196
|
+
if (insideWorktree.status !== 0 || insideWorktree.stdout !== "true") return { commitSha: null };
|
|
197
|
+
const commitResult = runGitCommand(workspaceRoot, ["rev-parse", "HEAD"]);
|
|
198
|
+
return { commitSha: commitResult.status === 0 ? commitResult.stdout : null };
|
|
199
|
+
}
|
|
200
|
+
//#endregion
|
|
201
|
+
//#region ../runner/src/runChildProtocol.ts
|
|
202
|
+
function isRunChildMessage(value) {
|
|
203
|
+
if (typeof value !== "object" || value === null) return false;
|
|
204
|
+
if (!("type" in value) || typeof value.type !== "string") return false;
|
|
205
|
+
if (value.type === "event") return "event" in value;
|
|
206
|
+
if (value.type === "case.finished") return "caseDetail" in value && "caseRow" in value;
|
|
207
|
+
return value.type === "done" && "evals" in value;
|
|
208
|
+
}
|
|
209
|
+
//#endregion
|
|
210
|
+
//#region ../runner/src/runChildManager.ts
|
|
211
|
+
function startRunChild(params) {
|
|
212
|
+
const child = spawn(process.execPath, [
|
|
213
|
+
...getRunChildExecArgv(),
|
|
214
|
+
resolveRunChildEntrypoint(),
|
|
215
|
+
params.contextPath
|
|
216
|
+
], {
|
|
217
|
+
cwd: params.managerContext.workspaceRoot,
|
|
218
|
+
env: process.env,
|
|
219
|
+
stdio: [
|
|
220
|
+
"ignore",
|
|
221
|
+
"inherit",
|
|
222
|
+
"inherit",
|
|
223
|
+
"ipc"
|
|
224
|
+
]
|
|
225
|
+
});
|
|
226
|
+
params.runState.childProcess = child;
|
|
227
|
+
child.on("message", (message) => {
|
|
228
|
+
if (!isRunChildMessage(message)) return;
|
|
229
|
+
handleRunChildMessage({
|
|
230
|
+
runState: params.runState,
|
|
231
|
+
message,
|
|
232
|
+
managerContext: params.managerContext
|
|
233
|
+
});
|
|
234
|
+
});
|
|
235
|
+
child.once("exit", (code, signal) => {
|
|
236
|
+
if (params.runState.childProcess === child) params.runState.childProcess = void 0;
|
|
237
|
+
if (params.runState.manifest.status !== "running" || params.runState.childTerminalReceived) return;
|
|
238
|
+
const reason = signal !== null ? `Run child exited with signal ${signal}` : `Run child exited with code ${String(code)}`;
|
|
239
|
+
markRunErrored(params.runState, reason, params.managerContext);
|
|
240
|
+
});
|
|
241
|
+
}
|
|
242
|
+
function getRunChildExecArgv() {
|
|
243
|
+
const execArgv = [];
|
|
244
|
+
let skipNext = false;
|
|
245
|
+
for (const arg of process.execArgv) {
|
|
246
|
+
if (skipNext) {
|
|
247
|
+
skipNext = false;
|
|
248
|
+
continue;
|
|
249
|
+
}
|
|
250
|
+
if (arg === "--eval" || arg === "-e" || arg === "--print" || arg === "-p") {
|
|
251
|
+
skipNext = true;
|
|
252
|
+
continue;
|
|
253
|
+
}
|
|
254
|
+
if (arg.startsWith("--eval=") || arg.startsWith("--print=")) continue;
|
|
255
|
+
if (arg === "--input-type" || arg.startsWith("--input-type=")) {
|
|
256
|
+
if (arg === "--input-type") skipNext = true;
|
|
257
|
+
continue;
|
|
258
|
+
}
|
|
259
|
+
execArgv.push(arg);
|
|
260
|
+
}
|
|
261
|
+
return execArgv;
|
|
262
|
+
}
|
|
263
|
+
function killRunChild(runState) {
|
|
264
|
+
const child = runState.childProcess;
|
|
265
|
+
runState.childProcess = void 0;
|
|
266
|
+
if (child === void 0 || child.killed) return;
|
|
267
|
+
if (!child.kill("SIGKILL")) child.kill();
|
|
268
|
+
}
|
|
269
|
+
function resolveRunChildEntrypoint() {
|
|
270
|
+
const currentDir = dirname(fileURLToPath(import.meta.url));
|
|
271
|
+
for (const fileName of [
|
|
272
|
+
"runChild.ts",
|
|
273
|
+
"runChild.mjs",
|
|
274
|
+
"runChild.js"
|
|
275
|
+
]) {
|
|
276
|
+
const candidate = join(currentDir, fileName);
|
|
277
|
+
if (existsSync(candidate)) return candidate;
|
|
278
|
+
}
|
|
279
|
+
throw new Error("Unable to locate the Agent Evals run child entrypoint.");
|
|
280
|
+
}
|
|
281
|
+
function handleRunChildMessage(params) {
|
|
282
|
+
const { runState, message, managerContext } = params;
|
|
283
|
+
if (message.type === "case.finished") {
|
|
284
|
+
if (runState.manifest.status !== "running") return;
|
|
285
|
+
upsertFinishedCase(runState, message.caseDetail, message.caseRow);
|
|
286
|
+
managerContext.emitEvent(runState, {
|
|
287
|
+
type: "case.finished",
|
|
288
|
+
runId: runState.manifest.id,
|
|
289
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
290
|
+
payload: message.caseRow
|
|
291
|
+
});
|
|
292
|
+
return;
|
|
293
|
+
}
|
|
294
|
+
if (message.type === "done") {
|
|
295
|
+
applyChildEvalMetas(managerContext.evals, message.evals);
|
|
296
|
+
managerContext.emitDiscoveryEvent();
|
|
297
|
+
return;
|
|
298
|
+
}
|
|
299
|
+
handleRunChildEvent(runState, message.event, managerContext);
|
|
300
|
+
}
|
|
301
|
+
function upsertFinishedCase(runState, caseDetail, caseRow) {
|
|
302
|
+
const existingIndex = runState.cases.findIndex((row) => row.evalId === caseRow.evalId && row.caseId === caseRow.caseId && row.trial === caseRow.trial);
|
|
303
|
+
if (existingIndex === -1) runState.cases.push(caseRow);
|
|
304
|
+
else runState.cases[existingIndex] = caseRow;
|
|
305
|
+
runState.caseDetails.set(caseDetail.caseId, caseDetail);
|
|
306
|
+
}
|
|
307
|
+
function applyChildEvalMetas(evals, childMetas) {
|
|
308
|
+
for (const childMeta of childMetas) {
|
|
309
|
+
const evalMeta = evals.get(childMeta.id);
|
|
310
|
+
if (evalMeta === void 0) continue;
|
|
311
|
+
evalMeta.columnDefs = childMeta.columnDefs;
|
|
312
|
+
evalMeta.caseCount = childMeta.caseCount;
|
|
313
|
+
evalMeta.stats = childMeta.stats;
|
|
314
|
+
evalMeta.charts = childMeta.charts;
|
|
315
|
+
evalMeta.sourceFingerprint = childMeta.sourceFingerprint;
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
function handleRunChildEvent(runState, event, managerContext) {
|
|
319
|
+
if (runState.manifest.status !== "running") return;
|
|
320
|
+
if (event.type === "run.summary") {
|
|
321
|
+
const parsed = runSummarySchema.safeParse(event.payload);
|
|
322
|
+
if (parsed.success) runState.summary = parsed.data;
|
|
323
|
+
managerContext.emitEvent(runState, event);
|
|
324
|
+
return;
|
|
325
|
+
}
|
|
326
|
+
if (event.type === "run.finished") {
|
|
327
|
+
runState.childTerminalReceived = true;
|
|
328
|
+
runState.childProcess = void 0;
|
|
329
|
+
markRunTerminalFromChild(runState, event, managerContext);
|
|
330
|
+
return;
|
|
331
|
+
}
|
|
332
|
+
if (event.type === "run.error") {
|
|
333
|
+
runState.childTerminalReceived = true;
|
|
334
|
+
runState.childProcess = void 0;
|
|
335
|
+
markRunTerminalFromChild(runState, event, managerContext);
|
|
336
|
+
return;
|
|
337
|
+
}
|
|
338
|
+
managerContext.emitEvent(runState, event);
|
|
339
|
+
}
|
|
340
|
+
function getRunErrorMessage(payload) {
|
|
341
|
+
if (typeof payload === "object" && payload !== null && "message" in payload && typeof payload.message === "string") return payload.message;
|
|
342
|
+
return "Run child ended with an error";
|
|
343
|
+
}
|
|
344
|
+
async function markRunErrored(runState, message, managerContext) {
|
|
345
|
+
runState.manifest.status = "error";
|
|
346
|
+
runState.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
347
|
+
runState.summary.status = "error";
|
|
348
|
+
runState.summary.errorMessage = message;
|
|
349
|
+
await persistRunState(runState);
|
|
350
|
+
managerContext.emitEvent(runState, {
|
|
351
|
+
type: "run.error",
|
|
352
|
+
runId: runState.manifest.id,
|
|
353
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
354
|
+
payload: { message }
|
|
355
|
+
});
|
|
356
|
+
managerContext.emitDiscoveryEvent();
|
|
357
|
+
}
|
|
358
|
+
async function markRunTerminalFromChild(runState, event, managerContext) {
|
|
359
|
+
const snapshot = await loadPersistedRunSnapshot(runState.runDir);
|
|
360
|
+
if (snapshot !== null) {
|
|
361
|
+
runState.manifest = snapshot.manifest;
|
|
362
|
+
runState.summary = snapshot.summary;
|
|
363
|
+
runState.cases = snapshot.cases;
|
|
364
|
+
runState.caseDetails = snapshot.caseDetails;
|
|
365
|
+
} else if (event.type === "run.finished") {
|
|
366
|
+
runState.manifest.status = "completed";
|
|
367
|
+
runState.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
368
|
+
const parsed = runSummarySchema.safeParse(event.payload);
|
|
369
|
+
if (parsed.success) runState.summary = parsed.data;
|
|
370
|
+
} else {
|
|
371
|
+
runState.manifest.status = "error";
|
|
372
|
+
runState.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
373
|
+
runState.summary.status = "error";
|
|
374
|
+
runState.summary.errorMessage = getRunErrorMessage(event.payload);
|
|
375
|
+
}
|
|
376
|
+
managerContext.emitEvent(runState, event);
|
|
377
|
+
managerContext.emitDiscoveryEvent();
|
|
378
|
+
}
|
|
379
|
+
//#endregion
|
|
380
|
+
//#region ../runner/src/runner.ts
|
|
381
|
+
const globMagicCharacters = new Set([
|
|
382
|
+
"*",
|
|
383
|
+
"?",
|
|
384
|
+
"[",
|
|
385
|
+
"]",
|
|
386
|
+
"{",
|
|
387
|
+
"}",
|
|
388
|
+
"(",
|
|
389
|
+
")",
|
|
390
|
+
"!",
|
|
391
|
+
"+",
|
|
392
|
+
"@"
|
|
393
|
+
]);
|
|
394
|
+
function hasGlobMagic(value) {
|
|
395
|
+
for (const char of value) if (globMagicCharacters.has(char)) return true;
|
|
396
|
+
return false;
|
|
397
|
+
}
|
|
398
|
+
function getWatchRootForIncludePattern(params) {
|
|
399
|
+
const segments = params.pattern.replaceAll("\\", "/").split("/").filter((part) => part !== "");
|
|
400
|
+
const firstGlobSegmentIndex = segments.findIndex(hasGlobMagic);
|
|
401
|
+
if (firstGlobSegmentIndex === -1) return dirname(resolve(params.workspaceRoot, params.pattern));
|
|
402
|
+
if (firstGlobSegmentIndex === 0) return params.workspaceRoot;
|
|
403
|
+
return resolve(params.workspaceRoot, segments.slice(0, firstGlobSegmentIndex).join("/"));
|
|
404
|
+
}
|
|
405
|
+
function getWatchRootsForIncludePatterns(params) {
|
|
406
|
+
const roots = /* @__PURE__ */ new Set();
|
|
407
|
+
for (const pattern of params.patterns) roots.add(getWatchRootForIncludePattern({
|
|
408
|
+
pattern,
|
|
409
|
+
workspaceRoot: params.workspaceRoot
|
|
410
|
+
}));
|
|
411
|
+
if (roots.size === 0) return [params.workspaceRoot];
|
|
412
|
+
return [...roots];
|
|
413
|
+
}
|
|
414
|
+
/** Create an in-memory eval runner bound to the current workspace config. */
|
|
415
|
+
function createRunner({ watchForChanges = true } = {}) {
|
|
416
|
+
let config;
|
|
417
|
+
let workspaceRoot;
|
|
418
|
+
let localStateDir;
|
|
419
|
+
let cacheStore;
|
|
420
|
+
const evals = /* @__PURE__ */ new Map();
|
|
421
|
+
const runs = /* @__PURE__ */ new Map();
|
|
422
|
+
const lastRunStatusMap = /* @__PURE__ */ new Map();
|
|
423
|
+
const latestRunInfoMap = /* @__PURE__ */ new Map();
|
|
424
|
+
const discoveryListeners = /* @__PURE__ */ new Set();
|
|
425
|
+
let nextShortIdNum = 0;
|
|
426
|
+
let discoveryWatcher;
|
|
427
|
+
let discoveryRefreshTimer;
|
|
428
|
+
function toWorkspaceRelativePath(filePath) {
|
|
429
|
+
return relative(workspaceRoot, filePath).replaceAll("\\", "/");
|
|
430
|
+
}
|
|
431
|
+
function getSortedEvalMetas() {
|
|
432
|
+
return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
|
|
433
|
+
}
|
|
434
|
+
function getSourceFingerprint(source) {
|
|
435
|
+
return createHash("sha256").update(source).digest("hex");
|
|
436
|
+
}
|
|
437
|
+
const runner = {
|
|
438
|
+
async init() {
|
|
439
|
+
config = await loadConfig();
|
|
440
|
+
workspaceRoot = config.workspaceRoot ?? process.cwd();
|
|
441
|
+
localStateDir = resolve(workspaceRoot, ".agent-evals");
|
|
442
|
+
await mkdir(localStateDir, { recursive: true });
|
|
443
|
+
await mkdir(join(localStateDir, "runs"), { recursive: true });
|
|
444
|
+
cacheStore = createFsCacheStore({
|
|
445
|
+
workspaceRoot,
|
|
446
|
+
dir: config.cache?.dir,
|
|
447
|
+
maxEntriesPerEval: config.cache?.maxEntriesPerEval
|
|
448
|
+
});
|
|
449
|
+
await loadPersistedRuns();
|
|
450
|
+
await runner.refreshDiscovery();
|
|
451
|
+
if (watchForChanges) await setupWatcher();
|
|
452
|
+
},
|
|
453
|
+
async listCache() {
|
|
454
|
+
return cacheStore.list();
|
|
455
|
+
},
|
|
456
|
+
async clearCache(filter) {
|
|
457
|
+
await cacheStore.clear(filter);
|
|
458
|
+
},
|
|
459
|
+
async recomputeStatusesForEval(evalId) {
|
|
460
|
+
const evalMeta = evals.get(evalId);
|
|
461
|
+
if (!evalMeta) return { updatedRuns: 0 };
|
|
462
|
+
const registry = getEvalRegistry();
|
|
463
|
+
await loadEvalModule(evalMeta.sourceFilePath, evalMeta.sourceFingerprint ?? void 0);
|
|
464
|
+
const entry = registry.get(evalId);
|
|
465
|
+
if (!entry) return { updatedRuns: 0 };
|
|
466
|
+
const scoreThresholds = /* @__PURE__ */ new Map();
|
|
467
|
+
entry.use((evalDef) => {
|
|
468
|
+
for (const [key, def] of Object.entries(evalDef.scores ?? {})) {
|
|
469
|
+
const threshold = normalizeScoreDef(def).passThreshold;
|
|
470
|
+
if (threshold !== void 0) scoreThresholds.set(key, threshold);
|
|
471
|
+
}
|
|
472
|
+
for (const [key, def] of Object.entries(evalDef.manualScores ?? {})) if (def.passThreshold !== void 0) scoreThresholds.set(key, def.passThreshold);
|
|
473
|
+
});
|
|
474
|
+
const updatedRuns = await recomputeEvalStatusesInRuns({
|
|
475
|
+
runs: runs.values(),
|
|
476
|
+
evalId,
|
|
477
|
+
evalExists: evals.has(evalId),
|
|
478
|
+
scoreThresholds,
|
|
479
|
+
persistCaseDetail
|
|
480
|
+
});
|
|
481
|
+
emitDiscoveryEvent();
|
|
482
|
+
return { updatedRuns };
|
|
483
|
+
},
|
|
484
|
+
async cleanRunsForEval(evalId) {
|
|
485
|
+
let deletedRuns = 0;
|
|
486
|
+
for (const [runId, run] of [...runs]) {
|
|
487
|
+
if (!runTouchesEval({
|
|
488
|
+
target: run.manifest.target,
|
|
489
|
+
caseRows: run.cases,
|
|
490
|
+
evalId,
|
|
491
|
+
evalExists: evals.has(evalId)
|
|
492
|
+
})) continue;
|
|
493
|
+
if (run.manifest.status === "running") continue;
|
|
494
|
+
runs.delete(runId);
|
|
495
|
+
await rm(run.runDir, {
|
|
496
|
+
recursive: true,
|
|
497
|
+
force: true
|
|
498
|
+
});
|
|
499
|
+
deletedRuns += 1;
|
|
500
|
+
}
|
|
501
|
+
emitDiscoveryEvent();
|
|
502
|
+
return { deletedRuns };
|
|
503
|
+
},
|
|
504
|
+
async updateManualScore({ runId, caseId, scoreKey, value }) {
|
|
505
|
+
const run = runs.get(runId);
|
|
506
|
+
if (!run) return {
|
|
507
|
+
updated: false,
|
|
508
|
+
reason: "Run not found"
|
|
509
|
+
};
|
|
510
|
+
if (run.manifest.status === "running") return {
|
|
511
|
+
updated: false,
|
|
512
|
+
reason: "Run is still running"
|
|
513
|
+
};
|
|
514
|
+
const caseRow = run.cases.find((row) => row.caseId === caseId);
|
|
515
|
+
if (!caseRow) return {
|
|
516
|
+
updated: false,
|
|
517
|
+
reason: "Case not found"
|
|
518
|
+
};
|
|
519
|
+
const evalMeta = evals.get(caseRow.evalId);
|
|
520
|
+
if (!evalMeta) return {
|
|
521
|
+
updated: false,
|
|
522
|
+
reason: "Eval not found"
|
|
523
|
+
};
|
|
524
|
+
if (evalMeta.columnDefs.find((def) => def.key === scoreKey)?.isManualScore !== true) return {
|
|
525
|
+
updated: false,
|
|
526
|
+
reason: "Manual score not found"
|
|
527
|
+
};
|
|
528
|
+
const caseDetail = run.caseDetails.get(caseId);
|
|
529
|
+
if (!caseDetail) return {
|
|
530
|
+
updated: false,
|
|
531
|
+
reason: "Case detail not found"
|
|
532
|
+
};
|
|
533
|
+
caseRow.columns[scoreKey] = value;
|
|
534
|
+
caseDetail.columns[scoreKey] = value;
|
|
535
|
+
const scoreThresholds = /* @__PURE__ */ new Map();
|
|
536
|
+
for (const def of evalMeta.columnDefs) {
|
|
537
|
+
if (def.isScore !== true || def.passThreshold === void 0) continue;
|
|
538
|
+
scoreThresholds.set(def.key, def.passThreshold);
|
|
539
|
+
}
|
|
540
|
+
const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds);
|
|
541
|
+
caseRow.status = nextStatus;
|
|
542
|
+
caseDetail.status = nextStatus;
|
|
543
|
+
const derivedSummary = deriveScopedSummaryFromCases({ caseRows: run.cases });
|
|
544
|
+
run.summary.totalCases = derivedSummary.totalCases;
|
|
545
|
+
run.summary.passedCases = derivedSummary.passedCases;
|
|
546
|
+
run.summary.failedCases = derivedSummary.failedCases;
|
|
547
|
+
run.summary.errorCases = derivedSummary.errorCases;
|
|
548
|
+
run.summary.cancelledCases = derivedSummary.cancelledCases;
|
|
549
|
+
run.summary.totalDurationMs = derivedSummary.totalDurationMs;
|
|
550
|
+
await persistCaseDetail(run.runDir, caseDetail);
|
|
551
|
+
await persistRunState(run);
|
|
552
|
+
emitDiscoveryEvent();
|
|
553
|
+
return {
|
|
554
|
+
updated: true,
|
|
555
|
+
run: {
|
|
556
|
+
manifest: run.manifest,
|
|
557
|
+
summary: run.summary,
|
|
558
|
+
cases: run.cases
|
|
559
|
+
},
|
|
560
|
+
caseDetail
|
|
561
|
+
};
|
|
562
|
+
},
|
|
563
|
+
async deleteRun(runId) {
|
|
564
|
+
const run = runs.get(runId);
|
|
565
|
+
if (!run) return { deleted: false };
|
|
566
|
+
if (run.manifest.status === "running") return { deleted: false };
|
|
567
|
+
runs.delete(runId);
|
|
568
|
+
await rm(run.runDir, {
|
|
569
|
+
recursive: true,
|
|
570
|
+
force: true
|
|
571
|
+
});
|
|
572
|
+
emitDiscoveryEvent();
|
|
573
|
+
return { deleted: true };
|
|
574
|
+
},
|
|
575
|
+
getEvals() {
|
|
576
|
+
const gitState = readGitWorktreeState(workspaceRoot);
|
|
577
|
+
const result = [];
|
|
578
|
+
for (const meta of getSortedEvalMetas()) result.push(buildEvalSummary({
|
|
579
|
+
meta,
|
|
580
|
+
config,
|
|
581
|
+
gitState,
|
|
582
|
+
latestRun: latestRunInfoMap.get(meta.id),
|
|
583
|
+
lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
|
|
584
|
+
}));
|
|
585
|
+
return result;
|
|
586
|
+
},
|
|
587
|
+
getEval(id) {
|
|
588
|
+
const meta = evals.get(id);
|
|
589
|
+
if (!meta) return void 0;
|
|
590
|
+
return buildEvalSummary({
|
|
591
|
+
meta,
|
|
592
|
+
config,
|
|
593
|
+
gitState: readGitWorktreeState(workspaceRoot),
|
|
594
|
+
latestRun: latestRunInfoMap.get(meta.id),
|
|
595
|
+
lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
|
|
596
|
+
});
|
|
597
|
+
},
|
|
598
|
+
async refreshDiscovery() {
|
|
599
|
+
const patterns = config.include;
|
|
600
|
+
const discovered = [];
|
|
601
|
+
for (const pattern of patterns) {
|
|
602
|
+
const files = await glob(pattern, {
|
|
603
|
+
cwd: workspaceRoot,
|
|
604
|
+
absolute: true
|
|
605
|
+
});
|
|
606
|
+
discovered.push(...files);
|
|
607
|
+
}
|
|
608
|
+
evals.clear();
|
|
609
|
+
for (const filePath of discovered) try {
|
|
610
|
+
const content = await readFile(filePath, "utf-8");
|
|
611
|
+
const discoveredMetas = parseEvalMetas(filePath, content);
|
|
612
|
+
const sourceFingerprint = getSourceFingerprint(content);
|
|
613
|
+
const registry = getEvalRegistry();
|
|
614
|
+
try {
|
|
615
|
+
await loadEvalModule(filePath, sourceFingerprint);
|
|
616
|
+
} catch {}
|
|
617
|
+
for (const meta of discoveredMetas) {
|
|
618
|
+
const discoveredEntry = registry.get(meta.id);
|
|
619
|
+
const title = meta.title;
|
|
620
|
+
let columnDefs = buildDeclaredColumnDefs(void 0, void 0, void 0);
|
|
621
|
+
let stats;
|
|
622
|
+
let charts;
|
|
623
|
+
discoveredEntry?.use((evalDef) => {
|
|
624
|
+
columnDefs = buildDeclaredColumnDefs(evalDef.columns, evalDef.scores, evalDef.manualScores);
|
|
625
|
+
stats = evalDef.stats;
|
|
626
|
+
const validated = validateCharts({
|
|
627
|
+
charts: evalDef.charts,
|
|
628
|
+
columnDefs,
|
|
629
|
+
evalId: meta.id
|
|
630
|
+
});
|
|
631
|
+
for (const warning of validated.warnings) console.warn(warning);
|
|
632
|
+
charts = validated.charts;
|
|
633
|
+
});
|
|
634
|
+
evals.set(meta.id, {
|
|
635
|
+
id: meta.id,
|
|
636
|
+
title,
|
|
637
|
+
filePath: toWorkspaceRelativePath(meta.filePath),
|
|
638
|
+
sourceFilePath: meta.filePath,
|
|
639
|
+
sourceFingerprint,
|
|
640
|
+
columnDefs,
|
|
641
|
+
caseCount: null,
|
|
642
|
+
stats,
|
|
643
|
+
charts
|
|
644
|
+
});
|
|
645
|
+
}
|
|
646
|
+
} catch {}
|
|
647
|
+
emitDiscoveryEvent();
|
|
648
|
+
},
|
|
649
|
+
async startRun(request) {
|
|
650
|
+
const runId = generateRunId();
|
|
651
|
+
const shortId = `r${String(nextShortIdNum++)}`;
|
|
652
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
653
|
+
const cacheMode = request.cache?.mode ?? "use";
|
|
654
|
+
const runDir = join(localStateDir, "runs", runId);
|
|
655
|
+
const manifest = {
|
|
656
|
+
id: runId,
|
|
657
|
+
shortId,
|
|
658
|
+
status: "running",
|
|
659
|
+
startedAt: now,
|
|
660
|
+
endedAt: null,
|
|
661
|
+
commitSha: readGitWorktreeState(workspaceRoot).commitSha,
|
|
662
|
+
evalSourceFingerprints: {},
|
|
663
|
+
target: request.target,
|
|
664
|
+
trials: request.trials,
|
|
665
|
+
trialSelection: config.trialSelection ?? "lowestScore",
|
|
666
|
+
cacheMode
|
|
667
|
+
};
|
|
668
|
+
const summary = {
|
|
669
|
+
runId,
|
|
670
|
+
status: "running",
|
|
671
|
+
totalCases: 0,
|
|
672
|
+
passedCases: 0,
|
|
673
|
+
failedCases: 0,
|
|
674
|
+
errorCases: 0,
|
|
675
|
+
cancelledCases: 0,
|
|
676
|
+
totalDurationMs: null,
|
|
677
|
+
errorMessage: null
|
|
678
|
+
};
|
|
679
|
+
const runState = {
|
|
680
|
+
runDir,
|
|
681
|
+
manifest,
|
|
682
|
+
summary,
|
|
683
|
+
cases: [],
|
|
684
|
+
caseDetails: /* @__PURE__ */ new Map(),
|
|
685
|
+
listeners: /* @__PURE__ */ new Set(),
|
|
686
|
+
childProcess: void 0,
|
|
687
|
+
childTerminalReceived: false
|
|
688
|
+
};
|
|
689
|
+
runs.set(runId, runState);
|
|
690
|
+
setLatestRunInfoMap({
|
|
691
|
+
latestRunInfoMap,
|
|
692
|
+
evalIds: getTargetEvalIds({
|
|
693
|
+
request,
|
|
694
|
+
sortedEvalIds: getSortedEvalMetas().map((meta) => meta.id),
|
|
695
|
+
knownEvalIds: new Set(evals.keys())
|
|
696
|
+
}),
|
|
697
|
+
info: {
|
|
698
|
+
status: "running",
|
|
699
|
+
startedAt: now,
|
|
700
|
+
commitSha: manifest.commitSha ?? null,
|
|
701
|
+
evalSourceFingerprint: null
|
|
702
|
+
}
|
|
703
|
+
});
|
|
704
|
+
await mkdir(runDir, { recursive: true });
|
|
705
|
+
await mkdir(join(runDir, "traces"), { recursive: true });
|
|
706
|
+
await mkdir(join(runDir, "artifacts"), { recursive: true });
|
|
707
|
+
await mkdir(join(runDir, "case-details"), { recursive: true });
|
|
708
|
+
await writeFile(join(runDir, "run.json"), JSON.stringify(manifest, null, 2));
|
|
709
|
+
const childContext = {
|
|
710
|
+
request,
|
|
711
|
+
workspaceRoot,
|
|
712
|
+
runDir,
|
|
713
|
+
manifest,
|
|
714
|
+
summary,
|
|
715
|
+
evals: getSortedEvalMetas()
|
|
716
|
+
};
|
|
717
|
+
await writeFile(join(runDir, "run-child-context.json"), JSON.stringify(childContext, null, 2));
|
|
718
|
+
startRunChild({
|
|
719
|
+
runState,
|
|
720
|
+
contextPath: join(runDir, "run-child-context.json"),
|
|
721
|
+
managerContext: {
|
|
722
|
+
workspaceRoot,
|
|
723
|
+
evals,
|
|
724
|
+
emitEvent,
|
|
725
|
+
emitDiscoveryEvent
|
|
726
|
+
}
|
|
727
|
+
});
|
|
728
|
+
return {
|
|
729
|
+
manifest,
|
|
730
|
+
summary,
|
|
731
|
+
cases: []
|
|
732
|
+
};
|
|
733
|
+
},
|
|
734
|
+
getRuns() {
|
|
735
|
+
return [...runs.values()].map((r) => r.manifest);
|
|
736
|
+
},
|
|
737
|
+
getRun(id) {
|
|
738
|
+
const run = runs.get(id);
|
|
739
|
+
if (!run) return void 0;
|
|
740
|
+
return {
|
|
741
|
+
manifest: run.manifest,
|
|
742
|
+
summary: run.summary,
|
|
743
|
+
cases: run.cases
|
|
744
|
+
};
|
|
745
|
+
},
|
|
746
|
+
async cancelRun(id) {
|
|
747
|
+
const run = runs.get(id);
|
|
748
|
+
if (!run) return;
|
|
749
|
+
if (run.manifest.status !== "running") return;
|
|
750
|
+
const endedAt = /* @__PURE__ */ new Date();
|
|
751
|
+
run.manifest.status = "cancelled";
|
|
752
|
+
run.manifest.endedAt = endedAt.toISOString();
|
|
753
|
+
run.summary.status = "cancelled";
|
|
754
|
+
const derivedSummary = deriveScopedSummaryFromCases({
|
|
755
|
+
caseRows: run.cases,
|
|
756
|
+
lifecycleStatus: "cancelled"
|
|
757
|
+
});
|
|
758
|
+
run.summary.totalCases = derivedSummary.totalCases;
|
|
759
|
+
run.summary.passedCases = derivedSummary.passedCases;
|
|
760
|
+
run.summary.failedCases = derivedSummary.failedCases;
|
|
761
|
+
run.summary.errorCases = derivedSummary.errorCases;
|
|
762
|
+
run.summary.cancelledCases = derivedSummary.cancelledCases;
|
|
763
|
+
run.summary.totalDurationMs = endedAt.getTime() - new Date(run.manifest.startedAt).getTime();
|
|
764
|
+
killRunChild(run);
|
|
765
|
+
await persistRunState(run);
|
|
766
|
+
emitEvent(run, {
|
|
767
|
+
type: "run.cancelled",
|
|
768
|
+
runId: id,
|
|
769
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
770
|
+
payload: run.summary
|
|
771
|
+
});
|
|
772
|
+
emitDiscoveryEvent();
|
|
773
|
+
},
|
|
774
|
+
getCaseDetail(runId, caseId) {
|
|
775
|
+
const run = runs.get(runId);
|
|
776
|
+
if (!run) return void 0;
|
|
777
|
+
return run.caseDetails.get(caseId);
|
|
778
|
+
},
|
|
779
|
+
subscribe(runId, listener) {
|
|
780
|
+
const run = runs.get(runId);
|
|
781
|
+
if (!run) return () => {};
|
|
782
|
+
run.listeners.add(listener);
|
|
783
|
+
return () => {
|
|
784
|
+
run.listeners.delete(listener);
|
|
785
|
+
};
|
|
786
|
+
},
|
|
787
|
+
subscribeDiscovery(listener) {
|
|
788
|
+
discoveryListeners.add(listener);
|
|
789
|
+
return () => {
|
|
790
|
+
discoveryListeners.delete(listener);
|
|
791
|
+
};
|
|
792
|
+
},
|
|
793
|
+
async close() {
|
|
794
|
+
if (discoveryRefreshTimer !== void 0) {
|
|
795
|
+
clearTimeout(discoveryRefreshTimer);
|
|
796
|
+
discoveryRefreshTimer = void 0;
|
|
797
|
+
}
|
|
798
|
+
const watcher = discoveryWatcher;
|
|
799
|
+
if (watcher === void 0) return;
|
|
800
|
+
discoveryWatcher = void 0;
|
|
801
|
+
await watcher.close();
|
|
802
|
+
},
|
|
803
|
+
getWorkspaceRoot() {
|
|
804
|
+
return workspaceRoot;
|
|
805
|
+
},
|
|
806
|
+
getArtifactPath(artifactId_) {
|
|
807
|
+
return resolveArtifactPath(join(localStateDir, "runs"), artifactId_);
|
|
808
|
+
}
|
|
809
|
+
};
|
|
810
|
+
async function setupWatcher() {
|
|
811
|
+
const watcher = watch(getWatchRootsForIncludePatterns({
|
|
812
|
+
patterns: config.include,
|
|
813
|
+
workspaceRoot
|
|
814
|
+
}), {
|
|
815
|
+
ignoreInitial: true,
|
|
816
|
+
persistent: true
|
|
817
|
+
});
|
|
818
|
+
discoveryWatcher = watcher;
|
|
819
|
+
const scheduleRefresh = () => {
|
|
820
|
+
if (discoveryRefreshTimer !== void 0) clearTimeout(discoveryRefreshTimer);
|
|
821
|
+
discoveryRefreshTimer = setTimeout(() => {
|
|
822
|
+
discoveryRefreshTimer = void 0;
|
|
823
|
+
runner.refreshDiscovery();
|
|
824
|
+
}, 50);
|
|
825
|
+
};
|
|
826
|
+
watcher.on("change", scheduleRefresh);
|
|
827
|
+
watcher.on("add", scheduleRefresh);
|
|
828
|
+
watcher.on("unlink", scheduleRefresh);
|
|
829
|
+
watcher.on("addDir", scheduleRefresh);
|
|
830
|
+
watcher.on("unlinkDir", scheduleRefresh);
|
|
831
|
+
await new Promise((ready) => {
|
|
832
|
+
watcher.once("ready", ready);
|
|
833
|
+
});
|
|
834
|
+
}
|
|
835
|
+
function emitDiscoveryEvent() {
|
|
836
|
+
const lastRunStatuses = getLastRunStatuses({
|
|
837
|
+
runs: runs.values(),
|
|
838
|
+
knownEvals: evals.values()
|
|
839
|
+
});
|
|
840
|
+
const latestRunInfos = getLatestRunInfos({
|
|
841
|
+
runs: runs.values(),
|
|
842
|
+
knownEvals: evals.values()
|
|
843
|
+
});
|
|
844
|
+
lastRunStatusMap.clear();
|
|
845
|
+
for (const [evalId, status] of lastRunStatuses) lastRunStatusMap.set(evalId, status);
|
|
846
|
+
latestRunInfoMap.clear();
|
|
847
|
+
for (const [evalId, info] of latestRunInfos) latestRunInfoMap.set(evalId, info);
|
|
848
|
+
const event = {
|
|
849
|
+
type: "discovery.updated",
|
|
850
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
851
|
+
payload: runner.getEvals()
|
|
852
|
+
};
|
|
853
|
+
for (const listener of discoveryListeners) listener(event);
|
|
854
|
+
}
|
|
855
|
+
function emitEvent(runState, event) {
|
|
856
|
+
for (const listener of runState.listeners) try {
|
|
857
|
+
listener(event);
|
|
858
|
+
} catch {}
|
|
859
|
+
}
|
|
860
|
+
async function loadPersistedRuns() {
|
|
861
|
+
runs.clear();
|
|
862
|
+
const persistedRuns = await loadPersistedRunSnapshots(localStateDir);
|
|
863
|
+
nextShortIdNum = nextShortIdFromSnapshots(persistedRuns);
|
|
864
|
+
for (const persistedRun of persistedRuns) runs.set(persistedRun.manifest.id, {
|
|
865
|
+
...persistedRun,
|
|
866
|
+
listeners: /* @__PURE__ */ new Set(),
|
|
867
|
+
childProcess: void 0,
|
|
868
|
+
childTerminalReceived: false
|
|
869
|
+
});
|
|
870
|
+
}
|
|
871
|
+
return runner;
|
|
872
|
+
}
|
|
873
|
+
//#endregion
|
|
874
|
+
//#region src/cli.ts
|
|
875
|
+
function parseArgs(argv) {
|
|
876
|
+
const normalizedArgv = argv.filter((arg) => arg !== "--no-env");
|
|
877
|
+
const args = {
|
|
878
|
+
command: "help",
|
|
879
|
+
subcommand: void 0,
|
|
880
|
+
showHelp: false,
|
|
881
|
+
helpTopic: "global",
|
|
882
|
+
unknownHelpTarget: void 0,
|
|
883
|
+
evalIds: [],
|
|
884
|
+
caseIds: [],
|
|
885
|
+
trials: 1,
|
|
886
|
+
json: false,
|
|
887
|
+
port: 4100,
|
|
888
|
+
cacheMode: "use",
|
|
889
|
+
clearCache: false,
|
|
890
|
+
all: false,
|
|
891
|
+
loadEnv: normalizedArgv.length === argv.length
|
|
892
|
+
};
|
|
893
|
+
const command = normalizedArgv[0];
|
|
894
|
+
if (command === "--help" || command === "-h") {
|
|
895
|
+
args.showHelp = true;
|
|
896
|
+
return args;
|
|
897
|
+
}
|
|
898
|
+
if (isCliCommand(command)) {
|
|
899
|
+
args.command = command;
|
|
900
|
+
args.helpTopic = command === "help" ? "global" : command;
|
|
901
|
+
} else if (command !== void 0 && !command.startsWith("-")) args.unknownHelpTarget = command;
|
|
902
|
+
let cursor = 1;
|
|
903
|
+
if (args.command === "cache") {
|
|
904
|
+
const sub = normalizedArgv[cursor];
|
|
905
|
+
if (sub === "list" || sub === "clear") {
|
|
906
|
+
args.subcommand = sub;
|
|
907
|
+
args.helpTopic = `cache ${sub}`;
|
|
908
|
+
cursor++;
|
|
909
|
+
} else if (sub !== void 0 && !sub.startsWith("-")) args.unknownHelpTarget = `cache ${sub}`;
|
|
910
|
+
}
|
|
911
|
+
for (let i = cursor; i < normalizedArgv.length; i++) {
|
|
912
|
+
const arg = normalizedArgv[i];
|
|
913
|
+
const next = normalizedArgv[i + 1];
|
|
914
|
+
if (arg === "--help" || arg === "-h") args.showHelp = true;
|
|
915
|
+
else if (arg === "--eval" && next) {
|
|
916
|
+
args.evalIds.push(...next.split(","));
|
|
917
|
+
i++;
|
|
918
|
+
} else if (arg === "--case" && next) {
|
|
919
|
+
args.caseIds.push(...next.split(","));
|
|
920
|
+
i++;
|
|
921
|
+
} else if (arg === "--trials" && next) {
|
|
922
|
+
args.trials = Number(next);
|
|
923
|
+
i++;
|
|
924
|
+
} else if (arg === "--json") args.json = true;
|
|
925
|
+
else if (arg === "--port" && next) {
|
|
926
|
+
args.port = Number(next);
|
|
927
|
+
i++;
|
|
928
|
+
} else if (arg === "--cache" && next) {
|
|
929
|
+
if (next === "use" || next === "bypass" || next === "refresh") args.cacheMode = next;
|
|
930
|
+
i++;
|
|
931
|
+
} else if (arg === "--no-cache") args.cacheMode = "bypass";
|
|
932
|
+
else if (arg === "--refresh-cache") args.cacheMode = "refresh";
|
|
933
|
+
else if (arg === "--clear-cache") args.clearCache = true;
|
|
934
|
+
else if (arg === "--all") args.all = true;
|
|
935
|
+
}
|
|
936
|
+
return args;
|
|
937
|
+
}
|
|
938
|
+
/**
|
|
939
|
+
* Run the Agent Evals CLI against the current workspace.
|
|
940
|
+
*
|
|
941
|
+
* @param argv Raw command-line arguments excluding the executable name.
|
|
942
|
+
*/
|
|
943
|
+
async function runCli(argv) {
|
|
944
|
+
const args = parseArgs(argv);
|
|
945
|
+
if (args.loadEnv && !loadWorkspaceEnv()) {
|
|
946
|
+
process.exit(1);
|
|
947
|
+
return;
|
|
948
|
+
}
|
|
949
|
+
if (args.showHelp) {
|
|
950
|
+
if (args.unknownHelpTarget !== void 0) {
|
|
951
|
+
console.error(`No help found for "${args.unknownHelpTarget}".`);
|
|
952
|
+
process.exit(1);
|
|
953
|
+
return;
|
|
954
|
+
}
|
|
955
|
+
printHelp(args.helpTopic);
|
|
956
|
+
return;
|
|
957
|
+
}
|
|
958
|
+
switch (args.command) {
|
|
959
|
+
case "app":
|
|
960
|
+
await commandApp(args);
|
|
961
|
+
break;
|
|
962
|
+
case "list":
|
|
963
|
+
await commandList(args);
|
|
964
|
+
break;
|
|
965
|
+
case "run":
|
|
966
|
+
await commandRun(args);
|
|
967
|
+
break;
|
|
968
|
+
case "cache":
|
|
969
|
+
await commandCache(args);
|
|
970
|
+
break;
|
|
971
|
+
default:
|
|
972
|
+
printHelp(args.helpTopic);
|
|
973
|
+
break;
|
|
974
|
+
}
|
|
975
|
+
}
|
|
976
|
+
function isCliCommand(command) {
|
|
977
|
+
return command === "app" || command === "list" || command === "run" || command === "cache" || command === "help";
|
|
978
|
+
}
|
|
979
|
+
function loadWorkspaceEnv() {
|
|
980
|
+
const envPath = resolve(process.cwd(), ".env");
|
|
981
|
+
if (!existsSync(envPath)) return true;
|
|
982
|
+
const loadResult = resultify(() => {
|
|
983
|
+
process.loadEnvFile(envPath);
|
|
984
|
+
});
|
|
985
|
+
if (loadResult.error) {
|
|
986
|
+
console.error(`Failed to load .env at ${envPath}: ${loadResult.error.message}`);
|
|
987
|
+
return false;
|
|
988
|
+
}
|
|
989
|
+
return true;
|
|
990
|
+
}
|
|
991
|
+
const currentDir = dirname(fileURLToPath(import.meta.url));
|
|
992
|
+
const repoRoot = resolve(currentDir, "../../..");
|
|
993
|
+
const pnpmCommand = process.platform === "win32" ? "pnpm.cmd" : "pnpm";
|
|
994
|
+
function hasRepoWebWorkspace() {
|
|
995
|
+
return existsSync(resolve(repoRoot, "apps/web/package.json"));
|
|
996
|
+
}
|
|
997
|
+
async function ensureWebUiIsBuilt() {
|
|
998
|
+
if (!hasRepoWebWorkspace()) return;
|
|
999
|
+
console.info("Preparing web UI...");
|
|
1000
|
+
await new Promise((resolvePromise, rejectPromise) => {
|
|
1001
|
+
const child = spawn(pnpmCommand, [
|
|
1002
|
+
"--filter",
|
|
1003
|
+
"@agent-evals/web",
|
|
1004
|
+
"build"
|
|
1005
|
+
], {
|
|
1006
|
+
cwd: repoRoot,
|
|
1007
|
+
stdio: "inherit"
|
|
1008
|
+
});
|
|
1009
|
+
child.once("error", (error) => {
|
|
1010
|
+
rejectPromise(error);
|
|
1011
|
+
});
|
|
1012
|
+
child.once("exit", (code, signal) => {
|
|
1013
|
+
if (signal) {
|
|
1014
|
+
rejectPromise(/* @__PURE__ */ new Error(`Web UI build stopped with signal ${signal}.`));
|
|
1015
|
+
return;
|
|
1016
|
+
}
|
|
1017
|
+
if (code !== 0) {
|
|
1018
|
+
rejectPromise(/* @__PURE__ */ new Error(`Web UI build failed with exit code ${String(code)}.`));
|
|
1019
|
+
return;
|
|
1020
|
+
}
|
|
1021
|
+
resolvePromise();
|
|
1022
|
+
});
|
|
1023
|
+
});
|
|
1024
|
+
}
|
|
1025
|
+
function isHonoAppModule(mod) {
|
|
1026
|
+
if (typeof mod !== "object" || mod === null || !("app" in mod)) return false;
|
|
1027
|
+
const { app } = mod;
|
|
1028
|
+
return typeof app === "object" && app !== null && "fetch" in app && typeof app.fetch === "function";
|
|
1029
|
+
}
|
|
1030
|
+
function isServerRunnerModule(mod) {
|
|
1031
|
+
if (typeof mod !== "object" || mod === null || !("initRunner" in mod)) return false;
|
|
1032
|
+
return typeof mod.initRunner === "function";
|
|
1033
|
+
}
|
|
1034
|
+
async function commandApp(args) {
|
|
1035
|
+
await ensureWebUiIsBuilt();
|
|
1036
|
+
const { serve } = await import("@hono/node-server");
|
|
1037
|
+
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
1038
|
+
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
1039
|
+
const appModule = await import("./app-TjV5nDMM.mjs");
|
|
1040
|
+
const runnerModule = await import("./runner-CBDZos0Z.mjs");
|
|
1041
|
+
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
1042
|
+
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
1043
|
+
await runnerModule.initRunner();
|
|
1044
|
+
console.info(`Agent Evals app: http://localhost:${String(args.port)}`);
|
|
1045
|
+
serve({
|
|
1046
|
+
fetch: appModule.app.fetch,
|
|
1047
|
+
port: args.port
|
|
1048
|
+
});
|
|
1049
|
+
}
|
|
1050
|
+
async function commandList(args_) {
|
|
1051
|
+
const runner = createRunner({ watchForChanges: false });
|
|
1052
|
+
await runner.init();
|
|
1053
|
+
const evals = runner.getEvals();
|
|
1054
|
+
if (evals.length === 0) {
|
|
1055
|
+
console.info("No eval files found.");
|
|
1056
|
+
return;
|
|
1057
|
+
}
|
|
1058
|
+
console.info("Discovered evals:\n");
|
|
1059
|
+
for (const ev of evals) {
|
|
1060
|
+
const displayStatus = getEvalDisplayStatus({
|
|
1061
|
+
freshnessStatus: ev.freshnessStatus,
|
|
1062
|
+
stale: ev.stale,
|
|
1063
|
+
outdated: ev.outdated,
|
|
1064
|
+
lastRunStatus: ev.lastRunStatus
|
|
1065
|
+
});
|
|
1066
|
+
const title = getEvalTitle(ev);
|
|
1067
|
+
console.info(` ${title}`);
|
|
1068
|
+
console.info(` id: ${ev.id}`);
|
|
1069
|
+
console.info(` file: ${ev.filePath}`);
|
|
1070
|
+
if (displayStatus !== "pending") console.info(` status: ${displayStatus}`);
|
|
1071
|
+
if (ev.caseCount !== null) console.info(` cases: ${String(ev.caseCount)}`);
|
|
1072
|
+
console.info("");
|
|
1073
|
+
}
|
|
1074
|
+
}
|
|
1075
|
+
async function commandRun(args) {
|
|
1076
|
+
const runner = createRunner({ watchForChanges: false });
|
|
1077
|
+
await runner.init();
|
|
1078
|
+
if (args.clearCache) {
|
|
1079
|
+
await runner.clearCache();
|
|
1080
|
+
if (!args.json) {
|
|
1081
|
+
console.info("Cleared cache before run.");
|
|
1082
|
+
console.info("");
|
|
1083
|
+
}
|
|
1084
|
+
}
|
|
1085
|
+
const target = args.caseIds.length > 0 ? {
|
|
1086
|
+
mode: "caseIds",
|
|
1087
|
+
caseIds: args.caseIds,
|
|
1088
|
+
evalIds: args.evalIds.length > 0 ? args.evalIds : void 0
|
|
1089
|
+
} : args.evalIds.length > 0 ? {
|
|
1090
|
+
mode: "evalIds",
|
|
1091
|
+
evalIds: args.evalIds
|
|
1092
|
+
} : { mode: "all" };
|
|
1093
|
+
const run = await runner.startRun({
|
|
1094
|
+
target,
|
|
1095
|
+
trials: args.trials,
|
|
1096
|
+
cache: { mode: args.cacheMode }
|
|
1097
|
+
});
|
|
1098
|
+
if (!args.json) {
|
|
1099
|
+
console.info(`Run started: ${run.manifest.id}`);
|
|
1100
|
+
console.info(`Trials: ${String(args.trials)}`);
|
|
1101
|
+
if (args.cacheMode !== "use") console.info(`Cache mode: ${args.cacheMode}`);
|
|
1102
|
+
console.info("");
|
|
1103
|
+
}
|
|
1104
|
+
await waitForRunCompletion(runner, run.manifest.id);
|
|
1105
|
+
const finalRun = runner.getRun(run.manifest.id);
|
|
1106
|
+
if (!finalRun) {
|
|
1107
|
+
process.exit(1);
|
|
1108
|
+
return;
|
|
1109
|
+
}
|
|
1110
|
+
const { summary } = finalRun;
|
|
1111
|
+
if (args.json) console.info(JSON.stringify(summary, null, 2));
|
|
1112
|
+
else {
|
|
1113
|
+
console.info("--- Run Summary ---");
|
|
1114
|
+
console.info(`Status: ${summary.status}`);
|
|
1115
|
+
console.info(`Total: ${String(summary.totalCases)}`);
|
|
1116
|
+
console.info(`Passed: ${String(summary.passedCases)}`);
|
|
1117
|
+
console.info(`Failed: ${String(summary.failedCases)}`);
|
|
1118
|
+
console.info(`Errors: ${String(summary.errorCases)}`);
|
|
1119
|
+
if (summary.totalCases > 0) console.info(`Pass Rate: ${String(summary.passedCases)}/${String(summary.totalCases)}`);
|
|
1120
|
+
if (summary.totalDurationMs !== null) console.info(`Duration: ${(summary.totalDurationMs / 1e3).toFixed(1)}s`);
|
|
1121
|
+
}
|
|
1122
|
+
if (summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
|
|
1123
|
+
}
|
|
1124
|
+
async function commandCache(args) {
|
|
1125
|
+
const runner = createRunner({ watchForChanges: false });
|
|
1126
|
+
await runner.init();
|
|
1127
|
+
if (args.subcommand === "list" || args.subcommand === void 0) {
|
|
1128
|
+
const entries = await runner.listCache();
|
|
1129
|
+
if (args.json) {
|
|
1130
|
+
console.info(JSON.stringify(entries, null, 2));
|
|
1131
|
+
return;
|
|
1132
|
+
}
|
|
1133
|
+
if (entries.length === 0) {
|
|
1134
|
+
console.info("No cache entries.");
|
|
1135
|
+
return;
|
|
1136
|
+
}
|
|
1137
|
+
console.info(`Cache entries (${String(entries.length)}):\n`);
|
|
1138
|
+
for (const entry of entries) {
|
|
1139
|
+
console.info(` ${entry.namespace}`);
|
|
1140
|
+
console.info(` key: ${entry.key}`);
|
|
1141
|
+
const operationLabel = entry.operationType === "span" ? `${entry.operationName} (span ${entry.spanKind ?? "unknown"})` : `${entry.operationName} (value)`;
|
|
1142
|
+
console.info(` operation: ${operationLabel}`);
|
|
1143
|
+
console.info(` stored: ${entry.storedAt}`);
|
|
1144
|
+
console.info(` size: ${String(entry.sizeBytes)} bytes`);
|
|
1145
|
+
console.info("");
|
|
1146
|
+
}
|
|
1147
|
+
return;
|
|
1148
|
+
}
|
|
1149
|
+
if (args.subcommand === "clear") {
|
|
1150
|
+
if (args.evalIds.length > 0) {
|
|
1151
|
+
for (const evalId of args.evalIds) {
|
|
1152
|
+
const entries = await runner.listCache();
|
|
1153
|
+
const prefix = `${evalId}__`;
|
|
1154
|
+
const matching = entries.filter((entry) => entry.namespace.startsWith(prefix));
|
|
1155
|
+
for (const entry of matching) await runner.clearCache({
|
|
1156
|
+
namespace: entry.namespace,
|
|
1157
|
+
key: entry.key
|
|
1158
|
+
});
|
|
1159
|
+
}
|
|
1160
|
+
console.info(`Cleared cache entries for: ${args.evalIds.join(", ")}`);
|
|
1161
|
+
return;
|
|
1162
|
+
}
|
|
1163
|
+
if (args.all) {
|
|
1164
|
+
await runner.clearCache();
|
|
1165
|
+
console.info("Cleared all cache entries.");
|
|
1166
|
+
return;
|
|
1167
|
+
}
|
|
1168
|
+
console.info("Refusing to clear cache without --eval <id> or --all. Use one of these flags to confirm.");
|
|
1169
|
+
process.exit(1);
|
|
1170
|
+
return;
|
|
1171
|
+
}
|
|
1172
|
+
printHelp(args.helpTopic);
|
|
1173
|
+
}
|
|
1174
|
+
async function waitForRunCompletion(runner, runId) {
|
|
1175
|
+
return new Promise((resolvePromise) => {
|
|
1176
|
+
const check = () => {
|
|
1177
|
+
const run = runner.getRun(runId);
|
|
1178
|
+
if (!run || run.manifest.status === "completed" || run.manifest.status === "cancelled" || run.manifest.status === "error") {
|
|
1179
|
+
resolvePromise();
|
|
1180
|
+
return;
|
|
1181
|
+
}
|
|
1182
|
+
setTimeout(check, 200);
|
|
1183
|
+
};
|
|
1184
|
+
check();
|
|
1185
|
+
});
|
|
1186
|
+
}
|
|
1187
|
+
function printHelp(topic = "global") {
|
|
1188
|
+
if (topic === "app") {
|
|
1189
|
+
console.info(`
|
|
1190
|
+
agent-evals app - Start server with UI
|
|
1191
|
+
|
|
1192
|
+
Usage:
|
|
1193
|
+
agent-evals app [flags]
|
|
1194
|
+
|
|
1195
|
+
Flags:
|
|
1196
|
+
--port <n> Server port (default: 4100)
|
|
1197
|
+
--no-env Disable automatic .env loading
|
|
1198
|
+
--help, -h Show this help
|
|
1199
|
+
`);
|
|
1200
|
+
return;
|
|
1201
|
+
}
|
|
1202
|
+
if (topic === "list") {
|
|
1203
|
+
console.info(`
|
|
1204
|
+
agent-evals list - List discovered evals
|
|
1205
|
+
|
|
1206
|
+
Usage:
|
|
1207
|
+
agent-evals list [flags]
|
|
1208
|
+
|
|
1209
|
+
Flags:
|
|
1210
|
+
--no-env Disable automatic .env loading
|
|
1211
|
+
--help, -h Show this help
|
|
1212
|
+
`);
|
|
1213
|
+
return;
|
|
1214
|
+
}
|
|
1215
|
+
if (topic === "run") {
|
|
1216
|
+
console.info(`
|
|
1217
|
+
agent-evals run - Run evals
|
|
1218
|
+
|
|
1219
|
+
Usage:
|
|
1220
|
+
agent-evals run [flags]
|
|
1221
|
+
|
|
1222
|
+
Flags:
|
|
1223
|
+
--eval <id> Run specific eval(s) (comma-separated)
|
|
1224
|
+
--case <id> Run specific case(s) (comma-separated)
|
|
1225
|
+
--trials <n> Number of trials per case
|
|
1226
|
+
--inspect[=host:port] Run with the Node.js inspector enabled
|
|
1227
|
+
--inspect-brk[=host:port] Enable inspector and pause before startup
|
|
1228
|
+
--json Output run summary as JSON
|
|
1229
|
+
--cache <use|bypass|refresh> Cache mode for this run (default: use)
|
|
1230
|
+
--no-cache Shortcut for --cache bypass
|
|
1231
|
+
--refresh-cache Shortcut for --cache refresh
|
|
1232
|
+
--clear-cache Clear the cache before starting the run
|
|
1233
|
+
--no-env Disable automatic .env loading
|
|
1234
|
+
--help, -h Show this help
|
|
1235
|
+
`);
|
|
1236
|
+
return;
|
|
1237
|
+
}
|
|
1238
|
+
if (topic === "cache" || topic === "cache list" || topic === "cache clear") {
|
|
1239
|
+
console.info(`
|
|
1240
|
+
agent-evals cache - Manage cached operation entries
|
|
1241
|
+
|
|
1242
|
+
Usage:
|
|
1243
|
+
agent-evals cache list [flags]
|
|
1244
|
+
agent-evals cache clear --eval <id>
|
|
1245
|
+
agent-evals cache clear --all
|
|
1246
|
+
|
|
1247
|
+
Flags:
|
|
1248
|
+
--eval <id> Clear entries for specific eval(s) (comma-separated)
|
|
1249
|
+
--all Confirm clearing every cached entry
|
|
1250
|
+
--json Output cache listing as JSON
|
|
1251
|
+
--no-env Disable automatic .env loading
|
|
1252
|
+
--help, -h Show this help
|
|
1253
|
+
`);
|
|
1254
|
+
return;
|
|
1255
|
+
}
|
|
1256
|
+
console.info(`
|
|
1257
|
+
agent-evals - LLM/Agent eval runner
|
|
1258
|
+
|
|
1259
|
+
Commands:
|
|
1260
|
+
app Start server with UI
|
|
1261
|
+
list List discovered evals
|
|
1262
|
+
run Run evals
|
|
1263
|
+
cache list List cached operation entries
|
|
1264
|
+
cache clear --eval <id> Clear cache entries for one eval
|
|
1265
|
+
cache clear --all Clear every cached entry
|
|
1266
|
+
help Show this help
|
|
1267
|
+
|
|
1268
|
+
Options:
|
|
1269
|
+
--eval <id> Run specific eval(s) (comma-separated)
|
|
1270
|
+
--case <id> Run specific case(s) (comma-separated)
|
|
1271
|
+
--trials <n> Number of trials per case
|
|
1272
|
+
--inspect[=host:port] Run with the Node.js inspector enabled
|
|
1273
|
+
--inspect-brk[=host:port] Enable inspector and pause before startup
|
|
1274
|
+
--json Output results as JSON
|
|
1275
|
+
--port <n> Server port (default: 4100)
|
|
1276
|
+
--cache <use|bypass|refresh> Cache mode for this run (default: use)
|
|
1277
|
+
--no-cache Shortcut for --cache bypass
|
|
1278
|
+
--refresh-cache Shortcut for --cache refresh
|
|
1279
|
+
--clear-cache Clear the cache before starting the run
|
|
1280
|
+
--no-env Disable automatic .env loading
|
|
1281
|
+
--help, -h Show help
|
|
1282
|
+
`);
|
|
1283
|
+
}
|
|
1284
|
+
//#endregion
|
|
1285
|
+
export { createRunner as n, runCli as t };
|