@ls-stack/agent-eval 0.6.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1285 @@
1
+ import { D as deriveScopedSummaryFromCases, E as getEvalDisplayStatus, Kt as getEvalRegistry, T as getEvalTitle, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as normalizeScoreDef, c as persistCaseDetail, d as recomputePersistedCaseStatus, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, i as getLatestRunInfos, j as runSummarySchema, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, r as getLastRunStatuses, s as nextShortIdFromSnapshots, u as recomputeEvalStatusesInRuns, v as loadConfig, x as createFsCacheStore, y as buildDeclaredColumnDefs } from "./runOrchestration-HaMahl6b.mjs";
2
+ import { createHash } from "node:crypto";
3
+ import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
4
+ import { dirname, join, relative, resolve } from "node:path";
5
+ import { watch } from "chokidar";
6
+ import { glob } from "glob";
7
+ import { existsSync } from "node:fs";
8
+ import { resultify } from "t-result";
9
+ import { fileURLToPath } from "node:url";
10
+ import { spawn, spawnSync } from "node:child_process";
11
+ //#region ../runner/src/chartValidation.ts
12
+ function isValidColumnMetric(metric, columnsByKey, evalId, warnings) {
13
+ const columnDef = columnsByKey.get(metric.key);
14
+ if (!columnDef) {
15
+ warnings.push(`[${evalId}] chart metric references unknown column "${metric.key}" — dropped`);
16
+ return false;
17
+ }
18
+ if (metric.aggregate === "passThresholdRate") {
19
+ if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
20
+ warnings.push(`[${evalId}] chart metric "${metric.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
21
+ return false;
22
+ }
23
+ }
24
+ return true;
25
+ }
26
+ function isValidTooltipExtra(extra, columnsByKey, evalId, warnings) {
27
+ const columnDef = columnsByKey.get(extra.key);
28
+ if (!columnDef) {
29
+ warnings.push(`[${evalId}] chart tooltip extra references unknown column "${extra.key}" — dropped`);
30
+ return false;
31
+ }
32
+ if (extra.aggregate === "passThresholdRate") {
33
+ if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
34
+ warnings.push(`[${evalId}] chart tooltip extra "${extra.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
35
+ return false;
36
+ }
37
+ }
38
+ return true;
39
+ }
40
+ function sanitizeChart(chart, columnsByKey, evalId, warnings) {
41
+ const metrics = chart.metrics.filter((metric) => {
42
+ if (metric.source === "builtin") return true;
43
+ return isValidColumnMetric(metric, columnsByKey, evalId, warnings);
44
+ });
45
+ if (metrics.length === 0) {
46
+ warnings.push(`[${evalId}] chart had no valid metrics after validation — chart dropped`);
47
+ return null;
48
+ }
49
+ const tooltipExtras = chart.tooltipExtras?.filter((extra) => {
50
+ if (extra.source === "builtin") return true;
51
+ return isValidTooltipExtra(extra, columnsByKey, evalId, warnings);
52
+ });
53
+ return {
54
+ ...chart,
55
+ metrics,
56
+ tooltipExtras: tooltipExtras?.length ? tooltipExtras : void 0
57
+ };
58
+ }
59
+ /**
60
+ * Validate and sanitize an authored `charts` config against the eval's
61
+ * declared columns. Drops metrics/extras that reference unknown columns or
62
+ * misuse `passThresholdRate`, and drops entire charts whose metrics are all
63
+ * invalid. Returns `charts: undefined` when nothing valid remains so the UI
64
+ * falls back to rendering no chart (matching the opt-in default).
65
+ */
66
+ function validateCharts(params) {
67
+ const { charts, columnDefs, evalId } = params;
68
+ if (!charts || charts.length === 0) return {
69
+ charts: void 0,
70
+ warnings: []
71
+ };
72
+ const columnsByKey = new Map(columnDefs.map((def) => [def.key, def]));
73
+ const warnings = [];
74
+ const sanitized = [];
75
+ for (const chart of charts) {
76
+ const result = sanitizeChart(chart, columnsByKey, evalId, warnings);
77
+ if (result) sanitized.push(result);
78
+ }
79
+ return {
80
+ charts: sanitized.length > 0 ? sanitized : void 0,
81
+ warnings
82
+ };
83
+ }
84
+ //#endregion
85
+ //#region ../runner/src/discovery.ts
86
+ const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
87
+ const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
88
+ function parseEvalMetas(filePath, content) {
89
+ const metas = [];
90
+ let searchIndex = 0;
91
+ while (searchIndex < content.length) {
92
+ const defineEvalIndex = content.indexOf("defineEval", searchIndex);
93
+ if (defineEvalIndex === -1) break;
94
+ const extracted = extractDefineEvalObject(content, defineEvalIndex);
95
+ if (!extracted) {
96
+ searchIndex = defineEvalIndex + 10;
97
+ continue;
98
+ }
99
+ const id = evalIdMatchRegex.exec(extracted.objectText)?.[1];
100
+ if (id !== void 0) {
101
+ const result = {
102
+ filePath,
103
+ id
104
+ };
105
+ const title = evalTitleMatchRegex.exec(extracted.objectText)?.[1];
106
+ if (title !== void 0) result.title = title;
107
+ metas.push(result);
108
+ }
109
+ searchIndex = extracted.nextIndex;
110
+ }
111
+ return metas;
112
+ }
113
+ function extractDefineEvalObject(content, defineEvalIndex) {
114
+ const openParenIndex = content.indexOf("(", defineEvalIndex);
115
+ if (openParenIndex === -1) return void 0;
116
+ const objectStartIndex = content.indexOf("{", openParenIndex);
117
+ if (objectStartIndex === -1) return void 0;
118
+ let depth = 0;
119
+ let quote;
120
+ let inBlockComment = false;
121
+ let inLineComment = false;
122
+ let isEscaped = false;
123
+ for (let index = objectStartIndex; index < content.length; index++) {
124
+ const currentChar = content[index];
125
+ const nextChar = content[index + 1];
126
+ if (inLineComment) {
127
+ if (currentChar === "\n") inLineComment = false;
128
+ continue;
129
+ }
130
+ if (inBlockComment) {
131
+ if (currentChar === "*" && nextChar === "/") {
132
+ inBlockComment = false;
133
+ index++;
134
+ }
135
+ continue;
136
+ }
137
+ if (quote) {
138
+ if (isEscaped) {
139
+ isEscaped = false;
140
+ continue;
141
+ }
142
+ if (currentChar === "\\") {
143
+ isEscaped = true;
144
+ continue;
145
+ }
146
+ if (currentChar === quote) quote = void 0;
147
+ continue;
148
+ }
149
+ if (currentChar === "/" && nextChar === "/") {
150
+ inLineComment = true;
151
+ index++;
152
+ continue;
153
+ }
154
+ if (currentChar === "/" && nextChar === "*") {
155
+ inBlockComment = true;
156
+ index++;
157
+ continue;
158
+ }
159
+ if (currentChar === "\"" || currentChar === "'" || currentChar === "`") {
160
+ quote = currentChar;
161
+ continue;
162
+ }
163
+ if (currentChar === "{") {
164
+ depth++;
165
+ continue;
166
+ }
167
+ if (currentChar === "}") {
168
+ depth--;
169
+ if (depth === 0) return {
170
+ nextIndex: index + 1,
171
+ objectText: content.slice(objectStartIndex, index + 1)
172
+ };
173
+ }
174
+ }
175
+ }
176
+ //#endregion
177
+ //#region ../runner/src/gitState.ts
178
+ function runGitCommand(workspaceRoot, args) {
179
+ const result = spawnSync("git", args, {
180
+ cwd: workspaceRoot,
181
+ encoding: "utf8",
182
+ stdio: [
183
+ "ignore",
184
+ "pipe",
185
+ "ignore"
186
+ ]
187
+ });
188
+ return {
189
+ status: result.status,
190
+ stdout: result.stdout.trim()
191
+ };
192
+ }
193
+ /** Read the current git commit for the workspace, if available. */
194
+ function readGitWorktreeState(workspaceRoot) {
195
+ const insideWorktree = runGitCommand(workspaceRoot, ["rev-parse", "--is-inside-work-tree"]);
196
+ if (insideWorktree.status !== 0 || insideWorktree.stdout !== "true") return { commitSha: null };
197
+ const commitResult = runGitCommand(workspaceRoot, ["rev-parse", "HEAD"]);
198
+ return { commitSha: commitResult.status === 0 ? commitResult.stdout : null };
199
+ }
200
+ //#endregion
201
+ //#region ../runner/src/runChildProtocol.ts
202
+ function isRunChildMessage(value) {
203
+ if (typeof value !== "object" || value === null) return false;
204
+ if (!("type" in value) || typeof value.type !== "string") return false;
205
+ if (value.type === "event") return "event" in value;
206
+ if (value.type === "case.finished") return "caseDetail" in value && "caseRow" in value;
207
+ return value.type === "done" && "evals" in value;
208
+ }
209
+ //#endregion
210
+ //#region ../runner/src/runChildManager.ts
211
+ function startRunChild(params) {
212
+ const child = spawn(process.execPath, [
213
+ ...getRunChildExecArgv(),
214
+ resolveRunChildEntrypoint(),
215
+ params.contextPath
216
+ ], {
217
+ cwd: params.managerContext.workspaceRoot,
218
+ env: process.env,
219
+ stdio: [
220
+ "ignore",
221
+ "inherit",
222
+ "inherit",
223
+ "ipc"
224
+ ]
225
+ });
226
+ params.runState.childProcess = child;
227
+ child.on("message", (message) => {
228
+ if (!isRunChildMessage(message)) return;
229
+ handleRunChildMessage({
230
+ runState: params.runState,
231
+ message,
232
+ managerContext: params.managerContext
233
+ });
234
+ });
235
+ child.once("exit", (code, signal) => {
236
+ if (params.runState.childProcess === child) params.runState.childProcess = void 0;
237
+ if (params.runState.manifest.status !== "running" || params.runState.childTerminalReceived) return;
238
+ const reason = signal !== null ? `Run child exited with signal ${signal}` : `Run child exited with code ${String(code)}`;
239
+ markRunErrored(params.runState, reason, params.managerContext);
240
+ });
241
+ }
242
+ function getRunChildExecArgv() {
243
+ const execArgv = [];
244
+ let skipNext = false;
245
+ for (const arg of process.execArgv) {
246
+ if (skipNext) {
247
+ skipNext = false;
248
+ continue;
249
+ }
250
+ if (arg === "--eval" || arg === "-e" || arg === "--print" || arg === "-p") {
251
+ skipNext = true;
252
+ continue;
253
+ }
254
+ if (arg.startsWith("--eval=") || arg.startsWith("--print=")) continue;
255
+ if (arg === "--input-type" || arg.startsWith("--input-type=")) {
256
+ if (arg === "--input-type") skipNext = true;
257
+ continue;
258
+ }
259
+ execArgv.push(arg);
260
+ }
261
+ return execArgv;
262
+ }
263
+ function killRunChild(runState) {
264
+ const child = runState.childProcess;
265
+ runState.childProcess = void 0;
266
+ if (child === void 0 || child.killed) return;
267
+ if (!child.kill("SIGKILL")) child.kill();
268
+ }
269
+ function resolveRunChildEntrypoint() {
270
+ const currentDir = dirname(fileURLToPath(import.meta.url));
271
+ for (const fileName of [
272
+ "runChild.ts",
273
+ "runChild.mjs",
274
+ "runChild.js"
275
+ ]) {
276
+ const candidate = join(currentDir, fileName);
277
+ if (existsSync(candidate)) return candidate;
278
+ }
279
+ throw new Error("Unable to locate the Agent Evals run child entrypoint.");
280
+ }
281
+ function handleRunChildMessage(params) {
282
+ const { runState, message, managerContext } = params;
283
+ if (message.type === "case.finished") {
284
+ if (runState.manifest.status !== "running") return;
285
+ upsertFinishedCase(runState, message.caseDetail, message.caseRow);
286
+ managerContext.emitEvent(runState, {
287
+ type: "case.finished",
288
+ runId: runState.manifest.id,
289
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
290
+ payload: message.caseRow
291
+ });
292
+ return;
293
+ }
294
+ if (message.type === "done") {
295
+ applyChildEvalMetas(managerContext.evals, message.evals);
296
+ managerContext.emitDiscoveryEvent();
297
+ return;
298
+ }
299
+ handleRunChildEvent(runState, message.event, managerContext);
300
+ }
301
+ function upsertFinishedCase(runState, caseDetail, caseRow) {
302
+ const existingIndex = runState.cases.findIndex((row) => row.evalId === caseRow.evalId && row.caseId === caseRow.caseId && row.trial === caseRow.trial);
303
+ if (existingIndex === -1) runState.cases.push(caseRow);
304
+ else runState.cases[existingIndex] = caseRow;
305
+ runState.caseDetails.set(caseDetail.caseId, caseDetail);
306
+ }
307
+ function applyChildEvalMetas(evals, childMetas) {
308
+ for (const childMeta of childMetas) {
309
+ const evalMeta = evals.get(childMeta.id);
310
+ if (evalMeta === void 0) continue;
311
+ evalMeta.columnDefs = childMeta.columnDefs;
312
+ evalMeta.caseCount = childMeta.caseCount;
313
+ evalMeta.stats = childMeta.stats;
314
+ evalMeta.charts = childMeta.charts;
315
+ evalMeta.sourceFingerprint = childMeta.sourceFingerprint;
316
+ }
317
+ }
318
+ function handleRunChildEvent(runState, event, managerContext) {
319
+ if (runState.manifest.status !== "running") return;
320
+ if (event.type === "run.summary") {
321
+ const parsed = runSummarySchema.safeParse(event.payload);
322
+ if (parsed.success) runState.summary = parsed.data;
323
+ managerContext.emitEvent(runState, event);
324
+ return;
325
+ }
326
+ if (event.type === "run.finished") {
327
+ runState.childTerminalReceived = true;
328
+ runState.childProcess = void 0;
329
+ markRunTerminalFromChild(runState, event, managerContext);
330
+ return;
331
+ }
332
+ if (event.type === "run.error") {
333
+ runState.childTerminalReceived = true;
334
+ runState.childProcess = void 0;
335
+ markRunTerminalFromChild(runState, event, managerContext);
336
+ return;
337
+ }
338
+ managerContext.emitEvent(runState, event);
339
+ }
340
+ function getRunErrorMessage(payload) {
341
+ if (typeof payload === "object" && payload !== null && "message" in payload && typeof payload.message === "string") return payload.message;
342
+ return "Run child ended with an error";
343
+ }
344
+ async function markRunErrored(runState, message, managerContext) {
345
+ runState.manifest.status = "error";
346
+ runState.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
347
+ runState.summary.status = "error";
348
+ runState.summary.errorMessage = message;
349
+ await persistRunState(runState);
350
+ managerContext.emitEvent(runState, {
351
+ type: "run.error",
352
+ runId: runState.manifest.id,
353
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
354
+ payload: { message }
355
+ });
356
+ managerContext.emitDiscoveryEvent();
357
+ }
358
+ async function markRunTerminalFromChild(runState, event, managerContext) {
359
+ const snapshot = await loadPersistedRunSnapshot(runState.runDir);
360
+ if (snapshot !== null) {
361
+ runState.manifest = snapshot.manifest;
362
+ runState.summary = snapshot.summary;
363
+ runState.cases = snapshot.cases;
364
+ runState.caseDetails = snapshot.caseDetails;
365
+ } else if (event.type === "run.finished") {
366
+ runState.manifest.status = "completed";
367
+ runState.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
368
+ const parsed = runSummarySchema.safeParse(event.payload);
369
+ if (parsed.success) runState.summary = parsed.data;
370
+ } else {
371
+ runState.manifest.status = "error";
372
+ runState.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
373
+ runState.summary.status = "error";
374
+ runState.summary.errorMessage = getRunErrorMessage(event.payload);
375
+ }
376
+ managerContext.emitEvent(runState, event);
377
+ managerContext.emitDiscoveryEvent();
378
+ }
379
+ //#endregion
380
+ //#region ../runner/src/runner.ts
381
+ const globMagicCharacters = new Set([
382
+ "*",
383
+ "?",
384
+ "[",
385
+ "]",
386
+ "{",
387
+ "}",
388
+ "(",
389
+ ")",
390
+ "!",
391
+ "+",
392
+ "@"
393
+ ]);
394
+ function hasGlobMagic(value) {
395
+ for (const char of value) if (globMagicCharacters.has(char)) return true;
396
+ return false;
397
+ }
398
+ function getWatchRootForIncludePattern(params) {
399
+ const segments = params.pattern.replaceAll("\\", "/").split("/").filter((part) => part !== "");
400
+ const firstGlobSegmentIndex = segments.findIndex(hasGlobMagic);
401
+ if (firstGlobSegmentIndex === -1) return dirname(resolve(params.workspaceRoot, params.pattern));
402
+ if (firstGlobSegmentIndex === 0) return params.workspaceRoot;
403
+ return resolve(params.workspaceRoot, segments.slice(0, firstGlobSegmentIndex).join("/"));
404
+ }
405
+ function getWatchRootsForIncludePatterns(params) {
406
+ const roots = /* @__PURE__ */ new Set();
407
+ for (const pattern of params.patterns) roots.add(getWatchRootForIncludePattern({
408
+ pattern,
409
+ workspaceRoot: params.workspaceRoot
410
+ }));
411
+ if (roots.size === 0) return [params.workspaceRoot];
412
+ return [...roots];
413
+ }
414
+ /** Create an in-memory eval runner bound to the current workspace config. */
415
+ function createRunner({ watchForChanges = true } = {}) {
416
+ let config;
417
+ let workspaceRoot;
418
+ let localStateDir;
419
+ let cacheStore;
420
+ const evals = /* @__PURE__ */ new Map();
421
+ const runs = /* @__PURE__ */ new Map();
422
+ const lastRunStatusMap = /* @__PURE__ */ new Map();
423
+ const latestRunInfoMap = /* @__PURE__ */ new Map();
424
+ const discoveryListeners = /* @__PURE__ */ new Set();
425
+ let nextShortIdNum = 0;
426
+ let discoveryWatcher;
427
+ let discoveryRefreshTimer;
428
+ function toWorkspaceRelativePath(filePath) {
429
+ return relative(workspaceRoot, filePath).replaceAll("\\", "/");
430
+ }
431
+ function getSortedEvalMetas() {
432
+ return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
433
+ }
434
+ function getSourceFingerprint(source) {
435
+ return createHash("sha256").update(source).digest("hex");
436
+ }
437
+ const runner = {
438
+ async init() {
439
+ config = await loadConfig();
440
+ workspaceRoot = config.workspaceRoot ?? process.cwd();
441
+ localStateDir = resolve(workspaceRoot, ".agent-evals");
442
+ await mkdir(localStateDir, { recursive: true });
443
+ await mkdir(join(localStateDir, "runs"), { recursive: true });
444
+ cacheStore = createFsCacheStore({
445
+ workspaceRoot,
446
+ dir: config.cache?.dir,
447
+ maxEntriesPerEval: config.cache?.maxEntriesPerEval
448
+ });
449
+ await loadPersistedRuns();
450
+ await runner.refreshDiscovery();
451
+ if (watchForChanges) await setupWatcher();
452
+ },
453
+ async listCache() {
454
+ return cacheStore.list();
455
+ },
456
+ async clearCache(filter) {
457
+ await cacheStore.clear(filter);
458
+ },
459
+ async recomputeStatusesForEval(evalId) {
460
+ const evalMeta = evals.get(evalId);
461
+ if (!evalMeta) return { updatedRuns: 0 };
462
+ const registry = getEvalRegistry();
463
+ await loadEvalModule(evalMeta.sourceFilePath, evalMeta.sourceFingerprint ?? void 0);
464
+ const entry = registry.get(evalId);
465
+ if (!entry) return { updatedRuns: 0 };
466
+ const scoreThresholds = /* @__PURE__ */ new Map();
467
+ entry.use((evalDef) => {
468
+ for (const [key, def] of Object.entries(evalDef.scores ?? {})) {
469
+ const threshold = normalizeScoreDef(def).passThreshold;
470
+ if (threshold !== void 0) scoreThresholds.set(key, threshold);
471
+ }
472
+ for (const [key, def] of Object.entries(evalDef.manualScores ?? {})) if (def.passThreshold !== void 0) scoreThresholds.set(key, def.passThreshold);
473
+ });
474
+ const updatedRuns = await recomputeEvalStatusesInRuns({
475
+ runs: runs.values(),
476
+ evalId,
477
+ evalExists: evals.has(evalId),
478
+ scoreThresholds,
479
+ persistCaseDetail
480
+ });
481
+ emitDiscoveryEvent();
482
+ return { updatedRuns };
483
+ },
484
+ async cleanRunsForEval(evalId) {
485
+ let deletedRuns = 0;
486
+ for (const [runId, run] of [...runs]) {
487
+ if (!runTouchesEval({
488
+ target: run.manifest.target,
489
+ caseRows: run.cases,
490
+ evalId,
491
+ evalExists: evals.has(evalId)
492
+ })) continue;
493
+ if (run.manifest.status === "running") continue;
494
+ runs.delete(runId);
495
+ await rm(run.runDir, {
496
+ recursive: true,
497
+ force: true
498
+ });
499
+ deletedRuns += 1;
500
+ }
501
+ emitDiscoveryEvent();
502
+ return { deletedRuns };
503
+ },
504
+ async updateManualScore({ runId, caseId, scoreKey, value }) {
505
+ const run = runs.get(runId);
506
+ if (!run) return {
507
+ updated: false,
508
+ reason: "Run not found"
509
+ };
510
+ if (run.manifest.status === "running") return {
511
+ updated: false,
512
+ reason: "Run is still running"
513
+ };
514
+ const caseRow = run.cases.find((row) => row.caseId === caseId);
515
+ if (!caseRow) return {
516
+ updated: false,
517
+ reason: "Case not found"
518
+ };
519
+ const evalMeta = evals.get(caseRow.evalId);
520
+ if (!evalMeta) return {
521
+ updated: false,
522
+ reason: "Eval not found"
523
+ };
524
+ if (evalMeta.columnDefs.find((def) => def.key === scoreKey)?.isManualScore !== true) return {
525
+ updated: false,
526
+ reason: "Manual score not found"
527
+ };
528
+ const caseDetail = run.caseDetails.get(caseId);
529
+ if (!caseDetail) return {
530
+ updated: false,
531
+ reason: "Case detail not found"
532
+ };
533
+ caseRow.columns[scoreKey] = value;
534
+ caseDetail.columns[scoreKey] = value;
535
+ const scoreThresholds = /* @__PURE__ */ new Map();
536
+ for (const def of evalMeta.columnDefs) {
537
+ if (def.isScore !== true || def.passThreshold === void 0) continue;
538
+ scoreThresholds.set(def.key, def.passThreshold);
539
+ }
540
+ const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds);
541
+ caseRow.status = nextStatus;
542
+ caseDetail.status = nextStatus;
543
+ const derivedSummary = deriveScopedSummaryFromCases({ caseRows: run.cases });
544
+ run.summary.totalCases = derivedSummary.totalCases;
545
+ run.summary.passedCases = derivedSummary.passedCases;
546
+ run.summary.failedCases = derivedSummary.failedCases;
547
+ run.summary.errorCases = derivedSummary.errorCases;
548
+ run.summary.cancelledCases = derivedSummary.cancelledCases;
549
+ run.summary.totalDurationMs = derivedSummary.totalDurationMs;
550
+ await persistCaseDetail(run.runDir, caseDetail);
551
+ await persistRunState(run);
552
+ emitDiscoveryEvent();
553
+ return {
554
+ updated: true,
555
+ run: {
556
+ manifest: run.manifest,
557
+ summary: run.summary,
558
+ cases: run.cases
559
+ },
560
+ caseDetail
561
+ };
562
+ },
563
+ async deleteRun(runId) {
564
+ const run = runs.get(runId);
565
+ if (!run) return { deleted: false };
566
+ if (run.manifest.status === "running") return { deleted: false };
567
+ runs.delete(runId);
568
+ await rm(run.runDir, {
569
+ recursive: true,
570
+ force: true
571
+ });
572
+ emitDiscoveryEvent();
573
+ return { deleted: true };
574
+ },
575
+ getEvals() {
576
+ const gitState = readGitWorktreeState(workspaceRoot);
577
+ const result = [];
578
+ for (const meta of getSortedEvalMetas()) result.push(buildEvalSummary({
579
+ meta,
580
+ config,
581
+ gitState,
582
+ latestRun: latestRunInfoMap.get(meta.id),
583
+ lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
584
+ }));
585
+ return result;
586
+ },
587
+ getEval(id) {
588
+ const meta = evals.get(id);
589
+ if (!meta) return void 0;
590
+ return buildEvalSummary({
591
+ meta,
592
+ config,
593
+ gitState: readGitWorktreeState(workspaceRoot),
594
+ latestRun: latestRunInfoMap.get(meta.id),
595
+ lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
596
+ });
597
+ },
598
+ async refreshDiscovery() {
599
+ const patterns = config.include;
600
+ const discovered = [];
601
+ for (const pattern of patterns) {
602
+ const files = await glob(pattern, {
603
+ cwd: workspaceRoot,
604
+ absolute: true
605
+ });
606
+ discovered.push(...files);
607
+ }
608
+ evals.clear();
609
+ for (const filePath of discovered) try {
610
+ const content = await readFile(filePath, "utf-8");
611
+ const discoveredMetas = parseEvalMetas(filePath, content);
612
+ const sourceFingerprint = getSourceFingerprint(content);
613
+ const registry = getEvalRegistry();
614
+ try {
615
+ await loadEvalModule(filePath, sourceFingerprint);
616
+ } catch {}
617
+ for (const meta of discoveredMetas) {
618
+ const discoveredEntry = registry.get(meta.id);
619
+ const title = meta.title;
620
+ let columnDefs = buildDeclaredColumnDefs(void 0, void 0, void 0);
621
+ let stats;
622
+ let charts;
623
+ discoveredEntry?.use((evalDef) => {
624
+ columnDefs = buildDeclaredColumnDefs(evalDef.columns, evalDef.scores, evalDef.manualScores);
625
+ stats = evalDef.stats;
626
+ const validated = validateCharts({
627
+ charts: evalDef.charts,
628
+ columnDefs,
629
+ evalId: meta.id
630
+ });
631
+ for (const warning of validated.warnings) console.warn(warning);
632
+ charts = validated.charts;
633
+ });
634
+ evals.set(meta.id, {
635
+ id: meta.id,
636
+ title,
637
+ filePath: toWorkspaceRelativePath(meta.filePath),
638
+ sourceFilePath: meta.filePath,
639
+ sourceFingerprint,
640
+ columnDefs,
641
+ caseCount: null,
642
+ stats,
643
+ charts
644
+ });
645
+ }
646
+ } catch {}
647
+ emitDiscoveryEvent();
648
+ },
649
+ async startRun(request) {
650
+ const runId = generateRunId();
651
+ const shortId = `r${String(nextShortIdNum++)}`;
652
+ const now = (/* @__PURE__ */ new Date()).toISOString();
653
+ const cacheMode = request.cache?.mode ?? "use";
654
+ const runDir = join(localStateDir, "runs", runId);
655
+ const manifest = {
656
+ id: runId,
657
+ shortId,
658
+ status: "running",
659
+ startedAt: now,
660
+ endedAt: null,
661
+ commitSha: readGitWorktreeState(workspaceRoot).commitSha,
662
+ evalSourceFingerprints: {},
663
+ target: request.target,
664
+ trials: request.trials,
665
+ trialSelection: config.trialSelection ?? "lowestScore",
666
+ cacheMode
667
+ };
668
+ const summary = {
669
+ runId,
670
+ status: "running",
671
+ totalCases: 0,
672
+ passedCases: 0,
673
+ failedCases: 0,
674
+ errorCases: 0,
675
+ cancelledCases: 0,
676
+ totalDurationMs: null,
677
+ errorMessage: null
678
+ };
679
+ const runState = {
680
+ runDir,
681
+ manifest,
682
+ summary,
683
+ cases: [],
684
+ caseDetails: /* @__PURE__ */ new Map(),
685
+ listeners: /* @__PURE__ */ new Set(),
686
+ childProcess: void 0,
687
+ childTerminalReceived: false
688
+ };
689
+ runs.set(runId, runState);
690
+ setLatestRunInfoMap({
691
+ latestRunInfoMap,
692
+ evalIds: getTargetEvalIds({
693
+ request,
694
+ sortedEvalIds: getSortedEvalMetas().map((meta) => meta.id),
695
+ knownEvalIds: new Set(evals.keys())
696
+ }),
697
+ info: {
698
+ status: "running",
699
+ startedAt: now,
700
+ commitSha: manifest.commitSha ?? null,
701
+ evalSourceFingerprint: null
702
+ }
703
+ });
704
+ await mkdir(runDir, { recursive: true });
705
+ await mkdir(join(runDir, "traces"), { recursive: true });
706
+ await mkdir(join(runDir, "artifacts"), { recursive: true });
707
+ await mkdir(join(runDir, "case-details"), { recursive: true });
708
+ await writeFile(join(runDir, "run.json"), JSON.stringify(manifest, null, 2));
709
+ const childContext = {
710
+ request,
711
+ workspaceRoot,
712
+ runDir,
713
+ manifest,
714
+ summary,
715
+ evals: getSortedEvalMetas()
716
+ };
717
+ await writeFile(join(runDir, "run-child-context.json"), JSON.stringify(childContext, null, 2));
718
+ startRunChild({
719
+ runState,
720
+ contextPath: join(runDir, "run-child-context.json"),
721
+ managerContext: {
722
+ workspaceRoot,
723
+ evals,
724
+ emitEvent,
725
+ emitDiscoveryEvent
726
+ }
727
+ });
728
+ return {
729
+ manifest,
730
+ summary,
731
+ cases: []
732
+ };
733
+ },
734
+ getRuns() {
735
+ return [...runs.values()].map((r) => r.manifest);
736
+ },
737
+ getRun(id) {
738
+ const run = runs.get(id);
739
+ if (!run) return void 0;
740
+ return {
741
+ manifest: run.manifest,
742
+ summary: run.summary,
743
+ cases: run.cases
744
+ };
745
+ },
746
+ async cancelRun(id) {
747
+ const run = runs.get(id);
748
+ if (!run) return;
749
+ if (run.manifest.status !== "running") return;
750
+ const endedAt = /* @__PURE__ */ new Date();
751
+ run.manifest.status = "cancelled";
752
+ run.manifest.endedAt = endedAt.toISOString();
753
+ run.summary.status = "cancelled";
754
+ const derivedSummary = deriveScopedSummaryFromCases({
755
+ caseRows: run.cases,
756
+ lifecycleStatus: "cancelled"
757
+ });
758
+ run.summary.totalCases = derivedSummary.totalCases;
759
+ run.summary.passedCases = derivedSummary.passedCases;
760
+ run.summary.failedCases = derivedSummary.failedCases;
761
+ run.summary.errorCases = derivedSummary.errorCases;
762
+ run.summary.cancelledCases = derivedSummary.cancelledCases;
763
+ run.summary.totalDurationMs = endedAt.getTime() - new Date(run.manifest.startedAt).getTime();
764
+ killRunChild(run);
765
+ await persistRunState(run);
766
+ emitEvent(run, {
767
+ type: "run.cancelled",
768
+ runId: id,
769
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
770
+ payload: run.summary
771
+ });
772
+ emitDiscoveryEvent();
773
+ },
774
+ getCaseDetail(runId, caseId) {
775
+ const run = runs.get(runId);
776
+ if (!run) return void 0;
777
+ return run.caseDetails.get(caseId);
778
+ },
779
+ subscribe(runId, listener) {
780
+ const run = runs.get(runId);
781
+ if (!run) return () => {};
782
+ run.listeners.add(listener);
783
+ return () => {
784
+ run.listeners.delete(listener);
785
+ };
786
+ },
787
+ subscribeDiscovery(listener) {
788
+ discoveryListeners.add(listener);
789
+ return () => {
790
+ discoveryListeners.delete(listener);
791
+ };
792
+ },
793
+ async close() {
794
+ if (discoveryRefreshTimer !== void 0) {
795
+ clearTimeout(discoveryRefreshTimer);
796
+ discoveryRefreshTimer = void 0;
797
+ }
798
+ const watcher = discoveryWatcher;
799
+ if (watcher === void 0) return;
800
+ discoveryWatcher = void 0;
801
+ await watcher.close();
802
+ },
803
+ getWorkspaceRoot() {
804
+ return workspaceRoot;
805
+ },
806
+ getArtifactPath(artifactId_) {
807
+ return resolveArtifactPath(join(localStateDir, "runs"), artifactId_);
808
+ }
809
+ };
810
+ async function setupWatcher() {
811
+ const watcher = watch(getWatchRootsForIncludePatterns({
812
+ patterns: config.include,
813
+ workspaceRoot
814
+ }), {
815
+ ignoreInitial: true,
816
+ persistent: true
817
+ });
818
+ discoveryWatcher = watcher;
819
+ const scheduleRefresh = () => {
820
+ if (discoveryRefreshTimer !== void 0) clearTimeout(discoveryRefreshTimer);
821
+ discoveryRefreshTimer = setTimeout(() => {
822
+ discoveryRefreshTimer = void 0;
823
+ runner.refreshDiscovery();
824
+ }, 50);
825
+ };
826
+ watcher.on("change", scheduleRefresh);
827
+ watcher.on("add", scheduleRefresh);
828
+ watcher.on("unlink", scheduleRefresh);
829
+ watcher.on("addDir", scheduleRefresh);
830
+ watcher.on("unlinkDir", scheduleRefresh);
831
+ await new Promise((ready) => {
832
+ watcher.once("ready", ready);
833
+ });
834
+ }
835
+ function emitDiscoveryEvent() {
836
+ const lastRunStatuses = getLastRunStatuses({
837
+ runs: runs.values(),
838
+ knownEvals: evals.values()
839
+ });
840
+ const latestRunInfos = getLatestRunInfos({
841
+ runs: runs.values(),
842
+ knownEvals: evals.values()
843
+ });
844
+ lastRunStatusMap.clear();
845
+ for (const [evalId, status] of lastRunStatuses) lastRunStatusMap.set(evalId, status);
846
+ latestRunInfoMap.clear();
847
+ for (const [evalId, info] of latestRunInfos) latestRunInfoMap.set(evalId, info);
848
+ const event = {
849
+ type: "discovery.updated",
850
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
851
+ payload: runner.getEvals()
852
+ };
853
+ for (const listener of discoveryListeners) listener(event);
854
+ }
855
+ function emitEvent(runState, event) {
856
+ for (const listener of runState.listeners) try {
857
+ listener(event);
858
+ } catch {}
859
+ }
860
+ async function loadPersistedRuns() {
861
+ runs.clear();
862
+ const persistedRuns = await loadPersistedRunSnapshots(localStateDir);
863
+ nextShortIdNum = nextShortIdFromSnapshots(persistedRuns);
864
+ for (const persistedRun of persistedRuns) runs.set(persistedRun.manifest.id, {
865
+ ...persistedRun,
866
+ listeners: /* @__PURE__ */ new Set(),
867
+ childProcess: void 0,
868
+ childTerminalReceived: false
869
+ });
870
+ }
871
+ return runner;
872
+ }
873
+ //#endregion
874
+ //#region src/cli.ts
875
+ function parseArgs(argv) {
876
+ const normalizedArgv = argv.filter((arg) => arg !== "--no-env");
877
+ const args = {
878
+ command: "help",
879
+ subcommand: void 0,
880
+ showHelp: false,
881
+ helpTopic: "global",
882
+ unknownHelpTarget: void 0,
883
+ evalIds: [],
884
+ caseIds: [],
885
+ trials: 1,
886
+ json: false,
887
+ port: 4100,
888
+ cacheMode: "use",
889
+ clearCache: false,
890
+ all: false,
891
+ loadEnv: normalizedArgv.length === argv.length
892
+ };
893
+ const command = normalizedArgv[0];
894
+ if (command === "--help" || command === "-h") {
895
+ args.showHelp = true;
896
+ return args;
897
+ }
898
+ if (isCliCommand(command)) {
899
+ args.command = command;
900
+ args.helpTopic = command === "help" ? "global" : command;
901
+ } else if (command !== void 0 && !command.startsWith("-")) args.unknownHelpTarget = command;
902
+ let cursor = 1;
903
+ if (args.command === "cache") {
904
+ const sub = normalizedArgv[cursor];
905
+ if (sub === "list" || sub === "clear") {
906
+ args.subcommand = sub;
907
+ args.helpTopic = `cache ${sub}`;
908
+ cursor++;
909
+ } else if (sub !== void 0 && !sub.startsWith("-")) args.unknownHelpTarget = `cache ${sub}`;
910
+ }
911
+ for (let i = cursor; i < normalizedArgv.length; i++) {
912
+ const arg = normalizedArgv[i];
913
+ const next = normalizedArgv[i + 1];
914
+ if (arg === "--help" || arg === "-h") args.showHelp = true;
915
+ else if (arg === "--eval" && next) {
916
+ args.evalIds.push(...next.split(","));
917
+ i++;
918
+ } else if (arg === "--case" && next) {
919
+ args.caseIds.push(...next.split(","));
920
+ i++;
921
+ } else if (arg === "--trials" && next) {
922
+ args.trials = Number(next);
923
+ i++;
924
+ } else if (arg === "--json") args.json = true;
925
+ else if (arg === "--port" && next) {
926
+ args.port = Number(next);
927
+ i++;
928
+ } else if (arg === "--cache" && next) {
929
+ if (next === "use" || next === "bypass" || next === "refresh") args.cacheMode = next;
930
+ i++;
931
+ } else if (arg === "--no-cache") args.cacheMode = "bypass";
932
+ else if (arg === "--refresh-cache") args.cacheMode = "refresh";
933
+ else if (arg === "--clear-cache") args.clearCache = true;
934
+ else if (arg === "--all") args.all = true;
935
+ }
936
+ return args;
937
+ }
938
+ /**
939
+ * Run the Agent Evals CLI against the current workspace.
940
+ *
941
+ * @param argv Raw command-line arguments excluding the executable name.
942
+ */
943
+ async function runCli(argv) {
944
+ const args = parseArgs(argv);
945
+ if (args.loadEnv && !loadWorkspaceEnv()) {
946
+ process.exit(1);
947
+ return;
948
+ }
949
+ if (args.showHelp) {
950
+ if (args.unknownHelpTarget !== void 0) {
951
+ console.error(`No help found for "${args.unknownHelpTarget}".`);
952
+ process.exit(1);
953
+ return;
954
+ }
955
+ printHelp(args.helpTopic);
956
+ return;
957
+ }
958
+ switch (args.command) {
959
+ case "app":
960
+ await commandApp(args);
961
+ break;
962
+ case "list":
963
+ await commandList(args);
964
+ break;
965
+ case "run":
966
+ await commandRun(args);
967
+ break;
968
+ case "cache":
969
+ await commandCache(args);
970
+ break;
971
+ default:
972
+ printHelp(args.helpTopic);
973
+ break;
974
+ }
975
+ }
976
+ function isCliCommand(command) {
977
+ return command === "app" || command === "list" || command === "run" || command === "cache" || command === "help";
978
+ }
979
+ function loadWorkspaceEnv() {
980
+ const envPath = resolve(process.cwd(), ".env");
981
+ if (!existsSync(envPath)) return true;
982
+ const loadResult = resultify(() => {
983
+ process.loadEnvFile(envPath);
984
+ });
985
+ if (loadResult.error) {
986
+ console.error(`Failed to load .env at ${envPath}: ${loadResult.error.message}`);
987
+ return false;
988
+ }
989
+ return true;
990
+ }
991
+ const currentDir = dirname(fileURLToPath(import.meta.url));
992
+ const repoRoot = resolve(currentDir, "../../..");
993
+ const pnpmCommand = process.platform === "win32" ? "pnpm.cmd" : "pnpm";
994
+ function hasRepoWebWorkspace() {
995
+ return existsSync(resolve(repoRoot, "apps/web/package.json"));
996
+ }
997
+ async function ensureWebUiIsBuilt() {
998
+ if (!hasRepoWebWorkspace()) return;
999
+ console.info("Preparing web UI...");
1000
+ await new Promise((resolvePromise, rejectPromise) => {
1001
+ const child = spawn(pnpmCommand, [
1002
+ "--filter",
1003
+ "@agent-evals/web",
1004
+ "build"
1005
+ ], {
1006
+ cwd: repoRoot,
1007
+ stdio: "inherit"
1008
+ });
1009
+ child.once("error", (error) => {
1010
+ rejectPromise(error);
1011
+ });
1012
+ child.once("exit", (code, signal) => {
1013
+ if (signal) {
1014
+ rejectPromise(/* @__PURE__ */ new Error(`Web UI build stopped with signal ${signal}.`));
1015
+ return;
1016
+ }
1017
+ if (code !== 0) {
1018
+ rejectPromise(/* @__PURE__ */ new Error(`Web UI build failed with exit code ${String(code)}.`));
1019
+ return;
1020
+ }
1021
+ resolvePromise();
1022
+ });
1023
+ });
1024
+ }
1025
+ function isHonoAppModule(mod) {
1026
+ if (typeof mod !== "object" || mod === null || !("app" in mod)) return false;
1027
+ const { app } = mod;
1028
+ return typeof app === "object" && app !== null && "fetch" in app && typeof app.fetch === "function";
1029
+ }
1030
+ function isServerRunnerModule(mod) {
1031
+ if (typeof mod !== "object" || mod === null || !("initRunner" in mod)) return false;
1032
+ return typeof mod.initRunner === "function";
1033
+ }
1034
+ async function commandApp(args) {
1035
+ await ensureWebUiIsBuilt();
1036
+ const { serve } = await import("@hono/node-server");
1037
+ const bundledWebDist = resolve(currentDir, "apps/web/dist");
1038
+ if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
1039
+ const appModule = await import("./app-ZFLdu8-r.mjs");
1040
+ const runnerModule = await import("./runner--XPZ5D7N.mjs");
1041
+ if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
1042
+ if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
1043
+ await runnerModule.initRunner();
1044
+ console.info(`Agent Evals app: http://localhost:${String(args.port)}`);
1045
+ serve({
1046
+ fetch: appModule.app.fetch,
1047
+ port: args.port
1048
+ });
1049
+ }
1050
+ async function commandList(args_) {
1051
+ const runner = createRunner({ watchForChanges: false });
1052
+ await runner.init();
1053
+ const evals = runner.getEvals();
1054
+ if (evals.length === 0) {
1055
+ console.info("No eval files found.");
1056
+ return;
1057
+ }
1058
+ console.info("Discovered evals:\n");
1059
+ for (const ev of evals) {
1060
+ const displayStatus = getEvalDisplayStatus({
1061
+ freshnessStatus: ev.freshnessStatus,
1062
+ stale: ev.stale,
1063
+ outdated: ev.outdated,
1064
+ lastRunStatus: ev.lastRunStatus
1065
+ });
1066
+ const title = getEvalTitle(ev);
1067
+ console.info(` ${title}`);
1068
+ console.info(` id: ${ev.id}`);
1069
+ console.info(` file: ${ev.filePath}`);
1070
+ if (displayStatus !== "pending") console.info(` status: ${displayStatus}`);
1071
+ if (ev.caseCount !== null) console.info(` cases: ${String(ev.caseCount)}`);
1072
+ console.info("");
1073
+ }
1074
+ }
1075
+ async function commandRun(args) {
1076
+ const runner = createRunner({ watchForChanges: false });
1077
+ await runner.init();
1078
+ if (args.clearCache) {
1079
+ await runner.clearCache();
1080
+ if (!args.json) {
1081
+ console.info("Cleared cache before run.");
1082
+ console.info("");
1083
+ }
1084
+ }
1085
+ const target = args.caseIds.length > 0 ? {
1086
+ mode: "caseIds",
1087
+ caseIds: args.caseIds,
1088
+ evalIds: args.evalIds.length > 0 ? args.evalIds : void 0
1089
+ } : args.evalIds.length > 0 ? {
1090
+ mode: "evalIds",
1091
+ evalIds: args.evalIds
1092
+ } : { mode: "all" };
1093
+ const run = await runner.startRun({
1094
+ target,
1095
+ trials: args.trials,
1096
+ cache: { mode: args.cacheMode }
1097
+ });
1098
+ if (!args.json) {
1099
+ console.info(`Run started: ${run.manifest.id}`);
1100
+ console.info(`Trials: ${String(args.trials)}`);
1101
+ if (args.cacheMode !== "use") console.info(`Cache mode: ${args.cacheMode}`);
1102
+ console.info("");
1103
+ }
1104
+ await waitForRunCompletion(runner, run.manifest.id);
1105
+ const finalRun = runner.getRun(run.manifest.id);
1106
+ if (!finalRun) {
1107
+ process.exit(1);
1108
+ return;
1109
+ }
1110
+ const { summary } = finalRun;
1111
+ if (args.json) console.info(JSON.stringify(summary, null, 2));
1112
+ else {
1113
+ console.info("--- Run Summary ---");
1114
+ console.info(`Status: ${summary.status}`);
1115
+ console.info(`Total: ${String(summary.totalCases)}`);
1116
+ console.info(`Passed: ${String(summary.passedCases)}`);
1117
+ console.info(`Failed: ${String(summary.failedCases)}`);
1118
+ console.info(`Errors: ${String(summary.errorCases)}`);
1119
+ if (summary.totalCases > 0) console.info(`Pass Rate: ${String(summary.passedCases)}/${String(summary.totalCases)}`);
1120
+ if (summary.totalDurationMs !== null) console.info(`Duration: ${(summary.totalDurationMs / 1e3).toFixed(1)}s`);
1121
+ }
1122
+ if (summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
1123
+ }
1124
+ async function commandCache(args) {
1125
+ const runner = createRunner({ watchForChanges: false });
1126
+ await runner.init();
1127
+ if (args.subcommand === "list" || args.subcommand === void 0) {
1128
+ const entries = await runner.listCache();
1129
+ if (args.json) {
1130
+ console.info(JSON.stringify(entries, null, 2));
1131
+ return;
1132
+ }
1133
+ if (entries.length === 0) {
1134
+ console.info("No cache entries.");
1135
+ return;
1136
+ }
1137
+ console.info(`Cache entries (${String(entries.length)}):\n`);
1138
+ for (const entry of entries) {
1139
+ console.info(` ${entry.namespace}`);
1140
+ console.info(` key: ${entry.key}`);
1141
+ const operationLabel = entry.operationType === "span" ? `${entry.operationName} (span ${entry.spanKind ?? "unknown"})` : `${entry.operationName} (value)`;
1142
+ console.info(` operation: ${operationLabel}`);
1143
+ console.info(` stored: ${entry.storedAt}`);
1144
+ console.info(` size: ${String(entry.sizeBytes)} bytes`);
1145
+ console.info("");
1146
+ }
1147
+ return;
1148
+ }
1149
+ if (args.subcommand === "clear") {
1150
+ if (args.evalIds.length > 0) {
1151
+ for (const evalId of args.evalIds) {
1152
+ const entries = await runner.listCache();
1153
+ const prefix = `${evalId}__`;
1154
+ const matching = entries.filter((entry) => entry.namespace.startsWith(prefix));
1155
+ for (const entry of matching) await runner.clearCache({
1156
+ namespace: entry.namespace,
1157
+ key: entry.key
1158
+ });
1159
+ }
1160
+ console.info(`Cleared cache entries for: ${args.evalIds.join(", ")}`);
1161
+ return;
1162
+ }
1163
+ if (args.all) {
1164
+ await runner.clearCache();
1165
+ console.info("Cleared all cache entries.");
1166
+ return;
1167
+ }
1168
+ console.info("Refusing to clear cache without --eval <id> or --all. Use one of these flags to confirm.");
1169
+ process.exit(1);
1170
+ return;
1171
+ }
1172
+ printHelp(args.helpTopic);
1173
+ }
1174
+ async function waitForRunCompletion(runner, runId) {
1175
+ return new Promise((resolvePromise) => {
1176
+ const check = () => {
1177
+ const run = runner.getRun(runId);
1178
+ if (!run || run.manifest.status === "completed" || run.manifest.status === "cancelled" || run.manifest.status === "error") {
1179
+ resolvePromise();
1180
+ return;
1181
+ }
1182
+ setTimeout(check, 200);
1183
+ };
1184
+ check();
1185
+ });
1186
+ }
1187
+ function printHelp(topic = "global") {
1188
+ if (topic === "app") {
1189
+ console.info(`
1190
+ agent-evals app - Start server with UI
1191
+
1192
+ Usage:
1193
+ agent-evals app [flags]
1194
+
1195
+ Flags:
1196
+ --port <n> Server port (default: 4100)
1197
+ --no-env Disable automatic .env loading
1198
+ --help, -h Show this help
1199
+ `);
1200
+ return;
1201
+ }
1202
+ if (topic === "list") {
1203
+ console.info(`
1204
+ agent-evals list - List discovered evals
1205
+
1206
+ Usage:
1207
+ agent-evals list [flags]
1208
+
1209
+ Flags:
1210
+ --no-env Disable automatic .env loading
1211
+ --help, -h Show this help
1212
+ `);
1213
+ return;
1214
+ }
1215
+ if (topic === "run") {
1216
+ console.info(`
1217
+ agent-evals run - Run evals
1218
+
1219
+ Usage:
1220
+ agent-evals run [flags]
1221
+
1222
+ Flags:
1223
+ --eval <id> Run specific eval(s) (comma-separated)
1224
+ --case <id> Run specific case(s) (comma-separated)
1225
+ --trials <n> Number of trials per case
1226
+ --inspect[=host:port] Run with the Node.js inspector enabled
1227
+ --inspect-brk[=host:port] Enable inspector and pause before startup
1228
+ --json Output run summary as JSON
1229
+ --cache <use|bypass|refresh> Cache mode for this run (default: use)
1230
+ --no-cache Shortcut for --cache bypass
1231
+ --refresh-cache Shortcut for --cache refresh
1232
+ --clear-cache Clear the cache before starting the run
1233
+ --no-env Disable automatic .env loading
1234
+ --help, -h Show this help
1235
+ `);
1236
+ return;
1237
+ }
1238
+ if (topic === "cache" || topic === "cache list" || topic === "cache clear") {
1239
+ console.info(`
1240
+ agent-evals cache - Manage cached operation entries
1241
+
1242
+ Usage:
1243
+ agent-evals cache list [flags]
1244
+ agent-evals cache clear --eval <id>
1245
+ agent-evals cache clear --all
1246
+
1247
+ Flags:
1248
+ --eval <id> Clear entries for specific eval(s) (comma-separated)
1249
+ --all Confirm clearing every cached entry
1250
+ --json Output cache listing as JSON
1251
+ --no-env Disable automatic .env loading
1252
+ --help, -h Show this help
1253
+ `);
1254
+ return;
1255
+ }
1256
+ console.info(`
1257
+ agent-evals - LLM/Agent eval runner
1258
+
1259
+ Commands:
1260
+ app Start server with UI
1261
+ list List discovered evals
1262
+ run Run evals
1263
+ cache list List cached operation entries
1264
+ cache clear --eval <id> Clear cache entries for one eval
1265
+ cache clear --all Clear every cached entry
1266
+ help Show this help
1267
+
1268
+ Options:
1269
+ --eval <id> Run specific eval(s) (comma-separated)
1270
+ --case <id> Run specific case(s) (comma-separated)
1271
+ --trials <n> Number of trials per case
1272
+ --inspect[=host:port] Run with the Node.js inspector enabled
1273
+ --inspect-brk[=host:port] Enable inspector and pause before startup
1274
+ --json Output results as JSON
1275
+ --port <n> Server port (default: 4100)
1276
+ --cache <use|bypass|refresh> Cache mode for this run (default: use)
1277
+ --no-cache Shortcut for --cache bypass
1278
+ --refresh-cache Shortcut for --cache refresh
1279
+ --clear-cache Clear the cache before starting the run
1280
+ --no-env Disable automatic .env loading
1281
+ --help, -h Show help
1282
+ `);
1283
+ }
1284
+ //#endregion
1285
+ export { createRunner as n, runCli as t };