@ls-stack/agent-eval 0.27.1 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,25 +1,149 @@
1
- import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef, Wn as getEvalRegistry, _ as loadEvalModule, a as getLastRunStatuses, b as loadConfig, c as loadPersistedRunSnapshots, ct as getCaseRowEvalKey, d as persistRunState, f as recomputeEvalStatusesInRuns, g as deriveEvalFreshness, h as resolveArtifactPath, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, nt as resolveLlmCallsConfig, o as getLatestRunInfos, ot as buildEvalKey, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, st as getCaseRowCaseKey, tt as resolveApiCallsConfig, u as persistCaseDetail, v as parseEvalDiscovery, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-FEvBwwJI.mjs";
2
- import { createHash } from "node:crypto";
3
- import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
4
- import { dirname, join, relative, resolve } from "node:path";
1
+ import { B as getEvalDisplayStatus, C as loadConfig, D as createFsCacheStore, E as validateCharts, G as runSummarySchema, L as applyDerivedCallAttributes, S as resolveEvalDefaultConfig, T as normalizeScoreDef, V as deriveScopedSummaryFromCases, _ as buildManualInputDescriptor, _t as getCaseRowEvalKey, a as getLastRunStatuses, b as loadEvalModule, c as loadPersistedRunSnapshots, d as persistRunState, dt as resolveLlmCallsConfig, f as recomputeEvalStatusesInRuns, g as resolveArtifactPath, gt as getCaseRowCaseKey, h as resolveTracePresentation, ht as buildEvalKey, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, o as getLatestRunInfos, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, u as persistCaseDetail, ur as getEvalRegistry, ut as resolveApiCallsConfig, v as parseManualInputValues, w as buildDeclaredColumnDefs, x as parseEvalDiscovery, y as deriveEvalFreshness, z as getEvalTitle } from "./runOrchestration-CIARrLs6.mjs";
2
+ import { createHash, randomUUID } from "node:crypto";
3
+ import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
4
+ import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
5
5
  import { watch } from "chokidar";
6
6
  import { glob } from "glob";
7
7
  import { existsSync } from "node:fs";
8
8
  import { resultify } from "t-result";
9
9
  import { fileURLToPath } from "node:url";
10
10
  import { spawn, spawnSync } from "node:child_process";
11
+ //#region ../runner/src/configReload.ts
12
+ /** Coordinates idle-only reloads for `agent-evals.config.ts` in app mode. */
13
+ function createConfigReloadController({ getActiveRunCount, closeRunnerWatchers, loadRunnerState, emitToDiscoveryListeners }) {
14
+ let watcher;
15
+ let reloadTimer;
16
+ let reloadPromise;
17
+ let state = {
18
+ status: "idle",
19
+ activeRunCount: 0,
20
+ lastChangedAt: null,
21
+ lastReloadedAt: null
22
+ };
23
+ function currentState() {
24
+ return {
25
+ ...state,
26
+ activeRunCount: getActiveRunCount()
27
+ };
28
+ }
29
+ function emitReloadEvent() {
30
+ emitToDiscoveryListeners({
31
+ type: "config.reload",
32
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
33
+ payload: currentState()
34
+ });
35
+ }
36
+ function setState(patch) {
37
+ state = {
38
+ ...state,
39
+ ...patch,
40
+ activeRunCount: getActiveRunCount()
41
+ };
42
+ emitReloadEvent();
43
+ }
44
+ async function close() {
45
+ if (reloadTimer !== void 0) {
46
+ clearTimeout(reloadTimer);
47
+ reloadTimer = void 0;
48
+ }
49
+ const watcherToClose = watcher;
50
+ watcher = void 0;
51
+ if (watcherToClose !== void 0) await watcherToClose.close();
52
+ }
53
+ async function reloadConfigNow(changedAt) {
54
+ setState({
55
+ status: "reloading",
56
+ lastChangedAt: changedAt
57
+ });
58
+ await close();
59
+ await closeRunnerWatchers();
60
+ await loadRunnerState();
61
+ setState({
62
+ status: "idle",
63
+ lastChangedAt: changedAt,
64
+ lastReloadedAt: (/* @__PURE__ */ new Date()).toISOString()
65
+ });
66
+ }
67
+ async function reloadConfig(changedAt) {
68
+ if (reloadPromise !== void 0) {
69
+ setState({
70
+ status: "pending",
71
+ lastChangedAt: changedAt
72
+ });
73
+ await reloadPromise;
74
+ await reloadIfPendingAndIdle();
75
+ return;
76
+ }
77
+ reloadPromise = reloadConfigNow(changedAt);
78
+ try {
79
+ await reloadPromise;
80
+ } finally {
81
+ reloadPromise = void 0;
82
+ }
83
+ }
84
+ async function handleConfigChanged() {
85
+ const changedAt = (/* @__PURE__ */ new Date()).toISOString();
86
+ if (getActiveRunCount() > 0) {
87
+ setState({
88
+ status: "pending",
89
+ lastChangedAt: changedAt
90
+ });
91
+ return;
92
+ }
93
+ await reloadConfig(changedAt);
94
+ }
95
+ async function reloadIfPendingAndIdle() {
96
+ if (state.status !== "pending") return;
97
+ if (getActiveRunCount() > 0) {
98
+ state = currentState();
99
+ return;
100
+ }
101
+ await reloadConfig(state.lastChangedAt ?? (/* @__PURE__ */ new Date()).toISOString());
102
+ }
103
+ async function setupWatcher() {
104
+ const nextWatcher = watch(resolve(process.cwd(), "agent-evals.config.ts"), {
105
+ awaitWriteFinish: {
106
+ stabilityThreshold: 100,
107
+ pollInterval: 20
108
+ },
109
+ ignoreInitial: true,
110
+ persistent: true
111
+ });
112
+ watcher = nextWatcher;
113
+ const scheduleReload = () => {
114
+ if (reloadTimer !== void 0) clearTimeout(reloadTimer);
115
+ reloadTimer = setTimeout(() => {
116
+ reloadTimer = void 0;
117
+ handleConfigChanged();
118
+ }, 50);
119
+ };
120
+ nextWatcher.on("change", scheduleReload);
121
+ nextWatcher.on("add", scheduleReload);
122
+ nextWatcher.on("unlink", scheduleReload);
123
+ await new Promise((ready) => {
124
+ nextWatcher.once("ready", ready);
125
+ });
126
+ }
127
+ return {
128
+ close,
129
+ currentState,
130
+ reloadIfPendingAndIdle,
131
+ setupWatcher
132
+ };
133
+ }
134
+ //#endregion
11
135
  //#region ../runner/src/evalSummaries.ts
12
136
  /** Build the API/UI summary payload for one discovered eval. */
13
137
  function buildEvalSummary(params) {
14
138
  const { meta, config, gitState, latestRun, lastRunStatus } = params;
15
- const { sourceFingerprint, ...summaryMeta } = meta;
139
+ const { sourceFingerprint, manualInputDescriptor, requiresManualInput, ...summaryMeta } = meta;
16
140
  const freshness = deriveEvalFreshness({
17
141
  latestRun,
18
142
  gitState,
19
143
  currentEvalSourceFingerprint: sourceFingerprint,
20
144
  staleAfterDays: config.staleAfterDays ?? 14
21
145
  });
22
- return {
146
+ const summary = {
23
147
  ...summaryMeta,
24
148
  stale: freshness.stale,
25
149
  outdated: freshness.outdated,
@@ -29,6 +153,8 @@ function buildEvalSummary(params) {
29
153
  currentCommitSha: gitState.commitSha,
30
154
  lastRunStatus
31
155
  };
156
+ if (manualInputDescriptor && requiresManualInput) summary.manualInput = manualInputDescriptor;
157
+ return summary;
32
158
  }
33
159
  /** Write one latest-run snapshot to each targeted eval id. */
34
160
  function setLatestRunInfoMap(params) {
@@ -60,6 +186,343 @@ function readGitWorktreeState(workspaceRoot) {
60
186
  return { commitSha: commitResult.status === 0 ? commitResult.stdout : null };
61
187
  }
62
188
  //#endregion
189
+ //#region ../runner/src/manualInput/discovery.ts
190
+ /**
191
+ * Inspect an eval's `manualInput` config during discovery. Rejects evals that
192
+ * declare both `cases` and `manualInput` and evals whose schema cannot be
193
+ * walked into a wire descriptor.
194
+ */
195
+ function resolveManualInputDiscovery(params) {
196
+ const { evalDef, evalId, relativeFilePath } = params;
197
+ if (!evalDef.manualInput) return { kind: "none" };
198
+ if (evalDef.cases !== void 0) return {
199
+ kind: "issue",
200
+ issue: {
201
+ type: "manual-input-with-cases",
202
+ severity: "error",
203
+ filePath: relativeFilePath,
204
+ evalId,
205
+ message: `Eval "${evalId}" in ${relativeFilePath} declares both "cases" and "manualInput". Remove one of them.`
206
+ }
207
+ };
208
+ const descriptorResult = buildManualInputDescriptor(evalDef.manualInput);
209
+ if (descriptorResult.error) return {
210
+ kind: "issue",
211
+ issue: {
212
+ type: "manual-input-with-cases",
213
+ severity: "error",
214
+ filePath: relativeFilePath,
215
+ evalId,
216
+ message: `Eval "${evalId}" in ${relativeFilePath} has an unsupported manualInput schema: ${descriptorResult.error.message}`
217
+ }
218
+ };
219
+ return {
220
+ kind: "ok",
221
+ requiresManualInput: true,
222
+ descriptor: descriptorResult.value,
223
+ config: evalDef.manualInput
224
+ };
225
+ }
226
+ //#endregion
227
+ //#region ../runner/src/manualInput/files.ts
228
+ const stagedUploadDir = ".agent-evals/manual-input-uploads";
229
+ const mimeTypeByExtension = {
230
+ ".gif": "image/gif",
231
+ ".jpeg": "image/jpeg",
232
+ ".jpg": "image/jpeg",
233
+ ".json": "application/json",
234
+ ".md": "text/markdown",
235
+ ".pdf": "application/pdf",
236
+ ".png": "image/png",
237
+ ".svg": "image/svg+xml",
238
+ ".txt": "text/plain",
239
+ ".webp": "image/webp"
240
+ };
241
+ function toWorkspaceRelativePath(params) {
242
+ return relative(params.workspaceRoot, params.filePath).replaceAll("\\", "/");
243
+ }
244
+ function isInsideWorkspace(params) {
245
+ const rel = relative(params.workspaceRoot, params.filePath);
246
+ return rel === "" || !rel.startsWith("..") && !isAbsolute(rel);
247
+ }
248
+ function sanitizeSegment(value) {
249
+ const normalized = value.trim().replaceAll(/[^A-Za-z0-9._-]+/g, "-");
250
+ return normalized.length > 0 ? normalized : "file";
251
+ }
252
+ function sanitizeFileName(value) {
253
+ const normalized = sanitizeSegment(value);
254
+ const extension = extname(normalized);
255
+ if (extension.length === 0) return normalized;
256
+ return `${normalized.slice(0, -extension.length).replaceAll(".", "-")}${extension}`;
257
+ }
258
+ function inferMimeType(params) {
259
+ const normalized = params.mimeType?.trim();
260
+ if (normalized && normalized.length > 0) return normalized;
261
+ return mimeTypeByExtension[extname(params.name).toLowerCase()] ?? "";
262
+ }
263
+ function hashBytes(bytes) {
264
+ return createHash("sha256").update(bytes).digest("hex");
265
+ }
266
+ function isRecord$1(value) {
267
+ return typeof value === "object" && value !== null && !Array.isArray(value);
268
+ }
269
+ function isManualInputFileValue(value) {
270
+ if (!isRecord$1(value)) return false;
271
+ return typeof value.name === "string" && typeof value.mimeType === "string" && typeof value.sizeBytes === "number" && typeof value.sha256 === "string" && typeof value.path === "string";
272
+ }
273
+ function isStagedManualInputPath(path) {
274
+ return path === stagedUploadDir || path.startsWith(`${stagedUploadDir}/`) || path.startsWith(stagedUploadDir + sep);
275
+ }
276
+ /**
277
+ * Persist uploaded manual-input bytes in the workspace staging area and return
278
+ * the JSON-safe metadata used by manual-input schemas.
279
+ */
280
+ async function stageManualInputFile({ workspaceRoot, bytes, name, mimeType }) {
281
+ const fileName = sanitizeFileName(name || "uploaded-file");
282
+ const sha256 = hashBytes(bytes);
283
+ const dir = resolve(workspaceRoot, stagedUploadDir);
284
+ await mkdir(dir, { recursive: true });
285
+ const targetPath = join(dir, `${Date.now().toString(36)}-${randomUUID()}__${sha256.slice(0, 12)}__${fileName}`);
286
+ await writeFile(targetPath, bytes);
287
+ return {
288
+ name: name || fileName,
289
+ mimeType: inferMimeType({
290
+ mimeType,
291
+ name: fileName
292
+ }),
293
+ sizeBytes: bytes.byteLength,
294
+ sha256,
295
+ path: toWorkspaceRelativePath({
296
+ workspaceRoot,
297
+ filePath: targetPath
298
+ })
299
+ };
300
+ }
301
+ /**
302
+ * Read a file path supplied by the CLI and stage it as a manual-input file.
303
+ */
304
+ async function stageManualInputFileFromPath({ workspaceRoot, path, name, mimeType }) {
305
+ const sourcePath = isAbsolute(path) ? resolve(path) : resolve(workspaceRoot, path);
306
+ return await stageManualInputFile({
307
+ workspaceRoot,
308
+ bytes: new Uint8Array(await readFile(sourcePath)),
309
+ name: name ?? basename(sourcePath),
310
+ mimeType: inferMimeType({
311
+ mimeType,
312
+ name: name ?? basename(sourcePath)
313
+ })
314
+ });
315
+ }
316
+ async function materializeOneManualInputFile(params) {
317
+ const sourcePath = resolve(params.workspaceRoot, params.value.path);
318
+ if (!isInsideWorkspace({
319
+ workspaceRoot: params.workspaceRoot,
320
+ filePath: sourcePath
321
+ })) throw new Error(`Manual input file path escapes workspace: ${params.value.path}`);
322
+ const bytes = new Uint8Array(await readFile(sourcePath));
323
+ const sha256 = hashBytes(bytes);
324
+ const fileName = sanitizeFileName(params.value.name || basename(sourcePath));
325
+ const artifactId = [
326
+ sanitizeSegment(params.runId),
327
+ "manual-input",
328
+ sha256.slice(0, 12),
329
+ fileName
330
+ ].join("__");
331
+ const targetPath = join(params.runDir, "artifacts", artifactId);
332
+ await mkdir(join(params.runDir, "artifacts"), { recursive: true });
333
+ if (sourcePath !== targetPath) await copyFile(sourcePath, targetPath);
334
+ if (isStagedManualInputPath(params.value.path)) await resultify(() => rm(sourcePath, { force: true }));
335
+ return {
336
+ name: params.value.name,
337
+ mimeType: inferMimeType({
338
+ mimeType: params.value.mimeType,
339
+ name: params.value.name || fileName
340
+ }),
341
+ sizeBytes: bytes.byteLength,
342
+ sha256,
343
+ path: toWorkspaceRelativePath({
344
+ workspaceRoot: params.workspaceRoot,
345
+ filePath: targetPath
346
+ })
347
+ };
348
+ }
349
+ async function materializeUnknownValue(params) {
350
+ if (isManualInputFileValue(params.value)) return await materializeOneManualInputFile({
351
+ workspaceRoot: params.workspaceRoot,
352
+ runId: params.runId,
353
+ runDir: params.runDir,
354
+ value: params.value
355
+ });
356
+ if (Array.isArray(params.value)) return await Promise.all(params.value.map(async (entry) => await materializeUnknownValue({
357
+ workspaceRoot: params.workspaceRoot,
358
+ runId: params.runId,
359
+ runDir: params.runDir,
360
+ value: entry
361
+ })));
362
+ if (isRecord$1(params.value)) {
363
+ const entries = await Promise.all(Object.entries(params.value).map(async ([key, child]) => {
364
+ return [key, await materializeUnknownValue({
365
+ workspaceRoot: params.workspaceRoot,
366
+ runId: params.runId,
367
+ runDir: params.runDir,
368
+ value: child
369
+ })];
370
+ }));
371
+ return Object.fromEntries(entries);
372
+ }
373
+ return params.value;
374
+ }
375
+ /**
376
+ * Copy all manual-input file references inside a run request into the run's
377
+ * artifact directory and return a request-safe value with artifact paths.
378
+ */
379
+ async function materializeManualInputFiles({ workspaceRoot, runId, runDir, value }) {
380
+ const result = await resultify(() => materializeUnknownValue({
381
+ workspaceRoot,
382
+ runId,
383
+ runDir,
384
+ value
385
+ }));
386
+ if (result.error) return {
387
+ error: result.error.message,
388
+ value: null
389
+ };
390
+ return {
391
+ error: null,
392
+ value: result.value
393
+ };
394
+ }
395
+ /** Remove stale staged manual-input uploads from previous abandoned runs. */
396
+ async function cleanupStagedManualInputFiles(workspaceRoot) {
397
+ await resultify(() => rm(resolve(workspaceRoot, stagedUploadDir), {
398
+ force: true,
399
+ recursive: true
400
+ }));
401
+ }
402
+ //#endregion
403
+ //#region ../runner/src/manualInput/validation.ts
404
+ function evalIsTargeted(evalMeta, target) {
405
+ if (target.evalKeys && target.evalKeys.length > 0) {
406
+ if (!target.evalKeys.includes(evalMeta.key)) return false;
407
+ }
408
+ if (target.evalIds && target.evalIds.length > 0) {
409
+ if (!target.evalIds.includes(evalMeta.id)) return false;
410
+ }
411
+ return true;
412
+ }
413
+ /**
414
+ * Validate the `manualInputs` map carried by a `CreateRunRequest` against the
415
+ * authored Zod schemas of every targeted eval that requires manual input.
416
+ *
417
+ * Pure: takes captured discovery state (eval metas + schema configs) and the
418
+ * request, returns a structured result the server/CLI can format directly.
419
+ */
420
+ function validateManualInputsForRequest(params) {
421
+ const { evalMetas, manualInputConfigs, request } = params;
422
+ const failures = [];
423
+ const parsed = {};
424
+ for (const evalMeta of evalMetas) {
425
+ if (!evalMeta.requiresManualInput) continue;
426
+ if (!evalIsTargeted(evalMeta, request.target)) continue;
427
+ const rawValue = request.manualInputs?.[evalMeta.key];
428
+ if (rawValue === void 0) {
429
+ failures.push({
430
+ evalKey: evalMeta.key,
431
+ evalId: evalMeta.id,
432
+ reason: "missing",
433
+ issues: [{
434
+ path: "",
435
+ message: `manualInputs is missing an entry for "${evalMeta.key}"`
436
+ }]
437
+ });
438
+ continue;
439
+ }
440
+ const config = manualInputConfigs.get(evalMeta.key);
441
+ if (!config) {
442
+ failures.push({
443
+ evalKey: evalMeta.key,
444
+ evalId: evalMeta.id,
445
+ reason: "invalid",
446
+ issues: [{
447
+ path: "",
448
+ message: "manualInput schema is unavailable; reload the workspace and try again"
449
+ }]
450
+ });
451
+ continue;
452
+ }
453
+ const result = parseManualInputValues(config, rawValue);
454
+ if (result.error) {
455
+ failures.push({
456
+ evalKey: evalMeta.key,
457
+ evalId: evalMeta.id,
458
+ reason: "invalid",
459
+ issues: result.error.issues
460
+ });
461
+ continue;
462
+ }
463
+ parsed[evalMeta.key] = result.value;
464
+ }
465
+ if (failures.length > 0) return {
466
+ ok: false,
467
+ failures
468
+ };
469
+ return {
470
+ ok: true,
471
+ parsed
472
+ };
473
+ }
474
+ //#endregion
475
+ //#region ../runner/src/recalculateDerivedAttributes.ts
476
+ function getCaseArtifactFileIdForExistingRun(runState, caseRow) {
477
+ const caseKey = getCaseRowCaseKey(caseRow);
478
+ return runState.cases.some((existing) => existing.caseId === caseRow.caseId && getCaseRowCaseKey(existing) !== caseKey) ? caseKey : caseRow.caseId;
479
+ }
480
+ async function recalculateDerivedAttributesForCase(params) {
481
+ const { run, caseId } = params;
482
+ if (run.manifest.status === "running") return {
483
+ updated: false,
484
+ reason: "Run is still running"
485
+ };
486
+ const caseRow = run.cases.find((row) => getCaseRowCaseKey(row) === caseId || row.caseId === caseId);
487
+ if (!caseRow) return {
488
+ updated: false,
489
+ reason: "Case not found"
490
+ };
491
+ const caseKey = getCaseRowCaseKey(caseRow);
492
+ const caseDetail = run.caseDetails.get(caseKey);
493
+ if (!caseDetail) return {
494
+ updated: false,
495
+ reason: "Case detail not found"
496
+ };
497
+ const spansWithDerivedAttributes = applyDerivedCallAttributes({
498
+ spans: caseDetail.trace,
499
+ llmCallsConfig: params.llmCallsConfig,
500
+ apiCallsConfig: params.apiCallsConfig
501
+ });
502
+ let nextTrace = spansWithDerivedAttributes;
503
+ let nextTraceDisplay = caseDetail.traceDisplay;
504
+ const evalMeta = params.evals.get(getCaseRowEvalKey(caseRow));
505
+ const entry = evalMeta === void 0 ? void 0 : getEvalRegistry().get(evalMeta.id);
506
+ if (entry !== void 0) entry.use((evalDef) => {
507
+ const resolved = resolveTracePresentation(spansWithDerivedAttributes, params.traceDisplayConfig, evalDef.traceDisplay);
508
+ nextTrace = resolved.trace;
509
+ nextTraceDisplay = resolved.traceDisplay;
510
+ });
511
+ const nextCaseDetail = {
512
+ ...caseDetail,
513
+ trace: nextTrace,
514
+ traceDisplay: nextTraceDisplay
515
+ };
516
+ run.caseDetails.set(caseKey, nextCaseDetail);
517
+ const artifactFileId = getCaseArtifactFileIdForExistingRun(run, caseRow);
518
+ await writeFile(join(run.runDir, "traces", `${encodeURIComponent(artifactFileId)}.json`), JSON.stringify(nextCaseDetail.trace, null, 2));
519
+ await params.persistCaseDetail(run.runDir, nextCaseDetail, artifactFileId);
520
+ return {
521
+ updated: true,
522
+ caseDetail: nextCaseDetail
523
+ };
524
+ }
525
+ //#endregion
63
526
  //#region ../runner/src/runChildProtocol.ts
64
527
  function isRunChildMessage(value) {
65
528
  if (typeof value !== "object" || value === null) return false;
@@ -251,7 +714,7 @@ async function markRunTerminalFromChild(runState, event, managerContext) {
251
714
  managerContext.emitDiscoveryEvent();
252
715
  }
253
716
  //#endregion
254
- //#region ../runner/src/runner.ts
717
+ //#region ../runner/src/watchRoots.ts
255
718
  const globMagicCharacters = new Set([
256
719
  "*",
257
720
  "?",
@@ -285,6 +748,11 @@ function getWatchRootsForIncludePatterns(params) {
285
748
  if (roots.size === 0) return [params.workspaceRoot];
286
749
  return [...roots];
287
750
  }
751
+ //#endregion
752
+ //#region ../runner/src/runner.ts
753
+ function isRecord(value) {
754
+ return typeof value === "object" && value !== null && !Array.isArray(value);
755
+ }
288
756
  /** Create an in-memory eval runner bound to the current workspace config. */
289
757
  function createRunner({ watchForChanges = true } = {}) {
290
758
  let config;
@@ -294,6 +762,7 @@ function createRunner({ watchForChanges = true } = {}) {
294
762
  let llmCallsConfig = resolveLlmCallsConfig(void 0);
295
763
  let apiCallsConfig = resolveApiCallsConfig(void 0);
296
764
  const evals = /* @__PURE__ */ new Map();
765
+ const manualInputConfigs = /* @__PURE__ */ new Map();
297
766
  let discoveryIssues = [];
298
767
  const runs = /* @__PURE__ */ new Map();
299
768
  const lastRunStatusMap = /* @__PURE__ */ new Map();
@@ -304,6 +773,12 @@ function createRunner({ watchForChanges = true } = {}) {
304
773
  let runHistoryWatcher;
305
774
  let discoveryRefreshTimer;
306
775
  let runHistoryRefreshTimer;
776
+ const configReload = createConfigReloadController({
777
+ getActiveRunCount,
778
+ closeRunnerWatchers: closeWatchers,
779
+ loadRunnerState,
780
+ emitToDiscoveryListeners
781
+ });
307
782
  function toWorkspaceRelativePath(filePath) {
308
783
  return relative(workspaceRoot, filePath).replaceAll("\\", "/");
309
784
  }
@@ -321,22 +796,7 @@ function createRunner({ watchForChanges = true } = {}) {
321
796
  }
322
797
  const runner = {
323
798
  async init() {
324
- config = await loadConfig();
325
- workspaceRoot = config.workspaceRoot ?? process.cwd();
326
- localStateDir = resolve(workspaceRoot, ".agent-evals");
327
- llmCallsConfig = resolveLlmCallsConfig(config.llmCalls);
328
- apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
329
- await mkdir(localStateDir, { recursive: true });
330
- await mkdir(join(localStateDir, "runs"), { recursive: true });
331
- cacheStore = createFsCacheStore({
332
- workspaceRoot,
333
- dir: config.cache?.dir,
334
- maxEntriesPerNamespace: config.cache?.maxEntriesPerNamespace ?? config.cache?.maxEntriesPerEval,
335
- maxEntriesByNamespace: config.cache?.maxEntriesByNamespace
336
- });
337
- await loadPersistedRuns();
338
- await runner.refreshDiscovery();
339
- if (watchForChanges) await setupWatcher();
799
+ await loadRunnerState();
340
800
  },
341
801
  async listCache() {
342
802
  return cacheStore.list();
@@ -373,6 +833,22 @@ function createRunner({ watchForChanges = true } = {}) {
373
833
  emitDiscoveryEvent();
374
834
  return { updatedRuns };
375
835
  },
836
+ async recalculateDerivedAttributesForCase({ runId, caseId }) {
837
+ const run = runs.get(runId);
838
+ if (!run) return {
839
+ updated: false,
840
+ reason: "Run not found"
841
+ };
842
+ return recalculateDerivedAttributesForCase({
843
+ run,
844
+ caseId,
845
+ llmCallsConfig,
846
+ apiCallsConfig,
847
+ traceDisplayConfig: config.traceDisplay,
848
+ evals,
849
+ persistCaseDetail
850
+ });
851
+ },
376
852
  async cleanRunsForEval(evalKey) {
377
853
  const evalMeta = resolveEvalMeta(evalKey);
378
854
  let deletedRuns = 0;
@@ -466,6 +942,13 @@ function createRunner({ watchForChanges = true } = {}) {
466
942
  emitDiscoveryEvent();
467
943
  return { deleted: true };
468
944
  },
945
+ validateManualInputs(request) {
946
+ return validateManualInputsForRequest({
947
+ evalMetas: getSortedEvalMetas(),
948
+ manualInputConfigs,
949
+ request
950
+ });
951
+ },
469
952
  getEvals() {
470
953
  const gitState = readGitWorktreeState(workspaceRoot);
471
954
  const result = [];
@@ -492,6 +975,9 @@ function createRunner({ watchForChanges = true } = {}) {
492
975
  getDiscoveryIssues() {
493
976
  return discoveryIssues;
494
977
  },
978
+ getConfigReloadState() {
979
+ return configReload.currentState();
980
+ },
495
981
  async refreshDiscovery() {
496
982
  const patterns = config.include;
497
983
  const discovered = [];
@@ -503,6 +989,7 @@ function createRunner({ watchForChanges = true } = {}) {
503
989
  discovered.push(...files);
504
990
  }
505
991
  evals.clear();
992
+ manualInputConfigs.clear();
506
993
  discoveryIssues = [];
507
994
  for (const filePath of discovered) try {
508
995
  const content = await readFile(filePath, "utf-8");
@@ -526,9 +1013,14 @@ function createRunner({ watchForChanges = true } = {}) {
526
1013
  let columnDefs = buildDeclaredColumnDefs(void 0, void 0, void 0);
527
1014
  let stats;
528
1015
  let charts;
1016
+ let manualInputDescriptor;
1017
+ let requiresManualInput = false;
1018
+ const relativeFilePath = toWorkspaceRelativePath(meta.filePath);
529
1019
  discoveredEntry?.use((evalDef) => {
530
1020
  const defaultConfig = resolveEvalDefaultConfig({
531
1021
  evalDef,
1022
+ globalColumns: config.columns,
1023
+ globalStats: config.stats,
532
1024
  globalRemove: config.removeDefaultConfig
533
1025
  });
534
1026
  columnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
@@ -540,8 +1032,25 @@ function createRunner({ watchForChanges = true } = {}) {
540
1032
  });
541
1033
  for (const warning of validated.warnings) console.warn(warning);
542
1034
  charts = validated.charts;
1035
+ const manualInputResult = resolveManualInputDiscovery({
1036
+ evalDef,
1037
+ evalId: meta.id,
1038
+ relativeFilePath
1039
+ });
1040
+ if (manualInputResult.kind === "issue") {
1041
+ discoveryIssues.push(manualInputResult.issue);
1042
+ requiresManualInput = true;
1043
+ return;
1044
+ }
1045
+ if (manualInputResult.kind === "ok") {
1046
+ requiresManualInput = manualInputResult.requiresManualInput;
1047
+ manualInputDescriptor = manualInputResult.descriptor;
1048
+ manualInputConfigs.set(buildEvalKey({
1049
+ filePath: relativeFilePath,
1050
+ evalId: meta.id
1051
+ }), manualInputResult.config);
1052
+ }
543
1053
  });
544
- const relativeFilePath = toWorkspaceRelativePath(meta.filePath);
545
1054
  const key = buildEvalKey({
546
1055
  filePath: relativeFilePath,
547
1056
  evalId: meta.id
@@ -556,7 +1065,9 @@ function createRunner({ watchForChanges = true } = {}) {
556
1065
  columnDefs,
557
1066
  caseCount: null,
558
1067
  stats,
559
- charts
1068
+ charts,
1069
+ manualInputDescriptor,
1070
+ requiresManualInput
560
1071
  });
561
1072
  }
562
1073
  } catch {}
@@ -602,11 +1113,27 @@ function createRunner({ watchForChanges = true } = {}) {
602
1113
  childProcess: void 0,
603
1114
  childTerminalReceived: false
604
1115
  };
1116
+ await mkdir(runDir, { recursive: true });
1117
+ await mkdir(join(runDir, "traces"), { recursive: true });
1118
+ await mkdir(join(runDir, "artifacts"), { recursive: true });
1119
+ await mkdir(join(runDir, "case-details"), { recursive: true });
1120
+ const materializedRequest = { ...request };
1121
+ if (request.manualInputs !== void 0) {
1122
+ const materialized = await materializeManualInputFiles({
1123
+ workspaceRoot,
1124
+ runId,
1125
+ runDir,
1126
+ value: request.manualInputs
1127
+ });
1128
+ if (materialized.error !== null) throw new Error(materialized.error);
1129
+ if (!isRecord(materialized.value)) throw new Error("Materialized manual inputs must be an object");
1130
+ materializedRequest.manualInputs = materialized.value;
1131
+ }
605
1132
  runs.set(runId, runState);
606
1133
  setLatestRunInfoMap({
607
1134
  latestRunInfoMap,
608
1135
  evalIds: getTargetEvalKeys({
609
- request,
1136
+ request: materializedRequest,
610
1137
  sortedEvals: getSortedEvalMetas()
611
1138
  }),
612
1139
  info: {
@@ -616,13 +1143,9 @@ function createRunner({ watchForChanges = true } = {}) {
616
1143
  evalSourceFingerprint: null
617
1144
  }
618
1145
  });
619
- await mkdir(runDir, { recursive: true });
620
- await mkdir(join(runDir, "traces"), { recursive: true });
621
- await mkdir(join(runDir, "artifacts"), { recursive: true });
622
- await mkdir(join(runDir, "case-details"), { recursive: true });
623
1146
  await writeFile(join(runDir, "run.json"), JSON.stringify(manifest, null, 2));
624
1147
  const childContext = {
625
- request,
1148
+ request: materializedRequest,
626
1149
  workspaceRoot,
627
1150
  runDir,
628
1151
  manifest,
@@ -705,18 +1228,7 @@ function createRunner({ watchForChanges = true } = {}) {
705
1228
  };
706
1229
  },
707
1230
  async close() {
708
- if (discoveryRefreshTimer !== void 0) {
709
- clearTimeout(discoveryRefreshTimer);
710
- discoveryRefreshTimer = void 0;
711
- }
712
- if (runHistoryRefreshTimer !== void 0) {
713
- clearTimeout(runHistoryRefreshTimer);
714
- runHistoryRefreshTimer = void 0;
715
- }
716
- const watchers = [discoveryWatcher, runHistoryWatcher].filter((watcher) => watcher !== void 0);
717
- discoveryWatcher = void 0;
718
- runHistoryWatcher = void 0;
719
- await Promise.all(watchers.map((watcher) => watcher.close()));
1231
+ await Promise.all([closeWatchers(), configReload.close()]);
720
1232
  },
721
1233
  getWorkspaceRoot() {
722
1234
  return workspaceRoot;
@@ -734,6 +1246,39 @@ function createRunner({ watchForChanges = true } = {}) {
734
1246
  return resolveArtifactPath(join(localStateDir, "runs"), artifactId_);
735
1247
  }
736
1248
  };
1249
+ async function loadRunnerState() {
1250
+ config = await loadConfig();
1251
+ workspaceRoot = config.workspaceRoot ?? process.cwd();
1252
+ localStateDir = resolve(workspaceRoot, ".agent-evals");
1253
+ llmCallsConfig = resolveLlmCallsConfig(config.llmCalls);
1254
+ apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
1255
+ await mkdir(localStateDir, { recursive: true });
1256
+ await mkdir(join(localStateDir, "runs"), { recursive: true });
1257
+ await cleanupStagedManualInputFiles(workspaceRoot);
1258
+ cacheStore = createFsCacheStore({
1259
+ workspaceRoot,
1260
+ dir: config.cache?.dir,
1261
+ maxEntriesPerNamespace: config.cache?.maxEntriesPerNamespace ?? config.cache?.maxEntriesPerEval,
1262
+ maxEntriesByNamespace: config.cache?.maxEntriesByNamespace
1263
+ });
1264
+ await loadPersistedRuns();
1265
+ await runner.refreshDiscovery();
1266
+ if (watchForChanges) await setupWatcher();
1267
+ }
1268
+ async function closeWatchers() {
1269
+ if (discoveryRefreshTimer !== void 0) {
1270
+ clearTimeout(discoveryRefreshTimer);
1271
+ discoveryRefreshTimer = void 0;
1272
+ }
1273
+ if (runHistoryRefreshTimer !== void 0) {
1274
+ clearTimeout(runHistoryRefreshTimer);
1275
+ runHistoryRefreshTimer = void 0;
1276
+ }
1277
+ const watchers = [discoveryWatcher, runHistoryWatcher].filter((watcher) => watcher !== void 0);
1278
+ discoveryWatcher = void 0;
1279
+ runHistoryWatcher = void 0;
1280
+ await Promise.all(watchers.map((watcher) => watcher.close()));
1281
+ }
737
1282
  async function setupWatcher() {
738
1283
  const watcher = watch(getWatchRootsForIncludePatterns({
739
1284
  patterns: config.include,
@@ -758,7 +1303,7 @@ function createRunner({ watchForChanges = true } = {}) {
758
1303
  watcher.on("unlink", scheduleRefresh);
759
1304
  watcher.on("addDir", scheduleRefresh);
760
1305
  watcher.on("unlinkDir", scheduleRefresh);
761
- await setupRunHistoryWatcher();
1306
+ await Promise.all([setupRunHistoryWatcher(), configReload.setupWatcher()]);
762
1307
  await watcherReady;
763
1308
  }
764
1309
  async function setupRunHistoryWatcher() {
@@ -783,6 +1328,9 @@ function createRunner({ watchForChanges = true } = {}) {
783
1328
  watcher.once("ready", ready);
784
1329
  });
785
1330
  }
1331
+ function getActiveRunCount() {
1332
+ return [...runs.values()].filter((run) => run.manifest.status === "running").length;
1333
+ }
786
1334
  function emitDiscoveryEvent() {
787
1335
  const lastRunStatuses = getLastRunStatuses({
788
1336
  runs: runs.values(),
@@ -802,6 +1350,10 @@ function createRunner({ watchForChanges = true } = {}) {
802
1350
  payload: runner.getEvals()
803
1351
  };
804
1352
  for (const listener of discoveryListeners) listener(event);
1353
+ configReload.reloadIfPendingAndIdle();
1354
+ }
1355
+ function emitToDiscoveryListeners(event) {
1356
+ for (const listener of discoveryListeners) listener(event);
805
1357
  }
806
1358
  function emitEvent(runState, event) {
807
1359
  for (const listener of runState.listeners) try {
@@ -844,6 +1396,345 @@ function createRunner({ watchForChanges = true } = {}) {
844
1396
  return runner;
845
1397
  }
846
1398
  //#endregion
1399
+ //#region src/cliHelp.ts
1400
+ /** Render the help block for a given CLI topic to stdout via `console.info`. */
1401
+ function printHelp(topic = "global") {
1402
+ if (topic === "app") {
1403
+ console.info(`
1404
+ agent-evals app - Start server with UI
1405
+
1406
+ Usage:
1407
+ agent-evals app [flags]
1408
+
1409
+ Flags:
1410
+ --port <n> Server port (default: 4100)
1411
+ --no-env Disable automatic .env loading
1412
+ --help, -h Show this help
1413
+ `);
1414
+ return;
1415
+ }
1416
+ if (topic === "list") {
1417
+ console.info(`
1418
+ agent-evals list - List discovered evals
1419
+
1420
+ Usage:
1421
+ agent-evals list [flags]
1422
+
1423
+ Flags:
1424
+ --no-env Disable automatic .env loading
1425
+ --help, -h Show this help
1426
+ `);
1427
+ return;
1428
+ }
1429
+ if (topic === "run") {
1430
+ console.info(`
1431
+ agent-evals run - Run evals
1432
+
1433
+ Usage:
1434
+ agent-evals run [flags]
1435
+
1436
+ Flags:
1437
+ --eval <id> Run specific eval(s) (comma-separated)
1438
+ --file <path|glob> Run eval files matching path/glob (comma-separated)
1439
+ --case <id> Run case(s); combine with --file/--eval if ambiguous
1440
+ --trials <n> Number of trials per case
1441
+ --inspect[=host:port] Run with the Node.js inspector enabled
1442
+ --inspect-brk[=host:port] Enable inspector and pause before startup
1443
+ --json Output run summary as JSON
1444
+ --cache <use|bypass|refresh> Cache mode for this run (default: use)
1445
+ --no-cache Shortcut for --cache bypass
1446
+ --refresh-cache Shortcut for --cache refresh
1447
+ --clear-cache Clear the cache before starting the run
1448
+ --input <json> Manual input value for a single targeted eval
1449
+ that declares manualInput
1450
+ --input-file <path> JSON object keyed by eval key (or eval id) with
1451
+ manual input values for one or more targeted evals
1452
+ --no-env Disable automatic .env loading
1453
+ --help, -h Show this help
1454
+ `);
1455
+ return;
1456
+ }
1457
+ if (topic === "show-runs") {
1458
+ console.info(`
1459
+ agent-evals show-runs - Show saved run artifact file paths
1460
+
1461
+ Usage:
1462
+ agent-evals show-runs [<run-id>|latest] [--json]
1463
+
1464
+ Prints the run directory and stable artifact paths for run.json, summary.json,
1465
+ cases.jsonl, case detail JSON, and trace JSON files. Run ids can be full
1466
+ timestamp ids, short ids such as r0, or latest.
1467
+
1468
+ Flags:
1469
+ --json Output the file index as JSON
1470
+ --no-env Disable automatic .env loading
1471
+ --help, -h Show this help
1472
+ `);
1473
+ return;
1474
+ }
1475
+ if (topic === "cache" || topic === "cache list" || topic === "cache clear") {
1476
+ console.info(`
1477
+ agent-evals cache - Manage cached operation entries
1478
+
1479
+ Usage:
1480
+ agent-evals cache list [flags]
1481
+ agent-evals cache clear --eval <id>
1482
+ agent-evals cache clear --all
1483
+
1484
+ Flags:
1485
+ --eval <id> Clear entries for specific eval(s) (comma-separated)
1486
+ --all Confirm clearing every cached entry
1487
+ --json Output cache listing as JSON
1488
+ --no-env Disable automatic .env loading
1489
+ --help, -h Show this help
1490
+ `);
1491
+ return;
1492
+ }
1493
+ console.info(`
1494
+ agent-evals - LLM/Agent eval runner
1495
+
1496
+ Commands:
1497
+ app Start server with UI
1498
+ list List discovered evals
1499
+ run Run evals
1500
+ show-runs [id|latest] Show saved run artifact file paths
1501
+ cache list List cached operation entries
1502
+ cache clear --eval <id> Clear cache entries for one eval
1503
+ cache clear --all Clear every cached entry
1504
+ help Show this help
1505
+
1506
+ Options:
1507
+ --eval <id> Run specific eval(s) (comma-separated)
1508
+ --case <id> Run specific case(s) (comma-separated)
1509
+ --trials <n> Number of trials per case
1510
+ --inspect[=host:port] Run with the Node.js inspector enabled
1511
+ --inspect-brk[=host:port] Enable inspector and pause before startup
1512
+ --json Output results as JSON
1513
+ --port <n> Server port (default: 4100)
1514
+ --cache <use|bypass|refresh> Cache mode for this run (default: use)
1515
+ --no-cache Shortcut for --cache bypass
1516
+ --refresh-cache Shortcut for --cache refresh
1517
+ --clear-cache Clear the cache before starting the run
1518
+ --no-env Disable automatic .env loading
1519
+ --help, -h Show help
1520
+ `);
1521
+ }
1522
+ //#endregion
1523
+ //#region src/manualInputArgs.ts
1524
+ function isPlainObject(value) {
1525
+ return typeof value === "object" && value !== null && !Array.isArray(value);
1526
+ }
1527
+ function isPathInputObject(value) {
1528
+ if (!isPlainObject(value)) return false;
1529
+ return typeof value.path === "string" && (value.name === void 0 || typeof value.name === "string") && (value.mimeType === void 0 || typeof value.mimeType === "string");
1530
+ }
1531
+ function escapeRegex$1(value) {
1532
+ return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
1533
+ }
1534
+ function globToRegex$1(pattern) {
1535
+ const normalized = pattern.replaceAll("\\", "/");
1536
+ let regex = "^";
1537
+ for (let i = 0; i < normalized.length; i++) {
1538
+ const char = normalized[i];
1539
+ const next = normalized[i + 1];
1540
+ if (char === "*" && next === "*") {
1541
+ regex += ".*";
1542
+ i++;
1543
+ } else if (char === "*") regex += "[^/]*";
1544
+ else if (char === "?") regex += "[^/]";
1545
+ else regex += escapeRegex$1(char ?? "");
1546
+ }
1547
+ regex += "$";
1548
+ return new RegExp(regex);
1549
+ }
1550
+ function fileMatches$1(pattern, filePath) {
1551
+ const normalizedPattern = pattern.replaceAll("\\", "/");
1552
+ if (normalizedPattern === filePath) return true;
1553
+ return globToRegex$1(normalizedPattern).test(filePath);
1554
+ }
1555
+ function isManualInputEvalTargeted(params) {
1556
+ const { evalSummary, args } = params;
1557
+ const hasEvalIds = args.evalIds.length > 0;
1558
+ const hasFiles = args.files.length > 0;
1559
+ const hasCaseIds = args.caseIds.length > 0;
1560
+ if (hasEvalIds && !args.evalIds.includes(evalSummary.id)) return false;
1561
+ if (hasFiles) {
1562
+ if (!args.files.some((file) => fileMatches$1(file, evalSummary.filePath))) return false;
1563
+ }
1564
+ if (!hasEvalIds && !hasFiles) {
1565
+ if (hasCaseIds) return false;
1566
+ return true;
1567
+ }
1568
+ return true;
1569
+ }
1570
+ async function readInputFileMap(inputFilePath) {
1571
+ const readResult = await resultify(() => readFile(inputFilePath, "utf-8"));
1572
+ if (readResult.error) return {
1573
+ error: `Failed to read --input-file at ${inputFilePath}: ${readResult.error.message}`,
1574
+ value: null
1575
+ };
1576
+ const parseResult = resultify(() => JSON.parse(readResult.value));
1577
+ if (parseResult.error) return {
1578
+ error: `Failed to parse --input-file at ${inputFilePath} as JSON: ${parseResult.error.message}`,
1579
+ value: null
1580
+ };
1581
+ return {
1582
+ error: null,
1583
+ value: parseResult.value
1584
+ };
1585
+ }
1586
+ async function normalizeManualInputFileValue(params) {
1587
+ if (isManualInputFileValue(params.value)) return {
1588
+ error: null,
1589
+ value: params.value
1590
+ };
1591
+ if (!isPathInputObject(params.value)) return {
1592
+ error: null,
1593
+ value: params.value
1594
+ };
1595
+ const pathInput = params.value;
1596
+ const staged = await resultify(() => stageManualInputFileFromPath({
1597
+ workspaceRoot: params.workspaceRoot,
1598
+ path: pathInput.path,
1599
+ name: pathInput.name,
1600
+ mimeType: pathInput.mimeType
1601
+ }));
1602
+ if (staged.error) return {
1603
+ error: `Failed to stage file input "${params.fieldKey}" for eval "${params.evalId}": ${staged.error.message}`,
1604
+ value: null
1605
+ };
1606
+ return {
1607
+ error: null,
1608
+ value: staged.value
1609
+ };
1610
+ }
1611
+ async function normalizeManualInputValue(params) {
1612
+ const descriptor = params.evalSummary.manualInput;
1613
+ if (!descriptor || !isPlainObject(params.value)) return {
1614
+ error: null,
1615
+ value: params.value
1616
+ };
1617
+ const next = { ...params.value };
1618
+ for (const field of descriptor.fields) {
1619
+ if (field.kind !== "file") continue;
1620
+ const normalized = await normalizeManualInputFileValue({
1621
+ workspaceRoot: params.workspaceRoot,
1622
+ evalId: params.evalSummary.id,
1623
+ fieldKey: field.key,
1624
+ value: next[field.key]
1625
+ });
1626
+ if (normalized.error !== null) return {
1627
+ error: normalized.error,
1628
+ value: null
1629
+ };
1630
+ next[field.key] = normalized.value;
1631
+ }
1632
+ return {
1633
+ error: null,
1634
+ value: next
1635
+ };
1636
+ }
1637
+ /**
1638
+ * Resolve the `manualInputs` payload to send with `runner.startRun`.
1639
+ *
1640
+ * Inspects every discovered eval that declares `manualInput`, filters them to
1641
+ * the run target, and either returns the typed map (single eval via `--input`,
1642
+ * multiple via `--input-file`) or a structured error to display and exit on.
1643
+ */
1644
+ async function collectManualInputs(params) {
1645
+ const { runner, args } = params;
1646
+ const workspaceRoot = runner.getWorkspaceRoot();
1647
+ const targetedManualInputEvals = runner.getEvals().filter((evalSummary) => evalSummary.manualInput !== void 0).filter((evalSummary) => isManualInputEvalTargeted({
1648
+ evalSummary,
1649
+ args
1650
+ }));
1651
+ if (targetedManualInputEvals.length === 0) {
1652
+ if (args.inputJson !== void 0 || args.inputFilePath !== void 0) return {
1653
+ error: "--input/--input-file was provided but no targeted eval requires manual input.",
1654
+ value: null
1655
+ };
1656
+ return {
1657
+ error: null,
1658
+ value: void 0
1659
+ };
1660
+ }
1661
+ if (args.inputJson !== void 0 && args.inputFilePath !== void 0) return {
1662
+ error: "Cannot use --input and --input-file together; choose one.",
1663
+ value: null
1664
+ };
1665
+ if (args.inputJson !== void 0) {
1666
+ if (targetedManualInputEvals.length > 1) {
1667
+ const ids = targetedManualInputEvals.map((evalSummary) => evalSummary.id).join(", ");
1668
+ return {
1669
+ error: `--input only works for one targeted manual-input eval at a time; got ${String(targetedManualInputEvals.length)} (${ids}). Use --input-file with a JSON object keyed by eval key.`,
1670
+ value: null
1671
+ };
1672
+ }
1673
+ const parsedResult = resultify(() => JSON.parse(args.inputJson ?? ""));
1674
+ if (parsedResult.error) return {
1675
+ error: `Failed to parse --input as JSON: ${parsedResult.error.message}`,
1676
+ value: null
1677
+ };
1678
+ const [onlyEval] = targetedManualInputEvals;
1679
+ if (onlyEval === void 0) return {
1680
+ error: null,
1681
+ value: void 0
1682
+ };
1683
+ const normalized = await normalizeManualInputValue({
1684
+ workspaceRoot,
1685
+ evalSummary: onlyEval,
1686
+ value: parsedResult.value
1687
+ });
1688
+ if (normalized.error !== null) return {
1689
+ error: normalized.error,
1690
+ value: null
1691
+ };
1692
+ return {
1693
+ error: null,
1694
+ value: { [onlyEval.key]: normalized.value }
1695
+ };
1696
+ }
1697
+ if (args.inputFilePath !== void 0) {
1698
+ const fileResult = await readInputFileMap(args.inputFilePath);
1699
+ if (fileResult.error !== null) return {
1700
+ error: fileResult.error,
1701
+ value: null
1702
+ };
1703
+ if (!isPlainObject(fileResult.value)) return {
1704
+ error: `--input-file must contain a JSON object keyed by eval key (got ${typeof fileResult.value}).`,
1705
+ value: null
1706
+ };
1707
+ const map = {};
1708
+ for (const evalSummary of targetedManualInputEvals) {
1709
+ const byKey = fileResult.value[evalSummary.key];
1710
+ const byId = fileResult.value[evalSummary.id];
1711
+ const value = byKey !== void 0 ? byKey : byId;
1712
+ if (value === void 0) return {
1713
+ error: `--input-file is missing manual input for eval "${evalSummary.id}" (key "${evalSummary.key}").`,
1714
+ value: null
1715
+ };
1716
+ const normalized = await normalizeManualInputValue({
1717
+ workspaceRoot,
1718
+ evalSummary,
1719
+ value
1720
+ });
1721
+ if (normalized.error !== null) return {
1722
+ error: normalized.error,
1723
+ value: null
1724
+ };
1725
+ map[evalSummary.key] = normalized.value;
1726
+ }
1727
+ return {
1728
+ error: null,
1729
+ value: map
1730
+ };
1731
+ }
1732
+ return {
1733
+ error: `Eval(s) require manual input but no --input/--input-file was provided: ${targetedManualInputEvals.map((evalSummary) => evalSummary.id).join(", ")}`,
1734
+ value: null
1735
+ };
1736
+ }
1737
+ //#endregion
847
1738
  //#region src/cli.ts
848
1739
  function parseArgs(argv) {
849
1740
  const normalizedArgv = argv.filter((arg) => arg !== "--no-env");
@@ -863,7 +1754,9 @@ function parseArgs(argv) {
863
1754
  cacheMode: "use",
864
1755
  clearCache: false,
865
1756
  all: false,
866
- loadEnv: normalizedArgv.length === argv.length
1757
+ loadEnv: normalizedArgv.length === argv.length,
1758
+ inputJson: void 0,
1759
+ inputFilePath: void 0
867
1760
  };
868
1761
  const command = normalizedArgv[0];
869
1762
  if (command === "--help" || command === "-h") {
@@ -910,7 +1803,13 @@ function parseArgs(argv) {
910
1803
  } else if (arg === "--no-cache") args.cacheMode = "bypass";
911
1804
  else if (arg === "--refresh-cache") args.cacheMode = "refresh";
912
1805
  else if (arg === "--clear-cache") args.clearCache = true;
913
- else if (arg === "--all") args.all = true;
1806
+ else if (arg === "--input" && next !== void 0) {
1807
+ args.inputJson = next;
1808
+ i++;
1809
+ } else if (arg === "--input-file" && next !== void 0) {
1810
+ args.inputFilePath = next;
1811
+ i++;
1812
+ } else if (arg === "--all") args.all = true;
914
1813
  else if (!arg.startsWith("-")) args.positionals.push(arg);
915
1814
  }
916
1815
  return args;
@@ -1041,8 +1940,8 @@ async function commandApp(args) {
1041
1940
  const { serve } = await import("@hono/node-server");
1042
1941
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
1043
1942
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
1044
- const appModule = await import("./app-CJj1yPPD.mjs");
1045
- const runnerModule = await import("./runner-KbDKLSU4.mjs");
1943
+ const appModule = await import("./app-D6-msfKP.mjs");
1944
+ const runnerModule = await import("./runner-Bq1f9B9d.mjs");
1046
1945
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
1047
1946
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
1048
1947
  await runnerModule.initRunner();
@@ -1113,10 +2012,26 @@ async function commandRun(args) {
1113
2012
  mode: "evalIds",
1114
2013
  files: args.files
1115
2014
  } : { mode: "all" };
2015
+ const manualInputsResult = await collectManualInputs({
2016
+ runner,
2017
+ args: {
2018
+ evalIds: args.evalIds,
2019
+ files: args.files,
2020
+ caseIds: args.caseIds,
2021
+ inputJson: args.inputJson,
2022
+ inputFilePath: args.inputFilePath
2023
+ }
2024
+ });
2025
+ if (manualInputsResult.error !== null) {
2026
+ console.error(manualInputsResult.error);
2027
+ process.exit(1);
2028
+ return;
2029
+ }
1116
2030
  const run = await runner.startRun({
1117
2031
  target,
1118
2032
  trials: args.trials,
1119
- cache: { mode: args.cacheMode }
2033
+ cache: { mode: args.cacheMode },
2034
+ manualInputs: manualInputsResult.value
1120
2035
  });
1121
2036
  if (!args.json) {
1122
2037
  console.info(`Run started: ${run.manifest.id}`);
@@ -1334,122 +2249,5 @@ async function waitForRunCompletion(runner, runId) {
1334
2249
  check();
1335
2250
  });
1336
2251
  }
1337
- function printHelp(topic = "global") {
1338
- if (topic === "app") {
1339
- console.info(`
1340
- agent-evals app - Start server with UI
1341
-
1342
- Usage:
1343
- agent-evals app [flags]
1344
-
1345
- Flags:
1346
- --port <n> Server port (default: 4100)
1347
- --no-env Disable automatic .env loading
1348
- --help, -h Show this help
1349
- `);
1350
- return;
1351
- }
1352
- if (topic === "list") {
1353
- console.info(`
1354
- agent-evals list - List discovered evals
1355
-
1356
- Usage:
1357
- agent-evals list [flags]
1358
-
1359
- Flags:
1360
- --no-env Disable automatic .env loading
1361
- --help, -h Show this help
1362
- `);
1363
- return;
1364
- }
1365
- if (topic === "run") {
1366
- console.info(`
1367
- agent-evals run - Run evals
1368
-
1369
- Usage:
1370
- agent-evals run [flags]
1371
-
1372
- Flags:
1373
- --eval <id> Run specific eval(s) (comma-separated)
1374
- --file <path|glob> Run eval files matching path/glob (comma-separated)
1375
- --case <id> Run case(s); combine with --file/--eval if ambiguous
1376
- --trials <n> Number of trials per case
1377
- --inspect[=host:port] Run with the Node.js inspector enabled
1378
- --inspect-brk[=host:port] Enable inspector and pause before startup
1379
- --json Output run summary as JSON
1380
- --cache <use|bypass|refresh> Cache mode for this run (default: use)
1381
- --no-cache Shortcut for --cache bypass
1382
- --refresh-cache Shortcut for --cache refresh
1383
- --clear-cache Clear the cache before starting the run
1384
- --no-env Disable automatic .env loading
1385
- --help, -h Show this help
1386
- `);
1387
- return;
1388
- }
1389
- if (topic === "show-runs") {
1390
- console.info(`
1391
- agent-evals show-runs - Show saved run artifact file paths
1392
-
1393
- Usage:
1394
- agent-evals show-runs [<run-id>|latest] [--json]
1395
-
1396
- Prints the run directory and stable artifact paths for run.json, summary.json,
1397
- cases.jsonl, case detail JSON, and trace JSON files. Run ids can be full
1398
- timestamp ids, short ids such as r0, or latest.
1399
-
1400
- Flags:
1401
- --json Output the file index as JSON
1402
- --no-env Disable automatic .env loading
1403
- --help, -h Show this help
1404
- `);
1405
- return;
1406
- }
1407
- if (topic === "cache" || topic === "cache list" || topic === "cache clear") {
1408
- console.info(`
1409
- agent-evals cache - Manage cached operation entries
1410
-
1411
- Usage:
1412
- agent-evals cache list [flags]
1413
- agent-evals cache clear --eval <id>
1414
- agent-evals cache clear --all
1415
-
1416
- Flags:
1417
- --eval <id> Clear entries for specific eval(s) (comma-separated)
1418
- --all Confirm clearing every cached entry
1419
- --json Output cache listing as JSON
1420
- --no-env Disable automatic .env loading
1421
- --help, -h Show this help
1422
- `);
1423
- return;
1424
- }
1425
- console.info(`
1426
- agent-evals - LLM/Agent eval runner
1427
-
1428
- Commands:
1429
- app Start server with UI
1430
- list List discovered evals
1431
- run Run evals
1432
- show-runs [id|latest] Show saved run artifact file paths
1433
- cache list List cached operation entries
1434
- cache clear --eval <id> Clear cache entries for one eval
1435
- cache clear --all Clear every cached entry
1436
- help Show this help
1437
-
1438
- Options:
1439
- --eval <id> Run specific eval(s) (comma-separated)
1440
- --case <id> Run specific case(s) (comma-separated)
1441
- --trials <n> Number of trials per case
1442
- --inspect[=host:port] Run with the Node.js inspector enabled
1443
- --inspect-brk[=host:port] Enable inspector and pause before startup
1444
- --json Output results as JSON
1445
- --port <n> Server port (default: 4100)
1446
- --cache <use|bypass|refresh> Cache mode for this run (default: use)
1447
- --no-cache Shortcut for --cache bypass
1448
- --refresh-cache Shortcut for --cache refresh
1449
- --clear-cache Clear the cache before starting the run
1450
- --no-env Disable automatic .env loading
1451
- --help, -h Show help
1452
- `);
1453
- }
1454
2252
  //#endregion
1455
- export { createRunner as n, runCli as t };
2253
+ export { materializeManualInputFiles as a, isManualInputFileValue as i, createRunner as n, stageManualInputFile as o, cleanupStagedManualInputFiles as r, stageManualInputFileFromPath as s, runCli as t };