@ls-stack/agent-eval 0.28.0 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,25 +1,149 @@
1
- import { C as normalizeScoreDef, F as getEvalTitle, I as getEvalDisplayStatus, L as deriveScopedSummaryFromCases, N as applyDerivedCallAttributes, S as buildDeclaredColumnDefs, T as createFsCacheStore, V as runSummarySchema, Yn as getEvalRegistry, _ as deriveEvalFreshness, a as getLastRunStatuses, b as resolveEvalDefaultConfig, c as loadPersistedRunSnapshots, d as persistRunState, dt as buildEvalKey, f as recomputeEvalStatusesInRuns, ft as getCaseRowCaseKey, g as resolveArtifactPath, h as resolveTracePresentation, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, o as getLatestRunInfos, ot as resolveApiCallsConfig, p as recomputePersistedCaseStatus, pt as getCaseRowEvalKey, s as loadPersistedRunSnapshot, st as resolveLlmCallsConfig, u as persistCaseDetail, v as loadEvalModule, w as validateCharts, x as loadConfig, y as parseEvalDiscovery } from "./runOrchestration-ClWYWPen.mjs";
2
- import { createHash } from "node:crypto";
3
- import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
4
- import { dirname, join, relative, resolve } from "node:path";
1
+ import { B as getEvalDisplayStatus, C as loadConfig, D as createFsCacheStore, E as validateCharts, G as runSummarySchema, L as applyDerivedCallAttributes, S as resolveEvalDefaultConfig, T as normalizeScoreDef, V as deriveScopedSummaryFromCases, _ as buildManualInputDescriptor, _t as getCaseRowEvalKey, a as getLastRunStatuses, b as loadEvalModule, c as loadPersistedRunSnapshots, d as persistRunState, dt as resolveLlmCallsConfig, f as recomputeEvalStatusesInRuns, g as resolveArtifactPath, gt as getCaseRowCaseKey, h as resolveTracePresentation, ht as buildEvalKey, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, o as getLatestRunInfos, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, u as persistCaseDetail, ur as getEvalRegistry, ut as resolveApiCallsConfig, v as parseManualInputValues, w as buildDeclaredColumnDefs, x as parseEvalDiscovery, y as deriveEvalFreshness, z as getEvalTitle } from "./runOrchestration-CIARrLs6.mjs";
2
+ import { createHash, randomUUID } from "node:crypto";
3
+ import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
4
+ import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
5
5
  import { watch } from "chokidar";
6
6
  import { glob } from "glob";
7
7
  import { existsSync } from "node:fs";
8
8
  import { resultify } from "t-result";
9
9
  import { fileURLToPath } from "node:url";
10
10
  import { spawn, spawnSync } from "node:child_process";
11
+ //#region ../runner/src/configReload.ts
12
+ /** Coordinates idle-only reloads for `agent-evals.config.ts` in app mode. */
13
+ function createConfigReloadController({ getActiveRunCount, closeRunnerWatchers, loadRunnerState, emitToDiscoveryListeners }) {
14
+ let watcher;
15
+ let reloadTimer;
16
+ let reloadPromise;
17
+ let state = {
18
+ status: "idle",
19
+ activeRunCount: 0,
20
+ lastChangedAt: null,
21
+ lastReloadedAt: null
22
+ };
23
+ function currentState() {
24
+ return {
25
+ ...state,
26
+ activeRunCount: getActiveRunCount()
27
+ };
28
+ }
29
+ function emitReloadEvent() {
30
+ emitToDiscoveryListeners({
31
+ type: "config.reload",
32
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
33
+ payload: currentState()
34
+ });
35
+ }
36
+ function setState(patch) {
37
+ state = {
38
+ ...state,
39
+ ...patch,
40
+ activeRunCount: getActiveRunCount()
41
+ };
42
+ emitReloadEvent();
43
+ }
44
+ async function close() {
45
+ if (reloadTimer !== void 0) {
46
+ clearTimeout(reloadTimer);
47
+ reloadTimer = void 0;
48
+ }
49
+ const watcherToClose = watcher;
50
+ watcher = void 0;
51
+ if (watcherToClose !== void 0) await watcherToClose.close();
52
+ }
53
+ async function reloadConfigNow(changedAt) {
54
+ setState({
55
+ status: "reloading",
56
+ lastChangedAt: changedAt
57
+ });
58
+ await close();
59
+ await closeRunnerWatchers();
60
+ await loadRunnerState();
61
+ setState({
62
+ status: "idle",
63
+ lastChangedAt: changedAt,
64
+ lastReloadedAt: (/* @__PURE__ */ new Date()).toISOString()
65
+ });
66
+ }
67
+ async function reloadConfig(changedAt) {
68
+ if (reloadPromise !== void 0) {
69
+ setState({
70
+ status: "pending",
71
+ lastChangedAt: changedAt
72
+ });
73
+ await reloadPromise;
74
+ await reloadIfPendingAndIdle();
75
+ return;
76
+ }
77
+ reloadPromise = reloadConfigNow(changedAt);
78
+ try {
79
+ await reloadPromise;
80
+ } finally {
81
+ reloadPromise = void 0;
82
+ }
83
+ }
84
+ async function handleConfigChanged() {
85
+ const changedAt = (/* @__PURE__ */ new Date()).toISOString();
86
+ if (getActiveRunCount() > 0) {
87
+ setState({
88
+ status: "pending",
89
+ lastChangedAt: changedAt
90
+ });
91
+ return;
92
+ }
93
+ await reloadConfig(changedAt);
94
+ }
95
+ async function reloadIfPendingAndIdle() {
96
+ if (state.status !== "pending") return;
97
+ if (getActiveRunCount() > 0) {
98
+ state = currentState();
99
+ return;
100
+ }
101
+ await reloadConfig(state.lastChangedAt ?? (/* @__PURE__ */ new Date()).toISOString());
102
+ }
103
+ async function setupWatcher() {
104
+ const nextWatcher = watch(resolve(process.cwd(), "agent-evals.config.ts"), {
105
+ awaitWriteFinish: {
106
+ stabilityThreshold: 100,
107
+ pollInterval: 20
108
+ },
109
+ ignoreInitial: true,
110
+ persistent: true
111
+ });
112
+ watcher = nextWatcher;
113
+ const scheduleReload = () => {
114
+ if (reloadTimer !== void 0) clearTimeout(reloadTimer);
115
+ reloadTimer = setTimeout(() => {
116
+ reloadTimer = void 0;
117
+ handleConfigChanged();
118
+ }, 50);
119
+ };
120
+ nextWatcher.on("change", scheduleReload);
121
+ nextWatcher.on("add", scheduleReload);
122
+ nextWatcher.on("unlink", scheduleReload);
123
+ await new Promise((ready) => {
124
+ nextWatcher.once("ready", ready);
125
+ });
126
+ }
127
+ return {
128
+ close,
129
+ currentState,
130
+ reloadIfPendingAndIdle,
131
+ setupWatcher
132
+ };
133
+ }
134
+ //#endregion
11
135
  //#region ../runner/src/evalSummaries.ts
12
136
  /** Build the API/UI summary payload for one discovered eval. */
13
137
  function buildEvalSummary(params) {
14
138
  const { meta, config, gitState, latestRun, lastRunStatus } = params;
15
- const { sourceFingerprint, ...summaryMeta } = meta;
139
+ const { sourceFingerprint, manualInputDescriptor, requiresManualInput, ...summaryMeta } = meta;
16
140
  const freshness = deriveEvalFreshness({
17
141
  latestRun,
18
142
  gitState,
19
143
  currentEvalSourceFingerprint: sourceFingerprint,
20
144
  staleAfterDays: config.staleAfterDays ?? 14
21
145
  });
22
- return {
146
+ const summary = {
23
147
  ...summaryMeta,
24
148
  stale: freshness.stale,
25
149
  outdated: freshness.outdated,
@@ -29,6 +153,8 @@ function buildEvalSummary(params) {
29
153
  currentCommitSha: gitState.commitSha,
30
154
  lastRunStatus
31
155
  };
156
+ if (manualInputDescriptor && requiresManualInput) summary.manualInput = manualInputDescriptor;
157
+ return summary;
32
158
  }
33
159
  /** Write one latest-run snapshot to each targeted eval id. */
34
160
  function setLatestRunInfoMap(params) {
@@ -60,6 +186,292 @@ function readGitWorktreeState(workspaceRoot) {
60
186
  return { commitSha: commitResult.status === 0 ? commitResult.stdout : null };
61
187
  }
62
188
  //#endregion
189
+ //#region ../runner/src/manualInput/discovery.ts
190
+ /**
191
+ * Inspect an eval's `manualInput` config during discovery. Rejects evals that
192
+ * declare both `cases` and `manualInput` and evals whose schema cannot be
193
+ * walked into a wire descriptor.
194
+ */
195
+ function resolveManualInputDiscovery(params) {
196
+ const { evalDef, evalId, relativeFilePath } = params;
197
+ if (!evalDef.manualInput) return { kind: "none" };
198
+ if (evalDef.cases !== void 0) return {
199
+ kind: "issue",
200
+ issue: {
201
+ type: "manual-input-with-cases",
202
+ severity: "error",
203
+ filePath: relativeFilePath,
204
+ evalId,
205
+ message: `Eval "${evalId}" in ${relativeFilePath} declares both "cases" and "manualInput". Remove one of them.`
206
+ }
207
+ };
208
+ const descriptorResult = buildManualInputDescriptor(evalDef.manualInput);
209
+ if (descriptorResult.error) return {
210
+ kind: "issue",
211
+ issue: {
212
+ type: "manual-input-with-cases",
213
+ severity: "error",
214
+ filePath: relativeFilePath,
215
+ evalId,
216
+ message: `Eval "${evalId}" in ${relativeFilePath} has an unsupported manualInput schema: ${descriptorResult.error.message}`
217
+ }
218
+ };
219
+ return {
220
+ kind: "ok",
221
+ requiresManualInput: true,
222
+ descriptor: descriptorResult.value,
223
+ config: evalDef.manualInput
224
+ };
225
+ }
226
+ //#endregion
227
+ //#region ../runner/src/manualInput/files.ts
228
+ const stagedUploadDir = ".agent-evals/manual-input-uploads";
229
+ const mimeTypeByExtension = {
230
+ ".gif": "image/gif",
231
+ ".jpeg": "image/jpeg",
232
+ ".jpg": "image/jpeg",
233
+ ".json": "application/json",
234
+ ".md": "text/markdown",
235
+ ".pdf": "application/pdf",
236
+ ".png": "image/png",
237
+ ".svg": "image/svg+xml",
238
+ ".txt": "text/plain",
239
+ ".webp": "image/webp"
240
+ };
241
+ function toWorkspaceRelativePath(params) {
242
+ return relative(params.workspaceRoot, params.filePath).replaceAll("\\", "/");
243
+ }
244
+ function isInsideWorkspace(params) {
245
+ const rel = relative(params.workspaceRoot, params.filePath);
246
+ return rel === "" || !rel.startsWith("..") && !isAbsolute(rel);
247
+ }
248
+ function sanitizeSegment(value) {
249
+ const normalized = value.trim().replaceAll(/[^A-Za-z0-9._-]+/g, "-");
250
+ return normalized.length > 0 ? normalized : "file";
251
+ }
252
+ function sanitizeFileName(value) {
253
+ const normalized = sanitizeSegment(value);
254
+ const extension = extname(normalized);
255
+ if (extension.length === 0) return normalized;
256
+ return `${normalized.slice(0, -extension.length).replaceAll(".", "-")}${extension}`;
257
+ }
258
+ function inferMimeType(params) {
259
+ const normalized = params.mimeType?.trim();
260
+ if (normalized && normalized.length > 0) return normalized;
261
+ return mimeTypeByExtension[extname(params.name).toLowerCase()] ?? "";
262
+ }
263
+ function hashBytes(bytes) {
264
+ return createHash("sha256").update(bytes).digest("hex");
265
+ }
266
+ function isRecord$1(value) {
267
+ return typeof value === "object" && value !== null && !Array.isArray(value);
268
+ }
269
+ function isManualInputFileValue(value) {
270
+ if (!isRecord$1(value)) return false;
271
+ return typeof value.name === "string" && typeof value.mimeType === "string" && typeof value.sizeBytes === "number" && typeof value.sha256 === "string" && typeof value.path === "string";
272
+ }
273
+ function isStagedManualInputPath(path) {
274
+ return path === stagedUploadDir || path.startsWith(`${stagedUploadDir}/`) || path.startsWith(stagedUploadDir + sep);
275
+ }
276
+ /**
277
+ * Persist uploaded manual-input bytes in the workspace staging area and return
278
+ * the JSON-safe metadata used by manual-input schemas.
279
+ */
280
+ async function stageManualInputFile({ workspaceRoot, bytes, name, mimeType }) {
281
+ const fileName = sanitizeFileName(name || "uploaded-file");
282
+ const sha256 = hashBytes(bytes);
283
+ const dir = resolve(workspaceRoot, stagedUploadDir);
284
+ await mkdir(dir, { recursive: true });
285
+ const targetPath = join(dir, `${Date.now().toString(36)}-${randomUUID()}__${sha256.slice(0, 12)}__${fileName}`);
286
+ await writeFile(targetPath, bytes);
287
+ return {
288
+ name: name || fileName,
289
+ mimeType: inferMimeType({
290
+ mimeType,
291
+ name: fileName
292
+ }),
293
+ sizeBytes: bytes.byteLength,
294
+ sha256,
295
+ path: toWorkspaceRelativePath({
296
+ workspaceRoot,
297
+ filePath: targetPath
298
+ })
299
+ };
300
+ }
301
+ /**
302
+ * Read a file path supplied by the CLI and stage it as a manual-input file.
303
+ */
304
+ async function stageManualInputFileFromPath({ workspaceRoot, path, name, mimeType }) {
305
+ const sourcePath = isAbsolute(path) ? resolve(path) : resolve(workspaceRoot, path);
306
+ return await stageManualInputFile({
307
+ workspaceRoot,
308
+ bytes: new Uint8Array(await readFile(sourcePath)),
309
+ name: name ?? basename(sourcePath),
310
+ mimeType: inferMimeType({
311
+ mimeType,
312
+ name: name ?? basename(sourcePath)
313
+ })
314
+ });
315
+ }
316
+ async function materializeOneManualInputFile(params) {
317
+ const sourcePath = resolve(params.workspaceRoot, params.value.path);
318
+ if (!isInsideWorkspace({
319
+ workspaceRoot: params.workspaceRoot,
320
+ filePath: sourcePath
321
+ })) throw new Error(`Manual input file path escapes workspace: ${params.value.path}`);
322
+ const bytes = new Uint8Array(await readFile(sourcePath));
323
+ const sha256 = hashBytes(bytes);
324
+ const fileName = sanitizeFileName(params.value.name || basename(sourcePath));
325
+ const artifactId = [
326
+ sanitizeSegment(params.runId),
327
+ "manual-input",
328
+ sha256.slice(0, 12),
329
+ fileName
330
+ ].join("__");
331
+ const targetPath = join(params.runDir, "artifacts", artifactId);
332
+ await mkdir(join(params.runDir, "artifacts"), { recursive: true });
333
+ if (sourcePath !== targetPath) await copyFile(sourcePath, targetPath);
334
+ if (isStagedManualInputPath(params.value.path)) await resultify(() => rm(sourcePath, { force: true }));
335
+ return {
336
+ name: params.value.name,
337
+ mimeType: inferMimeType({
338
+ mimeType: params.value.mimeType,
339
+ name: params.value.name || fileName
340
+ }),
341
+ sizeBytes: bytes.byteLength,
342
+ sha256,
343
+ path: toWorkspaceRelativePath({
344
+ workspaceRoot: params.workspaceRoot,
345
+ filePath: targetPath
346
+ })
347
+ };
348
+ }
349
+ async function materializeUnknownValue(params) {
350
+ if (isManualInputFileValue(params.value)) return await materializeOneManualInputFile({
351
+ workspaceRoot: params.workspaceRoot,
352
+ runId: params.runId,
353
+ runDir: params.runDir,
354
+ value: params.value
355
+ });
356
+ if (Array.isArray(params.value)) return await Promise.all(params.value.map(async (entry) => await materializeUnknownValue({
357
+ workspaceRoot: params.workspaceRoot,
358
+ runId: params.runId,
359
+ runDir: params.runDir,
360
+ value: entry
361
+ })));
362
+ if (isRecord$1(params.value)) {
363
+ const entries = await Promise.all(Object.entries(params.value).map(async ([key, child]) => {
364
+ return [key, await materializeUnknownValue({
365
+ workspaceRoot: params.workspaceRoot,
366
+ runId: params.runId,
367
+ runDir: params.runDir,
368
+ value: child
369
+ })];
370
+ }));
371
+ return Object.fromEntries(entries);
372
+ }
373
+ return params.value;
374
+ }
375
+ /**
376
+ * Copy all manual-input file references inside a run request into the run's
377
+ * artifact directory and return a request-safe value with artifact paths.
378
+ */
379
+ async function materializeManualInputFiles({ workspaceRoot, runId, runDir, value }) {
380
+ const result = await resultify(() => materializeUnknownValue({
381
+ workspaceRoot,
382
+ runId,
383
+ runDir,
384
+ value
385
+ }));
386
+ if (result.error) return {
387
+ error: result.error.message,
388
+ value: null
389
+ };
390
+ return {
391
+ error: null,
392
+ value: result.value
393
+ };
394
+ }
395
+ /** Remove stale staged manual-input uploads from previous abandoned runs. */
396
+ async function cleanupStagedManualInputFiles(workspaceRoot) {
397
+ await resultify(() => rm(resolve(workspaceRoot, stagedUploadDir), {
398
+ force: true,
399
+ recursive: true
400
+ }));
401
+ }
402
+ //#endregion
403
+ //#region ../runner/src/manualInput/validation.ts
404
+ function evalIsTargeted(evalMeta, target) {
405
+ if (target.evalKeys && target.evalKeys.length > 0) {
406
+ if (!target.evalKeys.includes(evalMeta.key)) return false;
407
+ }
408
+ if (target.evalIds && target.evalIds.length > 0) {
409
+ if (!target.evalIds.includes(evalMeta.id)) return false;
410
+ }
411
+ return true;
412
+ }
413
+ /**
414
+ * Validate the `manualInputs` map carried by a `CreateRunRequest` against the
415
+ * authored Zod schemas of every targeted eval that requires manual input.
416
+ *
417
+ * Pure: takes captured discovery state (eval metas + schema configs) and the
418
+ * request, returns a structured result the server/CLI can format directly.
419
+ */
420
+ function validateManualInputsForRequest(params) {
421
+ const { evalMetas, manualInputConfigs, request } = params;
422
+ const failures = [];
423
+ const parsed = {};
424
+ for (const evalMeta of evalMetas) {
425
+ if (!evalMeta.requiresManualInput) continue;
426
+ if (!evalIsTargeted(evalMeta, request.target)) continue;
427
+ const rawValue = request.manualInputs?.[evalMeta.key];
428
+ if (rawValue === void 0) {
429
+ failures.push({
430
+ evalKey: evalMeta.key,
431
+ evalId: evalMeta.id,
432
+ reason: "missing",
433
+ issues: [{
434
+ path: "",
435
+ message: `manualInputs is missing an entry for "${evalMeta.key}"`
436
+ }]
437
+ });
438
+ continue;
439
+ }
440
+ const config = manualInputConfigs.get(evalMeta.key);
441
+ if (!config) {
442
+ failures.push({
443
+ evalKey: evalMeta.key,
444
+ evalId: evalMeta.id,
445
+ reason: "invalid",
446
+ issues: [{
447
+ path: "",
448
+ message: "manualInput schema is unavailable; reload the workspace and try again"
449
+ }]
450
+ });
451
+ continue;
452
+ }
453
+ const result = parseManualInputValues(config, rawValue);
454
+ if (result.error) {
455
+ failures.push({
456
+ evalKey: evalMeta.key,
457
+ evalId: evalMeta.id,
458
+ reason: "invalid",
459
+ issues: result.error.issues
460
+ });
461
+ continue;
462
+ }
463
+ parsed[evalMeta.key] = result.value;
464
+ }
465
+ if (failures.length > 0) return {
466
+ ok: false,
467
+ failures
468
+ };
469
+ return {
470
+ ok: true,
471
+ parsed
472
+ };
473
+ }
474
+ //#endregion
63
475
  //#region ../runner/src/recalculateDerivedAttributes.ts
64
476
  function getCaseArtifactFileIdForExistingRun(runState, caseRow) {
65
477
  const caseKey = getCaseRowCaseKey(caseRow);
@@ -338,6 +750,9 @@ function getWatchRootsForIncludePatterns(params) {
338
750
  }
339
751
  //#endregion
340
752
  //#region ../runner/src/runner.ts
753
+ function isRecord(value) {
754
+ return typeof value === "object" && value !== null && !Array.isArray(value);
755
+ }
341
756
  /** Create an in-memory eval runner bound to the current workspace config. */
342
757
  function createRunner({ watchForChanges = true } = {}) {
343
758
  let config;
@@ -347,6 +762,7 @@ function createRunner({ watchForChanges = true } = {}) {
347
762
  let llmCallsConfig = resolveLlmCallsConfig(void 0);
348
763
  let apiCallsConfig = resolveApiCallsConfig(void 0);
349
764
  const evals = /* @__PURE__ */ new Map();
765
+ const manualInputConfigs = /* @__PURE__ */ new Map();
350
766
  let discoveryIssues = [];
351
767
  const runs = /* @__PURE__ */ new Map();
352
768
  const lastRunStatusMap = /* @__PURE__ */ new Map();
@@ -357,6 +773,12 @@ function createRunner({ watchForChanges = true } = {}) {
357
773
  let runHistoryWatcher;
358
774
  let discoveryRefreshTimer;
359
775
  let runHistoryRefreshTimer;
776
+ const configReload = createConfigReloadController({
777
+ getActiveRunCount,
778
+ closeRunnerWatchers: closeWatchers,
779
+ loadRunnerState,
780
+ emitToDiscoveryListeners
781
+ });
360
782
  function toWorkspaceRelativePath(filePath) {
361
783
  return relative(workspaceRoot, filePath).replaceAll("\\", "/");
362
784
  }
@@ -374,22 +796,7 @@ function createRunner({ watchForChanges = true } = {}) {
374
796
  }
375
797
  const runner = {
376
798
  async init() {
377
- config = await loadConfig();
378
- workspaceRoot = config.workspaceRoot ?? process.cwd();
379
- localStateDir = resolve(workspaceRoot, ".agent-evals");
380
- llmCallsConfig = resolveLlmCallsConfig(config.llmCalls);
381
- apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
382
- await mkdir(localStateDir, { recursive: true });
383
- await mkdir(join(localStateDir, "runs"), { recursive: true });
384
- cacheStore = createFsCacheStore({
385
- workspaceRoot,
386
- dir: config.cache?.dir,
387
- maxEntriesPerNamespace: config.cache?.maxEntriesPerNamespace ?? config.cache?.maxEntriesPerEval,
388
- maxEntriesByNamespace: config.cache?.maxEntriesByNamespace
389
- });
390
- await loadPersistedRuns();
391
- await runner.refreshDiscovery();
392
- if (watchForChanges) await setupWatcher();
799
+ await loadRunnerState();
393
800
  },
394
801
  async listCache() {
395
802
  return cacheStore.list();
@@ -535,6 +942,13 @@ function createRunner({ watchForChanges = true } = {}) {
535
942
  emitDiscoveryEvent();
536
943
  return { deleted: true };
537
944
  },
945
+ validateManualInputs(request) {
946
+ return validateManualInputsForRequest({
947
+ evalMetas: getSortedEvalMetas(),
948
+ manualInputConfigs,
949
+ request
950
+ });
951
+ },
538
952
  getEvals() {
539
953
  const gitState = readGitWorktreeState(workspaceRoot);
540
954
  const result = [];
@@ -561,6 +975,9 @@ function createRunner({ watchForChanges = true } = {}) {
561
975
  getDiscoveryIssues() {
562
976
  return discoveryIssues;
563
977
  },
978
+ getConfigReloadState() {
979
+ return configReload.currentState();
980
+ },
564
981
  async refreshDiscovery() {
565
982
  const patterns = config.include;
566
983
  const discovered = [];
@@ -572,6 +989,7 @@ function createRunner({ watchForChanges = true } = {}) {
572
989
  discovered.push(...files);
573
990
  }
574
991
  evals.clear();
992
+ manualInputConfigs.clear();
575
993
  discoveryIssues = [];
576
994
  for (const filePath of discovered) try {
577
995
  const content = await readFile(filePath, "utf-8");
@@ -595,6 +1013,9 @@ function createRunner({ watchForChanges = true } = {}) {
595
1013
  let columnDefs = buildDeclaredColumnDefs(void 0, void 0, void 0);
596
1014
  let stats;
597
1015
  let charts;
1016
+ let manualInputDescriptor;
1017
+ let requiresManualInput = false;
1018
+ const relativeFilePath = toWorkspaceRelativePath(meta.filePath);
598
1019
  discoveredEntry?.use((evalDef) => {
599
1020
  const defaultConfig = resolveEvalDefaultConfig({
600
1021
  evalDef,
@@ -611,8 +1032,25 @@ function createRunner({ watchForChanges = true } = {}) {
611
1032
  });
612
1033
  for (const warning of validated.warnings) console.warn(warning);
613
1034
  charts = validated.charts;
1035
+ const manualInputResult = resolveManualInputDiscovery({
1036
+ evalDef,
1037
+ evalId: meta.id,
1038
+ relativeFilePath
1039
+ });
1040
+ if (manualInputResult.kind === "issue") {
1041
+ discoveryIssues.push(manualInputResult.issue);
1042
+ requiresManualInput = true;
1043
+ return;
1044
+ }
1045
+ if (manualInputResult.kind === "ok") {
1046
+ requiresManualInput = manualInputResult.requiresManualInput;
1047
+ manualInputDescriptor = manualInputResult.descriptor;
1048
+ manualInputConfigs.set(buildEvalKey({
1049
+ filePath: relativeFilePath,
1050
+ evalId: meta.id
1051
+ }), manualInputResult.config);
1052
+ }
614
1053
  });
615
- const relativeFilePath = toWorkspaceRelativePath(meta.filePath);
616
1054
  const key = buildEvalKey({
617
1055
  filePath: relativeFilePath,
618
1056
  evalId: meta.id
@@ -627,7 +1065,9 @@ function createRunner({ watchForChanges = true } = {}) {
627
1065
  columnDefs,
628
1066
  caseCount: null,
629
1067
  stats,
630
- charts
1068
+ charts,
1069
+ manualInputDescriptor,
1070
+ requiresManualInput
631
1071
  });
632
1072
  }
633
1073
  } catch {}
@@ -673,11 +1113,27 @@ function createRunner({ watchForChanges = true } = {}) {
673
1113
  childProcess: void 0,
674
1114
  childTerminalReceived: false
675
1115
  };
1116
+ await mkdir(runDir, { recursive: true });
1117
+ await mkdir(join(runDir, "traces"), { recursive: true });
1118
+ await mkdir(join(runDir, "artifacts"), { recursive: true });
1119
+ await mkdir(join(runDir, "case-details"), { recursive: true });
1120
+ const materializedRequest = { ...request };
1121
+ if (request.manualInputs !== void 0) {
1122
+ const materialized = await materializeManualInputFiles({
1123
+ workspaceRoot,
1124
+ runId,
1125
+ runDir,
1126
+ value: request.manualInputs
1127
+ });
1128
+ if (materialized.error !== null) throw new Error(materialized.error);
1129
+ if (!isRecord(materialized.value)) throw new Error("Materialized manual inputs must be an object");
1130
+ materializedRequest.manualInputs = materialized.value;
1131
+ }
676
1132
  runs.set(runId, runState);
677
1133
  setLatestRunInfoMap({
678
1134
  latestRunInfoMap,
679
1135
  evalIds: getTargetEvalKeys({
680
- request,
1136
+ request: materializedRequest,
681
1137
  sortedEvals: getSortedEvalMetas()
682
1138
  }),
683
1139
  info: {
@@ -687,13 +1143,9 @@ function createRunner({ watchForChanges = true } = {}) {
687
1143
  evalSourceFingerprint: null
688
1144
  }
689
1145
  });
690
- await mkdir(runDir, { recursive: true });
691
- await mkdir(join(runDir, "traces"), { recursive: true });
692
- await mkdir(join(runDir, "artifacts"), { recursive: true });
693
- await mkdir(join(runDir, "case-details"), { recursive: true });
694
1146
  await writeFile(join(runDir, "run.json"), JSON.stringify(manifest, null, 2));
695
1147
  const childContext = {
696
- request,
1148
+ request: materializedRequest,
697
1149
  workspaceRoot,
698
1150
  runDir,
699
1151
  manifest,
@@ -776,18 +1228,7 @@ function createRunner({ watchForChanges = true } = {}) {
776
1228
  };
777
1229
  },
778
1230
  async close() {
779
- if (discoveryRefreshTimer !== void 0) {
780
- clearTimeout(discoveryRefreshTimer);
781
- discoveryRefreshTimer = void 0;
782
- }
783
- if (runHistoryRefreshTimer !== void 0) {
784
- clearTimeout(runHistoryRefreshTimer);
785
- runHistoryRefreshTimer = void 0;
786
- }
787
- const watchers = [discoveryWatcher, runHistoryWatcher].filter((watcher) => watcher !== void 0);
788
- discoveryWatcher = void 0;
789
- runHistoryWatcher = void 0;
790
- await Promise.all(watchers.map((watcher) => watcher.close()));
1231
+ await Promise.all([closeWatchers(), configReload.close()]);
791
1232
  },
792
1233
  getWorkspaceRoot() {
793
1234
  return workspaceRoot;
@@ -805,6 +1246,39 @@ function createRunner({ watchForChanges = true } = {}) {
805
1246
  return resolveArtifactPath(join(localStateDir, "runs"), artifactId_);
806
1247
  }
807
1248
  };
1249
+ async function loadRunnerState() {
1250
+ config = await loadConfig();
1251
+ workspaceRoot = config.workspaceRoot ?? process.cwd();
1252
+ localStateDir = resolve(workspaceRoot, ".agent-evals");
1253
+ llmCallsConfig = resolveLlmCallsConfig(config.llmCalls);
1254
+ apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
1255
+ await mkdir(localStateDir, { recursive: true });
1256
+ await mkdir(join(localStateDir, "runs"), { recursive: true });
1257
+ await cleanupStagedManualInputFiles(workspaceRoot);
1258
+ cacheStore = createFsCacheStore({
1259
+ workspaceRoot,
1260
+ dir: config.cache?.dir,
1261
+ maxEntriesPerNamespace: config.cache?.maxEntriesPerNamespace ?? config.cache?.maxEntriesPerEval,
1262
+ maxEntriesByNamespace: config.cache?.maxEntriesByNamespace
1263
+ });
1264
+ await loadPersistedRuns();
1265
+ await runner.refreshDiscovery();
1266
+ if (watchForChanges) await setupWatcher();
1267
+ }
1268
+ async function closeWatchers() {
1269
+ if (discoveryRefreshTimer !== void 0) {
1270
+ clearTimeout(discoveryRefreshTimer);
1271
+ discoveryRefreshTimer = void 0;
1272
+ }
1273
+ if (runHistoryRefreshTimer !== void 0) {
1274
+ clearTimeout(runHistoryRefreshTimer);
1275
+ runHistoryRefreshTimer = void 0;
1276
+ }
1277
+ const watchers = [discoveryWatcher, runHistoryWatcher].filter((watcher) => watcher !== void 0);
1278
+ discoveryWatcher = void 0;
1279
+ runHistoryWatcher = void 0;
1280
+ await Promise.all(watchers.map((watcher) => watcher.close()));
1281
+ }
808
1282
  async function setupWatcher() {
809
1283
  const watcher = watch(getWatchRootsForIncludePatterns({
810
1284
  patterns: config.include,
@@ -829,7 +1303,7 @@ function createRunner({ watchForChanges = true } = {}) {
829
1303
  watcher.on("unlink", scheduleRefresh);
830
1304
  watcher.on("addDir", scheduleRefresh);
831
1305
  watcher.on("unlinkDir", scheduleRefresh);
832
- await setupRunHistoryWatcher();
1306
+ await Promise.all([setupRunHistoryWatcher(), configReload.setupWatcher()]);
833
1307
  await watcherReady;
834
1308
  }
835
1309
  async function setupRunHistoryWatcher() {
@@ -854,6 +1328,9 @@ function createRunner({ watchForChanges = true } = {}) {
854
1328
  watcher.once("ready", ready);
855
1329
  });
856
1330
  }
1331
+ function getActiveRunCount() {
1332
+ return [...runs.values()].filter((run) => run.manifest.status === "running").length;
1333
+ }
857
1334
  function emitDiscoveryEvent() {
858
1335
  const lastRunStatuses = getLastRunStatuses({
859
1336
  runs: runs.values(),
@@ -873,6 +1350,10 @@ function createRunner({ watchForChanges = true } = {}) {
873
1350
  payload: runner.getEvals()
874
1351
  };
875
1352
  for (const listener of discoveryListeners) listener(event);
1353
+ configReload.reloadIfPendingAndIdle();
1354
+ }
1355
+ function emitToDiscoveryListeners(event) {
1356
+ for (const listener of discoveryListeners) listener(event);
876
1357
  }
877
1358
  function emitEvent(runState, event) {
878
1359
  for (const listener of runState.listeners) try {
@@ -915,6 +1396,345 @@ function createRunner({ watchForChanges = true } = {}) {
915
1396
  return runner;
916
1397
  }
917
1398
  //#endregion
1399
+ //#region src/cliHelp.ts
1400
+ /** Render the help block for a given CLI topic to stdout via `console.info`. */
1401
+ function printHelp(topic = "global") {
1402
+ if (topic === "app") {
1403
+ console.info(`
1404
+ agent-evals app - Start server with UI
1405
+
1406
+ Usage:
1407
+ agent-evals app [flags]
1408
+
1409
+ Flags:
1410
+ --port <n> Server port (default: 4100)
1411
+ --no-env Disable automatic .env loading
1412
+ --help, -h Show this help
1413
+ `);
1414
+ return;
1415
+ }
1416
+ if (topic === "list") {
1417
+ console.info(`
1418
+ agent-evals list - List discovered evals
1419
+
1420
+ Usage:
1421
+ agent-evals list [flags]
1422
+
1423
+ Flags:
1424
+ --no-env Disable automatic .env loading
1425
+ --help, -h Show this help
1426
+ `);
1427
+ return;
1428
+ }
1429
+ if (topic === "run") {
1430
+ console.info(`
1431
+ agent-evals run - Run evals
1432
+
1433
+ Usage:
1434
+ agent-evals run [flags]
1435
+
1436
+ Flags:
1437
+ --eval <id> Run specific eval(s) (comma-separated)
1438
+ --file <path|glob> Run eval files matching path/glob (comma-separated)
1439
+ --case <id> Run case(s); combine with --file/--eval if ambiguous
1440
+ --trials <n> Number of trials per case
1441
+ --inspect[=host:port] Run with the Node.js inspector enabled
1442
+ --inspect-brk[=host:port] Enable inspector and pause before startup
1443
+ --json Output run summary as JSON
1444
+ --cache <use|bypass|refresh> Cache mode for this run (default: use)
1445
+ --no-cache Shortcut for --cache bypass
1446
+ --refresh-cache Shortcut for --cache refresh
1447
+ --clear-cache Clear the cache before starting the run
1448
+ --input <json> Manual input value for a single targeted eval
1449
+ that declares manualInput
1450
+ --input-file <path> JSON object keyed by eval key (or eval id) with
1451
+ manual input values for one or more targeted evals
1452
+ --no-env Disable automatic .env loading
1453
+ --help, -h Show this help
1454
+ `);
1455
+ return;
1456
+ }
1457
+ if (topic === "show-runs") {
1458
+ console.info(`
1459
+ agent-evals show-runs - Show saved run artifact file paths
1460
+
1461
+ Usage:
1462
+ agent-evals show-runs [<run-id>|latest] [--json]
1463
+
1464
+ Prints the run directory and stable artifact paths for run.json, summary.json,
1465
+ cases.jsonl, case detail JSON, and trace JSON files. Run ids can be full
1466
+ timestamp ids, short ids such as r0, or latest.
1467
+
1468
+ Flags:
1469
+ --json Output the file index as JSON
1470
+ --no-env Disable automatic .env loading
1471
+ --help, -h Show this help
1472
+ `);
1473
+ return;
1474
+ }
1475
+ if (topic === "cache" || topic === "cache list" || topic === "cache clear") {
1476
+ console.info(`
1477
+ agent-evals cache - Manage cached operation entries
1478
+
1479
+ Usage:
1480
+ agent-evals cache list [flags]
1481
+ agent-evals cache clear --eval <id>
1482
+ agent-evals cache clear --all
1483
+
1484
+ Flags:
1485
+ --eval <id> Clear entries for specific eval(s) (comma-separated)
1486
+ --all Confirm clearing every cached entry
1487
+ --json Output cache listing as JSON
1488
+ --no-env Disable automatic .env loading
1489
+ --help, -h Show this help
1490
+ `);
1491
+ return;
1492
+ }
1493
+ console.info(`
1494
+ agent-evals - LLM/Agent eval runner
1495
+
1496
+ Commands:
1497
+ app Start server with UI
1498
+ list List discovered evals
1499
+ run Run evals
1500
+ show-runs [id|latest] Show saved run artifact file paths
1501
+ cache list List cached operation entries
1502
+ cache clear --eval <id> Clear cache entries for one eval
1503
+ cache clear --all Clear every cached entry
1504
+ help Show this help
1505
+
1506
+ Options:
1507
+ --eval <id> Run specific eval(s) (comma-separated)
1508
+ --case <id> Run specific case(s) (comma-separated)
1509
+ --trials <n> Number of trials per case
1510
+ --inspect[=host:port] Run with the Node.js inspector enabled
1511
+ --inspect-brk[=host:port] Enable inspector and pause before startup
1512
+ --json Output results as JSON
1513
+ --port <n> Server port (default: 4100)
1514
+ --cache <use|bypass|refresh> Cache mode for this run (default: use)
1515
+ --no-cache Shortcut for --cache bypass
1516
+ --refresh-cache Shortcut for --cache refresh
1517
+ --clear-cache Clear the cache before starting the run
1518
+ --no-env Disable automatic .env loading
1519
+ --help, -h Show help
1520
+ `);
1521
+ }
1522
+ //#endregion
1523
+ //#region src/manualInputArgs.ts
1524
+ function isPlainObject(value) {
1525
+ return typeof value === "object" && value !== null && !Array.isArray(value);
1526
+ }
1527
+ function isPathInputObject(value) {
1528
+ if (!isPlainObject(value)) return false;
1529
+ return typeof value.path === "string" && (value.name === void 0 || typeof value.name === "string") && (value.mimeType === void 0 || typeof value.mimeType === "string");
1530
+ }
1531
+ function escapeRegex$1(value) {
1532
+ return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
1533
+ }
1534
+ function globToRegex$1(pattern) {
1535
+ const normalized = pattern.replaceAll("\\", "/");
1536
+ let regex = "^";
1537
+ for (let i = 0; i < normalized.length; i++) {
1538
+ const char = normalized[i];
1539
+ const next = normalized[i + 1];
1540
+ if (char === "*" && next === "*") {
1541
+ regex += ".*";
1542
+ i++;
1543
+ } else if (char === "*") regex += "[^/]*";
1544
+ else if (char === "?") regex += "[^/]";
1545
+ else regex += escapeRegex$1(char ?? "");
1546
+ }
1547
+ regex += "$";
1548
+ return new RegExp(regex);
1549
+ }
1550
+ function fileMatches$1(pattern, filePath) {
1551
+ const normalizedPattern = pattern.replaceAll("\\", "/");
1552
+ if (normalizedPattern === filePath) return true;
1553
+ return globToRegex$1(normalizedPattern).test(filePath);
1554
+ }
1555
+ function isManualInputEvalTargeted(params) {
1556
+ const { evalSummary, args } = params;
1557
+ const hasEvalIds = args.evalIds.length > 0;
1558
+ const hasFiles = args.files.length > 0;
1559
+ const hasCaseIds = args.caseIds.length > 0;
1560
+ if (hasEvalIds && !args.evalIds.includes(evalSummary.id)) return false;
1561
+ if (hasFiles) {
1562
+ if (!args.files.some((file) => fileMatches$1(file, evalSummary.filePath))) return false;
1563
+ }
1564
+ if (!hasEvalIds && !hasFiles) {
1565
+ if (hasCaseIds) return false;
1566
+ return true;
1567
+ }
1568
+ return true;
1569
+ }
1570
+ async function readInputFileMap(inputFilePath) {
1571
+ const readResult = await resultify(() => readFile(inputFilePath, "utf-8"));
1572
+ if (readResult.error) return {
1573
+ error: `Failed to read --input-file at ${inputFilePath}: ${readResult.error.message}`,
1574
+ value: null
1575
+ };
1576
+ const parseResult = resultify(() => JSON.parse(readResult.value));
1577
+ if (parseResult.error) return {
1578
+ error: `Failed to parse --input-file at ${inputFilePath} as JSON: ${parseResult.error.message}`,
1579
+ value: null
1580
+ };
1581
+ return {
1582
+ error: null,
1583
+ value: parseResult.value
1584
+ };
1585
+ }
1586
+ async function normalizeManualInputFileValue(params) {
1587
+ if (isManualInputFileValue(params.value)) return {
1588
+ error: null,
1589
+ value: params.value
1590
+ };
1591
+ if (!isPathInputObject(params.value)) return {
1592
+ error: null,
1593
+ value: params.value
1594
+ };
1595
+ const pathInput = params.value;
1596
+ const staged = await resultify(() => stageManualInputFileFromPath({
1597
+ workspaceRoot: params.workspaceRoot,
1598
+ path: pathInput.path,
1599
+ name: pathInput.name,
1600
+ mimeType: pathInput.mimeType
1601
+ }));
1602
+ if (staged.error) return {
1603
+ error: `Failed to stage file input "${params.fieldKey}" for eval "${params.evalId}": ${staged.error.message}`,
1604
+ value: null
1605
+ };
1606
+ return {
1607
+ error: null,
1608
+ value: staged.value
1609
+ };
1610
+ }
1611
+ async function normalizeManualInputValue(params) {
1612
+ const descriptor = params.evalSummary.manualInput;
1613
+ if (!descriptor || !isPlainObject(params.value)) return {
1614
+ error: null,
1615
+ value: params.value
1616
+ };
1617
+ const next = { ...params.value };
1618
+ for (const field of descriptor.fields) {
1619
+ if (field.kind !== "file") continue;
1620
+ const normalized = await normalizeManualInputFileValue({
1621
+ workspaceRoot: params.workspaceRoot,
1622
+ evalId: params.evalSummary.id,
1623
+ fieldKey: field.key,
1624
+ value: next[field.key]
1625
+ });
1626
+ if (normalized.error !== null) return {
1627
+ error: normalized.error,
1628
+ value: null
1629
+ };
1630
+ next[field.key] = normalized.value;
1631
+ }
1632
+ return {
1633
+ error: null,
1634
+ value: next
1635
+ };
1636
+ }
1637
+ /**
1638
+ * Resolve the `manualInputs` payload to send with `runner.startRun`.
1639
+ *
1640
+ * Inspects every discovered eval that declares `manualInput`, filters them to
1641
+ * the run target, and either returns the typed map (single eval via `--input`,
1642
+ * multiple via `--input-file`) or a structured error to display and exit on.
1643
+ */
1644
+ async function collectManualInputs(params) {
1645
+ const { runner, args } = params;
1646
+ const workspaceRoot = runner.getWorkspaceRoot();
1647
+ const targetedManualInputEvals = runner.getEvals().filter((evalSummary) => evalSummary.manualInput !== void 0).filter((evalSummary) => isManualInputEvalTargeted({
1648
+ evalSummary,
1649
+ args
1650
+ }));
1651
+ if (targetedManualInputEvals.length === 0) {
1652
+ if (args.inputJson !== void 0 || args.inputFilePath !== void 0) return {
1653
+ error: "--input/--input-file was provided but no targeted eval requires manual input.",
1654
+ value: null
1655
+ };
1656
+ return {
1657
+ error: null,
1658
+ value: void 0
1659
+ };
1660
+ }
1661
+ if (args.inputJson !== void 0 && args.inputFilePath !== void 0) return {
1662
+ error: "Cannot use --input and --input-file together; choose one.",
1663
+ value: null
1664
+ };
1665
+ if (args.inputJson !== void 0) {
1666
+ if (targetedManualInputEvals.length > 1) {
1667
+ const ids = targetedManualInputEvals.map((evalSummary) => evalSummary.id).join(", ");
1668
+ return {
1669
+ error: `--input only works for one targeted manual-input eval at a time; got ${String(targetedManualInputEvals.length)} (${ids}). Use --input-file with a JSON object keyed by eval key.`,
1670
+ value: null
1671
+ };
1672
+ }
1673
+ const parsedResult = resultify(() => JSON.parse(args.inputJson ?? ""));
1674
+ if (parsedResult.error) return {
1675
+ error: `Failed to parse --input as JSON: ${parsedResult.error.message}`,
1676
+ value: null
1677
+ };
1678
+ const [onlyEval] = targetedManualInputEvals;
1679
+ if (onlyEval === void 0) return {
1680
+ error: null,
1681
+ value: void 0
1682
+ };
1683
+ const normalized = await normalizeManualInputValue({
1684
+ workspaceRoot,
1685
+ evalSummary: onlyEval,
1686
+ value: parsedResult.value
1687
+ });
1688
+ if (normalized.error !== null) return {
1689
+ error: normalized.error,
1690
+ value: null
1691
+ };
1692
+ return {
1693
+ error: null,
1694
+ value: { [onlyEval.key]: normalized.value }
1695
+ };
1696
+ }
1697
+ if (args.inputFilePath !== void 0) {
1698
+ const fileResult = await readInputFileMap(args.inputFilePath);
1699
+ if (fileResult.error !== null) return {
1700
+ error: fileResult.error,
1701
+ value: null
1702
+ };
1703
+ if (!isPlainObject(fileResult.value)) return {
1704
+ error: `--input-file must contain a JSON object keyed by eval key (got ${typeof fileResult.value}).`,
1705
+ value: null
1706
+ };
1707
+ const map = {};
1708
+ for (const evalSummary of targetedManualInputEvals) {
1709
+ const byKey = fileResult.value[evalSummary.key];
1710
+ const byId = fileResult.value[evalSummary.id];
1711
+ const value = byKey !== void 0 ? byKey : byId;
1712
+ if (value === void 0) return {
1713
+ error: `--input-file is missing manual input for eval "${evalSummary.id}" (key "${evalSummary.key}").`,
1714
+ value: null
1715
+ };
1716
+ const normalized = await normalizeManualInputValue({
1717
+ workspaceRoot,
1718
+ evalSummary,
1719
+ value
1720
+ });
1721
+ if (normalized.error !== null) return {
1722
+ error: normalized.error,
1723
+ value: null
1724
+ };
1725
+ map[evalSummary.key] = normalized.value;
1726
+ }
1727
+ return {
1728
+ error: null,
1729
+ value: map
1730
+ };
1731
+ }
1732
+ return {
1733
+ error: `Eval(s) require manual input but no --input/--input-file was provided: ${targetedManualInputEvals.map((evalSummary) => evalSummary.id).join(", ")}`,
1734
+ value: null
1735
+ };
1736
+ }
1737
+ //#endregion
918
1738
  //#region src/cli.ts
919
1739
  function parseArgs(argv) {
920
1740
  const normalizedArgv = argv.filter((arg) => arg !== "--no-env");
@@ -934,7 +1754,9 @@ function parseArgs(argv) {
934
1754
  cacheMode: "use",
935
1755
  clearCache: false,
936
1756
  all: false,
937
- loadEnv: normalizedArgv.length === argv.length
1757
+ loadEnv: normalizedArgv.length === argv.length,
1758
+ inputJson: void 0,
1759
+ inputFilePath: void 0
938
1760
  };
939
1761
  const command = normalizedArgv[0];
940
1762
  if (command === "--help" || command === "-h") {
@@ -981,7 +1803,13 @@ function parseArgs(argv) {
981
1803
  } else if (arg === "--no-cache") args.cacheMode = "bypass";
982
1804
  else if (arg === "--refresh-cache") args.cacheMode = "refresh";
983
1805
  else if (arg === "--clear-cache") args.clearCache = true;
984
- else if (arg === "--all") args.all = true;
1806
+ else if (arg === "--input" && next !== void 0) {
1807
+ args.inputJson = next;
1808
+ i++;
1809
+ } else if (arg === "--input-file" && next !== void 0) {
1810
+ args.inputFilePath = next;
1811
+ i++;
1812
+ } else if (arg === "--all") args.all = true;
985
1813
  else if (!arg.startsWith("-")) args.positionals.push(arg);
986
1814
  }
987
1815
  return args;
@@ -1112,8 +1940,8 @@ async function commandApp(args) {
1112
1940
  const { serve } = await import("@hono/node-server");
1113
1941
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
1114
1942
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
1115
- const appModule = await import("./app-mBbAN-Gt.mjs");
1116
- const runnerModule = await import("./runner-BQn_xf36.mjs");
1943
+ const appModule = await import("./app-D6-msfKP.mjs");
1944
+ const runnerModule = await import("./runner-Bq1f9B9d.mjs");
1117
1945
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
1118
1946
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
1119
1947
  await runnerModule.initRunner();
@@ -1184,10 +2012,26 @@ async function commandRun(args) {
1184
2012
  mode: "evalIds",
1185
2013
  files: args.files
1186
2014
  } : { mode: "all" };
2015
+ const manualInputsResult = await collectManualInputs({
2016
+ runner,
2017
+ args: {
2018
+ evalIds: args.evalIds,
2019
+ files: args.files,
2020
+ caseIds: args.caseIds,
2021
+ inputJson: args.inputJson,
2022
+ inputFilePath: args.inputFilePath
2023
+ }
2024
+ });
2025
+ if (manualInputsResult.error !== null) {
2026
+ console.error(manualInputsResult.error);
2027
+ process.exit(1);
2028
+ return;
2029
+ }
1187
2030
  const run = await runner.startRun({
1188
2031
  target,
1189
2032
  trials: args.trials,
1190
- cache: { mode: args.cacheMode }
2033
+ cache: { mode: args.cacheMode },
2034
+ manualInputs: manualInputsResult.value
1191
2035
  });
1192
2036
  if (!args.json) {
1193
2037
  console.info(`Run started: ${run.manifest.id}`);
@@ -1405,122 +2249,5 @@ async function waitForRunCompletion(runner, runId) {
1405
2249
  check();
1406
2250
  });
1407
2251
  }
1408
- function printHelp(topic = "global") {
1409
- if (topic === "app") {
1410
- console.info(`
1411
- agent-evals app - Start server with UI
1412
-
1413
- Usage:
1414
- agent-evals app [flags]
1415
-
1416
- Flags:
1417
- --port <n> Server port (default: 4100)
1418
- --no-env Disable automatic .env loading
1419
- --help, -h Show this help
1420
- `);
1421
- return;
1422
- }
1423
- if (topic === "list") {
1424
- console.info(`
1425
- agent-evals list - List discovered evals
1426
-
1427
- Usage:
1428
- agent-evals list [flags]
1429
-
1430
- Flags:
1431
- --no-env Disable automatic .env loading
1432
- --help, -h Show this help
1433
- `);
1434
- return;
1435
- }
1436
- if (topic === "run") {
1437
- console.info(`
1438
- agent-evals run - Run evals
1439
-
1440
- Usage:
1441
- agent-evals run [flags]
1442
-
1443
- Flags:
1444
- --eval <id> Run specific eval(s) (comma-separated)
1445
- --file <path|glob> Run eval files matching path/glob (comma-separated)
1446
- --case <id> Run case(s); combine with --file/--eval if ambiguous
1447
- --trials <n> Number of trials per case
1448
- --inspect[=host:port] Run with the Node.js inspector enabled
1449
- --inspect-brk[=host:port] Enable inspector and pause before startup
1450
- --json Output run summary as JSON
1451
- --cache <use|bypass|refresh> Cache mode for this run (default: use)
1452
- --no-cache Shortcut for --cache bypass
1453
- --refresh-cache Shortcut for --cache refresh
1454
- --clear-cache Clear the cache before starting the run
1455
- --no-env Disable automatic .env loading
1456
- --help, -h Show this help
1457
- `);
1458
- return;
1459
- }
1460
- if (topic === "show-runs") {
1461
- console.info(`
1462
- agent-evals show-runs - Show saved run artifact file paths
1463
-
1464
- Usage:
1465
- agent-evals show-runs [<run-id>|latest] [--json]
1466
-
1467
- Prints the run directory and stable artifact paths for run.json, summary.json,
1468
- cases.jsonl, case detail JSON, and trace JSON files. Run ids can be full
1469
- timestamp ids, short ids such as r0, or latest.
1470
-
1471
- Flags:
1472
- --json Output the file index as JSON
1473
- --no-env Disable automatic .env loading
1474
- --help, -h Show this help
1475
- `);
1476
- return;
1477
- }
1478
- if (topic === "cache" || topic === "cache list" || topic === "cache clear") {
1479
- console.info(`
1480
- agent-evals cache - Manage cached operation entries
1481
-
1482
- Usage:
1483
- agent-evals cache list [flags]
1484
- agent-evals cache clear --eval <id>
1485
- agent-evals cache clear --all
1486
-
1487
- Flags:
1488
- --eval <id> Clear entries for specific eval(s) (comma-separated)
1489
- --all Confirm clearing every cached entry
1490
- --json Output cache listing as JSON
1491
- --no-env Disable automatic .env loading
1492
- --help, -h Show this help
1493
- `);
1494
- return;
1495
- }
1496
- console.info(`
1497
- agent-evals - LLM/Agent eval runner
1498
-
1499
- Commands:
1500
- app Start server with UI
1501
- list List discovered evals
1502
- run Run evals
1503
- show-runs [id|latest] Show saved run artifact file paths
1504
- cache list List cached operation entries
1505
- cache clear --eval <id> Clear cache entries for one eval
1506
- cache clear --all Clear every cached entry
1507
- help Show this help
1508
-
1509
- Options:
1510
- --eval <id> Run specific eval(s) (comma-separated)
1511
- --case <id> Run specific case(s) (comma-separated)
1512
- --trials <n> Number of trials per case
1513
- --inspect[=host:port] Run with the Node.js inspector enabled
1514
- --inspect-brk[=host:port] Enable inspector and pause before startup
1515
- --json Output results as JSON
1516
- --port <n> Server port (default: 4100)
1517
- --cache <use|bypass|refresh> Cache mode for this run (default: use)
1518
- --no-cache Shortcut for --cache bypass
1519
- --refresh-cache Shortcut for --cache refresh
1520
- --clear-cache Clear the cache before starting the run
1521
- --no-env Disable automatic .env loading
1522
- --help, -h Show help
1523
- `);
1524
- }
1525
2252
  //#endregion
1526
- export { createRunner as n, runCli as t };
2253
+ export { materializeManualInputFiles as a, isManualInputFileValue as i, createRunner as n, stageManualInputFile as o, cleanupStagedManualInputFiles as r, stageManualInputFileFromPath as s, runCli as t };