@ls-stack/agent-eval 0.27.1 → 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-CJj1yPPD.mjs → app-D6-msfKP.mjs} +45 -6
- package/dist/apps/web/dist/assets/index-BCr6J8Uj.js +118 -0
- package/dist/apps/web/dist/assets/index-DjUTm3M-.css +1 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-Clf8xUFa.mjs → cli-CIc_gBNM.mjs} +965 -167
- package/dist/index.d.mts +5828 -3368
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +4 -2
- package/dist/{runOrchestration-FEvBwwJI.mjs → runOrchestration-CIARrLs6.mjs} +1046 -228
- package/dist/{runner-zqKwTlNj.mjs → runner-1F8MeY5V.mjs} +2 -2
- package/dist/{runner-KbDKLSU4.mjs → runner-Bq1f9B9d.mjs} +1 -1
- package/dist/src-CkWT1iSu.mjs +3 -0
- package/package.json +2 -29
- package/skills/agent-eval/SKILL.md +104 -20
- package/dist/apps/web/dist/assets/index-6YqV9t4k.js +0 -118
- package/dist/apps/web/dist/assets/index-C-OiMSQD.css +0 -1
- package/dist/bin.d.mts +0 -1
- package/dist/runChild.d.mts +0 -1
- package/dist/src-BBwT7_cy.mjs +0 -3
|
@@ -1,25 +1,149 @@
|
|
|
1
|
-
import { B as
|
|
2
|
-
import { createHash } from "node:crypto";
|
|
3
|
-
import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
|
-
import { dirname, join, relative, resolve } from "node:path";
|
|
1
|
+
import { B as getEvalDisplayStatus, C as loadConfig, D as createFsCacheStore, E as validateCharts, G as runSummarySchema, L as applyDerivedCallAttributes, S as resolveEvalDefaultConfig, T as normalizeScoreDef, V as deriveScopedSummaryFromCases, _ as buildManualInputDescriptor, _t as getCaseRowEvalKey, a as getLastRunStatuses, b as loadEvalModule, c as loadPersistedRunSnapshots, d as persistRunState, dt as resolveLlmCallsConfig, f as recomputeEvalStatusesInRuns, g as resolveArtifactPath, gt as getCaseRowCaseKey, h as resolveTracePresentation, ht as buildEvalKey, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, o as getLatestRunInfos, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, u as persistCaseDetail, ur as getEvalRegistry, ut as resolveApiCallsConfig, v as parseManualInputValues, w as buildDeclaredColumnDefs, x as parseEvalDiscovery, y as deriveEvalFreshness, z as getEvalTitle } from "./runOrchestration-CIARrLs6.mjs";
|
|
2
|
+
import { createHash, randomUUID } from "node:crypto";
|
|
3
|
+
import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
|
+
import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
5
5
|
import { watch } from "chokidar";
|
|
6
6
|
import { glob } from "glob";
|
|
7
7
|
import { existsSync } from "node:fs";
|
|
8
8
|
import { resultify } from "t-result";
|
|
9
9
|
import { fileURLToPath } from "node:url";
|
|
10
10
|
import { spawn, spawnSync } from "node:child_process";
|
|
11
|
+
//#region ../runner/src/configReload.ts
|
|
12
|
+
/** Coordinates idle-only reloads for `agent-evals.config.ts` in app mode. */
|
|
13
|
+
function createConfigReloadController({ getActiveRunCount, closeRunnerWatchers, loadRunnerState, emitToDiscoveryListeners }) {
|
|
14
|
+
let watcher;
|
|
15
|
+
let reloadTimer;
|
|
16
|
+
let reloadPromise;
|
|
17
|
+
let state = {
|
|
18
|
+
status: "idle",
|
|
19
|
+
activeRunCount: 0,
|
|
20
|
+
lastChangedAt: null,
|
|
21
|
+
lastReloadedAt: null
|
|
22
|
+
};
|
|
23
|
+
function currentState() {
|
|
24
|
+
return {
|
|
25
|
+
...state,
|
|
26
|
+
activeRunCount: getActiveRunCount()
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
function emitReloadEvent() {
|
|
30
|
+
emitToDiscoveryListeners({
|
|
31
|
+
type: "config.reload",
|
|
32
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
33
|
+
payload: currentState()
|
|
34
|
+
});
|
|
35
|
+
}
|
|
36
|
+
function setState(patch) {
|
|
37
|
+
state = {
|
|
38
|
+
...state,
|
|
39
|
+
...patch,
|
|
40
|
+
activeRunCount: getActiveRunCount()
|
|
41
|
+
};
|
|
42
|
+
emitReloadEvent();
|
|
43
|
+
}
|
|
44
|
+
async function close() {
|
|
45
|
+
if (reloadTimer !== void 0) {
|
|
46
|
+
clearTimeout(reloadTimer);
|
|
47
|
+
reloadTimer = void 0;
|
|
48
|
+
}
|
|
49
|
+
const watcherToClose = watcher;
|
|
50
|
+
watcher = void 0;
|
|
51
|
+
if (watcherToClose !== void 0) await watcherToClose.close();
|
|
52
|
+
}
|
|
53
|
+
async function reloadConfigNow(changedAt) {
|
|
54
|
+
setState({
|
|
55
|
+
status: "reloading",
|
|
56
|
+
lastChangedAt: changedAt
|
|
57
|
+
});
|
|
58
|
+
await close();
|
|
59
|
+
await closeRunnerWatchers();
|
|
60
|
+
await loadRunnerState();
|
|
61
|
+
setState({
|
|
62
|
+
status: "idle",
|
|
63
|
+
lastChangedAt: changedAt,
|
|
64
|
+
lastReloadedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
async function reloadConfig(changedAt) {
|
|
68
|
+
if (reloadPromise !== void 0) {
|
|
69
|
+
setState({
|
|
70
|
+
status: "pending",
|
|
71
|
+
lastChangedAt: changedAt
|
|
72
|
+
});
|
|
73
|
+
await reloadPromise;
|
|
74
|
+
await reloadIfPendingAndIdle();
|
|
75
|
+
return;
|
|
76
|
+
}
|
|
77
|
+
reloadPromise = reloadConfigNow(changedAt);
|
|
78
|
+
try {
|
|
79
|
+
await reloadPromise;
|
|
80
|
+
} finally {
|
|
81
|
+
reloadPromise = void 0;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
async function handleConfigChanged() {
|
|
85
|
+
const changedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
86
|
+
if (getActiveRunCount() > 0) {
|
|
87
|
+
setState({
|
|
88
|
+
status: "pending",
|
|
89
|
+
lastChangedAt: changedAt
|
|
90
|
+
});
|
|
91
|
+
return;
|
|
92
|
+
}
|
|
93
|
+
await reloadConfig(changedAt);
|
|
94
|
+
}
|
|
95
|
+
async function reloadIfPendingAndIdle() {
|
|
96
|
+
if (state.status !== "pending") return;
|
|
97
|
+
if (getActiveRunCount() > 0) {
|
|
98
|
+
state = currentState();
|
|
99
|
+
return;
|
|
100
|
+
}
|
|
101
|
+
await reloadConfig(state.lastChangedAt ?? (/* @__PURE__ */ new Date()).toISOString());
|
|
102
|
+
}
|
|
103
|
+
async function setupWatcher() {
|
|
104
|
+
const nextWatcher = watch(resolve(process.cwd(), "agent-evals.config.ts"), {
|
|
105
|
+
awaitWriteFinish: {
|
|
106
|
+
stabilityThreshold: 100,
|
|
107
|
+
pollInterval: 20
|
|
108
|
+
},
|
|
109
|
+
ignoreInitial: true,
|
|
110
|
+
persistent: true
|
|
111
|
+
});
|
|
112
|
+
watcher = nextWatcher;
|
|
113
|
+
const scheduleReload = () => {
|
|
114
|
+
if (reloadTimer !== void 0) clearTimeout(reloadTimer);
|
|
115
|
+
reloadTimer = setTimeout(() => {
|
|
116
|
+
reloadTimer = void 0;
|
|
117
|
+
handleConfigChanged();
|
|
118
|
+
}, 50);
|
|
119
|
+
};
|
|
120
|
+
nextWatcher.on("change", scheduleReload);
|
|
121
|
+
nextWatcher.on("add", scheduleReload);
|
|
122
|
+
nextWatcher.on("unlink", scheduleReload);
|
|
123
|
+
await new Promise((ready) => {
|
|
124
|
+
nextWatcher.once("ready", ready);
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
return {
|
|
128
|
+
close,
|
|
129
|
+
currentState,
|
|
130
|
+
reloadIfPendingAndIdle,
|
|
131
|
+
setupWatcher
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
//#endregion
|
|
11
135
|
//#region ../runner/src/evalSummaries.ts
|
|
12
136
|
/** Build the API/UI summary payload for one discovered eval. */
|
|
13
137
|
function buildEvalSummary(params) {
|
|
14
138
|
const { meta, config, gitState, latestRun, lastRunStatus } = params;
|
|
15
|
-
const { sourceFingerprint, ...summaryMeta } = meta;
|
|
139
|
+
const { sourceFingerprint, manualInputDescriptor, requiresManualInput, ...summaryMeta } = meta;
|
|
16
140
|
const freshness = deriveEvalFreshness({
|
|
17
141
|
latestRun,
|
|
18
142
|
gitState,
|
|
19
143
|
currentEvalSourceFingerprint: sourceFingerprint,
|
|
20
144
|
staleAfterDays: config.staleAfterDays ?? 14
|
|
21
145
|
});
|
|
22
|
-
|
|
146
|
+
const summary = {
|
|
23
147
|
...summaryMeta,
|
|
24
148
|
stale: freshness.stale,
|
|
25
149
|
outdated: freshness.outdated,
|
|
@@ -29,6 +153,8 @@ function buildEvalSummary(params) {
|
|
|
29
153
|
currentCommitSha: gitState.commitSha,
|
|
30
154
|
lastRunStatus
|
|
31
155
|
};
|
|
156
|
+
if (manualInputDescriptor && requiresManualInput) summary.manualInput = manualInputDescriptor;
|
|
157
|
+
return summary;
|
|
32
158
|
}
|
|
33
159
|
/** Write one latest-run snapshot to each targeted eval id. */
|
|
34
160
|
function setLatestRunInfoMap(params) {
|
|
@@ -60,6 +186,343 @@ function readGitWorktreeState(workspaceRoot) {
|
|
|
60
186
|
return { commitSha: commitResult.status === 0 ? commitResult.stdout : null };
|
|
61
187
|
}
|
|
62
188
|
//#endregion
|
|
189
|
+
//#region ../runner/src/manualInput/discovery.ts
|
|
190
|
+
/**
|
|
191
|
+
* Inspect an eval's `manualInput` config during discovery. Rejects evals that
|
|
192
|
+
* declare both `cases` and `manualInput` and evals whose schema cannot be
|
|
193
|
+
* walked into a wire descriptor.
|
|
194
|
+
*/
|
|
195
|
+
function resolveManualInputDiscovery(params) {
|
|
196
|
+
const { evalDef, evalId, relativeFilePath } = params;
|
|
197
|
+
if (!evalDef.manualInput) return { kind: "none" };
|
|
198
|
+
if (evalDef.cases !== void 0) return {
|
|
199
|
+
kind: "issue",
|
|
200
|
+
issue: {
|
|
201
|
+
type: "manual-input-with-cases",
|
|
202
|
+
severity: "error",
|
|
203
|
+
filePath: relativeFilePath,
|
|
204
|
+
evalId,
|
|
205
|
+
message: `Eval "${evalId}" in ${relativeFilePath} declares both "cases" and "manualInput". Remove one of them.`
|
|
206
|
+
}
|
|
207
|
+
};
|
|
208
|
+
const descriptorResult = buildManualInputDescriptor(evalDef.manualInput);
|
|
209
|
+
if (descriptorResult.error) return {
|
|
210
|
+
kind: "issue",
|
|
211
|
+
issue: {
|
|
212
|
+
type: "manual-input-with-cases",
|
|
213
|
+
severity: "error",
|
|
214
|
+
filePath: relativeFilePath,
|
|
215
|
+
evalId,
|
|
216
|
+
message: `Eval "${evalId}" in ${relativeFilePath} has an unsupported manualInput schema: ${descriptorResult.error.message}`
|
|
217
|
+
}
|
|
218
|
+
};
|
|
219
|
+
return {
|
|
220
|
+
kind: "ok",
|
|
221
|
+
requiresManualInput: true,
|
|
222
|
+
descriptor: descriptorResult.value,
|
|
223
|
+
config: evalDef.manualInput
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
//#endregion
|
|
227
|
+
//#region ../runner/src/manualInput/files.ts
|
|
228
|
+
const stagedUploadDir = ".agent-evals/manual-input-uploads";
|
|
229
|
+
const mimeTypeByExtension = {
|
|
230
|
+
".gif": "image/gif",
|
|
231
|
+
".jpeg": "image/jpeg",
|
|
232
|
+
".jpg": "image/jpeg",
|
|
233
|
+
".json": "application/json",
|
|
234
|
+
".md": "text/markdown",
|
|
235
|
+
".pdf": "application/pdf",
|
|
236
|
+
".png": "image/png",
|
|
237
|
+
".svg": "image/svg+xml",
|
|
238
|
+
".txt": "text/plain",
|
|
239
|
+
".webp": "image/webp"
|
|
240
|
+
};
|
|
241
|
+
function toWorkspaceRelativePath(params) {
|
|
242
|
+
return relative(params.workspaceRoot, params.filePath).replaceAll("\\", "/");
|
|
243
|
+
}
|
|
244
|
+
function isInsideWorkspace(params) {
|
|
245
|
+
const rel = relative(params.workspaceRoot, params.filePath);
|
|
246
|
+
return rel === "" || !rel.startsWith("..") && !isAbsolute(rel);
|
|
247
|
+
}
|
|
248
|
+
function sanitizeSegment(value) {
|
|
249
|
+
const normalized = value.trim().replaceAll(/[^A-Za-z0-9._-]+/g, "-");
|
|
250
|
+
return normalized.length > 0 ? normalized : "file";
|
|
251
|
+
}
|
|
252
|
+
function sanitizeFileName(value) {
|
|
253
|
+
const normalized = sanitizeSegment(value);
|
|
254
|
+
const extension = extname(normalized);
|
|
255
|
+
if (extension.length === 0) return normalized;
|
|
256
|
+
return `${normalized.slice(0, -extension.length).replaceAll(".", "-")}${extension}`;
|
|
257
|
+
}
|
|
258
|
+
function inferMimeType(params) {
|
|
259
|
+
const normalized = params.mimeType?.trim();
|
|
260
|
+
if (normalized && normalized.length > 0) return normalized;
|
|
261
|
+
return mimeTypeByExtension[extname(params.name).toLowerCase()] ?? "";
|
|
262
|
+
}
|
|
263
|
+
function hashBytes(bytes) {
|
|
264
|
+
return createHash("sha256").update(bytes).digest("hex");
|
|
265
|
+
}
|
|
266
|
+
function isRecord$1(value) {
|
|
267
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
268
|
+
}
|
|
269
|
+
function isManualInputFileValue(value) {
|
|
270
|
+
if (!isRecord$1(value)) return false;
|
|
271
|
+
return typeof value.name === "string" && typeof value.mimeType === "string" && typeof value.sizeBytes === "number" && typeof value.sha256 === "string" && typeof value.path === "string";
|
|
272
|
+
}
|
|
273
|
+
function isStagedManualInputPath(path) {
|
|
274
|
+
return path === stagedUploadDir || path.startsWith(`${stagedUploadDir}/`) || path.startsWith(stagedUploadDir + sep);
|
|
275
|
+
}
|
|
276
|
+
/**
|
|
277
|
+
* Persist uploaded manual-input bytes in the workspace staging area and return
|
|
278
|
+
* the JSON-safe metadata used by manual-input schemas.
|
|
279
|
+
*/
|
|
280
|
+
async function stageManualInputFile({ workspaceRoot, bytes, name, mimeType }) {
|
|
281
|
+
const fileName = sanitizeFileName(name || "uploaded-file");
|
|
282
|
+
const sha256 = hashBytes(bytes);
|
|
283
|
+
const dir = resolve(workspaceRoot, stagedUploadDir);
|
|
284
|
+
await mkdir(dir, { recursive: true });
|
|
285
|
+
const targetPath = join(dir, `${Date.now().toString(36)}-${randomUUID()}__${sha256.slice(0, 12)}__${fileName}`);
|
|
286
|
+
await writeFile(targetPath, bytes);
|
|
287
|
+
return {
|
|
288
|
+
name: name || fileName,
|
|
289
|
+
mimeType: inferMimeType({
|
|
290
|
+
mimeType,
|
|
291
|
+
name: fileName
|
|
292
|
+
}),
|
|
293
|
+
sizeBytes: bytes.byteLength,
|
|
294
|
+
sha256,
|
|
295
|
+
path: toWorkspaceRelativePath({
|
|
296
|
+
workspaceRoot,
|
|
297
|
+
filePath: targetPath
|
|
298
|
+
})
|
|
299
|
+
};
|
|
300
|
+
}
|
|
301
|
+
/**
|
|
302
|
+
* Read a file path supplied by the CLI and stage it as a manual-input file.
|
|
303
|
+
*/
|
|
304
|
+
async function stageManualInputFileFromPath({ workspaceRoot, path, name, mimeType }) {
|
|
305
|
+
const sourcePath = isAbsolute(path) ? resolve(path) : resolve(workspaceRoot, path);
|
|
306
|
+
return await stageManualInputFile({
|
|
307
|
+
workspaceRoot,
|
|
308
|
+
bytes: new Uint8Array(await readFile(sourcePath)),
|
|
309
|
+
name: name ?? basename(sourcePath),
|
|
310
|
+
mimeType: inferMimeType({
|
|
311
|
+
mimeType,
|
|
312
|
+
name: name ?? basename(sourcePath)
|
|
313
|
+
})
|
|
314
|
+
});
|
|
315
|
+
}
|
|
316
|
+
async function materializeOneManualInputFile(params) {
|
|
317
|
+
const sourcePath = resolve(params.workspaceRoot, params.value.path);
|
|
318
|
+
if (!isInsideWorkspace({
|
|
319
|
+
workspaceRoot: params.workspaceRoot,
|
|
320
|
+
filePath: sourcePath
|
|
321
|
+
})) throw new Error(`Manual input file path escapes workspace: ${params.value.path}`);
|
|
322
|
+
const bytes = new Uint8Array(await readFile(sourcePath));
|
|
323
|
+
const sha256 = hashBytes(bytes);
|
|
324
|
+
const fileName = sanitizeFileName(params.value.name || basename(sourcePath));
|
|
325
|
+
const artifactId = [
|
|
326
|
+
sanitizeSegment(params.runId),
|
|
327
|
+
"manual-input",
|
|
328
|
+
sha256.slice(0, 12),
|
|
329
|
+
fileName
|
|
330
|
+
].join("__");
|
|
331
|
+
const targetPath = join(params.runDir, "artifacts", artifactId);
|
|
332
|
+
await mkdir(join(params.runDir, "artifacts"), { recursive: true });
|
|
333
|
+
if (sourcePath !== targetPath) await copyFile(sourcePath, targetPath);
|
|
334
|
+
if (isStagedManualInputPath(params.value.path)) await resultify(() => rm(sourcePath, { force: true }));
|
|
335
|
+
return {
|
|
336
|
+
name: params.value.name,
|
|
337
|
+
mimeType: inferMimeType({
|
|
338
|
+
mimeType: params.value.mimeType,
|
|
339
|
+
name: params.value.name || fileName
|
|
340
|
+
}),
|
|
341
|
+
sizeBytes: bytes.byteLength,
|
|
342
|
+
sha256,
|
|
343
|
+
path: toWorkspaceRelativePath({
|
|
344
|
+
workspaceRoot: params.workspaceRoot,
|
|
345
|
+
filePath: targetPath
|
|
346
|
+
})
|
|
347
|
+
};
|
|
348
|
+
}
|
|
349
|
+
async function materializeUnknownValue(params) {
|
|
350
|
+
if (isManualInputFileValue(params.value)) return await materializeOneManualInputFile({
|
|
351
|
+
workspaceRoot: params.workspaceRoot,
|
|
352
|
+
runId: params.runId,
|
|
353
|
+
runDir: params.runDir,
|
|
354
|
+
value: params.value
|
|
355
|
+
});
|
|
356
|
+
if (Array.isArray(params.value)) return await Promise.all(params.value.map(async (entry) => await materializeUnknownValue({
|
|
357
|
+
workspaceRoot: params.workspaceRoot,
|
|
358
|
+
runId: params.runId,
|
|
359
|
+
runDir: params.runDir,
|
|
360
|
+
value: entry
|
|
361
|
+
})));
|
|
362
|
+
if (isRecord$1(params.value)) {
|
|
363
|
+
const entries = await Promise.all(Object.entries(params.value).map(async ([key, child]) => {
|
|
364
|
+
return [key, await materializeUnknownValue({
|
|
365
|
+
workspaceRoot: params.workspaceRoot,
|
|
366
|
+
runId: params.runId,
|
|
367
|
+
runDir: params.runDir,
|
|
368
|
+
value: child
|
|
369
|
+
})];
|
|
370
|
+
}));
|
|
371
|
+
return Object.fromEntries(entries);
|
|
372
|
+
}
|
|
373
|
+
return params.value;
|
|
374
|
+
}
|
|
375
|
+
/**
|
|
376
|
+
* Copy all manual-input file references inside a run request into the run's
|
|
377
|
+
* artifact directory and return a request-safe value with artifact paths.
|
|
378
|
+
*/
|
|
379
|
+
async function materializeManualInputFiles({ workspaceRoot, runId, runDir, value }) {
|
|
380
|
+
const result = await resultify(() => materializeUnknownValue({
|
|
381
|
+
workspaceRoot,
|
|
382
|
+
runId,
|
|
383
|
+
runDir,
|
|
384
|
+
value
|
|
385
|
+
}));
|
|
386
|
+
if (result.error) return {
|
|
387
|
+
error: result.error.message,
|
|
388
|
+
value: null
|
|
389
|
+
};
|
|
390
|
+
return {
|
|
391
|
+
error: null,
|
|
392
|
+
value: result.value
|
|
393
|
+
};
|
|
394
|
+
}
|
|
395
|
+
/** Remove stale staged manual-input uploads from previous abandoned runs. */
|
|
396
|
+
async function cleanupStagedManualInputFiles(workspaceRoot) {
|
|
397
|
+
await resultify(() => rm(resolve(workspaceRoot, stagedUploadDir), {
|
|
398
|
+
force: true,
|
|
399
|
+
recursive: true
|
|
400
|
+
}));
|
|
401
|
+
}
|
|
402
|
+
//#endregion
|
|
403
|
+
//#region ../runner/src/manualInput/validation.ts
|
|
404
|
+
function evalIsTargeted(evalMeta, target) {
|
|
405
|
+
if (target.evalKeys && target.evalKeys.length > 0) {
|
|
406
|
+
if (!target.evalKeys.includes(evalMeta.key)) return false;
|
|
407
|
+
}
|
|
408
|
+
if (target.evalIds && target.evalIds.length > 0) {
|
|
409
|
+
if (!target.evalIds.includes(evalMeta.id)) return false;
|
|
410
|
+
}
|
|
411
|
+
return true;
|
|
412
|
+
}
|
|
413
|
+
/**
|
|
414
|
+
* Validate the `manualInputs` map carried by a `CreateRunRequest` against the
|
|
415
|
+
* authored Zod schemas of every targeted eval that requires manual input.
|
|
416
|
+
*
|
|
417
|
+
* Pure: takes captured discovery state (eval metas + schema configs) and the
|
|
418
|
+
* request, returns a structured result the server/CLI can format directly.
|
|
419
|
+
*/
|
|
420
|
+
function validateManualInputsForRequest(params) {
|
|
421
|
+
const { evalMetas, manualInputConfigs, request } = params;
|
|
422
|
+
const failures = [];
|
|
423
|
+
const parsed = {};
|
|
424
|
+
for (const evalMeta of evalMetas) {
|
|
425
|
+
if (!evalMeta.requiresManualInput) continue;
|
|
426
|
+
if (!evalIsTargeted(evalMeta, request.target)) continue;
|
|
427
|
+
const rawValue = request.manualInputs?.[evalMeta.key];
|
|
428
|
+
if (rawValue === void 0) {
|
|
429
|
+
failures.push({
|
|
430
|
+
evalKey: evalMeta.key,
|
|
431
|
+
evalId: evalMeta.id,
|
|
432
|
+
reason: "missing",
|
|
433
|
+
issues: [{
|
|
434
|
+
path: "",
|
|
435
|
+
message: `manualInputs is missing an entry for "${evalMeta.key}"`
|
|
436
|
+
}]
|
|
437
|
+
});
|
|
438
|
+
continue;
|
|
439
|
+
}
|
|
440
|
+
const config = manualInputConfigs.get(evalMeta.key);
|
|
441
|
+
if (!config) {
|
|
442
|
+
failures.push({
|
|
443
|
+
evalKey: evalMeta.key,
|
|
444
|
+
evalId: evalMeta.id,
|
|
445
|
+
reason: "invalid",
|
|
446
|
+
issues: [{
|
|
447
|
+
path: "",
|
|
448
|
+
message: "manualInput schema is unavailable; reload the workspace and try again"
|
|
449
|
+
}]
|
|
450
|
+
});
|
|
451
|
+
continue;
|
|
452
|
+
}
|
|
453
|
+
const result = parseManualInputValues(config, rawValue);
|
|
454
|
+
if (result.error) {
|
|
455
|
+
failures.push({
|
|
456
|
+
evalKey: evalMeta.key,
|
|
457
|
+
evalId: evalMeta.id,
|
|
458
|
+
reason: "invalid",
|
|
459
|
+
issues: result.error.issues
|
|
460
|
+
});
|
|
461
|
+
continue;
|
|
462
|
+
}
|
|
463
|
+
parsed[evalMeta.key] = result.value;
|
|
464
|
+
}
|
|
465
|
+
if (failures.length > 0) return {
|
|
466
|
+
ok: false,
|
|
467
|
+
failures
|
|
468
|
+
};
|
|
469
|
+
return {
|
|
470
|
+
ok: true,
|
|
471
|
+
parsed
|
|
472
|
+
};
|
|
473
|
+
}
|
|
474
|
+
//#endregion
|
|
475
|
+
//#region ../runner/src/recalculateDerivedAttributes.ts
|
|
476
|
+
function getCaseArtifactFileIdForExistingRun(runState, caseRow) {
|
|
477
|
+
const caseKey = getCaseRowCaseKey(caseRow);
|
|
478
|
+
return runState.cases.some((existing) => existing.caseId === caseRow.caseId && getCaseRowCaseKey(existing) !== caseKey) ? caseKey : caseRow.caseId;
|
|
479
|
+
}
|
|
480
|
+
async function recalculateDerivedAttributesForCase(params) {
|
|
481
|
+
const { run, caseId } = params;
|
|
482
|
+
if (run.manifest.status === "running") return {
|
|
483
|
+
updated: false,
|
|
484
|
+
reason: "Run is still running"
|
|
485
|
+
};
|
|
486
|
+
const caseRow = run.cases.find((row) => getCaseRowCaseKey(row) === caseId || row.caseId === caseId);
|
|
487
|
+
if (!caseRow) return {
|
|
488
|
+
updated: false,
|
|
489
|
+
reason: "Case not found"
|
|
490
|
+
};
|
|
491
|
+
const caseKey = getCaseRowCaseKey(caseRow);
|
|
492
|
+
const caseDetail = run.caseDetails.get(caseKey);
|
|
493
|
+
if (!caseDetail) return {
|
|
494
|
+
updated: false,
|
|
495
|
+
reason: "Case detail not found"
|
|
496
|
+
};
|
|
497
|
+
const spansWithDerivedAttributes = applyDerivedCallAttributes({
|
|
498
|
+
spans: caseDetail.trace,
|
|
499
|
+
llmCallsConfig: params.llmCallsConfig,
|
|
500
|
+
apiCallsConfig: params.apiCallsConfig
|
|
501
|
+
});
|
|
502
|
+
let nextTrace = spansWithDerivedAttributes;
|
|
503
|
+
let nextTraceDisplay = caseDetail.traceDisplay;
|
|
504
|
+
const evalMeta = params.evals.get(getCaseRowEvalKey(caseRow));
|
|
505
|
+
const entry = evalMeta === void 0 ? void 0 : getEvalRegistry().get(evalMeta.id);
|
|
506
|
+
if (entry !== void 0) entry.use((evalDef) => {
|
|
507
|
+
const resolved = resolveTracePresentation(spansWithDerivedAttributes, params.traceDisplayConfig, evalDef.traceDisplay);
|
|
508
|
+
nextTrace = resolved.trace;
|
|
509
|
+
nextTraceDisplay = resolved.traceDisplay;
|
|
510
|
+
});
|
|
511
|
+
const nextCaseDetail = {
|
|
512
|
+
...caseDetail,
|
|
513
|
+
trace: nextTrace,
|
|
514
|
+
traceDisplay: nextTraceDisplay
|
|
515
|
+
};
|
|
516
|
+
run.caseDetails.set(caseKey, nextCaseDetail);
|
|
517
|
+
const artifactFileId = getCaseArtifactFileIdForExistingRun(run, caseRow);
|
|
518
|
+
await writeFile(join(run.runDir, "traces", `${encodeURIComponent(artifactFileId)}.json`), JSON.stringify(nextCaseDetail.trace, null, 2));
|
|
519
|
+
await params.persistCaseDetail(run.runDir, nextCaseDetail, artifactFileId);
|
|
520
|
+
return {
|
|
521
|
+
updated: true,
|
|
522
|
+
caseDetail: nextCaseDetail
|
|
523
|
+
};
|
|
524
|
+
}
|
|
525
|
+
//#endregion
|
|
63
526
|
//#region ../runner/src/runChildProtocol.ts
|
|
64
527
|
function isRunChildMessage(value) {
|
|
65
528
|
if (typeof value !== "object" || value === null) return false;
|
|
@@ -251,7 +714,7 @@ async function markRunTerminalFromChild(runState, event, managerContext) {
|
|
|
251
714
|
managerContext.emitDiscoveryEvent();
|
|
252
715
|
}
|
|
253
716
|
//#endregion
|
|
254
|
-
//#region ../runner/src/
|
|
717
|
+
//#region ../runner/src/watchRoots.ts
|
|
255
718
|
const globMagicCharacters = new Set([
|
|
256
719
|
"*",
|
|
257
720
|
"?",
|
|
@@ -285,6 +748,11 @@ function getWatchRootsForIncludePatterns(params) {
|
|
|
285
748
|
if (roots.size === 0) return [params.workspaceRoot];
|
|
286
749
|
return [...roots];
|
|
287
750
|
}
|
|
751
|
+
//#endregion
|
|
752
|
+
//#region ../runner/src/runner.ts
|
|
753
|
+
function isRecord(value) {
|
|
754
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
755
|
+
}
|
|
288
756
|
/** Create an in-memory eval runner bound to the current workspace config. */
|
|
289
757
|
function createRunner({ watchForChanges = true } = {}) {
|
|
290
758
|
let config;
|
|
@@ -294,6 +762,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
294
762
|
let llmCallsConfig = resolveLlmCallsConfig(void 0);
|
|
295
763
|
let apiCallsConfig = resolveApiCallsConfig(void 0);
|
|
296
764
|
const evals = /* @__PURE__ */ new Map();
|
|
765
|
+
const manualInputConfigs = /* @__PURE__ */ new Map();
|
|
297
766
|
let discoveryIssues = [];
|
|
298
767
|
const runs = /* @__PURE__ */ new Map();
|
|
299
768
|
const lastRunStatusMap = /* @__PURE__ */ new Map();
|
|
@@ -304,6 +773,12 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
304
773
|
let runHistoryWatcher;
|
|
305
774
|
let discoveryRefreshTimer;
|
|
306
775
|
let runHistoryRefreshTimer;
|
|
776
|
+
const configReload = createConfigReloadController({
|
|
777
|
+
getActiveRunCount,
|
|
778
|
+
closeRunnerWatchers: closeWatchers,
|
|
779
|
+
loadRunnerState,
|
|
780
|
+
emitToDiscoveryListeners
|
|
781
|
+
});
|
|
307
782
|
function toWorkspaceRelativePath(filePath) {
|
|
308
783
|
return relative(workspaceRoot, filePath).replaceAll("\\", "/");
|
|
309
784
|
}
|
|
@@ -321,22 +796,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
321
796
|
}
|
|
322
797
|
const runner = {
|
|
323
798
|
async init() {
|
|
324
|
-
|
|
325
|
-
workspaceRoot = config.workspaceRoot ?? process.cwd();
|
|
326
|
-
localStateDir = resolve(workspaceRoot, ".agent-evals");
|
|
327
|
-
llmCallsConfig = resolveLlmCallsConfig(config.llmCalls);
|
|
328
|
-
apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
|
|
329
|
-
await mkdir(localStateDir, { recursive: true });
|
|
330
|
-
await mkdir(join(localStateDir, "runs"), { recursive: true });
|
|
331
|
-
cacheStore = createFsCacheStore({
|
|
332
|
-
workspaceRoot,
|
|
333
|
-
dir: config.cache?.dir,
|
|
334
|
-
maxEntriesPerNamespace: config.cache?.maxEntriesPerNamespace ?? config.cache?.maxEntriesPerEval,
|
|
335
|
-
maxEntriesByNamespace: config.cache?.maxEntriesByNamespace
|
|
336
|
-
});
|
|
337
|
-
await loadPersistedRuns();
|
|
338
|
-
await runner.refreshDiscovery();
|
|
339
|
-
if (watchForChanges) await setupWatcher();
|
|
799
|
+
await loadRunnerState();
|
|
340
800
|
},
|
|
341
801
|
async listCache() {
|
|
342
802
|
return cacheStore.list();
|
|
@@ -373,6 +833,22 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
373
833
|
emitDiscoveryEvent();
|
|
374
834
|
return { updatedRuns };
|
|
375
835
|
},
|
|
836
|
+
async recalculateDerivedAttributesForCase({ runId, caseId }) {
|
|
837
|
+
const run = runs.get(runId);
|
|
838
|
+
if (!run) return {
|
|
839
|
+
updated: false,
|
|
840
|
+
reason: "Run not found"
|
|
841
|
+
};
|
|
842
|
+
return recalculateDerivedAttributesForCase({
|
|
843
|
+
run,
|
|
844
|
+
caseId,
|
|
845
|
+
llmCallsConfig,
|
|
846
|
+
apiCallsConfig,
|
|
847
|
+
traceDisplayConfig: config.traceDisplay,
|
|
848
|
+
evals,
|
|
849
|
+
persistCaseDetail
|
|
850
|
+
});
|
|
851
|
+
},
|
|
376
852
|
async cleanRunsForEval(evalKey) {
|
|
377
853
|
const evalMeta = resolveEvalMeta(evalKey);
|
|
378
854
|
let deletedRuns = 0;
|
|
@@ -466,6 +942,13 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
466
942
|
emitDiscoveryEvent();
|
|
467
943
|
return { deleted: true };
|
|
468
944
|
},
|
|
945
|
+
validateManualInputs(request) {
|
|
946
|
+
return validateManualInputsForRequest({
|
|
947
|
+
evalMetas: getSortedEvalMetas(),
|
|
948
|
+
manualInputConfigs,
|
|
949
|
+
request
|
|
950
|
+
});
|
|
951
|
+
},
|
|
469
952
|
getEvals() {
|
|
470
953
|
const gitState = readGitWorktreeState(workspaceRoot);
|
|
471
954
|
const result = [];
|
|
@@ -492,6 +975,9 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
492
975
|
getDiscoveryIssues() {
|
|
493
976
|
return discoveryIssues;
|
|
494
977
|
},
|
|
978
|
+
getConfigReloadState() {
|
|
979
|
+
return configReload.currentState();
|
|
980
|
+
},
|
|
495
981
|
async refreshDiscovery() {
|
|
496
982
|
const patterns = config.include;
|
|
497
983
|
const discovered = [];
|
|
@@ -503,6 +989,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
503
989
|
discovered.push(...files);
|
|
504
990
|
}
|
|
505
991
|
evals.clear();
|
|
992
|
+
manualInputConfigs.clear();
|
|
506
993
|
discoveryIssues = [];
|
|
507
994
|
for (const filePath of discovered) try {
|
|
508
995
|
const content = await readFile(filePath, "utf-8");
|
|
@@ -526,9 +1013,14 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
526
1013
|
let columnDefs = buildDeclaredColumnDefs(void 0, void 0, void 0);
|
|
527
1014
|
let stats;
|
|
528
1015
|
let charts;
|
|
1016
|
+
let manualInputDescriptor;
|
|
1017
|
+
let requiresManualInput = false;
|
|
1018
|
+
const relativeFilePath = toWorkspaceRelativePath(meta.filePath);
|
|
529
1019
|
discoveredEntry?.use((evalDef) => {
|
|
530
1020
|
const defaultConfig = resolveEvalDefaultConfig({
|
|
531
1021
|
evalDef,
|
|
1022
|
+
globalColumns: config.columns,
|
|
1023
|
+
globalStats: config.stats,
|
|
532
1024
|
globalRemove: config.removeDefaultConfig
|
|
533
1025
|
});
|
|
534
1026
|
columnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
|
|
@@ -540,8 +1032,25 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
540
1032
|
});
|
|
541
1033
|
for (const warning of validated.warnings) console.warn(warning);
|
|
542
1034
|
charts = validated.charts;
|
|
1035
|
+
const manualInputResult = resolveManualInputDiscovery({
|
|
1036
|
+
evalDef,
|
|
1037
|
+
evalId: meta.id,
|
|
1038
|
+
relativeFilePath
|
|
1039
|
+
});
|
|
1040
|
+
if (manualInputResult.kind === "issue") {
|
|
1041
|
+
discoveryIssues.push(manualInputResult.issue);
|
|
1042
|
+
requiresManualInput = true;
|
|
1043
|
+
return;
|
|
1044
|
+
}
|
|
1045
|
+
if (manualInputResult.kind === "ok") {
|
|
1046
|
+
requiresManualInput = manualInputResult.requiresManualInput;
|
|
1047
|
+
manualInputDescriptor = manualInputResult.descriptor;
|
|
1048
|
+
manualInputConfigs.set(buildEvalKey({
|
|
1049
|
+
filePath: relativeFilePath,
|
|
1050
|
+
evalId: meta.id
|
|
1051
|
+
}), manualInputResult.config);
|
|
1052
|
+
}
|
|
543
1053
|
});
|
|
544
|
-
const relativeFilePath = toWorkspaceRelativePath(meta.filePath);
|
|
545
1054
|
const key = buildEvalKey({
|
|
546
1055
|
filePath: relativeFilePath,
|
|
547
1056
|
evalId: meta.id
|
|
@@ -556,7 +1065,9 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
556
1065
|
columnDefs,
|
|
557
1066
|
caseCount: null,
|
|
558
1067
|
stats,
|
|
559
|
-
charts
|
|
1068
|
+
charts,
|
|
1069
|
+
manualInputDescriptor,
|
|
1070
|
+
requiresManualInput
|
|
560
1071
|
});
|
|
561
1072
|
}
|
|
562
1073
|
} catch {}
|
|
@@ -602,11 +1113,27 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
602
1113
|
childProcess: void 0,
|
|
603
1114
|
childTerminalReceived: false
|
|
604
1115
|
};
|
|
1116
|
+
await mkdir(runDir, { recursive: true });
|
|
1117
|
+
await mkdir(join(runDir, "traces"), { recursive: true });
|
|
1118
|
+
await mkdir(join(runDir, "artifacts"), { recursive: true });
|
|
1119
|
+
await mkdir(join(runDir, "case-details"), { recursive: true });
|
|
1120
|
+
const materializedRequest = { ...request };
|
|
1121
|
+
if (request.manualInputs !== void 0) {
|
|
1122
|
+
const materialized = await materializeManualInputFiles({
|
|
1123
|
+
workspaceRoot,
|
|
1124
|
+
runId,
|
|
1125
|
+
runDir,
|
|
1126
|
+
value: request.manualInputs
|
|
1127
|
+
});
|
|
1128
|
+
if (materialized.error !== null) throw new Error(materialized.error);
|
|
1129
|
+
if (!isRecord(materialized.value)) throw new Error("Materialized manual inputs must be an object");
|
|
1130
|
+
materializedRequest.manualInputs = materialized.value;
|
|
1131
|
+
}
|
|
605
1132
|
runs.set(runId, runState);
|
|
606
1133
|
setLatestRunInfoMap({
|
|
607
1134
|
latestRunInfoMap,
|
|
608
1135
|
evalIds: getTargetEvalKeys({
|
|
609
|
-
request,
|
|
1136
|
+
request: materializedRequest,
|
|
610
1137
|
sortedEvals: getSortedEvalMetas()
|
|
611
1138
|
}),
|
|
612
1139
|
info: {
|
|
@@ -616,13 +1143,9 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
616
1143
|
evalSourceFingerprint: null
|
|
617
1144
|
}
|
|
618
1145
|
});
|
|
619
|
-
await mkdir(runDir, { recursive: true });
|
|
620
|
-
await mkdir(join(runDir, "traces"), { recursive: true });
|
|
621
|
-
await mkdir(join(runDir, "artifacts"), { recursive: true });
|
|
622
|
-
await mkdir(join(runDir, "case-details"), { recursive: true });
|
|
623
1146
|
await writeFile(join(runDir, "run.json"), JSON.stringify(manifest, null, 2));
|
|
624
1147
|
const childContext = {
|
|
625
|
-
request,
|
|
1148
|
+
request: materializedRequest,
|
|
626
1149
|
workspaceRoot,
|
|
627
1150
|
runDir,
|
|
628
1151
|
manifest,
|
|
@@ -705,18 +1228,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
705
1228
|
};
|
|
706
1229
|
},
|
|
707
1230
|
async close() {
|
|
708
|
-
|
|
709
|
-
clearTimeout(discoveryRefreshTimer);
|
|
710
|
-
discoveryRefreshTimer = void 0;
|
|
711
|
-
}
|
|
712
|
-
if (runHistoryRefreshTimer !== void 0) {
|
|
713
|
-
clearTimeout(runHistoryRefreshTimer);
|
|
714
|
-
runHistoryRefreshTimer = void 0;
|
|
715
|
-
}
|
|
716
|
-
const watchers = [discoveryWatcher, runHistoryWatcher].filter((watcher) => watcher !== void 0);
|
|
717
|
-
discoveryWatcher = void 0;
|
|
718
|
-
runHistoryWatcher = void 0;
|
|
719
|
-
await Promise.all(watchers.map((watcher) => watcher.close()));
|
|
1231
|
+
await Promise.all([closeWatchers(), configReload.close()]);
|
|
720
1232
|
},
|
|
721
1233
|
getWorkspaceRoot() {
|
|
722
1234
|
return workspaceRoot;
|
|
@@ -734,6 +1246,39 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
734
1246
|
return resolveArtifactPath(join(localStateDir, "runs"), artifactId_);
|
|
735
1247
|
}
|
|
736
1248
|
};
|
|
1249
|
+
async function loadRunnerState() {
|
|
1250
|
+
config = await loadConfig();
|
|
1251
|
+
workspaceRoot = config.workspaceRoot ?? process.cwd();
|
|
1252
|
+
localStateDir = resolve(workspaceRoot, ".agent-evals");
|
|
1253
|
+
llmCallsConfig = resolveLlmCallsConfig(config.llmCalls);
|
|
1254
|
+
apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
|
|
1255
|
+
await mkdir(localStateDir, { recursive: true });
|
|
1256
|
+
await mkdir(join(localStateDir, "runs"), { recursive: true });
|
|
1257
|
+
await cleanupStagedManualInputFiles(workspaceRoot);
|
|
1258
|
+
cacheStore = createFsCacheStore({
|
|
1259
|
+
workspaceRoot,
|
|
1260
|
+
dir: config.cache?.dir,
|
|
1261
|
+
maxEntriesPerNamespace: config.cache?.maxEntriesPerNamespace ?? config.cache?.maxEntriesPerEval,
|
|
1262
|
+
maxEntriesByNamespace: config.cache?.maxEntriesByNamespace
|
|
1263
|
+
});
|
|
1264
|
+
await loadPersistedRuns();
|
|
1265
|
+
await runner.refreshDiscovery();
|
|
1266
|
+
if (watchForChanges) await setupWatcher();
|
|
1267
|
+
}
|
|
1268
|
+
async function closeWatchers() {
|
|
1269
|
+
if (discoveryRefreshTimer !== void 0) {
|
|
1270
|
+
clearTimeout(discoveryRefreshTimer);
|
|
1271
|
+
discoveryRefreshTimer = void 0;
|
|
1272
|
+
}
|
|
1273
|
+
if (runHistoryRefreshTimer !== void 0) {
|
|
1274
|
+
clearTimeout(runHistoryRefreshTimer);
|
|
1275
|
+
runHistoryRefreshTimer = void 0;
|
|
1276
|
+
}
|
|
1277
|
+
const watchers = [discoveryWatcher, runHistoryWatcher].filter((watcher) => watcher !== void 0);
|
|
1278
|
+
discoveryWatcher = void 0;
|
|
1279
|
+
runHistoryWatcher = void 0;
|
|
1280
|
+
await Promise.all(watchers.map((watcher) => watcher.close()));
|
|
1281
|
+
}
|
|
737
1282
|
async function setupWatcher() {
|
|
738
1283
|
const watcher = watch(getWatchRootsForIncludePatterns({
|
|
739
1284
|
patterns: config.include,
|
|
@@ -758,7 +1303,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
758
1303
|
watcher.on("unlink", scheduleRefresh);
|
|
759
1304
|
watcher.on("addDir", scheduleRefresh);
|
|
760
1305
|
watcher.on("unlinkDir", scheduleRefresh);
|
|
761
|
-
await setupRunHistoryWatcher();
|
|
1306
|
+
await Promise.all([setupRunHistoryWatcher(), configReload.setupWatcher()]);
|
|
762
1307
|
await watcherReady;
|
|
763
1308
|
}
|
|
764
1309
|
async function setupRunHistoryWatcher() {
|
|
@@ -783,6 +1328,9 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
783
1328
|
watcher.once("ready", ready);
|
|
784
1329
|
});
|
|
785
1330
|
}
|
|
1331
|
+
function getActiveRunCount() {
|
|
1332
|
+
return [...runs.values()].filter((run) => run.manifest.status === "running").length;
|
|
1333
|
+
}
|
|
786
1334
|
function emitDiscoveryEvent() {
|
|
787
1335
|
const lastRunStatuses = getLastRunStatuses({
|
|
788
1336
|
runs: runs.values(),
|
|
@@ -802,6 +1350,10 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
802
1350
|
payload: runner.getEvals()
|
|
803
1351
|
};
|
|
804
1352
|
for (const listener of discoveryListeners) listener(event);
|
|
1353
|
+
configReload.reloadIfPendingAndIdle();
|
|
1354
|
+
}
|
|
1355
|
+
function emitToDiscoveryListeners(event) {
|
|
1356
|
+
for (const listener of discoveryListeners) listener(event);
|
|
805
1357
|
}
|
|
806
1358
|
function emitEvent(runState, event) {
|
|
807
1359
|
for (const listener of runState.listeners) try {
|
|
@@ -844,6 +1396,345 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
844
1396
|
return runner;
|
|
845
1397
|
}
|
|
846
1398
|
//#endregion
|
|
1399
|
+
//#region src/cliHelp.ts
|
|
1400
|
+
/** Render the help block for a given CLI topic to stdout via `console.info`. */
|
|
1401
|
+
function printHelp(topic = "global") {
|
|
1402
|
+
if (topic === "app") {
|
|
1403
|
+
console.info(`
|
|
1404
|
+
agent-evals app - Start server with UI
|
|
1405
|
+
|
|
1406
|
+
Usage:
|
|
1407
|
+
agent-evals app [flags]
|
|
1408
|
+
|
|
1409
|
+
Flags:
|
|
1410
|
+
--port <n> Server port (default: 4100)
|
|
1411
|
+
--no-env Disable automatic .env loading
|
|
1412
|
+
--help, -h Show this help
|
|
1413
|
+
`);
|
|
1414
|
+
return;
|
|
1415
|
+
}
|
|
1416
|
+
if (topic === "list") {
|
|
1417
|
+
console.info(`
|
|
1418
|
+
agent-evals list - List discovered evals
|
|
1419
|
+
|
|
1420
|
+
Usage:
|
|
1421
|
+
agent-evals list [flags]
|
|
1422
|
+
|
|
1423
|
+
Flags:
|
|
1424
|
+
--no-env Disable automatic .env loading
|
|
1425
|
+
--help, -h Show this help
|
|
1426
|
+
`);
|
|
1427
|
+
return;
|
|
1428
|
+
}
|
|
1429
|
+
if (topic === "run") {
|
|
1430
|
+
console.info(`
|
|
1431
|
+
agent-evals run - Run evals
|
|
1432
|
+
|
|
1433
|
+
Usage:
|
|
1434
|
+
agent-evals run [flags]
|
|
1435
|
+
|
|
1436
|
+
Flags:
|
|
1437
|
+
--eval <id> Run specific eval(s) (comma-separated)
|
|
1438
|
+
--file <path|glob> Run eval files matching path/glob (comma-separated)
|
|
1439
|
+
--case <id> Run case(s); combine with --file/--eval if ambiguous
|
|
1440
|
+
--trials <n> Number of trials per case
|
|
1441
|
+
--inspect[=host:port] Run with the Node.js inspector enabled
|
|
1442
|
+
--inspect-brk[=host:port] Enable inspector and pause before startup
|
|
1443
|
+
--json Output run summary as JSON
|
|
1444
|
+
--cache <use|bypass|refresh> Cache mode for this run (default: use)
|
|
1445
|
+
--no-cache Shortcut for --cache bypass
|
|
1446
|
+
--refresh-cache Shortcut for --cache refresh
|
|
1447
|
+
--clear-cache Clear the cache before starting the run
|
|
1448
|
+
--input <json> Manual input value for a single targeted eval
|
|
1449
|
+
that declares manualInput
|
|
1450
|
+
--input-file <path> JSON object keyed by eval key (or eval id) with
|
|
1451
|
+
manual input values for one or more targeted evals
|
|
1452
|
+
--no-env Disable automatic .env loading
|
|
1453
|
+
--help, -h Show this help
|
|
1454
|
+
`);
|
|
1455
|
+
return;
|
|
1456
|
+
}
|
|
1457
|
+
if (topic === "show-runs") {
|
|
1458
|
+
console.info(`
|
|
1459
|
+
agent-evals show-runs - Show saved run artifact file paths
|
|
1460
|
+
|
|
1461
|
+
Usage:
|
|
1462
|
+
agent-evals show-runs [<run-id>|latest] [--json]
|
|
1463
|
+
|
|
1464
|
+
Prints the run directory and stable artifact paths for run.json, summary.json,
|
|
1465
|
+
cases.jsonl, case detail JSON, and trace JSON files. Run ids can be full
|
|
1466
|
+
timestamp ids, short ids such as r0, or latest.
|
|
1467
|
+
|
|
1468
|
+
Flags:
|
|
1469
|
+
--json Output the file index as JSON
|
|
1470
|
+
--no-env Disable automatic .env loading
|
|
1471
|
+
--help, -h Show this help
|
|
1472
|
+
`);
|
|
1473
|
+
return;
|
|
1474
|
+
}
|
|
1475
|
+
if (topic === "cache" || topic === "cache list" || topic === "cache clear") {
|
|
1476
|
+
console.info(`
|
|
1477
|
+
agent-evals cache - Manage cached operation entries
|
|
1478
|
+
|
|
1479
|
+
Usage:
|
|
1480
|
+
agent-evals cache list [flags]
|
|
1481
|
+
agent-evals cache clear --eval <id>
|
|
1482
|
+
agent-evals cache clear --all
|
|
1483
|
+
|
|
1484
|
+
Flags:
|
|
1485
|
+
--eval <id> Clear entries for specific eval(s) (comma-separated)
|
|
1486
|
+
--all Confirm clearing every cached entry
|
|
1487
|
+
--json Output cache listing as JSON
|
|
1488
|
+
--no-env Disable automatic .env loading
|
|
1489
|
+
--help, -h Show this help
|
|
1490
|
+
`);
|
|
1491
|
+
return;
|
|
1492
|
+
}
|
|
1493
|
+
console.info(`
|
|
1494
|
+
agent-evals - LLM/Agent eval runner
|
|
1495
|
+
|
|
1496
|
+
Commands:
|
|
1497
|
+
app Start server with UI
|
|
1498
|
+
list List discovered evals
|
|
1499
|
+
run Run evals
|
|
1500
|
+
show-runs [id|latest] Show saved run artifact file paths
|
|
1501
|
+
cache list List cached operation entries
|
|
1502
|
+
cache clear --eval <id> Clear cache entries for one eval
|
|
1503
|
+
cache clear --all Clear every cached entry
|
|
1504
|
+
help Show this help
|
|
1505
|
+
|
|
1506
|
+
Options:
|
|
1507
|
+
--eval <id> Run specific eval(s) (comma-separated)
|
|
1508
|
+
--case <id> Run specific case(s) (comma-separated)
|
|
1509
|
+
--trials <n> Number of trials per case
|
|
1510
|
+
--inspect[=host:port] Run with the Node.js inspector enabled
|
|
1511
|
+
--inspect-brk[=host:port] Enable inspector and pause before startup
|
|
1512
|
+
--json Output results as JSON
|
|
1513
|
+
--port <n> Server port (default: 4100)
|
|
1514
|
+
--cache <use|bypass|refresh> Cache mode for this run (default: use)
|
|
1515
|
+
--no-cache Shortcut for --cache bypass
|
|
1516
|
+
--refresh-cache Shortcut for --cache refresh
|
|
1517
|
+
--clear-cache Clear the cache before starting the run
|
|
1518
|
+
--no-env Disable automatic .env loading
|
|
1519
|
+
--help, -h Show help
|
|
1520
|
+
`);
|
|
1521
|
+
}
|
|
1522
|
+
//#endregion
|
|
1523
|
+
//#region src/manualInputArgs.ts
|
|
1524
|
+
function isPlainObject(value) {
|
|
1525
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
1526
|
+
}
|
|
1527
|
+
function isPathInputObject(value) {
|
|
1528
|
+
if (!isPlainObject(value)) return false;
|
|
1529
|
+
return typeof value.path === "string" && (value.name === void 0 || typeof value.name === "string") && (value.mimeType === void 0 || typeof value.mimeType === "string");
|
|
1530
|
+
}
|
|
1531
|
+
function escapeRegex$1(value) {
|
|
1532
|
+
return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
|
|
1533
|
+
}
|
|
1534
|
+
function globToRegex$1(pattern) {
|
|
1535
|
+
const normalized = pattern.replaceAll("\\", "/");
|
|
1536
|
+
let regex = "^";
|
|
1537
|
+
for (let i = 0; i < normalized.length; i++) {
|
|
1538
|
+
const char = normalized[i];
|
|
1539
|
+
const next = normalized[i + 1];
|
|
1540
|
+
if (char === "*" && next === "*") {
|
|
1541
|
+
regex += ".*";
|
|
1542
|
+
i++;
|
|
1543
|
+
} else if (char === "*") regex += "[^/]*";
|
|
1544
|
+
else if (char === "?") regex += "[^/]";
|
|
1545
|
+
else regex += escapeRegex$1(char ?? "");
|
|
1546
|
+
}
|
|
1547
|
+
regex += "$";
|
|
1548
|
+
return new RegExp(regex);
|
|
1549
|
+
}
|
|
1550
|
+
function fileMatches$1(pattern, filePath) {
|
|
1551
|
+
const normalizedPattern = pattern.replaceAll("\\", "/");
|
|
1552
|
+
if (normalizedPattern === filePath) return true;
|
|
1553
|
+
return globToRegex$1(normalizedPattern).test(filePath);
|
|
1554
|
+
}
|
|
1555
|
+
function isManualInputEvalTargeted(params) {
|
|
1556
|
+
const { evalSummary, args } = params;
|
|
1557
|
+
const hasEvalIds = args.evalIds.length > 0;
|
|
1558
|
+
const hasFiles = args.files.length > 0;
|
|
1559
|
+
const hasCaseIds = args.caseIds.length > 0;
|
|
1560
|
+
if (hasEvalIds && !args.evalIds.includes(evalSummary.id)) return false;
|
|
1561
|
+
if (hasFiles) {
|
|
1562
|
+
if (!args.files.some((file) => fileMatches$1(file, evalSummary.filePath))) return false;
|
|
1563
|
+
}
|
|
1564
|
+
if (!hasEvalIds && !hasFiles) {
|
|
1565
|
+
if (hasCaseIds) return false;
|
|
1566
|
+
return true;
|
|
1567
|
+
}
|
|
1568
|
+
return true;
|
|
1569
|
+
}
|
|
1570
|
+
async function readInputFileMap(inputFilePath) {
|
|
1571
|
+
const readResult = await resultify(() => readFile(inputFilePath, "utf-8"));
|
|
1572
|
+
if (readResult.error) return {
|
|
1573
|
+
error: `Failed to read --input-file at ${inputFilePath}: ${readResult.error.message}`,
|
|
1574
|
+
value: null
|
|
1575
|
+
};
|
|
1576
|
+
const parseResult = resultify(() => JSON.parse(readResult.value));
|
|
1577
|
+
if (parseResult.error) return {
|
|
1578
|
+
error: `Failed to parse --input-file at ${inputFilePath} as JSON: ${parseResult.error.message}`,
|
|
1579
|
+
value: null
|
|
1580
|
+
};
|
|
1581
|
+
return {
|
|
1582
|
+
error: null,
|
|
1583
|
+
value: parseResult.value
|
|
1584
|
+
};
|
|
1585
|
+
}
|
|
1586
|
+
async function normalizeManualInputFileValue(params) {
|
|
1587
|
+
if (isManualInputFileValue(params.value)) return {
|
|
1588
|
+
error: null,
|
|
1589
|
+
value: params.value
|
|
1590
|
+
};
|
|
1591
|
+
if (!isPathInputObject(params.value)) return {
|
|
1592
|
+
error: null,
|
|
1593
|
+
value: params.value
|
|
1594
|
+
};
|
|
1595
|
+
const pathInput = params.value;
|
|
1596
|
+
const staged = await resultify(() => stageManualInputFileFromPath({
|
|
1597
|
+
workspaceRoot: params.workspaceRoot,
|
|
1598
|
+
path: pathInput.path,
|
|
1599
|
+
name: pathInput.name,
|
|
1600
|
+
mimeType: pathInput.mimeType
|
|
1601
|
+
}));
|
|
1602
|
+
if (staged.error) return {
|
|
1603
|
+
error: `Failed to stage file input "${params.fieldKey}" for eval "${params.evalId}": ${staged.error.message}`,
|
|
1604
|
+
value: null
|
|
1605
|
+
};
|
|
1606
|
+
return {
|
|
1607
|
+
error: null,
|
|
1608
|
+
value: staged.value
|
|
1609
|
+
};
|
|
1610
|
+
}
|
|
1611
|
+
async function normalizeManualInputValue(params) {
|
|
1612
|
+
const descriptor = params.evalSummary.manualInput;
|
|
1613
|
+
if (!descriptor || !isPlainObject(params.value)) return {
|
|
1614
|
+
error: null,
|
|
1615
|
+
value: params.value
|
|
1616
|
+
};
|
|
1617
|
+
const next = { ...params.value };
|
|
1618
|
+
for (const field of descriptor.fields) {
|
|
1619
|
+
if (field.kind !== "file") continue;
|
|
1620
|
+
const normalized = await normalizeManualInputFileValue({
|
|
1621
|
+
workspaceRoot: params.workspaceRoot,
|
|
1622
|
+
evalId: params.evalSummary.id,
|
|
1623
|
+
fieldKey: field.key,
|
|
1624
|
+
value: next[field.key]
|
|
1625
|
+
});
|
|
1626
|
+
if (normalized.error !== null) return {
|
|
1627
|
+
error: normalized.error,
|
|
1628
|
+
value: null
|
|
1629
|
+
};
|
|
1630
|
+
next[field.key] = normalized.value;
|
|
1631
|
+
}
|
|
1632
|
+
return {
|
|
1633
|
+
error: null,
|
|
1634
|
+
value: next
|
|
1635
|
+
};
|
|
1636
|
+
}
|
|
1637
|
+
/**
|
|
1638
|
+
* Resolve the `manualInputs` payload to send with `runner.startRun`.
|
|
1639
|
+
*
|
|
1640
|
+
* Inspects every discovered eval that declares `manualInput`, filters them to
|
|
1641
|
+
* the run target, and either returns the typed map (single eval via `--input`,
|
|
1642
|
+
* multiple via `--input-file`) or a structured error to display and exit on.
|
|
1643
|
+
*/
|
|
1644
|
+
async function collectManualInputs(params) {
|
|
1645
|
+
const { runner, args } = params;
|
|
1646
|
+
const workspaceRoot = runner.getWorkspaceRoot();
|
|
1647
|
+
const targetedManualInputEvals = runner.getEvals().filter((evalSummary) => evalSummary.manualInput !== void 0).filter((evalSummary) => isManualInputEvalTargeted({
|
|
1648
|
+
evalSummary,
|
|
1649
|
+
args
|
|
1650
|
+
}));
|
|
1651
|
+
if (targetedManualInputEvals.length === 0) {
|
|
1652
|
+
if (args.inputJson !== void 0 || args.inputFilePath !== void 0) return {
|
|
1653
|
+
error: "--input/--input-file was provided but no targeted eval requires manual input.",
|
|
1654
|
+
value: null
|
|
1655
|
+
};
|
|
1656
|
+
return {
|
|
1657
|
+
error: null,
|
|
1658
|
+
value: void 0
|
|
1659
|
+
};
|
|
1660
|
+
}
|
|
1661
|
+
if (args.inputJson !== void 0 && args.inputFilePath !== void 0) return {
|
|
1662
|
+
error: "Cannot use --input and --input-file together; choose one.",
|
|
1663
|
+
value: null
|
|
1664
|
+
};
|
|
1665
|
+
if (args.inputJson !== void 0) {
|
|
1666
|
+
if (targetedManualInputEvals.length > 1) {
|
|
1667
|
+
const ids = targetedManualInputEvals.map((evalSummary) => evalSummary.id).join(", ");
|
|
1668
|
+
return {
|
|
1669
|
+
error: `--input only works for one targeted manual-input eval at a time; got ${String(targetedManualInputEvals.length)} (${ids}). Use --input-file with a JSON object keyed by eval key.`,
|
|
1670
|
+
value: null
|
|
1671
|
+
};
|
|
1672
|
+
}
|
|
1673
|
+
const parsedResult = resultify(() => JSON.parse(args.inputJson ?? ""));
|
|
1674
|
+
if (parsedResult.error) return {
|
|
1675
|
+
error: `Failed to parse --input as JSON: ${parsedResult.error.message}`,
|
|
1676
|
+
value: null
|
|
1677
|
+
};
|
|
1678
|
+
const [onlyEval] = targetedManualInputEvals;
|
|
1679
|
+
if (onlyEval === void 0) return {
|
|
1680
|
+
error: null,
|
|
1681
|
+
value: void 0
|
|
1682
|
+
};
|
|
1683
|
+
const normalized = await normalizeManualInputValue({
|
|
1684
|
+
workspaceRoot,
|
|
1685
|
+
evalSummary: onlyEval,
|
|
1686
|
+
value: parsedResult.value
|
|
1687
|
+
});
|
|
1688
|
+
if (normalized.error !== null) return {
|
|
1689
|
+
error: normalized.error,
|
|
1690
|
+
value: null
|
|
1691
|
+
};
|
|
1692
|
+
return {
|
|
1693
|
+
error: null,
|
|
1694
|
+
value: { [onlyEval.key]: normalized.value }
|
|
1695
|
+
};
|
|
1696
|
+
}
|
|
1697
|
+
if (args.inputFilePath !== void 0) {
|
|
1698
|
+
const fileResult = await readInputFileMap(args.inputFilePath);
|
|
1699
|
+
if (fileResult.error !== null) return {
|
|
1700
|
+
error: fileResult.error,
|
|
1701
|
+
value: null
|
|
1702
|
+
};
|
|
1703
|
+
if (!isPlainObject(fileResult.value)) return {
|
|
1704
|
+
error: `--input-file must contain a JSON object keyed by eval key (got ${typeof fileResult.value}).`,
|
|
1705
|
+
value: null
|
|
1706
|
+
};
|
|
1707
|
+
const map = {};
|
|
1708
|
+
for (const evalSummary of targetedManualInputEvals) {
|
|
1709
|
+
const byKey = fileResult.value[evalSummary.key];
|
|
1710
|
+
const byId = fileResult.value[evalSummary.id];
|
|
1711
|
+
const value = byKey !== void 0 ? byKey : byId;
|
|
1712
|
+
if (value === void 0) return {
|
|
1713
|
+
error: `--input-file is missing manual input for eval "${evalSummary.id}" (key "${evalSummary.key}").`,
|
|
1714
|
+
value: null
|
|
1715
|
+
};
|
|
1716
|
+
const normalized = await normalizeManualInputValue({
|
|
1717
|
+
workspaceRoot,
|
|
1718
|
+
evalSummary,
|
|
1719
|
+
value
|
|
1720
|
+
});
|
|
1721
|
+
if (normalized.error !== null) return {
|
|
1722
|
+
error: normalized.error,
|
|
1723
|
+
value: null
|
|
1724
|
+
};
|
|
1725
|
+
map[evalSummary.key] = normalized.value;
|
|
1726
|
+
}
|
|
1727
|
+
return {
|
|
1728
|
+
error: null,
|
|
1729
|
+
value: map
|
|
1730
|
+
};
|
|
1731
|
+
}
|
|
1732
|
+
return {
|
|
1733
|
+
error: `Eval(s) require manual input but no --input/--input-file was provided: ${targetedManualInputEvals.map((evalSummary) => evalSummary.id).join(", ")}`,
|
|
1734
|
+
value: null
|
|
1735
|
+
};
|
|
1736
|
+
}
|
|
1737
|
+
//#endregion
|
|
847
1738
|
//#region src/cli.ts
|
|
848
1739
|
function parseArgs(argv) {
|
|
849
1740
|
const normalizedArgv = argv.filter((arg) => arg !== "--no-env");
|
|
@@ -863,7 +1754,9 @@ function parseArgs(argv) {
|
|
|
863
1754
|
cacheMode: "use",
|
|
864
1755
|
clearCache: false,
|
|
865
1756
|
all: false,
|
|
866
|
-
loadEnv: normalizedArgv.length === argv.length
|
|
1757
|
+
loadEnv: normalizedArgv.length === argv.length,
|
|
1758
|
+
inputJson: void 0,
|
|
1759
|
+
inputFilePath: void 0
|
|
867
1760
|
};
|
|
868
1761
|
const command = normalizedArgv[0];
|
|
869
1762
|
if (command === "--help" || command === "-h") {
|
|
@@ -910,7 +1803,13 @@ function parseArgs(argv) {
|
|
|
910
1803
|
} else if (arg === "--no-cache") args.cacheMode = "bypass";
|
|
911
1804
|
else if (arg === "--refresh-cache") args.cacheMode = "refresh";
|
|
912
1805
|
else if (arg === "--clear-cache") args.clearCache = true;
|
|
913
|
-
else if (arg === "--
|
|
1806
|
+
else if (arg === "--input" && next !== void 0) {
|
|
1807
|
+
args.inputJson = next;
|
|
1808
|
+
i++;
|
|
1809
|
+
} else if (arg === "--input-file" && next !== void 0) {
|
|
1810
|
+
args.inputFilePath = next;
|
|
1811
|
+
i++;
|
|
1812
|
+
} else if (arg === "--all") args.all = true;
|
|
914
1813
|
else if (!arg.startsWith("-")) args.positionals.push(arg);
|
|
915
1814
|
}
|
|
916
1815
|
return args;
|
|
@@ -1041,8 +1940,8 @@ async function commandApp(args) {
|
|
|
1041
1940
|
const { serve } = await import("@hono/node-server");
|
|
1042
1941
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
1043
1942
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
1044
|
-
const appModule = await import("./app-
|
|
1045
|
-
const runnerModule = await import("./runner-
|
|
1943
|
+
const appModule = await import("./app-D6-msfKP.mjs");
|
|
1944
|
+
const runnerModule = await import("./runner-Bq1f9B9d.mjs");
|
|
1046
1945
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
1047
1946
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
1048
1947
|
await runnerModule.initRunner();
|
|
@@ -1113,10 +2012,26 @@ async function commandRun(args) {
|
|
|
1113
2012
|
mode: "evalIds",
|
|
1114
2013
|
files: args.files
|
|
1115
2014
|
} : { mode: "all" };
|
|
2015
|
+
const manualInputsResult = await collectManualInputs({
|
|
2016
|
+
runner,
|
|
2017
|
+
args: {
|
|
2018
|
+
evalIds: args.evalIds,
|
|
2019
|
+
files: args.files,
|
|
2020
|
+
caseIds: args.caseIds,
|
|
2021
|
+
inputJson: args.inputJson,
|
|
2022
|
+
inputFilePath: args.inputFilePath
|
|
2023
|
+
}
|
|
2024
|
+
});
|
|
2025
|
+
if (manualInputsResult.error !== null) {
|
|
2026
|
+
console.error(manualInputsResult.error);
|
|
2027
|
+
process.exit(1);
|
|
2028
|
+
return;
|
|
2029
|
+
}
|
|
1116
2030
|
const run = await runner.startRun({
|
|
1117
2031
|
target,
|
|
1118
2032
|
trials: args.trials,
|
|
1119
|
-
cache: { mode: args.cacheMode }
|
|
2033
|
+
cache: { mode: args.cacheMode },
|
|
2034
|
+
manualInputs: manualInputsResult.value
|
|
1120
2035
|
});
|
|
1121
2036
|
if (!args.json) {
|
|
1122
2037
|
console.info(`Run started: ${run.manifest.id}`);
|
|
@@ -1334,122 +2249,5 @@ async function waitForRunCompletion(runner, runId) {
|
|
|
1334
2249
|
check();
|
|
1335
2250
|
});
|
|
1336
2251
|
}
|
|
1337
|
-
function printHelp(topic = "global") {
|
|
1338
|
-
if (topic === "app") {
|
|
1339
|
-
console.info(`
|
|
1340
|
-
agent-evals app - Start server with UI
|
|
1341
|
-
|
|
1342
|
-
Usage:
|
|
1343
|
-
agent-evals app [flags]
|
|
1344
|
-
|
|
1345
|
-
Flags:
|
|
1346
|
-
--port <n> Server port (default: 4100)
|
|
1347
|
-
--no-env Disable automatic .env loading
|
|
1348
|
-
--help, -h Show this help
|
|
1349
|
-
`);
|
|
1350
|
-
return;
|
|
1351
|
-
}
|
|
1352
|
-
if (topic === "list") {
|
|
1353
|
-
console.info(`
|
|
1354
|
-
agent-evals list - List discovered evals
|
|
1355
|
-
|
|
1356
|
-
Usage:
|
|
1357
|
-
agent-evals list [flags]
|
|
1358
|
-
|
|
1359
|
-
Flags:
|
|
1360
|
-
--no-env Disable automatic .env loading
|
|
1361
|
-
--help, -h Show this help
|
|
1362
|
-
`);
|
|
1363
|
-
return;
|
|
1364
|
-
}
|
|
1365
|
-
if (topic === "run") {
|
|
1366
|
-
console.info(`
|
|
1367
|
-
agent-evals run - Run evals
|
|
1368
|
-
|
|
1369
|
-
Usage:
|
|
1370
|
-
agent-evals run [flags]
|
|
1371
|
-
|
|
1372
|
-
Flags:
|
|
1373
|
-
--eval <id> Run specific eval(s) (comma-separated)
|
|
1374
|
-
--file <path|glob> Run eval files matching path/glob (comma-separated)
|
|
1375
|
-
--case <id> Run case(s); combine with --file/--eval if ambiguous
|
|
1376
|
-
--trials <n> Number of trials per case
|
|
1377
|
-
--inspect[=host:port] Run with the Node.js inspector enabled
|
|
1378
|
-
--inspect-brk[=host:port] Enable inspector and pause before startup
|
|
1379
|
-
--json Output run summary as JSON
|
|
1380
|
-
--cache <use|bypass|refresh> Cache mode for this run (default: use)
|
|
1381
|
-
--no-cache Shortcut for --cache bypass
|
|
1382
|
-
--refresh-cache Shortcut for --cache refresh
|
|
1383
|
-
--clear-cache Clear the cache before starting the run
|
|
1384
|
-
--no-env Disable automatic .env loading
|
|
1385
|
-
--help, -h Show this help
|
|
1386
|
-
`);
|
|
1387
|
-
return;
|
|
1388
|
-
}
|
|
1389
|
-
if (topic === "show-runs") {
|
|
1390
|
-
console.info(`
|
|
1391
|
-
agent-evals show-runs - Show saved run artifact file paths
|
|
1392
|
-
|
|
1393
|
-
Usage:
|
|
1394
|
-
agent-evals show-runs [<run-id>|latest] [--json]
|
|
1395
|
-
|
|
1396
|
-
Prints the run directory and stable artifact paths for run.json, summary.json,
|
|
1397
|
-
cases.jsonl, case detail JSON, and trace JSON files. Run ids can be full
|
|
1398
|
-
timestamp ids, short ids such as r0, or latest.
|
|
1399
|
-
|
|
1400
|
-
Flags:
|
|
1401
|
-
--json Output the file index as JSON
|
|
1402
|
-
--no-env Disable automatic .env loading
|
|
1403
|
-
--help, -h Show this help
|
|
1404
|
-
`);
|
|
1405
|
-
return;
|
|
1406
|
-
}
|
|
1407
|
-
if (topic === "cache" || topic === "cache list" || topic === "cache clear") {
|
|
1408
|
-
console.info(`
|
|
1409
|
-
agent-evals cache - Manage cached operation entries
|
|
1410
|
-
|
|
1411
|
-
Usage:
|
|
1412
|
-
agent-evals cache list [flags]
|
|
1413
|
-
agent-evals cache clear --eval <id>
|
|
1414
|
-
agent-evals cache clear --all
|
|
1415
|
-
|
|
1416
|
-
Flags:
|
|
1417
|
-
--eval <id> Clear entries for specific eval(s) (comma-separated)
|
|
1418
|
-
--all Confirm clearing every cached entry
|
|
1419
|
-
--json Output cache listing as JSON
|
|
1420
|
-
--no-env Disable automatic .env loading
|
|
1421
|
-
--help, -h Show this help
|
|
1422
|
-
`);
|
|
1423
|
-
return;
|
|
1424
|
-
}
|
|
1425
|
-
console.info(`
|
|
1426
|
-
agent-evals - LLM/Agent eval runner
|
|
1427
|
-
|
|
1428
|
-
Commands:
|
|
1429
|
-
app Start server with UI
|
|
1430
|
-
list List discovered evals
|
|
1431
|
-
run Run evals
|
|
1432
|
-
show-runs [id|latest] Show saved run artifact file paths
|
|
1433
|
-
cache list List cached operation entries
|
|
1434
|
-
cache clear --eval <id> Clear cache entries for one eval
|
|
1435
|
-
cache clear --all Clear every cached entry
|
|
1436
|
-
help Show this help
|
|
1437
|
-
|
|
1438
|
-
Options:
|
|
1439
|
-
--eval <id> Run specific eval(s) (comma-separated)
|
|
1440
|
-
--case <id> Run specific case(s) (comma-separated)
|
|
1441
|
-
--trials <n> Number of trials per case
|
|
1442
|
-
--inspect[=host:port] Run with the Node.js inspector enabled
|
|
1443
|
-
--inspect-brk[=host:port] Enable inspector and pause before startup
|
|
1444
|
-
--json Output results as JSON
|
|
1445
|
-
--port <n> Server port (default: 4100)
|
|
1446
|
-
--cache <use|bypass|refresh> Cache mode for this run (default: use)
|
|
1447
|
-
--no-cache Shortcut for --cache bypass
|
|
1448
|
-
--refresh-cache Shortcut for --cache refresh
|
|
1449
|
-
--clear-cache Clear the cache before starting the run
|
|
1450
|
-
--no-env Disable automatic .env loading
|
|
1451
|
-
--help, -h Show help
|
|
1452
|
-
`);
|
|
1453
|
-
}
|
|
1454
2252
|
//#endregion
|
|
1455
|
-
export { createRunner as n, runCli as t };
|
|
2253
|
+
export { materializeManualInputFiles as a, isManualInputFileValue as i, createRunner as n, stageManualInputFile as o, cleanupStagedManualInputFiles as r, stageManualInputFileFromPath as s, runCli as t };
|