@ls-stack/agent-eval 0.58.1 → 0.58.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-DhMIbjlE.mjs → app-BxD6aHbp.mjs} +52 -7
- package/dist/apps/web/dist/assets/index-BMWBZw_u.js +377 -0
- package/dist/apps/web/dist/assets/index-CHH7m5Cv.css +1 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +2 -1
- package/dist/{cli-_g2qOMK6.mjs → cli-HBwXIJsg.mjs} +31 -5
- package/dist/index.d.mts +76 -17
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +2 -2
- package/dist/{runExecution-d42Lm0i5.mjs → runExecution-pHJ0_TzH.mjs} +125 -21
- package/dist/{runOrchestration-CvmFeOmT.mjs → runOrchestration-ngVXShH4.mjs} +73 -6
- package/dist/{runner-BKogjiYd.mjs → runner-BnZMGBla.mjs} +1 -1
- package/dist/{runner-MSr8sAWm.mjs → runner-D_pz2NON.mjs} +2 -2
- package/dist/{src-CdZsOn6y.mjs → src-AeXGBJ26.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +18 -3
- package/dist/apps/web/dist/assets/index-Cz9p4l-t.js +0 -377
- package/dist/apps/web/dist/assets/index-DtARRwsS.css +0 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { o as stageManualInputFile } from "./cli-
|
|
3
|
-
import "./src-
|
|
4
|
-
import { t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { Et as getCaseRowCaseKey, Ot as caseRowSchema, dt as getEvalTitle, nt as updateManualScoreRequestSchema, rt as extractCacheEntries, tt as createRunRequestSchema } from "./runExecution-pHJ0_TzH.mjs";
|
|
2
|
+
import { o as stageManualInputFile } from "./cli-HBwXIJsg.mjs";
|
|
3
|
+
import "./src-AeXGBJ26.mjs";
|
|
4
|
+
import { t as getRunnerInstance } from "./runner-D_pz2NON.mjs";
|
|
5
5
|
import { z } from "zod/v4";
|
|
6
6
|
import { readFile } from "node:fs/promises";
|
|
7
7
|
import { dirname, isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
@@ -231,9 +231,24 @@ function logStartedAppRunEvals(params) {
|
|
|
231
231
|
const targetEvals = getRunTargetEvalSummaries(params.evals, params.target);
|
|
232
232
|
if (targetEvals.length === 0) return;
|
|
233
233
|
const label = targetEvals.length === 1 ? "eval" : "evals";
|
|
234
|
-
console.info(`[agent-evals]
|
|
234
|
+
console.info(`[agent-evals] Queued app run ${params.shortId} (${params.runId}) with ${String(targetEvals.length)} ${label}; concurrency ${String(params.concurrency)}:`);
|
|
235
235
|
for (const ev of targetEvals) console.info(` - ${getEvalTitle(ev)} (${ev.filePath}#${ev.id})`);
|
|
236
236
|
}
|
|
237
|
+
function getEvalSummaryLabel(evalsByKey, evalsById, evalKey, evalId) {
|
|
238
|
+
const summary = (evalKey === void 0 ? void 0 : evalsByKey.get(evalKey)) ?? evalsById.get(evalId);
|
|
239
|
+
if (summary === void 0) return evalId;
|
|
240
|
+
return `${getEvalTitle(summary)} (${summary.filePath}#${summary.id})`;
|
|
241
|
+
}
|
|
242
|
+
function getRunCaseLabel(caseId, caseKey) {
|
|
243
|
+
return caseKey === void 0 || caseKey === caseId ? caseId : `${caseId} [${caseKey}]`;
|
|
244
|
+
}
|
|
245
|
+
function formatCaseStartedLog(params) {
|
|
246
|
+
return [
|
|
247
|
+
`[agent-evals] Run ${params.shortId} started `,
|
|
248
|
+
`${String(params.activeCount)}/${String(params.concurrency)}: `,
|
|
249
|
+
`${params.evalLabel} / ${params.caseLabel}`
|
|
250
|
+
].join("");
|
|
251
|
+
}
|
|
237
252
|
function formatDurationMs(durationMs) {
|
|
238
253
|
if (durationMs === null) return "";
|
|
239
254
|
if (durationMs < 1e3) return ` in ${String(durationMs)}ms`;
|
|
@@ -247,8 +262,34 @@ function isTerminalRunEvent(eventType) {
|
|
|
247
262
|
return eventType === "run.finished" || eventType === "run.error" || eventType === "run.cancelled";
|
|
248
263
|
}
|
|
249
264
|
function subscribeToAppRunResultLog(params) {
|
|
265
|
+
const evalsByKey = new Map(params.evals.map((ev) => [ev.key, ev]));
|
|
266
|
+
const evalsById = new Map(params.evals.map((ev) => [ev.id, ev]));
|
|
267
|
+
const activeCases = /* @__PURE__ */ new Set();
|
|
268
|
+
const loggedStarts = /* @__PURE__ */ new Set();
|
|
250
269
|
let unsubscribe;
|
|
251
270
|
unsubscribe = params.runner.subscribe(params.runId, (event) => {
|
|
271
|
+
if (event.type === "case.started") {
|
|
272
|
+
const parsed = caseRowSchema.safeParse(event.payload);
|
|
273
|
+
if (!parsed.success) return;
|
|
274
|
+
const caseRow = parsed.data;
|
|
275
|
+
const caseKey = `${getCaseRowCaseKey(caseRow)}:${String(caseRow.trial)}`;
|
|
276
|
+
activeCases.add(caseKey);
|
|
277
|
+
if (loggedStarts.has(caseKey)) return;
|
|
278
|
+
loggedStarts.add(caseKey);
|
|
279
|
+
console.info(formatCaseStartedLog({
|
|
280
|
+
shortId: params.shortId,
|
|
281
|
+
activeCount: activeCases.size,
|
|
282
|
+
concurrency: params.concurrency,
|
|
283
|
+
evalLabel: getEvalSummaryLabel(evalsByKey, evalsById, caseRow.evalKey, caseRow.evalId),
|
|
284
|
+
caseLabel: getRunCaseLabel(caseRow.caseId, caseRow.caseKey)
|
|
285
|
+
}));
|
|
286
|
+
return;
|
|
287
|
+
}
|
|
288
|
+
if (event.type === "case.finished") {
|
|
289
|
+
const parsed = caseRowSchema.safeParse(event.payload);
|
|
290
|
+
if (parsed.success) activeCases.delete(`${getCaseRowCaseKey(parsed.data)}:${String(parsed.data.trial)}`);
|
|
291
|
+
return;
|
|
292
|
+
}
|
|
252
293
|
if (!isTerminalRunEvent(event.type)) return;
|
|
253
294
|
unsubscribe?.();
|
|
254
295
|
unsubscribe = void 0;
|
|
@@ -301,6 +342,7 @@ const runsRoutes = new Hono().get("/", (c) => {
|
|
|
301
342
|
failures: validation.failures
|
|
302
343
|
}, 400);
|
|
303
344
|
const evalsForTerminalLog = runner.getEvals();
|
|
345
|
+
const concurrency = runner.getConfiguredConcurrency();
|
|
304
346
|
const runResult = await resultify(() => runner.startRun(body));
|
|
305
347
|
if (runResult.error) return c.json({
|
|
306
348
|
error: "Failed to start run",
|
|
@@ -310,12 +352,15 @@ const runsRoutes = new Hono().get("/", (c) => {
|
|
|
310
352
|
runId: runResult.value.manifest.id,
|
|
311
353
|
shortId: runResult.value.manifest.shortId,
|
|
312
354
|
evals: evalsForTerminalLog,
|
|
313
|
-
target: body.target
|
|
355
|
+
target: body.target,
|
|
356
|
+
concurrency
|
|
314
357
|
});
|
|
315
358
|
subscribeToAppRunResultLog({
|
|
316
359
|
runner,
|
|
317
360
|
runId: runResult.value.manifest.id,
|
|
318
|
-
shortId: runResult.value.manifest.shortId
|
|
361
|
+
shortId: runResult.value.manifest.shortId,
|
|
362
|
+
evals: evalsForTerminalLog,
|
|
363
|
+
concurrency
|
|
319
364
|
});
|
|
320
365
|
return c.json(runResult.value, 201);
|
|
321
366
|
}).post("/actions/open-location", zValidator("json", openRunLocationRequestSchema), (c) => {
|