@ls-stack/agent-eval 0.58.0 → 0.58.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-L9GdY28I.mjs → app-BxD6aHbp.mjs} +52 -7
- package/dist/apps/web/dist/assets/index-BMWBZw_u.js +377 -0
- package/dist/apps/web/dist/assets/index-CHH7m5Cv.css +1 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +2 -1
- package/dist/{cli-Cf37PZKi.mjs → cli-HBwXIJsg.mjs} +31 -5
- package/dist/index.d.mts +136 -80
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +2 -2
- package/dist/{runExecution-C4kAOhC1.mjs → runExecution-pHJ0_TzH.mjs} +188 -89
- package/dist/{runOrchestration-5xEiQxiS.mjs → runOrchestration-ngVXShH4.mjs} +73 -6
- package/dist/{runner-JIykMlve.mjs → runner-BnZMGBla.mjs} +1 -1
- package/dist/{runner-bjd_UB9i.mjs → runner-D_pz2NON.mjs} +2 -2
- package/dist/{src-303BocMW.mjs → src-AeXGBJ26.mjs} +2 -2
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +18 -3
- package/dist/apps/web/dist/assets/index-Cz9p4l-t.js +0 -377
- package/dist/apps/web/dist/assets/index-DtARRwsS.css +0 -1
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Et as
|
|
1
|
+
import { Dt as caseDetailSchema, Et as getCaseRowCaseKey, It as runWithEvalRegistry, J as runInEvalRuntimeScope, Ot as caseRowSchema, Z as runWithEvalClock, _t as matchesTagsFilter, bt as runManifestSchema, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as dedupeEvalTags, ht as deriveStatusFromChildStatuses, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromCaseRows, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveScopedSummaryFromCases, t as filterEvalCases, u as runWithModuleIsolation, vt as validateEvalTagName, wt as buildCaseKey, xt as runSummarySchema, yt as validateTagsFilterExpression } from "./runExecution-pHJ0_TzH.mjs";
|
|
2
2
|
import { readFile, readdir, rm, writeFile } from "node:fs/promises";
|
|
3
3
|
import { dirname, join } from "node:path";
|
|
4
4
|
import { existsSync } from "node:fs";
|
|
@@ -1030,6 +1030,7 @@ async function executeQueuedCases(params) {
|
|
|
1030
1030
|
async function executeQueuedCase(params) {
|
|
1031
1031
|
const { queuedCase, globalTraceDisplay } = params;
|
|
1032
1032
|
const startTime = Date.now();
|
|
1033
|
+
await queuedCase.onStart?.();
|
|
1033
1034
|
const result = await queuedCase.execute({
|
|
1034
1035
|
globalTraceDisplay,
|
|
1035
1036
|
startTime
|
|
@@ -1249,6 +1250,32 @@ function buildRunErrorMessage(errors) {
|
|
|
1249
1250
|
return `[${entry.evalId}] ${messageLine}\n${details}`;
|
|
1250
1251
|
}).join("\n");
|
|
1251
1252
|
}
|
|
1253
|
+
function upsertCaseRow(caseRows, nextCaseRow) {
|
|
1254
|
+
const existingIndex = caseRows.findIndex((caseRow) => getCaseRowCaseKey(caseRow) === getCaseRowCaseKey(nextCaseRow) && caseRow.trial === nextCaseRow.trial);
|
|
1255
|
+
if (existingIndex === -1) {
|
|
1256
|
+
caseRows.push(nextCaseRow);
|
|
1257
|
+
return;
|
|
1258
|
+
}
|
|
1259
|
+
caseRows[existingIndex] = nextCaseRow;
|
|
1260
|
+
}
|
|
1261
|
+
function removeLiveCaseRows(caseRows, nextCaseRow) {
|
|
1262
|
+
const caseKey = getCaseRowCaseKey(nextCaseRow);
|
|
1263
|
+
for (let i = caseRows.length - 1; i >= 0; i--) {
|
|
1264
|
+
const caseRow = caseRows[i];
|
|
1265
|
+
if (caseRow === void 0) continue;
|
|
1266
|
+
if (getCaseRowCaseKey(caseRow) !== caseKey) continue;
|
|
1267
|
+
if (caseRow.status !== "pending" && caseRow.status !== "running") continue;
|
|
1268
|
+
caseRows.splice(i, 1);
|
|
1269
|
+
}
|
|
1270
|
+
}
|
|
1271
|
+
function emitCaseRowEvent(params) {
|
|
1272
|
+
params.emitEvent(params.runState, {
|
|
1273
|
+
type: params.type,
|
|
1274
|
+
runId: params.runState.manifest.id,
|
|
1275
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1276
|
+
payload: params.caseRow
|
|
1277
|
+
});
|
|
1278
|
+
}
|
|
1252
1279
|
async function finalizePreparedCase(params) {
|
|
1253
1280
|
const { runState, runDir, preparedEval, preparedCase, onCaseFinished, emitEvent } = params;
|
|
1254
1281
|
if (preparedCase.finalized || preparedCase.trialResults.length === 0) return;
|
|
@@ -1263,7 +1290,8 @@ async function finalizePreparedCase(params) {
|
|
|
1263
1290
|
pendingWrites: winningTrial.pendingCacheWrites
|
|
1264
1291
|
});
|
|
1265
1292
|
const artifactFileId = getCaseArtifactFileId(runState, winningTrial.caseRow);
|
|
1266
|
-
runState.cases
|
|
1293
|
+
removeLiveCaseRows(runState.cases, winningTrial.caseRow);
|
|
1294
|
+
upsertCaseRow(runState.cases, winningTrial.caseRow);
|
|
1267
1295
|
runState.caseDetails.set(getCaseRowCaseKey(winningTrial.caseRow), winningTrial.caseDetail);
|
|
1268
1296
|
if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
|
|
1269
1297
|
else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
|
|
@@ -1271,11 +1299,11 @@ async function finalizePreparedCase(params) {
|
|
|
1271
1299
|
await writeFile(join(runDir, "traces", `${encodeURIComponent(artifactFileId)}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
|
|
1272
1300
|
await persistCaseDetail(runDir, winningTrial.caseDetail, artifactFileId);
|
|
1273
1301
|
onCaseFinished?.(winningTrial.caseDetail, winningTrial.caseRow);
|
|
1274
|
-
|
|
1302
|
+
emitCaseRowEvent({
|
|
1303
|
+
runState,
|
|
1304
|
+
emitEvent,
|
|
1275
1305
|
type: "case.finished",
|
|
1276
|
-
|
|
1277
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1278
|
-
payload: winningTrial.caseRow
|
|
1306
|
+
caseRow: winningTrial.caseRow
|
|
1279
1307
|
});
|
|
1280
1308
|
preparedEval.evalCaseRows.push(winningTrial.caseRow);
|
|
1281
1309
|
}
|
|
@@ -1437,13 +1465,52 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
1437
1465
|
preparedEvals.push(preparedEval);
|
|
1438
1466
|
for (const evalCase of cases) {
|
|
1439
1467
|
const trialResults = [];
|
|
1468
|
+
const liveCaseRow = {
|
|
1469
|
+
caseId: evalCase.id,
|
|
1470
|
+
evalId: evalMeta.id,
|
|
1471
|
+
evalKey: evalMeta.key,
|
|
1472
|
+
caseKey: buildCaseKey({
|
|
1473
|
+
filePath: evalMeta.filePath,
|
|
1474
|
+
evalId: evalMeta.id,
|
|
1475
|
+
caseId: evalCase.id
|
|
1476
|
+
}),
|
|
1477
|
+
tags: evalCase.tags,
|
|
1478
|
+
status: "pending",
|
|
1479
|
+
durationMs: null,
|
|
1480
|
+
cacheHits: 0,
|
|
1481
|
+
cacheOperations: 0,
|
|
1482
|
+
columns: {},
|
|
1483
|
+
trial: 0
|
|
1484
|
+
};
|
|
1440
1485
|
const preparedCase = {
|
|
1441
1486
|
caseId: evalCase.id,
|
|
1487
|
+
liveCaseRow,
|
|
1442
1488
|
trialResults,
|
|
1443
1489
|
finalized: false
|
|
1444
1490
|
};
|
|
1445
1491
|
preparedCases.push(preparedCase);
|
|
1492
|
+
upsertCaseRow(runState.cases, liveCaseRow);
|
|
1493
|
+
emitCaseRowEvent({
|
|
1494
|
+
runState,
|
|
1495
|
+
emitEvent,
|
|
1496
|
+
type: "case.updated",
|
|
1497
|
+
caseRow: liveCaseRow
|
|
1498
|
+
});
|
|
1446
1499
|
for (let trial = 0; trial < request.trials; trial++) queuedCases.push({
|
|
1500
|
+
onStart: () => {
|
|
1501
|
+
if (preparedCase.finalized) return;
|
|
1502
|
+
preparedCase.liveCaseRow = {
|
|
1503
|
+
...preparedCase.liveCaseRow,
|
|
1504
|
+
status: "running"
|
|
1505
|
+
};
|
|
1506
|
+
upsertCaseRow(runState.cases, preparedCase.liveCaseRow);
|
|
1507
|
+
emitCaseRowEvent({
|
|
1508
|
+
runState,
|
|
1509
|
+
emitEvent,
|
|
1510
|
+
type: "case.started",
|
|
1511
|
+
caseRow: preparedCase.liveCaseRow
|
|
1512
|
+
});
|
|
1513
|
+
},
|
|
1447
1514
|
execute: async ({ startTime, globalTraceDisplay }) => await executeCaseChild({
|
|
1448
1515
|
evalId: evalMeta.id,
|
|
1449
1516
|
evalKey: evalMeta.key,
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-D_pz2NON.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-HBwXIJsg.mjs";
|
|
2
|
+
import "./src-AeXGBJ26.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { G as matchesEvalTags$1, Pt as defineEval$1 } from "./runExecution-pHJ0_TzH.mjs";
|
|
2
|
+
import "./cli-HBwXIJsg.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -35,6 +35,9 @@ display rules), read the TypeScript declarations shipped with the package:
|
|
|
35
35
|
- `agent-evals app` watches `agent-evals.config.ts` and reloads config in
|
|
36
36
|
place when the runner is idle. If config changes during an active run, the
|
|
37
37
|
reload applies after the current run reaches a terminal state.
|
|
38
|
+
- App-triggered runs log the queued target evals, resolved case concurrency,
|
|
39
|
+
each case start for evals that are actually running, and the terminal run
|
|
40
|
+
summary in the server terminal.
|
|
38
41
|
|
|
39
42
|
Assume that enumerated tables in this document may lag behind the types —
|
|
40
43
|
treat the types as source of truth when they disagree.
|
|
@@ -360,7 +363,18 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
360
363
|
The older object-returning function form remains supported. Global
|
|
361
364
|
derivations run first; runtime outputs are never overwritten, and eval-level
|
|
362
365
|
derivations only fill keys still missing after global derivations. In keyed
|
|
363
|
-
form, return `undefined` to omit one output for that case.
|
|
366
|
+
form, return `undefined` to omit one output for that case. Do not call
|
|
367
|
+
`evalAssert(...)` or `evalExpect(...)` from `deriveFromTracing`; use
|
|
368
|
+
`tracingAssertions` for trace-derived pass/fail checks.
|
|
369
|
+
- `tracingAssertions` can be authored globally or locally on one eval when a
|
|
370
|
+
finished-trace invariant should pass or fail the case without creating a fake
|
|
371
|
+
score column. It receives the same `{ trace, input, case }` context as
|
|
372
|
+
`deriveFromTracing`; call `evalAssert(...)` or `evalExpect(...)` inside it.
|
|
373
|
+
Useful trace helpers include `trace.findSpan(name)`, `trace.findSpans(name)`,
|
|
374
|
+
`trace.hasSpan(name)`, `trace.findSpansByKind(kind)`,
|
|
375
|
+
`trace.findToolCallSpans()`, `trace.listToolCallSpanNames()`,
|
|
376
|
+
`trace.hasToolCallSpan(name)`, `trace.listSpanNames(kind?)`,
|
|
377
|
+
`trace.listSpanNamesDfs(kind?)`, and `trace.flattenDfs()`.
|
|
364
378
|
- `traceDisplay` promotes selected span attributes into the trace tree and
|
|
365
379
|
detail pane; it supports aggregation across subtrees (`scope`, `mode`) and
|
|
366
380
|
user-defined `transform(...)` for derived views (e.g. currency conversion).
|
|
@@ -629,8 +643,9 @@ When adding or changing evals:
|
|
|
629
643
|
3. `evalAssert` for hard invariants and truthy type narrowing. It records
|
|
630
644
|
pass/fail entries in case-detail `assertions`; failed entries are also kept
|
|
631
645
|
in `assertionFailures` and fail the case. Use `evalExpect` for non-trivial
|
|
632
|
-
comparisons, `
|
|
633
|
-
scores
|
|
646
|
+
comparisons, `tracingAssertions` for invariants derived from the finished
|
|
647
|
+
trace, `scores` for graded signals, and `passThreshold` only on scores that
|
|
648
|
+
should gate pass/fail.
|
|
634
649
|
4. Surface reviewable values through execute-context `setOutput` or ambient
|
|
635
650
|
`setEvalOutput` in shared workflow code, and shape them with `columns`
|
|
636
651
|
formats from the `ColumnFormat` type.
|