@ls-stack/agent-eval 0.58.1 → 0.58.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-d42Lm0i5.mjs";
1
+ import { Dt as caseDetailSchema, Et as getCaseRowCaseKey, It as runWithEvalRegistry, J as runInEvalRuntimeScope, Ot as caseRowSchema, Z as runWithEvalClock, _t as matchesTagsFilter, bt as runManifestSchema, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as dedupeEvalTags, ht as deriveStatusFromChildStatuses, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromCaseRows, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveScopedSummaryFromCases, t as filterEvalCases, u as runWithModuleIsolation, vt as validateEvalTagName, wt as buildCaseKey, xt as runSummarySchema, yt as validateTagsFilterExpression } from "./runExecution-pHJ0_TzH.mjs";
2
2
  import { readFile, readdir, rm, writeFile } from "node:fs/promises";
3
3
  import { dirname, join } from "node:path";
4
4
  import { existsSync } from "node:fs";
@@ -1030,6 +1030,7 @@ async function executeQueuedCases(params) {
1030
1030
  async function executeQueuedCase(params) {
1031
1031
  const { queuedCase, globalTraceDisplay } = params;
1032
1032
  const startTime = Date.now();
1033
+ await queuedCase.onStart?.();
1033
1034
  const result = await queuedCase.execute({
1034
1035
  globalTraceDisplay,
1035
1036
  startTime
@@ -1249,6 +1250,32 @@ function buildRunErrorMessage(errors) {
1249
1250
  return `[${entry.evalId}] ${messageLine}\n${details}`;
1250
1251
  }).join("\n");
1251
1252
  }
1253
+ function upsertCaseRow(caseRows, nextCaseRow) {
1254
+ const existingIndex = caseRows.findIndex((caseRow) => getCaseRowCaseKey(caseRow) === getCaseRowCaseKey(nextCaseRow) && caseRow.trial === nextCaseRow.trial);
1255
+ if (existingIndex === -1) {
1256
+ caseRows.push(nextCaseRow);
1257
+ return;
1258
+ }
1259
+ caseRows[existingIndex] = nextCaseRow;
1260
+ }
1261
+ function removeLiveCaseRows(caseRows, nextCaseRow) {
1262
+ const caseKey = getCaseRowCaseKey(nextCaseRow);
1263
+ for (let i = caseRows.length - 1; i >= 0; i--) {
1264
+ const caseRow = caseRows[i];
1265
+ if (caseRow === void 0) continue;
1266
+ if (getCaseRowCaseKey(caseRow) !== caseKey) continue;
1267
+ if (caseRow.status !== "pending" && caseRow.status !== "running") continue;
1268
+ caseRows.splice(i, 1);
1269
+ }
1270
+ }
1271
+ function emitCaseRowEvent(params) {
1272
+ params.emitEvent(params.runState, {
1273
+ type: params.type,
1274
+ runId: params.runState.manifest.id,
1275
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1276
+ payload: params.caseRow
1277
+ });
1278
+ }
1252
1279
  async function finalizePreparedCase(params) {
1253
1280
  const { runState, runDir, preparedEval, preparedCase, onCaseFinished, emitEvent } = params;
1254
1281
  if (preparedCase.finalized || preparedCase.trialResults.length === 0) return;
@@ -1263,7 +1290,8 @@ async function finalizePreparedCase(params) {
1263
1290
  pendingWrites: winningTrial.pendingCacheWrites
1264
1291
  });
1265
1292
  const artifactFileId = getCaseArtifactFileId(runState, winningTrial.caseRow);
1266
- runState.cases.push(winningTrial.caseRow);
1293
+ removeLiveCaseRows(runState.cases, winningTrial.caseRow);
1294
+ upsertCaseRow(runState.cases, winningTrial.caseRow);
1267
1295
  runState.caseDetails.set(getCaseRowCaseKey(winningTrial.caseRow), winningTrial.caseDetail);
1268
1296
  if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
1269
1297
  else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
@@ -1271,11 +1299,11 @@ async function finalizePreparedCase(params) {
1271
1299
  await writeFile(join(runDir, "traces", `${encodeURIComponent(artifactFileId)}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
1272
1300
  await persistCaseDetail(runDir, winningTrial.caseDetail, artifactFileId);
1273
1301
  onCaseFinished?.(winningTrial.caseDetail, winningTrial.caseRow);
1274
- emitEvent(runState, {
1302
+ emitCaseRowEvent({
1303
+ runState,
1304
+ emitEvent,
1275
1305
  type: "case.finished",
1276
- runId: runState.manifest.id,
1277
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1278
- payload: winningTrial.caseRow
1306
+ caseRow: winningTrial.caseRow
1279
1307
  });
1280
1308
  preparedEval.evalCaseRows.push(winningTrial.caseRow);
1281
1309
  }
@@ -1437,13 +1465,52 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
1437
1465
  preparedEvals.push(preparedEval);
1438
1466
  for (const evalCase of cases) {
1439
1467
  const trialResults = [];
1468
+ const liveCaseRow = {
1469
+ caseId: evalCase.id,
1470
+ evalId: evalMeta.id,
1471
+ evalKey: evalMeta.key,
1472
+ caseKey: buildCaseKey({
1473
+ filePath: evalMeta.filePath,
1474
+ evalId: evalMeta.id,
1475
+ caseId: evalCase.id
1476
+ }),
1477
+ tags: evalCase.tags,
1478
+ status: "pending",
1479
+ durationMs: null,
1480
+ cacheHits: 0,
1481
+ cacheOperations: 0,
1482
+ columns: {},
1483
+ trial: 0
1484
+ };
1440
1485
  const preparedCase = {
1441
1486
  caseId: evalCase.id,
1487
+ liveCaseRow,
1442
1488
  trialResults,
1443
1489
  finalized: false
1444
1490
  };
1445
1491
  preparedCases.push(preparedCase);
1492
+ upsertCaseRow(runState.cases, liveCaseRow);
1493
+ emitCaseRowEvent({
1494
+ runState,
1495
+ emitEvent,
1496
+ type: "case.updated",
1497
+ caseRow: liveCaseRow
1498
+ });
1446
1499
  for (let trial = 0; trial < request.trials; trial++) queuedCases.push({
1500
+ onStart: () => {
1501
+ if (preparedCase.finalized) return;
1502
+ preparedCase.liveCaseRow = {
1503
+ ...preparedCase.liveCaseRow,
1504
+ status: "running"
1505
+ };
1506
+ upsertCaseRow(runState.cases, preparedCase.liveCaseRow);
1507
+ emitCaseRowEvent({
1508
+ runState,
1509
+ emitEvent,
1510
+ type: "case.started",
1511
+ caseRow: preparedCase.liveCaseRow
1512
+ });
1513
+ },
1447
1514
  execute: async ({ startTime, globalTraceDisplay }) => await executeCaseChild({
1448
1515
  evalId: evalMeta.id,
1449
1516
  evalKey: evalMeta.key,
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-MSr8sAWm.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-D_pz2NON.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-_g2qOMK6.mjs";
2
- import "./src-CdZsOn6y.mjs";
1
+ import { n as createRunner } from "./cli-HBwXIJsg.mjs";
2
+ import "./src-AeXGBJ26.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,5 +1,5 @@
1
- import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-d42Lm0i5.mjs";
2
- import "./cli-_g2qOMK6.mjs";
1
+ import { G as matchesEvalTags$1, Pt as defineEval$1 } from "./runExecution-pHJ0_TzH.mjs";
2
+ import "./cli-HBwXIJsg.mjs";
3
3
  //#region src/index.ts
4
4
  /** Register an eval definition with typed tag support. */
5
5
  function defineEval(definition) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.58.1",
3
+ "version": "0.58.2",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -33,8 +33,8 @@
33
33
  "@types/node": "^24.7.2",
34
34
  "typescript": "^5.9.2",
35
35
  "@agent-evals/runner": "0.0.1",
36
- "@agent-evals/shared": "0.0.1",
37
- "@agent-evals/sdk": "0.0.1"
36
+ "@agent-evals/sdk": "0.0.1",
37
+ "@agent-evals/shared": "0.0.1"
38
38
  },
39
39
  "scripts": {
40
40
  "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
@@ -35,6 +35,9 @@ display rules), read the TypeScript declarations shipped with the package:
35
35
  - `agent-evals app` watches `agent-evals.config.ts` and reloads config in
36
36
  place when the runner is idle. If config changes during an active run, the
37
37
  reload applies after the current run reaches a terminal state.
38
+ - App-triggered runs log the queued target evals, resolved case concurrency,
39
+ each case start for evals that are actually running, and the terminal run
40
+ summary in the server terminal.
38
41
 
39
42
  Assume that enumerated tables in this document may lag behind the types —
40
43
  treat the types as source of truth when they disagree.
@@ -360,7 +363,18 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
360
363
  The older object-returning function form remains supported. Global
361
364
  derivations run first; runtime outputs are never overwritten, and eval-level
362
365
  derivations only fill keys still missing after global derivations. In keyed
363
- form, return `undefined` to omit one output for that case.
366
+ form, return `undefined` to omit one output for that case. Do not call
367
+ `evalAssert(...)` or `evalExpect(...)` from `deriveFromTracing`; use
368
+ `tracingAssertions` for trace-derived pass/fail checks.
369
+ - `tracingAssertions` can be authored globally or locally on one eval when a
370
+ finished-trace invariant should pass or fail the case without creating a fake
371
+ score column. It receives the same `{ trace, input, case }` context as
372
+ `deriveFromTracing`; call `evalAssert(...)` or `evalExpect(...)` inside it.
373
+ Useful trace helpers include `trace.findSpan(name)`, `trace.findSpans(name)`,
374
+ `trace.hasSpan(name)`, `trace.findSpansByKind(kind)`,
375
+ `trace.findToolCallSpans()`, `trace.listToolCallSpanNames()`,
376
+ `trace.hasToolCallSpan(name)`, `trace.listSpanNames(kind?)`,
377
+ `trace.listSpanNamesDfs(kind?)`, and `trace.flattenDfs()`.
364
378
  - `traceDisplay` promotes selected span attributes into the trace tree and
365
379
  detail pane; it supports aggregation across subtrees (`scope`, `mode`) and
366
380
  user-defined `transform(...)` for derived views (e.g. currency conversion).
@@ -629,8 +643,9 @@ When adding or changing evals:
629
643
  3. `evalAssert` for hard invariants and truthy type narrowing. It records
630
644
  pass/fail entries in case-detail `assertions`; failed entries are also kept
631
645
  in `assertionFailures` and fail the case. Use `evalExpect` for non-trivial
632
- comparisons, `scores` for graded signals, and `passThreshold` only on
633
- scores that should gate pass/fail.
646
+ comparisons, `tracingAssertions` for invariants derived from the finished
647
+ trace, `scores` for graded signals, and `passThreshold` only on scores that
648
+ should gate pass/fail.
634
649
  4. Surface reviewable values through execute-context `setOutput` or ambient
635
650
  `setEvalOutput` in shared workflow code, and shape them with `columns`
636
651
  formats from the `ColumnFormat` type.