@ls-stack/agent-eval 0.57.0 → 0.58.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -289,7 +289,8 @@ z.object({
289
289
  key: z.string(),
290
290
  namespace: z.string(),
291
291
  storedAt: z.string(),
292
- lastAccessedAt: z.string()
292
+ /** Last successful cache hit time. `null` means the entry has not been hit yet. */
293
+ lastAccessedAt: z.string().nullable()
293
294
  });
294
295
  z.object({
295
296
  removedCacheFiles: z.number(),
@@ -814,8 +815,7 @@ const caseRowSchema = z.object({
814
815
  /** Winning trial index for the persisted case result. */
815
816
  trial: z.number()
816
817
  });
817
- /** Structured assertion failure metadata captured for one case run. */
818
- const assertionFailureSchema = z.object({
818
+ const assertionBaseSchema = z.object({
819
819
  /**
820
820
  * Error class or category label rendered alongside the message (e.g.
821
821
  * `EvalAssertionError`, `OutputsSchemaError`). Optional for legacy entries
@@ -827,7 +827,19 @@ const assertionFailureSchema = z.object({
827
827
  /** Stack trace captured from the originating error when available. */
828
828
  stack: z.string().optional()
829
829
  });
830
+ /** Structured assertion failure metadata captured for one case run. */
831
+ const assertionFailureSchema = assertionBaseSchema;
832
+ /** Pass/fail outcome for one recorded eval assertion. */
833
+ const assertionStatusSchema = z.enum(["pass", "fail"]);
834
+ /** Structured assertion result metadata captured for one case run. */
835
+ const assertionResultSchema = assertionBaseSchema.extend({
836
+ /** Whether the recorded assertion passed or failed. */
837
+ status: assertionStatusSchema });
830
838
  const legacyAssertionFailureSchema = z.string().transform((message) => ({ message }));
839
+ const legacyAssertionResultSchema = z.string().transform((message) => ({
840
+ message,
841
+ status: "fail"
842
+ }));
831
843
  /** Severity level for one log captured during a case run. */
832
844
  const runLogLevelSchema = z.enum([
833
845
  "log",
@@ -922,6 +934,12 @@ const caseDetailSchema = z.object({
922
934
  * These complement eval-level `columns` without changing discovery metadata.
923
935
  */
924
936
  outputColumnDefs: z.array(columnDefSchema).optional(),
937
+ /**
938
+ * Pass/fail assertion records captured from eval assertion helpers. New run
939
+ * artifacts include this alongside `assertionFailures`; older artifacts may
940
+ * omit it and should fall back to `assertionFailures` for failed outcomes.
941
+ */
942
+ assertions: z.array(z.union([assertionResultSchema, legacyAssertionResultSchema])).optional(),
925
943
  assertionFailures: z.array(z.union([assertionFailureSchema, legacyAssertionFailureSchema])),
926
944
  /** Logs captured from manual `evalLog(...)` calls and enabled console calls. */
927
945
  logs: z.array(runLogEntrySchema).default([]),
@@ -1405,6 +1423,7 @@ const agentEvalsConfigSchema = z.object({
1405
1423
  maxEntriesPerNamespace: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional()),
1406
1424
  maxEntriesByNamespace: z.record(z.string(), z.number()).optional(),
1407
1425
  pruneIdleDelayMs: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional()),
1426
+ lastAccessedAtUpdateIntervalMs: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional()),
1408
1427
  maxEntriesPerEval: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional())
1409
1428
  }).optional()
1410
1429
  });
@@ -3086,6 +3105,7 @@ async function runInEvalScope(caseId, fn, options = {}) {
3086
3105
  tags: options.tags ?? [],
3087
3106
  outputs: {},
3088
3107
  outputColumnOverrides: {},
3108
+ assertions: [],
3089
3109
  assertionFailures: [],
3090
3110
  logs: [],
3091
3111
  spans: [],
@@ -3211,7 +3231,7 @@ function mergeEvalOutput(key, patch) {
3211
3231
  return;
3212
3232
  }
3213
3233
  if (!isObjectRecord(existing)) {
3214
- scope.assertionFailures.push(toAssertionFailure$1(`mergeEvalOutput("${key}"): existing value is ${Array.isArray(existing) ? "array" : typeof existing}, expected object`));
3234
+ recordAssertionFailure$1(scope, toAssertionFailure$1(`mergeEvalOutput("${key}"): existing value is ${Array.isArray(existing) ? "array" : typeof existing}, expected object`));
3215
3235
  return;
3216
3236
  }
3217
3237
  scope.outputs[key] = {
@@ -3244,7 +3264,7 @@ function incrementEvalOutput(key, delta) {
3244
3264
  return;
3245
3265
  }
3246
3266
  if (typeof existing !== "number") {
3247
- scope.assertionFailures.push(toAssertionFailure$1(`incrementEvalOutput("${key}"): existing value is ${typeof existing}, expected number`));
3267
+ recordAssertionFailure$1(scope, toAssertionFailure$1(`incrementEvalOutput("${key}"): existing value is ${typeof existing}, expected number`));
3248
3268
  return;
3249
3269
  }
3250
3270
  scope.outputs[key] = existing + delta;
@@ -3263,13 +3283,26 @@ function incrementEvalOutput(key, delta) {
3263
3283
  * call.
3264
3284
  */
3265
3285
  function evalAssert(condition, message) {
3266
- if (condition) return;
3267
3286
  const scope = getCurrentScope();
3287
+ if (condition) {
3288
+ if (scope) scope.assertions.push({
3289
+ message,
3290
+ status: "pass"
3291
+ });
3292
+ return;
3293
+ }
3268
3294
  if (!scope) return;
3269
3295
  const error = new EvalAssertionError(message);
3270
- scope.assertionFailures.push(toAssertionFailure$1(message, error));
3296
+ recordAssertionFailure$1(scope, toAssertionFailure$1(message, error));
3271
3297
  throw error;
3272
3298
  }
3299
+ function recordAssertionFailure$1(scope, failure) {
3300
+ scope.assertionFailures.push(failure);
3301
+ scope.assertions.push({
3302
+ ...failure,
3303
+ status: "fail"
3304
+ });
3305
+ }
3273
3306
  //#endregion
3274
3307
  //#region ../sdk/src/evalExpect.ts
3275
3308
  const expectFormatOptions = {
@@ -4185,14 +4218,28 @@ function applyRecordingOp(scope, parentSpan, op, options) {
4185
4218
  ...existing,
4186
4219
  ...op.patch
4187
4220
  };
4188
- else scope.assertionFailures.push({ message: `replay mergeEvalOutput("${op.key}"): existing value is ${valueKind$1(existing)}, expected object` });
4221
+ else {
4222
+ const message = `replay mergeEvalOutput("${op.key}"): existing value is ${valueKind$1(existing)}, expected object`;
4223
+ scope.assertionFailures.push({ message });
4224
+ scope.assertions.push({
4225
+ message,
4226
+ status: "fail"
4227
+ });
4228
+ }
4189
4229
  return;
4190
4230
  }
4191
4231
  if (op.kind === "incrementOutput") {
4192
4232
  const existing = scope.outputs[op.key];
4193
4233
  if (existing === void 0) scope.outputs[op.key] = op.delta;
4194
4234
  else if (typeof existing === "number") scope.outputs[op.key] = existing + op.delta;
4195
- else scope.assertionFailures.push({ message: `replay incrementEvalOutput("${op.key}"): existing value is ${valueKind$1(existing)}, expected number` });
4235
+ else {
4236
+ const message = `replay incrementEvalOutput("${op.key}"): existing value is ${valueKind$1(existing)}, expected number`;
4237
+ scope.assertionFailures.push({ message });
4238
+ scope.assertions.push({
4239
+ message,
4240
+ status: "fail"
4241
+ });
4242
+ }
4196
4243
  return;
4197
4244
  }
4198
4245
  if (op.kind === "checkpoint") {
@@ -4478,6 +4525,10 @@ function recordSpanAttributeAssertion(message) {
4478
4525
  const scope = getCurrentScope();
4479
4526
  if (!scope) return;
4480
4527
  scope.assertionFailures.push({ message });
4528
+ scope.assertions.push({
4529
+ message,
4530
+ status: "fail"
4531
+ });
4481
4532
  }
4482
4533
  function incrementSpanAttribute(span, key, delta) {
4483
4534
  const existing = span.attributes?.[key];
@@ -4983,6 +5034,24 @@ function buildTraceTree(spans, checkpoints) {
4983
5034
  };
4984
5035
  }
4985
5036
  //#endregion
5037
+ //#region ../runner/src/cacheAccessTime.ts
5038
+ const defaultLastAccessedAtUpdateIntervalMs = 14400 * 1e3;
5039
+ function normalizeLastAccessedAtUpdateIntervalMs(value) {
5040
+ if (value === void 0 || !Number.isFinite(value) || value < 0) return defaultLastAccessedAtUpdateIntervalMs;
5041
+ return Math.floor(value);
5042
+ }
5043
+ function cacheAccessSortTime(entry) {
5044
+ return entry.lastAccessedAt ?? entry.storedAt;
5045
+ }
5046
+ function shouldRefreshLastAccessedAt(params) {
5047
+ return params.lastAccessedAt === null || params.nowMs - Date.parse(params.lastAccessedAt) > params.updateIntervalMs;
5048
+ }
5049
+ //#endregion
5050
+ //#region ../runner/src/cacheKeys.ts
5051
+ function toPendingKey(namespace, keyHash) {
5052
+ return `${namespace}::${keyHash}`;
5053
+ }
5054
+ //#endregion
4986
5055
  //#region ../runner/src/cacheStore.ts
4987
5056
  const defaultMaxEntriesPerNamespace = 100;
4988
5057
  const cacheSerializationMarker = "__aecs";
@@ -5015,6 +5084,7 @@ function createFsCacheStore(options) {
5015
5084
  primaryDir: blobDir
5016
5085
  });
5017
5086
  const defaultMaxEntries = normalizeMaxEntries(options.maxEntriesPerNamespace);
5087
+ const lastAccessedAtUpdateIntervalMs = normalizeLastAccessedAtUpdateIntervalMs(options.lastAccessedAtUpdateIntervalMs);
5018
5088
  return {
5019
5089
  externalJsonStore,
5020
5090
  dir() {
@@ -5034,7 +5104,12 @@ function createFsCacheStore(options) {
5034
5104
  });
5035
5105
  if (entry === null) return null;
5036
5106
  const materialized = await materializeExternalJsonCacheEntryOrNull(entry, externalJsonStore);
5037
- if (materialized !== null) await updateCacheIndexLastAccessedAt(cacheDir, namespace, keyHash);
5107
+ if (materialized !== null) await updateCacheIndexLastAccessedAt({
5108
+ cacheDir,
5109
+ key: keyHash,
5110
+ namespace,
5111
+ updateIntervalMs: lastAccessedAtUpdateIntervalMs
5112
+ });
5038
5113
  return materialized;
5039
5114
  },
5040
5115
  async lookupWithDebug(namespace, keyHash) {
@@ -5063,7 +5138,7 @@ function createFsCacheStore(options) {
5063
5138
  const index = await readNamespaceIndex(cacheDir, entry.namespace);
5064
5139
  index.entries[entry.key] = {
5065
5140
  storedAt: entry.storedAt,
5066
- lastAccessedAt: entry.storedAt,
5141
+ lastAccessedAt: null,
5067
5142
  blobRefs: await collectExternalJsonBlobRefs(entry, blobDirs)
5068
5143
  };
5069
5144
  await writeNamespaceIndex(cacheDir, index);
@@ -5082,7 +5157,7 @@ function createFsCacheStore(options) {
5082
5157
  async list() {
5083
5158
  const items = [];
5084
5159
  for (const index of await listCacheIndexes(cacheDir)) for (const [key, entry] of Object.entries(index.entries)) items.push(toCacheListItem(index.namespace, key, entry));
5085
- items.sort((a, b) => a.lastAccessedAt < b.lastAccessedAt ? 1 : -1);
5160
+ items.sort((a, b) => cacheAccessSortTime(a) < cacheAccessSortTime(b) ? 1 : -1);
5086
5161
  return items;
5087
5162
  },
5088
5163
  async clear(filter) {
@@ -5183,9 +5258,6 @@ function maxEntriesForNamespace(namespace, defaultMaxEntries, maxEntriesByNamesp
5183
5258
  const namespaceMaxEntries = maxEntriesByNamespace?.[namespace];
5184
5259
  return namespaceMaxEntries === void 0 ? defaultMaxEntries : normalizeMaxEntries(namespaceMaxEntries, defaultMaxEntries);
5185
5260
  }
5186
- function toPendingKey(namespace, keyHash) {
5187
- return `${namespace}::${keyHash}`;
5188
- }
5189
5261
  function sanitizeSegment$1(segment) {
5190
5262
  return segment.replace(/[^a-zA-Z0-9_.-]/g, "_");
5191
5263
  }
@@ -5231,16 +5303,22 @@ async function readIndexedCacheEntry(params) {
5231
5303
  return fileEntry.entry;
5232
5304
  });
5233
5305
  }
5234
- async function updateCacheIndexLastAccessedAt(cacheDir, namespace, key) {
5235
- await withCacheFileLock(namespaceLockPath(cacheDir, namespace), async () => {
5236
- const index = await readNamespaceIndex(cacheDir, namespace);
5237
- const entry = index.entries[key];
5306
+ async function updateCacheIndexLastAccessedAt(params) {
5307
+ await withCacheFileLock(namespaceLockPath(params.cacheDir, params.namespace), async () => {
5308
+ const index = await readNamespaceIndex(params.cacheDir, params.namespace);
5309
+ const entry = index.entries[params.key];
5238
5310
  if (entry === void 0) return;
5239
- index.entries[key] = {
5311
+ const nowMs = getRealDateNowMs();
5312
+ if (!shouldRefreshLastAccessedAt({
5313
+ lastAccessedAt: entry.lastAccessedAt,
5314
+ nowMs,
5315
+ updateIntervalMs: params.updateIntervalMs
5316
+ })) return;
5317
+ index.entries[params.key] = {
5240
5318
  ...entry,
5241
- lastAccessedAt: new Date(getRealDateNowMs()).toISOString()
5319
+ lastAccessedAt: new Date(nowMs).toISOString()
5242
5320
  };
5243
- await writeNamespaceIndex(cacheDir, index);
5321
+ await writeNamespaceIndex(params.cacheDir, index);
5244
5322
  });
5245
5323
  }
5246
5324
  async function readCacheEntryFilePath(filePath, expected) {
@@ -5371,7 +5449,7 @@ function parseCacheIndexFile(value, expectedNamespace) {
5371
5449
  }
5372
5450
  function parseCacheIndexEntry(value) {
5373
5451
  if (!isRecordLike(value)) return null;
5374
- if (typeof value.storedAt !== "string" || typeof value.lastAccessedAt !== "string") return null;
5452
+ if (typeof value.storedAt !== "string" || value.lastAccessedAt !== null && typeof value.lastAccessedAt !== "string") return null;
5375
5453
  if (!Array.isArray(value.blobRefs)) return null;
5376
5454
  const blobRefs = [];
5377
5455
  for (const blobRef of value.blobRefs) {
@@ -5441,7 +5519,7 @@ function entryMatchesFilter(entry, filter) {
5441
5519
  async function pruneCacheEntriesForNamespace(params) {
5442
5520
  const { cacheDir, index, maxEntries } = params;
5443
5521
  const entries = Object.entries(index.entries);
5444
- const sorted = entries.toSorted(([, a], [, b]) => a.lastAccessedAt < b.lastAccessedAt ? 1 : -1);
5522
+ const sorted = entries.toSorted(([, a], [, b]) => cacheAccessSortTime(a) < cacheAccessSortTime(b) ? 1 : -1);
5445
5523
  const keptKeys = /* @__PURE__ */ new Set();
5446
5524
  for (const [key] of sorted) {
5447
5525
  if (keptKeys.size >= maxEntries) break;
@@ -6606,7 +6684,7 @@ async function runDeriveFromTracingConfig(params) {
6606
6684
  });
6607
6685
  } catch (e) {
6608
6686
  const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
6609
- params.scope.assertionFailures.push(toAssertionFailure(message, e instanceof Error ? e : void 0));
6687
+ recordAssertionFailure(params.scope, toAssertionFailure(message, e instanceof Error ? e : void 0));
6610
6688
  }
6611
6689
  }
6612
6690
  async function runCase(params) {
@@ -6656,7 +6734,7 @@ async function runCase(params) {
6656
6734
  });
6657
6735
  const traceTree = buildTraceTree(spansWithDerivedAttributes, scope.checkpoints);
6658
6736
  const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
6659
- if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) scope.assertionFailures.push(toAssertionFailure(executeError.message, executeError));
6737
+ if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) recordAssertionFailure(scope, toAssertionFailure(executeError.message, executeError));
6660
6738
  if (!nonAssertError) {
6661
6739
  await runDeriveFromTracingConfig({
6662
6740
  deriveFromTracing: globalDeriveFromTracing,
@@ -6686,7 +6764,7 @@ async function runCase(params) {
6686
6764
  ...scope.outputs,
6687
6765
  ...parsedOutputs.data
6688
6766
  };
6689
- else scope.assertionFailures.push(toAssertionFailure(formatOutputsSchemaError(parsedOutputs.error), void 0, "OutputsSchemaError"));
6767
+ else recordAssertionFailure(scope, toAssertionFailure(formatOutputsSchemaError(parsedOutputs.error), void 0, "OutputsSchemaError"));
6690
6768
  }
6691
6769
  const scoreResults = /* @__PURE__ */ new Map();
6692
6770
  const scoringTraces = {};
@@ -6728,8 +6806,7 @@ async function runCase(params) {
6728
6806
  };
6729
6807
  const rawValue = scoreRun.result;
6730
6808
  if (scoreRun.error) {
6731
- const message = `score "${key}" threw: ${scoreRun.error.message}`;
6732
- scope.assertionFailures.push(toAssertionFailure(message, scoreRun.error));
6809
+ recordAssertionFailure(scope, toAssertionFailure(`score "${key}" threw: ${scoreRun.error.message}`, scoreRun.error));
6733
6810
  scope.outputs[key] = 0;
6734
6811
  scoreResults.set(key, {
6735
6812
  value: 0,
@@ -6739,7 +6816,7 @@ async function runCase(params) {
6739
6816
  continue;
6740
6817
  }
6741
6818
  if (typeof rawValue !== "number") {
6742
- scope.assertionFailures.push(toAssertionFailure(`score "${key}" must return a number`));
6819
+ recordAssertionFailure(scope, toAssertionFailure(`score "${key}" must return a number`));
6743
6820
  scope.outputs[key] = 0;
6744
6821
  scoreResults.set(key, {
6745
6822
  value: 0,
@@ -6801,6 +6878,7 @@ async function runCase(params) {
6801
6878
  traceDisplay,
6802
6879
  columns,
6803
6880
  ...outputColumnDefs.length > 0 ? { outputColumnDefs } : {},
6881
+ assertions: scope.assertions,
6804
6882
  assertionFailures: scope.assertionFailures,
6805
6883
  logs: scope.logs,
6806
6884
  error: errorInfo,
@@ -6852,5 +6930,12 @@ function toAssertionFailure(message, error = void 0, nameOverride = void 0) {
6852
6930
  ...stack !== void 0 ? { stack } : {}
6853
6931
  };
6854
6932
  }
6933
+ function recordAssertionFailure(scope, failure) {
6934
+ scope.assertionFailures.push(failure);
6935
+ scope.assertions.push({
6936
+ ...failure,
6937
+ status: "fail"
6938
+ });
6939
+ }
6855
6940
  //#endregion
6856
6941
  export { startEvalBackgroundJob as $, repoFile as A, evalChartsConfigSchema as At, getCurrentScope as B, evalTracer as C, buildEvalKey as Ct, deserializeCacheValue as D, evalStatAggregateSchema as Dt, deserializeCacheRecording as E, caseRowSchema as Et, appendToEvalOutput as F, mergeEvalOutput as G, incrementEvalOutput as H, configureEvalRunLogs as I, runInEvalScope as J, nextEvalId as K, evalAssert as L, readManualInputFile as M, defineEval as Mt, evalExpect as N, getEvalRegistry as Nt, serializeCacheRecording as O, evalStatsConfigSchema as Ot, EvalAssertionError as P, runWithEvalRegistry as Pt, setScopeCacheContext as Q, evalLog as R, evalSpan as S, resolveLlmCallsConfig as St, hashCacheKeySync as T, caseDetailSchema as Tt, isInEvalScope as U, getEvalCaseInput as V, matchesEvalTags as W, runWithEvalClock as X, runInExistingEvalScope as Y, setEvalOutput as Z, createBufferedCacheStore as _, validateEvalTagName as _t, isCaseChildParentMessage as a, extractLlmCalls as at, buildTraceTree as b, runSummarySchema as bt, resolveArtifactPath as c, applyDerivedCallAttributes as ct, loadEvalModule as d, getEvalDisplayStatus as dt, createRunRequestSchema as et, resolveEvalDefaultConfig as f, deriveScopedSummaryFromCases as ft, commitPendingCacheWrites as g, matchesTagsFilter as gt, normalizeScoreDef as h, dedupeEvalTags as ht, isCaseChildMessage as i, extractApiCalls as it, manualInputFileValueSchema as j, columnDefSchema as jt, serializeCacheValue as k, manualInputDescriptorSchema as kt, registerAgentEvalsPackageResolutionHooks as l, getNestedAttribute as lt, buildDeclaredColumnDefs as m, deriveStatusFromChildStatuses as mt, resolveRunnableEvalCases as n, extractCacheEntries as nt, stripTerminalControlCodes as o, simulateLlmCallCost as ot, loadConfig as p, deriveStatusFromCaseRows as pt, runInEvalRuntimeScope as q, runCase as r, extractCacheHits as rt, resolveTracePresentation as s, simulateTokenAllocation as st, filterEvalCases as t, updateManualScoreRequestSchema as tt, runWithModuleIsolation as u, getEvalTitle as ut, createFsCacheStore as v, validateTagsFilterExpression as vt, hashCacheKey as w, getCaseRowCaseKey as wt, captureEvalSpanError as x, resolveApiCallsConfig as xt, z$1 as y, runManifestSchema as yt, evalTime as z };
@@ -1,4 +1,4 @@
1
- import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-BH7DlMXl.mjs";
1
+ import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-C4kAOhC1.mjs";
2
2
  import { readFile, readdir, rm, writeFile } from "node:fs/promises";
3
3
  import { dirname, join } from "node:path";
4
4
  import { existsSync } from "node:fs";
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-B3hEOT_I.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-bjd_UB9i.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-Ck0mqxd-.mjs";
2
- import "./src-B3iq-tuv.mjs";
1
+ import { n as createRunner } from "./cli-Cf37PZKi.mjs";
2
+ import "./src-303BocMW.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,5 +1,5 @@
1
- import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-BH7DlMXl.mjs";
2
- import "./cli-Ck0mqxd-.mjs";
1
+ import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-C4kAOhC1.mjs";
2
+ import "./cli-Cf37PZKi.mjs";
3
3
  //#region src/index.ts
4
4
  /** Register an eval definition with typed tag support. */
5
5
  function defineEval(definition) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.57.0",
3
+ "version": "0.58.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -626,9 +626,11 @@ When adding or changing evals:
626
626
  1. Put the tracing + ambient SDK calls in the product code that runs in both
627
627
  production and evals. Keep eval files thin.
628
628
  2. Use realistic cases drawn from real product flows; avoid placeholder inputs.
629
- 3. `evalAssert` for hard invariants and truthy type narrowing, `evalExpect`
630
- for non-trivial comparisons, `scores` for graded signals, `passThreshold`
631
- only on scores that should gate pass/fail.
629
+ 3. `evalAssert` for hard invariants and truthy type narrowing. It records
630
+ pass/fail entries in case-detail `assertions`; failed entries are also kept
631
+ in `assertionFailures` and fail the case. Use `evalExpect` for non-trivial
632
+ comparisons, `scores` for graded signals, and `passThreshold` only on
633
+ scores that should gate pass/fail.
632
634
  4. Surface reviewable values through execute-context `setOutput` or ambient
633
635
  `setEvalOutput` in shared workflow code, and shape them with `columns`
634
636
  formats from the `ColumnFormat` type.