@ls-stack/agent-eval 0.57.0 → 0.58.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -289,7 +289,8 @@ z.object({
289
289
  key: z.string(),
290
290
  namespace: z.string(),
291
291
  storedAt: z.string(),
292
- lastAccessedAt: z.string()
292
+ /** Last successful cache hit time. `null` means the entry has not been hit yet. */
293
+ lastAccessedAt: z.string().nullable()
293
294
  });
294
295
  z.object({
295
296
  removedCacheFiles: z.number(),
@@ -814,8 +815,7 @@ const caseRowSchema = z.object({
814
815
  /** Winning trial index for the persisted case result. */
815
816
  trial: z.number()
816
817
  });
817
- /** Structured assertion failure metadata captured for one case run. */
818
- const assertionFailureSchema = z.object({
818
+ const assertionBaseSchema = z.object({
819
819
  /**
820
820
  * Error class or category label rendered alongside the message (e.g.
821
821
  * `EvalAssertionError`, `OutputsSchemaError`). Optional for legacy entries
@@ -827,7 +827,19 @@ const assertionFailureSchema = z.object({
827
827
  /** Stack trace captured from the originating error when available. */
828
828
  stack: z.string().optional()
829
829
  });
830
+ /** Structured assertion failure metadata captured for one case run. */
831
+ const assertionFailureSchema = assertionBaseSchema;
832
+ /** Pass/fail outcome for one recorded eval assertion. */
833
+ const assertionStatusSchema = z.enum(["pass", "fail"]);
834
+ /** Structured assertion result metadata captured for one case run. */
835
+ const assertionResultSchema = assertionBaseSchema.extend({
836
+ /** Whether the recorded assertion passed or failed. */
837
+ status: assertionStatusSchema });
830
838
  const legacyAssertionFailureSchema = z.string().transform((message) => ({ message }));
839
+ const legacyAssertionResultSchema = z.string().transform((message) => ({
840
+ message,
841
+ status: "fail"
842
+ }));
831
843
  /** Severity level for one log captured during a case run. */
832
844
  const runLogLevelSchema = z.enum([
833
845
  "log",
@@ -922,6 +934,12 @@ const caseDetailSchema = z.object({
922
934
  * These complement eval-level `columns` without changing discovery metadata.
923
935
  */
924
936
  outputColumnDefs: z.array(columnDefSchema).optional(),
937
+ /**
938
+ * Pass/fail assertion records captured from eval assertion helpers. New run
939
+ * artifacts include this alongside `assertionFailures`; older artifacts may
940
+ * omit it and should fall back to `assertionFailures` for failed outcomes.
941
+ */
942
+ assertions: z.array(z.union([assertionResultSchema, legacyAssertionResultSchema])).optional(),
925
943
  assertionFailures: z.array(z.union([assertionFailureSchema, legacyAssertionFailureSchema])),
926
944
  /** Logs captured from manual `evalLog(...)` calls and enabled console calls. */
927
945
  logs: z.array(runLogEntrySchema).default([]),
@@ -1405,6 +1423,7 @@ const agentEvalsConfigSchema = z.object({
1405
1423
  maxEntriesPerNamespace: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional()),
1406
1424
  maxEntriesByNamespace: z.record(z.string(), z.number()).optional(),
1407
1425
  pruneIdleDelayMs: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional()),
1426
+ lastAccessedAtUpdateIntervalMs: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional()),
1408
1427
  maxEntriesPerEval: z.preprocess((value) => typeof value === "number" && Number.isFinite(value) ? value : void 0, z.number().optional())
1409
1428
  }).optional()
1410
1429
  });
@@ -2632,6 +2651,7 @@ const scopeStorage = new AsyncLocalStorage();
2632
2651
  const runtimeScopeStorage = new AsyncLocalStorage();
2633
2652
  const evalClockStorage = new AsyncLocalStorage();
2634
2653
  const activeSpanStackStorage = new AsyncLocalStorage();
2654
+ const recordingStackStorage = new AsyncLocalStorage();
2635
2655
  let activeEvalScopeCount = 0;
2636
2656
  let activeEvalRuntimeScopeCount = 0;
2637
2657
  let consoleCaptureEnabled = true;
@@ -2784,6 +2804,20 @@ async function runWithActiveSpan(span, fn) {
2784
2804
  const currentStack = activeSpanStackStorage.getStore() ?? [];
2785
2805
  return await activeSpanStackStorage.run([...currentStack, span], fn);
2786
2806
  }
2807
+ /** Execute a callback with a cache recording frame scoped to this async branch. */
2808
+ async function runWithCacheRecordingFrame(frame, fn) {
2809
+ const currentStack = recordingStackStorage.getStore() ?? [];
2810
+ return await recordingStackStorage.run([...currentStack, frame], fn);
2811
+ }
2812
+ function getCurrentCacheRecordingFrame(scope) {
2813
+ if (scope.replayingDepth > 0) return void 0;
2814
+ return recordingStackStorage.getStore()?.at(-1);
2815
+ }
2816
+ /** Mark a span as created by the active cache recorder, when one exists. */
2817
+ function recordSpanForActiveCacheRecording(scope, spanId) {
2818
+ if (scope.replayingDepth > 0) return;
2819
+ for (const frame of recordingStackStorage.getStore() ?? []) frame.spanIds.add(spanId);
2820
+ }
2787
2821
  /**
2788
2822
  * Return the current eval runner phase for this async execution.
2789
2823
  *
@@ -3086,11 +3120,11 @@ async function runInEvalScope(caseId, fn, options = {}) {
3086
3120
  tags: options.tags ?? [],
3087
3121
  outputs: {},
3088
3122
  outputColumnOverrides: {},
3123
+ assertions: [],
3089
3124
  assertionFailures: [],
3090
3125
  logs: [],
3091
3126
  spans: [],
3092
3127
  checkpoints: /* @__PURE__ */ new Map(),
3093
- recordingStack: [],
3094
3128
  replayingDepth: 0,
3095
3129
  cacheContext: options.cacheContext,
3096
3130
  caseCacheRefs: [],
@@ -3130,10 +3164,16 @@ function nextEvalId() {
3130
3164
  scope.nextEvalIdCounter++;
3131
3165
  return `${scope.idPrefix}-${scope.nextEvalIdCounter}`;
3132
3166
  }
3133
- function recordOpIfActive(scope, op) {
3134
- if (scope.replayingDepth > 0) return;
3135
- const top = scope.recordingStack.at(-1);
3136
- if (top) top.ops.push(op);
3167
+ function recordCacheRecordingOpIfActive(scope, op) {
3168
+ getCurrentCacheRecordingFrame(scope)?.ops.push(op);
3169
+ }
3170
+ function recordCacheRecordingAttributesIfActive(scope, span, attributes) {
3171
+ const frames = recordingStackStorage.getStore();
3172
+ if (scope.replayingDepth > 0 || frames === void 0) return;
3173
+ for (const [key, value] of Object.entries(attributes)) {
3174
+ if (key.startsWith("cache.")) continue;
3175
+ for (const frame of frames) if (span.id === frame.replayParentSpanId) frame.finalAttributes[key] = value;
3176
+ }
3137
3177
  }
3138
3178
  function normalizeEvalOutputOptions(options) {
3139
3179
  if (options === void 0) return void 0;
@@ -3165,7 +3205,7 @@ function setEvalOutput(key, value, options = void 0) {
3165
3205
  scope.outputs[key] = value;
3166
3206
  const column = normalizeEvalOutputOptions(options);
3167
3207
  if (column !== void 0) scope.outputColumnOverrides[key] = column;
3168
- recordOpIfActive(scope, {
3208
+ recordCacheRecordingOpIfActive(scope, {
3169
3209
  kind: "setOutput",
3170
3210
  key,
3171
3211
  value,
@@ -3185,7 +3225,7 @@ function appendToEvalOutput(key, value) {
3185
3225
  if (existing === void 0) scope.outputs[key] = [value];
3186
3226
  else if (Array.isArray(existing)) scope.outputs[key] = [...copyArray$1(existing), value];
3187
3227
  else scope.outputs[key] = [existing, value];
3188
- recordOpIfActive(scope, {
3228
+ recordCacheRecordingOpIfActive(scope, {
3189
3229
  kind: "appendOutput",
3190
3230
  key,
3191
3231
  value
@@ -3203,7 +3243,7 @@ function mergeEvalOutput(key, patch) {
3203
3243
  const existing = scope.outputs[key];
3204
3244
  if (existing === void 0) {
3205
3245
  scope.outputs[key] = { ...patch };
3206
- recordOpIfActive(scope, {
3246
+ recordCacheRecordingOpIfActive(scope, {
3207
3247
  kind: "mergeOutput",
3208
3248
  key,
3209
3249
  patch
@@ -3211,14 +3251,14 @@ function mergeEvalOutput(key, patch) {
3211
3251
  return;
3212
3252
  }
3213
3253
  if (!isObjectRecord(existing)) {
3214
- scope.assertionFailures.push(toAssertionFailure$1(`mergeEvalOutput("${key}"): existing value is ${Array.isArray(existing) ? "array" : typeof existing}, expected object`));
3254
+ recordAssertionFailure$1(scope, toAssertionFailure$1(`mergeEvalOutput("${key}"): existing value is ${Array.isArray(existing) ? "array" : typeof existing}, expected object`));
3215
3255
  return;
3216
3256
  }
3217
3257
  scope.outputs[key] = {
3218
3258
  ...existing,
3219
3259
  ...patch
3220
3260
  };
3221
- recordOpIfActive(scope, {
3261
+ recordCacheRecordingOpIfActive(scope, {
3222
3262
  kind: "mergeOutput",
3223
3263
  key,
3224
3264
  patch
@@ -3236,7 +3276,7 @@ function incrementEvalOutput(key, delta) {
3236
3276
  const existing = scope.outputs[key];
3237
3277
  if (existing === void 0) {
3238
3278
  scope.outputs[key] = delta;
3239
- recordOpIfActive(scope, {
3279
+ recordCacheRecordingOpIfActive(scope, {
3240
3280
  kind: "incrementOutput",
3241
3281
  key,
3242
3282
  delta
@@ -3244,11 +3284,11 @@ function incrementEvalOutput(key, delta) {
3244
3284
  return;
3245
3285
  }
3246
3286
  if (typeof existing !== "number") {
3247
- scope.assertionFailures.push(toAssertionFailure$1(`incrementEvalOutput("${key}"): existing value is ${typeof existing}, expected number`));
3287
+ recordAssertionFailure$1(scope, toAssertionFailure$1(`incrementEvalOutput("${key}"): existing value is ${typeof existing}, expected number`));
3248
3288
  return;
3249
3289
  }
3250
3290
  scope.outputs[key] = existing + delta;
3251
- recordOpIfActive(scope, {
3291
+ recordCacheRecordingOpIfActive(scope, {
3252
3292
  kind: "incrementOutput",
3253
3293
  key,
3254
3294
  delta
@@ -3263,13 +3303,26 @@ function incrementEvalOutput(key, delta) {
3263
3303
  * call.
3264
3304
  */
3265
3305
  function evalAssert(condition, message) {
3266
- if (condition) return;
3267
3306
  const scope = getCurrentScope();
3307
+ if (condition) {
3308
+ if (scope) scope.assertions.push({
3309
+ message,
3310
+ status: "pass"
3311
+ });
3312
+ return;
3313
+ }
3268
3314
  if (!scope) return;
3269
3315
  const error = new EvalAssertionError(message);
3270
- scope.assertionFailures.push(toAssertionFailure$1(message, error));
3316
+ recordAssertionFailure$1(scope, toAssertionFailure$1(message, error));
3271
3317
  throw error;
3272
3318
  }
3319
+ function recordAssertionFailure$1(scope, failure) {
3320
+ scope.assertionFailures.push(failure);
3321
+ scope.assertions.push({
3322
+ ...failure,
3323
+ status: "fail"
3324
+ });
3325
+ }
3273
3326
  //#endregion
3274
3327
  //#region ../sdk/src/evalExpect.ts
3275
3328
  const expectFormatOptions = {
@@ -3642,10 +3695,6 @@ async function materializeExternalJsonValues(value, store) {
3642
3695
  if (!isRecordLike$3(value)) return value;
3643
3696
  return Object.fromEntries(await Promise.all(Object.entries(value).map(async ([key, entryValue]) => [key, await materializeExternalJsonValues(entryValue, store)])));
3644
3697
  }
3645
- /** Clone one value through the same serialization path used for cache data. */
3646
- async function cloneCacheValue(value, options = void 0) {
3647
- return deserializeCacheValue(await serializeCacheValue(value, options));
3648
- }
3649
3698
  function normalizeCacheSerializationOptions(options) {
3650
3699
  return {
3651
3700
  compress: options?.compress !== false,
@@ -4076,29 +4125,6 @@ function valueKind$1(value) {
4076
4125
  function copyArray(value) {
4077
4126
  return value.map((item) => item);
4078
4127
  }
4079
- function stripCacheAttributes(attributes) {
4080
- if (!attributes) return {};
4081
- const result = {};
4082
- for (const [key, value] of Object.entries(attributes)) if (!key.startsWith("cache.")) result[key] = value;
4083
- return result;
4084
- }
4085
- async function snapshotNonCacheAttributes(span) {
4086
- const snapshot = await cloneCacheValue(stripCacheAttributes(span?.attributes));
4087
- return isRecordLike$2(snapshot) ? snapshot : {};
4088
- }
4089
- function diffNonCacheAttributes(before, after) {
4090
- const result = {};
4091
- for (const [key, value] of Object.entries(after)) if (!cacheAttributeValuesEqual(before[key], value)) result[key] = value;
4092
- return result;
4093
- }
4094
- function cacheAttributeValuesEqual(left, right) {
4095
- if (Object.is(left, right)) return true;
4096
- try {
4097
- return JSON.stringify(left) === JSON.stringify(right);
4098
- } catch {
4099
- return false;
4100
- }
4101
- }
4102
4128
  function appendCacheRef(span, ref) {
4103
4129
  if (span === void 0) return;
4104
4130
  const existing = span.attributes?.["cache.refs"];
@@ -4117,7 +4143,7 @@ function recordCacheRef(scope, span, ref) {
4117
4143
  }
4118
4144
  scope.caseCacheRefs.push(ref);
4119
4145
  }
4120
- function serializeSubSpanTree(scope, spanId) {
4146
+ function serializeSubSpanTree(scope, spanId, spanIds) {
4121
4147
  const original = scope.spans.find((s) => s.id === spanId);
4122
4148
  if (!original) return {
4123
4149
  kind: "custom",
@@ -4130,7 +4156,7 @@ function serializeSubSpanTree(scope, spanId) {
4130
4156
  warnings: void 0,
4131
4157
  children: []
4132
4158
  };
4133
- const children = scope.spans.filter((s) => s.parentId === spanId).map((child) => serializeSubSpanTree(scope, child.id));
4159
+ const children = scope.spans.filter((s) => s.parentId === spanId && spanIds.has(s.id)).map((child) => serializeSubSpanTree(scope, child.id, spanIds));
4134
4160
  return {
4135
4161
  kind: original.kind,
4136
4162
  name: original.name,
@@ -4146,9 +4172,9 @@ function serializeSubSpanTree(scope, spanId) {
4146
4172
  function appendSubSpanOps(scope, frame) {
4147
4173
  for (let i = frame.baseSpanIndex; i < scope.spans.length; i++) {
4148
4174
  const candidate = scope.spans[i];
4149
- if (candidate?.parentId === frame.replayParentSpanId) frame.ops.push({
4175
+ if (candidate?.parentId === frame.replayParentSpanId && frame.spanIds.has(candidate.id)) frame.ops.push({
4150
4176
  kind: "subSpan",
4151
- span: serializeSubSpanTree(scope, candidate.id)
4177
+ span: serializeSubSpanTree(scope, candidate.id, frame.spanIds)
4152
4178
  });
4153
4179
  }
4154
4180
  }
@@ -4185,14 +4211,28 @@ function applyRecordingOp(scope, parentSpan, op, options) {
4185
4211
  ...existing,
4186
4212
  ...op.patch
4187
4213
  };
4188
- else scope.assertionFailures.push({ message: `replay mergeEvalOutput("${op.key}"): existing value is ${valueKind$1(existing)}, expected object` });
4214
+ else {
4215
+ const message = `replay mergeEvalOutput("${op.key}"): existing value is ${valueKind$1(existing)}, expected object`;
4216
+ scope.assertionFailures.push({ message });
4217
+ scope.assertions.push({
4218
+ message,
4219
+ status: "fail"
4220
+ });
4221
+ }
4189
4222
  return;
4190
4223
  }
4191
4224
  if (op.kind === "incrementOutput") {
4192
4225
  const existing = scope.outputs[op.key];
4193
4226
  if (existing === void 0) scope.outputs[op.key] = op.delta;
4194
4227
  else if (typeof existing === "number") scope.outputs[op.key] = existing + op.delta;
4195
- else scope.assertionFailures.push({ message: `replay incrementEvalOutput("${op.key}"): existing value is ${valueKind$1(existing)}, expected number` });
4228
+ else {
4229
+ const message = `replay incrementEvalOutput("${op.key}"): existing value is ${valueKind$1(existing)}, expected number`;
4230
+ scope.assertionFailures.push({ message });
4231
+ scope.assertions.push({
4232
+ message,
4233
+ status: "fail"
4234
+ });
4235
+ }
4196
4236
  return;
4197
4237
  }
4198
4238
  if (op.kind === "checkpoint") {
@@ -4390,25 +4430,21 @@ function createTraceCache(generateSpanId) {
4390
4430
  key: keyHash,
4391
4431
  status: "bypass"
4392
4432
  });
4393
- const beforeAttributes = await snapshotNonCacheAttributes(activeSpan);
4394
4433
  const frame = {
4395
4434
  baseSpanIndex: scope.spans.length,
4396
4435
  replayParentSpanId: activeSpan?.id ?? null,
4436
+ spanIds: /* @__PURE__ */ new Set(),
4437
+ finalAttributes: {},
4397
4438
  ops: []
4398
4439
  };
4399
- scope.recordingStack.push(frame);
4400
- let bodyResult;
4401
- try {
4402
- bodyResult = await fn();
4403
- } finally {
4404
- scope.recordingStack.pop();
4405
- }
4440
+ const bodyResult = await runWithCacheRecordingFrame(frame, async () => {
4441
+ return await fn();
4442
+ });
4406
4443
  appendSubSpanOps(scope, frame);
4407
4444
  if (canStore) {
4408
- const finalAttributes = diffNonCacheAttributes(beforeAttributes, await snapshotNonCacheAttributes(activeSpan));
4409
4445
  const recording = {
4410
4446
  returnValue: bodyResult,
4411
- finalAttributes,
4447
+ finalAttributes: frame.finalAttributes,
4412
4448
  ops: frame.ops
4413
4449
  };
4414
4450
  await cacheCtx.adapter.write({
@@ -4467,6 +4503,13 @@ function mergeSpanAttributes(span, attributes) {
4467
4503
  ...span.attributes,
4468
4504
  ...attributes
4469
4505
  };
4506
+ const scope = getCurrentScope();
4507
+ if (scope !== void 0) recordCacheRecordingAttributesIfActive(scope, span, attributes);
4508
+ }
4509
+ function copyNonCacheAttributes(attributes) {
4510
+ const result = {};
4511
+ for (const [key, value] of Object.entries(attributes ?? {})) if (!key.startsWith("cache.")) result[key] = value;
4512
+ return result;
4470
4513
  }
4471
4514
  function isRecordLike$1(value) {
4472
4515
  return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -4478,6 +4521,10 @@ function recordSpanAttributeAssertion(message) {
4478
4521
  const scope = getCurrentScope();
4479
4522
  if (!scope) return;
4480
4523
  scope.assertionFailures.push({ message });
4524
+ scope.assertions.push({
4525
+ message,
4526
+ status: "fail"
4527
+ });
4481
4528
  }
4482
4529
  function incrementSpanAttribute(span, key, delta) {
4483
4530
  const existing = span.attributes?.[key];
@@ -4637,6 +4684,7 @@ function startExternalSpan(info) {
4637
4684
  status: "running",
4638
4685
  attributes: info.attributes
4639
4686
  });
4687
+ recordSpanForActiveCacheRecording(scope, id);
4640
4688
  return createExternalSpanHandle(id);
4641
4689
  }
4642
4690
  function updateExternalSpan(info) {
@@ -4695,6 +4743,7 @@ function recordExternalSpan(info) {
4695
4743
  warning: info.warning,
4696
4744
  warnings: info.warnings
4697
4745
  });
4746
+ recordSpanForActiveCacheRecording(scope, id);
4698
4747
  return id;
4699
4748
  }
4700
4749
  /**
@@ -4780,6 +4829,7 @@ async function traceSpanInternal(info, fn) {
4780
4829
  attributes: info.attributes
4781
4830
  };
4782
4831
  scope.spans.push(spanRecord);
4832
+ recordSpanForActiveCacheRecording(scope, id);
4783
4833
  const activeSpan = createSpanHandle(spanRecord);
4784
4834
  return await runWithActiveSpan(spanRecord, async () => {
4785
4835
  try {
@@ -4829,21 +4879,19 @@ async function traceSpanInternal(info, fn) {
4829
4879
  const frame = {
4830
4880
  baseSpanIndex: scope.spans.length,
4831
4881
  replayParentSpanId: id,
4882
+ spanIds: /* @__PURE__ */ new Set(),
4883
+ finalAttributes: copyNonCacheAttributes(spanRecord.attributes),
4832
4884
  ops: []
4833
4885
  };
4834
- scope.recordingStack.push(frame);
4835
- let bodyResult;
4836
- try {
4837
- bodyResult = await fn(activeSpan);
4838
- } finally {
4839
- scope.recordingStack.pop();
4840
- }
4886
+ const bodyResult = await runWithCacheRecordingFrame(frame, async () => {
4887
+ return await fn(activeSpan);
4888
+ });
4841
4889
  appendSubSpanOps(scope, frame);
4842
4890
  finishSpanWithoutThrownError(spanRecord, realStartedAt);
4843
4891
  if (canStore) {
4844
4892
  const recording = {
4845
4893
  returnValue: bodyResult,
4846
- finalAttributes: stripCacheAttributes(spanRecord.attributes),
4894
+ finalAttributes: frame.finalAttributes,
4847
4895
  finalStatus: spanRecord.status,
4848
4896
  finalError: spanRecord.error,
4849
4897
  finalErrors: spanRecord.errors,
@@ -4947,14 +4995,12 @@ const evalTracer = {
4947
4995
  status: "ok",
4948
4996
  attributes: { value: data }
4949
4997
  });
4950
- if (scope.replayingDepth === 0) {
4951
- const top = scope.recordingStack.at(-1);
4952
- if (top) top.ops.push({
4953
- kind: "checkpoint",
4954
- name,
4955
- data
4956
- });
4957
- }
4998
+ recordSpanForActiveCacheRecording(scope, id);
4999
+ recordCacheRecordingOpIfActive(scope, {
5000
+ kind: "checkpoint",
5001
+ name,
5002
+ data
5003
+ });
4958
5004
  }
4959
5005
  };
4960
5006
  /** Build a queryable trace tree helper from a flat span list and checkpoints. */
@@ -4983,6 +5029,24 @@ function buildTraceTree(spans, checkpoints) {
4983
5029
  };
4984
5030
  }
4985
5031
  //#endregion
5032
+ //#region ../runner/src/cacheAccessTime.ts
5033
+ const defaultLastAccessedAtUpdateIntervalMs = 14400 * 1e3;
5034
+ function normalizeLastAccessedAtUpdateIntervalMs(value) {
5035
+ if (value === void 0 || !Number.isFinite(value) || value < 0) return defaultLastAccessedAtUpdateIntervalMs;
5036
+ return Math.floor(value);
5037
+ }
5038
+ function cacheAccessSortTime(entry) {
5039
+ return entry.lastAccessedAt ?? entry.storedAt;
5040
+ }
5041
+ function shouldRefreshLastAccessedAt(params) {
5042
+ return params.lastAccessedAt === null || params.nowMs - Date.parse(params.lastAccessedAt) > params.updateIntervalMs;
5043
+ }
5044
+ //#endregion
5045
+ //#region ../runner/src/cacheKeys.ts
5046
+ function toPendingKey(namespace, keyHash) {
5047
+ return `${namespace}::${keyHash}`;
5048
+ }
5049
+ //#endregion
4986
5050
  //#region ../runner/src/cacheStore.ts
4987
5051
  const defaultMaxEntriesPerNamespace = 100;
4988
5052
  const cacheSerializationMarker = "__aecs";
@@ -5015,6 +5079,7 @@ function createFsCacheStore(options) {
5015
5079
  primaryDir: blobDir
5016
5080
  });
5017
5081
  const defaultMaxEntries = normalizeMaxEntries(options.maxEntriesPerNamespace);
5082
+ const lastAccessedAtUpdateIntervalMs = normalizeLastAccessedAtUpdateIntervalMs(options.lastAccessedAtUpdateIntervalMs);
5018
5083
  return {
5019
5084
  externalJsonStore,
5020
5085
  dir() {
@@ -5034,7 +5099,12 @@ function createFsCacheStore(options) {
5034
5099
  });
5035
5100
  if (entry === null) return null;
5036
5101
  const materialized = await materializeExternalJsonCacheEntryOrNull(entry, externalJsonStore);
5037
- if (materialized !== null) await updateCacheIndexLastAccessedAt(cacheDir, namespace, keyHash);
5102
+ if (materialized !== null) await updateCacheIndexLastAccessedAt({
5103
+ cacheDir,
5104
+ key: keyHash,
5105
+ namespace,
5106
+ updateIntervalMs: lastAccessedAtUpdateIntervalMs
5107
+ });
5038
5108
  return materialized;
5039
5109
  },
5040
5110
  async lookupWithDebug(namespace, keyHash) {
@@ -5063,7 +5133,7 @@ function createFsCacheStore(options) {
5063
5133
  const index = await readNamespaceIndex(cacheDir, entry.namespace);
5064
5134
  index.entries[entry.key] = {
5065
5135
  storedAt: entry.storedAt,
5066
- lastAccessedAt: entry.storedAt,
5136
+ lastAccessedAt: null,
5067
5137
  blobRefs: await collectExternalJsonBlobRefs(entry, blobDirs)
5068
5138
  };
5069
5139
  await writeNamespaceIndex(cacheDir, index);
@@ -5082,7 +5152,7 @@ function createFsCacheStore(options) {
5082
5152
  async list() {
5083
5153
  const items = [];
5084
5154
  for (const index of await listCacheIndexes(cacheDir)) for (const [key, entry] of Object.entries(index.entries)) items.push(toCacheListItem(index.namespace, key, entry));
5085
- items.sort((a, b) => a.lastAccessedAt < b.lastAccessedAt ? 1 : -1);
5155
+ items.sort((a, b) => cacheAccessSortTime(a) < cacheAccessSortTime(b) ? 1 : -1);
5086
5156
  return items;
5087
5157
  },
5088
5158
  async clear(filter) {
@@ -5183,9 +5253,6 @@ function maxEntriesForNamespace(namespace, defaultMaxEntries, maxEntriesByNamesp
5183
5253
  const namespaceMaxEntries = maxEntriesByNamespace?.[namespace];
5184
5254
  return namespaceMaxEntries === void 0 ? defaultMaxEntries : normalizeMaxEntries(namespaceMaxEntries, defaultMaxEntries);
5185
5255
  }
5186
- function toPendingKey(namespace, keyHash) {
5187
- return `${namespace}::${keyHash}`;
5188
- }
5189
5256
  function sanitizeSegment$1(segment) {
5190
5257
  return segment.replace(/[^a-zA-Z0-9_.-]/g, "_");
5191
5258
  }
@@ -5231,16 +5298,22 @@ async function readIndexedCacheEntry(params) {
5231
5298
  return fileEntry.entry;
5232
5299
  });
5233
5300
  }
5234
- async function updateCacheIndexLastAccessedAt(cacheDir, namespace, key) {
5235
- await withCacheFileLock(namespaceLockPath(cacheDir, namespace), async () => {
5236
- const index = await readNamespaceIndex(cacheDir, namespace);
5237
- const entry = index.entries[key];
5301
+ async function updateCacheIndexLastAccessedAt(params) {
5302
+ await withCacheFileLock(namespaceLockPath(params.cacheDir, params.namespace), async () => {
5303
+ const index = await readNamespaceIndex(params.cacheDir, params.namespace);
5304
+ const entry = index.entries[params.key];
5238
5305
  if (entry === void 0) return;
5239
- index.entries[key] = {
5306
+ const nowMs = getRealDateNowMs();
5307
+ if (!shouldRefreshLastAccessedAt({
5308
+ lastAccessedAt: entry.lastAccessedAt,
5309
+ nowMs,
5310
+ updateIntervalMs: params.updateIntervalMs
5311
+ })) return;
5312
+ index.entries[params.key] = {
5240
5313
  ...entry,
5241
- lastAccessedAt: new Date(getRealDateNowMs()).toISOString()
5314
+ lastAccessedAt: new Date(nowMs).toISOString()
5242
5315
  };
5243
- await writeNamespaceIndex(cacheDir, index);
5316
+ await writeNamespaceIndex(params.cacheDir, index);
5244
5317
  });
5245
5318
  }
5246
5319
  async function readCacheEntryFilePath(filePath, expected) {
@@ -5371,7 +5444,7 @@ function parseCacheIndexFile(value, expectedNamespace) {
5371
5444
  }
5372
5445
  function parseCacheIndexEntry(value) {
5373
5446
  if (!isRecordLike(value)) return null;
5374
- if (typeof value.storedAt !== "string" || typeof value.lastAccessedAt !== "string") return null;
5447
+ if (typeof value.storedAt !== "string" || value.lastAccessedAt !== null && typeof value.lastAccessedAt !== "string") return null;
5375
5448
  if (!Array.isArray(value.blobRefs)) return null;
5376
5449
  const blobRefs = [];
5377
5450
  for (const blobRef of value.blobRefs) {
@@ -5441,7 +5514,7 @@ function entryMatchesFilter(entry, filter) {
5441
5514
  async function pruneCacheEntriesForNamespace(params) {
5442
5515
  const { cacheDir, index, maxEntries } = params;
5443
5516
  const entries = Object.entries(index.entries);
5444
- const sorted = entries.toSorted(([, a], [, b]) => a.lastAccessedAt < b.lastAccessedAt ? 1 : -1);
5517
+ const sorted = entries.toSorted(([, a], [, b]) => cacheAccessSortTime(a) < cacheAccessSortTime(b) ? 1 : -1);
5445
5518
  const keptKeys = /* @__PURE__ */ new Set();
5446
5519
  for (const [key] of sorted) {
5447
5520
  if (keptKeys.size >= maxEntries) break;
@@ -6606,7 +6679,7 @@ async function runDeriveFromTracingConfig(params) {
6606
6679
  });
6607
6680
  } catch (e) {
6608
6681
  const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
6609
- params.scope.assertionFailures.push(toAssertionFailure(message, e instanceof Error ? e : void 0));
6682
+ recordAssertionFailure(params.scope, toAssertionFailure(message, e instanceof Error ? e : void 0));
6610
6683
  }
6611
6684
  }
6612
6685
  async function runCase(params) {
@@ -6656,7 +6729,7 @@ async function runCase(params) {
6656
6729
  });
6657
6730
  const traceTree = buildTraceTree(spansWithDerivedAttributes, scope.checkpoints);
6658
6731
  const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
6659
- if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) scope.assertionFailures.push(toAssertionFailure(executeError.message, executeError));
6732
+ if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) recordAssertionFailure(scope, toAssertionFailure(executeError.message, executeError));
6660
6733
  if (!nonAssertError) {
6661
6734
  await runDeriveFromTracingConfig({
6662
6735
  deriveFromTracing: globalDeriveFromTracing,
@@ -6686,7 +6759,7 @@ async function runCase(params) {
6686
6759
  ...scope.outputs,
6687
6760
  ...parsedOutputs.data
6688
6761
  };
6689
- else scope.assertionFailures.push(toAssertionFailure(formatOutputsSchemaError(parsedOutputs.error), void 0, "OutputsSchemaError"));
6762
+ else recordAssertionFailure(scope, toAssertionFailure(formatOutputsSchemaError(parsedOutputs.error), void 0, "OutputsSchemaError"));
6690
6763
  }
6691
6764
  const scoreResults = /* @__PURE__ */ new Map();
6692
6765
  const scoringTraces = {};
@@ -6728,8 +6801,7 @@ async function runCase(params) {
6728
6801
  };
6729
6802
  const rawValue = scoreRun.result;
6730
6803
  if (scoreRun.error) {
6731
- const message = `score "${key}" threw: ${scoreRun.error.message}`;
6732
- scope.assertionFailures.push(toAssertionFailure(message, scoreRun.error));
6804
+ recordAssertionFailure(scope, toAssertionFailure(`score "${key}" threw: ${scoreRun.error.message}`, scoreRun.error));
6733
6805
  scope.outputs[key] = 0;
6734
6806
  scoreResults.set(key, {
6735
6807
  value: 0,
@@ -6739,7 +6811,7 @@ async function runCase(params) {
6739
6811
  continue;
6740
6812
  }
6741
6813
  if (typeof rawValue !== "number") {
6742
- scope.assertionFailures.push(toAssertionFailure(`score "${key}" must return a number`));
6814
+ recordAssertionFailure(scope, toAssertionFailure(`score "${key}" must return a number`));
6743
6815
  scope.outputs[key] = 0;
6744
6816
  scoreResults.set(key, {
6745
6817
  value: 0,
@@ -6801,6 +6873,7 @@ async function runCase(params) {
6801
6873
  traceDisplay,
6802
6874
  columns,
6803
6875
  ...outputColumnDefs.length > 0 ? { outputColumnDefs } : {},
6876
+ assertions: scope.assertions,
6804
6877
  assertionFailures: scope.assertionFailures,
6805
6878
  logs: scope.logs,
6806
6879
  error: errorInfo,
@@ -6852,5 +6925,12 @@ function toAssertionFailure(message, error = void 0, nameOverride = void 0) {
6852
6925
  ...stack !== void 0 ? { stack } : {}
6853
6926
  };
6854
6927
  }
6928
+ function recordAssertionFailure(scope, failure) {
6929
+ scope.assertionFailures.push(failure);
6930
+ scope.assertions.push({
6931
+ ...failure,
6932
+ status: "fail"
6933
+ });
6934
+ }
6855
6935
  //#endregion
6856
6936
  export { startEvalBackgroundJob as $, repoFile as A, evalChartsConfigSchema as At, getCurrentScope as B, evalTracer as C, buildEvalKey as Ct, deserializeCacheValue as D, evalStatAggregateSchema as Dt, deserializeCacheRecording as E, caseRowSchema as Et, appendToEvalOutput as F, mergeEvalOutput as G, incrementEvalOutput as H, configureEvalRunLogs as I, runInEvalScope as J, nextEvalId as K, evalAssert as L, readManualInputFile as M, defineEval as Mt, evalExpect as N, getEvalRegistry as Nt, serializeCacheRecording as O, evalStatsConfigSchema as Ot, EvalAssertionError as P, runWithEvalRegistry as Pt, setScopeCacheContext as Q, evalLog as R, evalSpan as S, resolveLlmCallsConfig as St, hashCacheKeySync as T, caseDetailSchema as Tt, isInEvalScope as U, getEvalCaseInput as V, matchesEvalTags as W, runWithEvalClock as X, runInExistingEvalScope as Y, setEvalOutput as Z, createBufferedCacheStore as _, validateEvalTagName as _t, isCaseChildParentMessage as a, extractLlmCalls as at, buildTraceTree as b, runSummarySchema as bt, resolveArtifactPath as c, applyDerivedCallAttributes as ct, loadEvalModule as d, getEvalDisplayStatus as dt, createRunRequestSchema as et, resolveEvalDefaultConfig as f, deriveScopedSummaryFromCases as ft, commitPendingCacheWrites as g, matchesTagsFilter as gt, normalizeScoreDef as h, dedupeEvalTags as ht, isCaseChildMessage as i, extractApiCalls as it, manualInputFileValueSchema as j, columnDefSchema as jt, serializeCacheValue as k, manualInputDescriptorSchema as kt, registerAgentEvalsPackageResolutionHooks as l, getNestedAttribute as lt, buildDeclaredColumnDefs as m, deriveStatusFromChildStatuses as mt, resolveRunnableEvalCases as n, extractCacheEntries as nt, stripTerminalControlCodes as o, simulateLlmCallCost as ot, loadConfig as p, deriveStatusFromCaseRows as pt, runInEvalRuntimeScope as q, runCase as r, extractCacheHits as rt, resolveTracePresentation as s, simulateTokenAllocation as st, filterEvalCases as t, updateManualScoreRequestSchema as tt, runWithModuleIsolation as u, getEvalTitle as ut, createFsCacheStore as v, validateTagsFilterExpression as vt, hashCacheKey as w, getCaseRowCaseKey as wt, captureEvalSpanError as x, resolveApiCallsConfig as xt, z$1 as y, runManifestSchema as yt, evalTime as z };
@@ -1,4 +1,4 @@
1
- import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-BH7DlMXl.mjs";
1
+ import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-d42Lm0i5.mjs";
2
2
  import { readFile, readdir, rm, writeFile } from "node:fs/promises";
3
3
  import { dirname, join } from "node:path";
4
4
  import { existsSync } from "node:fs";
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-B3hEOT_I.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-MSr8sAWm.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-Ck0mqxd-.mjs";
2
- import "./src-B3iq-tuv.mjs";
1
+ import { n as createRunner } from "./cli-_g2qOMK6.mjs";
2
+ import "./src-CdZsOn6y.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,5 +1,5 @@
1
- import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-BH7DlMXl.mjs";
2
- import "./cli-Ck0mqxd-.mjs";
1
+ import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-d42Lm0i5.mjs";
2
+ import "./cli-_g2qOMK6.mjs";
3
3
  //#region src/index.ts
4
4
  /** Register an eval definition with typed tag support. */
5
5
  function defineEval(definition) {