@ls-stack/agent-eval 0.47.0 → 0.51.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -621,6 +621,8 @@ const hideIfNoValueShape = {
621
621
  hideIfNoValue: z.boolean().optional() };
622
622
  /**
623
623
  * One entry in the EvalCard stats row. Built-in kinds use latest run totals;
624
+ * `cacheHits` counts Agent Eval operation-level cache hits from spans and
625
+ * `evalTracer.cache(...)` refs, not LLM provider prompt-cache read tokens.
624
626
  * `column` aggregates a score or numeric output column across the latest run.
625
627
  */
626
628
  const evalStatItemSchema = z.discriminatedUnion("kind", [
@@ -637,6 +639,10 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
637
639
  kind: z.literal("duration"),
638
640
  ...hideIfNoValueShape
639
641
  }),
642
+ z.object({
643
+ kind: z.literal("cacheHits"),
644
+ ...hideIfNoValueShape
645
+ }),
640
646
  z.object({
641
647
  kind: z.literal("column"),
642
648
  key: z.string(),
@@ -731,6 +737,23 @@ const caseRowSchema = z.object({
731
737
  ]),
732
738
  /** Elapsed case execution duration in milliseconds, or null before completion. */
733
739
  durationMs: z.number().nullable(),
740
+ /**
741
+ * Agent Eval operation-level cache hits recorded for this case.
742
+ *
743
+ * This counts persisted operation cache hits from spans and
744
+ * `evalTracer.cache(...)` refs. It does not count LLM provider prompt-cache
745
+ * read tokens such as `cachedInputTokens`. Older run artifacts may omit it
746
+ * and should be treated as zero by aggregate readers.
747
+ */
748
+ cacheHits: z.number().optional(),
749
+ /**
750
+ * Agent Eval operation-level cache activity entries recorded for this case.
751
+ *
752
+ * This is the denominator for `cacheHits`, counting hits plus misses and
753
+ * refreshes that appear in the Cache tab. Older run artifacts may omit it
754
+ * and should be treated as zero by aggregate readers.
755
+ */
756
+ cacheOperations: z.number().optional(),
734
757
  costUsd: z.number().nullable().optional(),
735
758
  columns: z.record(z.string(), cellValueSchema),
736
759
  /** Winning trial index for the persisted case result. */
@@ -771,7 +794,13 @@ const runLogLocationSchema = z.object({
771
794
  /** 1-based source line reported by the JavaScript stack frame. */
772
795
  line: z.number(),
773
796
  /** 1-based source column reported by the JavaScript stack frame. */
774
- column: z.number()
797
+ column: z.number(),
798
+ /**
799
+ * Full JavaScript stack captured when the log was emitted.
800
+ *
801
+ * Older run artifacts may only include the primary file, line, and column.
802
+ */
803
+ stack: z.string().optional()
775
804
  });
776
805
  /** Schema for one persisted log entry captured during a case run. */
777
806
  const runLogEntrySchema = z.object({
@@ -1692,6 +1721,8 @@ function deriveScopedSummaryFromCases(params) {
1692
1721
  let runningCases = 0;
1693
1722
  let totalDurationMs = 0;
1694
1723
  let hasDuration = false;
1724
+ let cacheHits = 0;
1725
+ let cacheOperations = 0;
1695
1726
  for (const caseRow of caseRows) {
1696
1727
  if (caseRow.status === "pass") passedCases += 1;
1697
1728
  else if (caseRow.status === "fail") failedCases += 1;
@@ -1703,6 +1734,8 @@ function deriveScopedSummaryFromCases(params) {
1703
1734
  totalDurationMs += caseRow.durationMs;
1704
1735
  hasDuration = true;
1705
1736
  }
1737
+ cacheHits += caseRow.cacheHits ?? 0;
1738
+ cacheOperations += caseRow.cacheOperations ?? 0;
1706
1739
  }
1707
1740
  return {
1708
1741
  status: deriveStatusFromCaseRows({
@@ -1716,7 +1749,9 @@ function deriveScopedSummaryFromCases(params) {
1716
1749
  cancelledCases,
1717
1750
  pendingCases,
1718
1751
  runningCases,
1719
- totalDurationMs: hasDuration ? totalDurationMs : null
1752
+ totalDurationMs: hasDuration ? totalDurationMs : null,
1753
+ cacheHits,
1754
+ cacheOperations
1720
1755
  };
1721
1756
  }
1722
1757
  //#endregion
@@ -2508,6 +2543,7 @@ function stripTerminalControlCodes$1(value) {
2508
2543
  const scopeStorage = new AsyncLocalStorage();
2509
2544
  const runtimeScopeStorage = new AsyncLocalStorage();
2510
2545
  const evalClockStorage = new AsyncLocalStorage();
2546
+ const activeSpanStackStorage = new AsyncLocalStorage();
2511
2547
  let activeEvalScopeCount = 0;
2512
2548
  let activeEvalRuntimeScopeCount = 0;
2513
2549
  let consoleCaptureEnabled = true;
@@ -2650,6 +2686,16 @@ function getCurrentScope() {
2650
2686
  if (activeEvalScopeCount === 0) return void 0;
2651
2687
  return scopeStorage.getStore();
2652
2688
  }
2689
+ /** Return the span currently active in this async execution, if any. */
2690
+ function getCurrentActiveSpan() {
2691
+ if (activeEvalScopeCount === 0) return void 0;
2692
+ return activeSpanStackStorage.getStore()?.at(-1);
2693
+ }
2694
+ /** Execute a callback with a span added to this async execution's active stack. */
2695
+ async function runWithActiveSpan(span, fn) {
2696
+ const currentStack = activeSpanStackStorage.getStore() ?? [];
2697
+ return await activeSpanStackStorage.run([...currentStack, span], fn);
2698
+ }
2653
2699
  /**
2654
2700
  * Return the current eval runner phase for this async execution.
2655
2701
  *
@@ -2787,7 +2833,8 @@ function normalizeStackFile(value) {
2787
2833
  return decodeURIComponent(value.replace(fileUrlPrefixPattern, ""));
2788
2834
  }
2789
2835
  function isInternalLogFrame(file) {
2790
- return file.includes("/packages/sdk/src/runtime.ts") || file.includes("/node:internal/") || file.startsWith("node:internal/");
2836
+ const normalizedFile = file.replaceAll("\\", "/");
2837
+ return normalizedFile.includes("/packages/sdk/src/runtime.ts") || normalizedFile.includes("/packages/sdk/dist/") || normalizedFile.includes("/node_modules/@agent-evals/sdk/dist/") || normalizedFile.includes("/node_modules/@ls-stack/agent-eval/dist/") || normalizedFile.includes("/node:internal/") || normalizedFile.startsWith("node:internal/");
2791
2838
  }
2792
2839
  function parseStackFrameLocation(line) {
2793
2840
  const match = stackFrameLocationPattern.exec(line.trim());
@@ -2808,7 +2855,10 @@ function getLogLocation() {
2808
2855
  for (const line of stack.split("\n").slice(1)) {
2809
2856
  const location = parseStackFrameLocation(line);
2810
2857
  if (location === null || isInternalLogFrame(location.file)) continue;
2811
- return location;
2858
+ return {
2859
+ ...location,
2860
+ stack
2861
+ };
2812
2862
  }
2813
2863
  }
2814
2864
  function recordEvalLog(level, args) {
@@ -2951,8 +3001,6 @@ async function runInEvalScope(caseId, fn, options = {}) {
2951
3001
  logs: [],
2952
3002
  spans: [],
2953
3003
  checkpoints: /* @__PURE__ */ new Map(),
2954
- spanStack: [],
2955
- activeSpanStack: [],
2956
3004
  recordingStack: [],
2957
3005
  replayingDepth: 0,
2958
3006
  cacheContext: options.cacheContext,
@@ -4185,7 +4233,7 @@ function createTraceCache(generateSpanId) {
4185
4233
  namespace,
4186
4234
  key: info.key
4187
4235
  }, { serializeFileBytes: info.serializeFileBytes === true });
4188
- const activeSpan = scope.activeSpanStack.at(-1);
4236
+ const activeSpan = getCurrentActiveSpan();
4189
4237
  const canRead = cacheCtx.mode === "use" && cacheCtx.read !== false;
4190
4238
  const canStore = cacheCtx.mode !== "bypass" && cacheCtx.store !== false;
4191
4239
  if (canRead) {
@@ -4283,7 +4331,7 @@ function generateSpanId() {
4283
4331
  return `span_${String(Date.now())}_${String(spanIdCounter)}`;
4284
4332
  }
4285
4333
  function updateCurrentSpan(update) {
4286
- const currentSpan = getCurrentScope()?.activeSpanStack.at(-1);
4334
+ const currentSpan = getCurrentActiveSpan();
4287
4335
  if (!currentSpan) return;
4288
4336
  update(currentSpan);
4289
4337
  }
@@ -4454,9 +4502,9 @@ function toIsoTimestamp(value) {
4454
4502
  function findSpan(scope, id) {
4455
4503
  return scope.spans.find((span) => span.id === id);
4456
4504
  }
4457
- function resolveExternalParentId(scope, parentId) {
4505
+ function resolveExternalParentId(parentId) {
4458
4506
  if (parentId !== void 0) return parentId;
4459
- return scope.activeSpanStack.at(-1)?.id ?? null;
4507
+ return getCurrentActiveSpan()?.id ?? null;
4460
4508
  }
4461
4509
  function startExternalSpan(info) {
4462
4510
  const id = info.id ?? generateSpanId();
@@ -4464,7 +4512,7 @@ function startExternalSpan(info) {
4464
4512
  if (!scope) return noopExternalSpan(id);
4465
4513
  const existing = findSpan(scope, id);
4466
4514
  if (existing) {
4467
- existing.parentId = resolveExternalParentId(scope, info.parentId);
4515
+ existing.parentId = resolveExternalParentId(info.parentId);
4468
4516
  existing.kind = info.kind;
4469
4517
  existing.name = info.name;
4470
4518
  existing.startedAt = toIsoTimestamp(info.startedAt);
@@ -4475,7 +4523,7 @@ function startExternalSpan(info) {
4475
4523
  }
4476
4524
  scope.spans.push({
4477
4525
  id,
4478
- parentId: resolveExternalParentId(scope, info.parentId),
4526
+ parentId: resolveExternalParentId(info.parentId),
4479
4527
  caseId: scope.caseId,
4480
4528
  kind: info.kind,
4481
4529
  name: info.name,
@@ -4516,7 +4564,7 @@ function recordExternalSpan(info) {
4516
4564
  const existing = findSpan(scope, id);
4517
4565
  const status = info.status ?? (info.error ? "error" : "ok");
4518
4566
  if (existing) {
4519
- existing.parentId = resolveExternalParentId(scope, info.parentId);
4567
+ existing.parentId = resolveExternalParentId(info.parentId);
4520
4568
  existing.kind = info.kind;
4521
4569
  existing.name = info.name;
4522
4570
  existing.startedAt = startedAt;
@@ -4530,7 +4578,7 @@ function recordExternalSpan(info) {
4530
4578
  }
4531
4579
  scope.spans.push({
4532
4580
  id,
4533
- parentId: resolveExternalParentId(scope, info.parentId),
4581
+ parentId: resolveExternalParentId(info.parentId),
4534
4582
  caseId: scope.caseId,
4535
4583
  kind: info.kind,
4536
4584
  name: info.name,
@@ -4613,7 +4661,7 @@ async function traceSpanInternal(info, fn) {
4613
4661
  const scope = getCurrentScope();
4614
4662
  if (!scope) return await fn(noopActiveSpan());
4615
4663
  const id = generateSpanId();
4616
- const parentId = scope.activeSpanStack.at(-1)?.id ?? null;
4664
+ const parentId = getCurrentActiveSpan()?.id ?? null;
4617
4665
  const realStartedAt = getRealDateNowMs();
4618
4666
  const spanRecord = {
4619
4667
  id,
@@ -4627,109 +4675,106 @@ async function traceSpanInternal(info, fn) {
4627
4675
  attributes: info.attributes
4628
4676
  };
4629
4677
  scope.spans.push(spanRecord);
4630
- scope.spanStack.push(id);
4631
- scope.activeSpanStack.push(spanRecord);
4632
4678
  const activeSpan = createSpanHandle(spanRecord);
4633
- try {
4634
- const cacheOpts = info.cache;
4635
- const cacheCtx = scope.cacheContext;
4636
- if (cacheOpts !== void 0 && cacheCtx !== void 0 && scope.replayingDepth === 0) {
4637
- const ctx = cacheCtx;
4638
- const namespace = getRequiredSpanCacheNamespace(cacheOpts);
4639
- const keyHash = await hashCacheKey({
4640
- namespace,
4641
- key: cacheOpts.key
4642
- }, { serializeFileBytes: cacheOpts.serializeFileBytes === true });
4643
- const canRead = ctx.mode === "use" && ctx.read !== false;
4644
- const canStore = ctx.mode !== "bypass" && ctx.store !== false;
4645
- mergeSpanAttributes(spanRecord, {
4646
- "cache.key": keyHash,
4647
- "cache.namespace": namespace
4648
- });
4649
- if (canRead) {
4650
- const hit = await ctx.adapter.lookup(namespace, keyHash);
4651
- if (hit) {
4652
- const storedAt = hit.storedAt;
4679
+ return await runWithActiveSpan(spanRecord, async () => {
4680
+ try {
4681
+ const cacheOpts = info.cache;
4682
+ const cacheCtx = scope.cacheContext;
4683
+ if (cacheOpts !== void 0 && cacheCtx !== void 0 && scope.replayingDepth === 0) {
4684
+ const ctx = cacheCtx;
4685
+ const namespace = getRequiredSpanCacheNamespace(cacheOpts);
4686
+ const keyHash = await hashCacheKey({
4687
+ namespace,
4688
+ key: cacheOpts.key
4689
+ }, { serializeFileBytes: cacheOpts.serializeFileBytes === true });
4690
+ const canRead = ctx.mode === "use" && ctx.read !== false;
4691
+ const canStore = ctx.mode !== "bypass" && ctx.store !== false;
4692
+ mergeSpanAttributes(spanRecord, {
4693
+ "cache.key": keyHash,
4694
+ "cache.namespace": namespace
4695
+ });
4696
+ if (canRead) {
4697
+ const hit = await ctx.adapter.lookup(namespace, keyHash);
4698
+ if (hit) {
4699
+ const storedAt = hit.storedAt;
4700
+ mergeSpanAttributes(spanRecord, {
4701
+ "cache.status": "hit",
4702
+ "cache.storedAt": storedAt,
4703
+ "cache.age": getRealDateNowMs() - new Date(storedAt).getTime()
4704
+ });
4705
+ const recording = deserializeCacheRecording(hit.recording);
4706
+ replayRecording(scope, spanRecord, recording, { generateSpanId });
4707
+ spanRecord.status = recording.finalStatus ?? (hasSpanError(spanRecord) ? "error" : "ok");
4708
+ spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
4709
+ return recording.returnValue;
4710
+ }
4653
4711
  mergeSpanAttributes(spanRecord, {
4654
- "cache.status": "hit",
4655
- "cache.storedAt": storedAt,
4656
- "cache.age": getRealDateNowMs() - new Date(storedAt).getTime()
4712
+ "cache.status": "miss",
4713
+ ...canStore ? {} : { "cache.stored": false }
4657
4714
  });
4658
- const recording = deserializeCacheRecording(hit.recording);
4659
- replayRecording(scope, spanRecord, recording, { generateSpanId });
4660
- spanRecord.status = recording.finalStatus ?? (hasSpanError(spanRecord) ? "error" : "ok");
4661
- spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
4662
- return recording.returnValue;
4663
- }
4664
- mergeSpanAttributes(spanRecord, {
4715
+ } else if (ctx.mode === "use" && canStore) mergeSpanAttributes(spanRecord, {
4665
4716
  "cache.status": "miss",
4717
+ "cache.read": false
4718
+ });
4719
+ else if (ctx.mode === "refresh") mergeSpanAttributes(spanRecord, {
4720
+ "cache.status": "refresh",
4666
4721
  ...canStore ? {} : { "cache.stored": false }
4667
4722
  });
4668
- } else if (ctx.mode === "use" && canStore) mergeSpanAttributes(spanRecord, {
4669
- "cache.status": "miss",
4670
- "cache.read": false
4671
- });
4672
- else if (ctx.mode === "refresh") mergeSpanAttributes(spanRecord, {
4673
- "cache.status": "refresh",
4674
- ...canStore ? {} : { "cache.stored": false }
4675
- });
4676
- else mergeSpanAttributes(spanRecord, { "cache.status": "bypass" });
4677
- const frame = {
4678
- baseSpanIndex: scope.spans.length,
4679
- replayParentSpanId: id,
4680
- ops: []
4681
- };
4682
- scope.recordingStack.push(frame);
4683
- let bodyResult;
4684
- try {
4685
- bodyResult = await fn(activeSpan);
4686
- } finally {
4687
- scope.recordingStack.pop();
4688
- }
4689
- appendSubSpanOps(scope, frame);
4690
- finishSpanWithoutThrownError(spanRecord, realStartedAt);
4691
- if (canStore) {
4692
- const recording = {
4693
- returnValue: bodyResult,
4694
- finalAttributes: stripCacheAttributes(spanRecord.attributes),
4695
- finalStatus: spanRecord.status,
4696
- finalError: spanRecord.error,
4697
- finalErrors: spanRecord.errors,
4698
- finalWarning: spanRecord.warning,
4699
- finalWarnings: spanRecord.warnings,
4700
- ops: frame.ops
4701
- };
4702
- const entry = {
4703
- version: 1,
4704
- key: keyHash,
4705
- namespace,
4706
- operationType: "span",
4707
- operationName: info.name,
4708
- spanName: info.name,
4709
- spanKind: info.kind,
4710
- storedAt: new Date(getRealDateNowMs()).toISOString(),
4711
- recording: await serializeCacheRecording(recording, { externalJsonStore: ctx.adapter.externalJsonStore })
4723
+ else mergeSpanAttributes(spanRecord, { "cache.status": "bypass" });
4724
+ const frame = {
4725
+ baseSpanIndex: scope.spans.length,
4726
+ replayParentSpanId: id,
4727
+ ops: []
4712
4728
  };
4713
- await ctx.adapter.write(entry, {
4714
- rawKey: cacheOpts.key,
4715
- operationType: "span",
4716
- operationName: info.name
4717
- });
4729
+ scope.recordingStack.push(frame);
4730
+ let bodyResult;
4731
+ try {
4732
+ bodyResult = await fn(activeSpan);
4733
+ } finally {
4734
+ scope.recordingStack.pop();
4735
+ }
4736
+ appendSubSpanOps(scope, frame);
4737
+ finishSpanWithoutThrownError(spanRecord, realStartedAt);
4738
+ if (canStore) {
4739
+ const recording = {
4740
+ returnValue: bodyResult,
4741
+ finalAttributes: stripCacheAttributes(spanRecord.attributes),
4742
+ finalStatus: spanRecord.status,
4743
+ finalError: spanRecord.error,
4744
+ finalErrors: spanRecord.errors,
4745
+ finalWarning: spanRecord.warning,
4746
+ finalWarnings: spanRecord.warnings,
4747
+ ops: frame.ops
4748
+ };
4749
+ const entry = {
4750
+ version: 1,
4751
+ key: keyHash,
4752
+ namespace,
4753
+ operationType: "span",
4754
+ operationName: info.name,
4755
+ spanName: info.name,
4756
+ spanKind: info.kind,
4757
+ storedAt: new Date(getRealDateNowMs()).toISOString(),
4758
+ recording: await serializeCacheRecording(recording, { externalJsonStore: ctx.adapter.externalJsonStore })
4759
+ };
4760
+ await ctx.adapter.write(entry, {
4761
+ rawKey: cacheOpts.key,
4762
+ operationType: "span",
4763
+ operationName: info.name
4764
+ });
4765
+ }
4766
+ return bodyResult;
4718
4767
  }
4719
- return bodyResult;
4768
+ const result = await fn(activeSpan);
4769
+ finishSpanWithoutThrownError(spanRecord, realStartedAt);
4770
+ return result;
4771
+ } catch (error) {
4772
+ spanRecord.status = "error";
4773
+ spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
4774
+ spanRecord.error = normalizeTraceError(error);
4775
+ throw error;
4720
4776
  }
4721
- const result = await fn(activeSpan);
4722
- finishSpanWithoutThrownError(spanRecord, realStartedAt);
4723
- return result;
4724
- } catch (error) {
4725
- spanRecord.status = "error";
4726
- spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
4727
- spanRecord.error = normalizeTraceError(error);
4728
- throw error;
4729
- } finally {
4730
- scope.spanStack.pop();
4731
- scope.activeSpanStack.pop();
4732
- }
4777
+ });
4733
4778
  }
4734
4779
  function getRequiredSpanCacheNamespace(cacheOpts) {
4735
4780
  if (!isRecordLike$1(cacheOpts)) throw new Error("Cached spans require a non-empty cache.namespace");
@@ -4785,7 +4830,7 @@ const evalTracer = {
4785
4830
  if (!scope) return;
4786
4831
  scope.checkpoints.set(name, data);
4787
4832
  const id = generateSpanId();
4788
- const parentId = scope.spanStack.at(-1) ?? null;
4833
+ const parentId = getCurrentActiveSpan()?.id ?? null;
4789
4834
  scope.spans.push({
4790
4835
  id,
4791
4836
  parentId,
@@ -7169,12 +7214,16 @@ async function runCase(params) {
7169
7214
  };
7170
7215
  if (Object.keys(scoringTraces).length > 0) caseDetail.scoringTraces = scoringTraces;
7171
7216
  const elapsedMs = Date.now() - startTime;
7217
+ const cacheEntries = extractCacheEntries(displayTrace, scope.caseCacheRefs);
7218
+ const cacheHits = cacheEntries.filter((entry) => entry.status === "hit");
7172
7219
  return {
7173
7220
  caseDetail,
7174
7221
  caseRowUpdate: {
7175
7222
  tags: evalCase.tags ?? [],
7176
7223
  status,
7177
7224
  durationMs: elapsedMs,
7225
+ cacheHits: cacheHits.length,
7226
+ cacheOperations: cacheEntries.length,
7178
7227
  columns
7179
7228
  }
7180
7229
  };
@@ -7670,6 +7719,8 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
7670
7719
  tags: caseDetail.tags,
7671
7720
  status: caseRowUpdate.status ?? "pending",
7672
7721
  durationMs: caseRowUpdate.durationMs ?? null,
7722
+ cacheHits: caseRowUpdate.cacheHits ?? 0,
7723
+ cacheOperations: caseRowUpdate.cacheOperations ?? 0,
7673
7724
  columns: caseRowUpdate.columns ?? {},
7674
7725
  trial
7675
7726
  }
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-vdJYkEVk.mjs";
2
- import "./src-BRqs3kSA.mjs";
1
+ import { n as createRunner } from "./cli-Cvs7tc2v.mjs";
2
+ import "./src-Jahivm6d.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-DJWn_7p0.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-LdMiDmAN.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-BFdxG9ws.mjs";
2
- import "./cli-vdJYkEVk.mjs";
1
+ import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-o38J7uZO.mjs";
2
+ import "./cli-Cvs7tc2v.mjs";
3
3
  //#region src/index.ts
4
4
  /** Register an eval definition with typed tag support. */
5
5
  function defineEval(definition) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.47.0",
3
+ "version": "0.51.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -33,8 +33,8 @@
33
33
  "@types/node": "^24.7.2",
34
34
  "typescript": "^5.9.2",
35
35
  "@agent-evals/runner": "0.0.1",
36
- "@agent-evals/sdk": "0.0.1",
37
- "@agent-evals/shared": "0.0.1"
36
+ "@agent-evals/shared": "0.0.1",
37
+ "@agent-evals/sdk": "0.0.1"
38
38
  },
39
39
  "scripts": {
40
40
  "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
@@ -400,11 +400,17 @@ cacheCreationInputTokens` so cache details are not double-counted.
400
400
  - `runLogs` (in `agent-evals.config.ts`) controls case log capture. Use
401
401
  `runLogs: { captureConsole: false }` to keep console output in the terminal
402
402
  without persisting console calls to case details. Manual `evalLog(...)` calls
403
- are still captured.
403
+ are still captured. Captured log locations store the selected user-facing
404
+ source frame and the full JavaScript stack so agents can inspect additional
405
+ frames in persisted artifacts when diagnosing where a log came from.
404
406
 
405
407
  Stats rows and history charts can be authored via `stats` / `charts` on the eval
406
408
  definition. Global `stats` in `agent-evals.config.ts` combine with eval-level
407
- stats. Usage stats and LLM usage charts are added by default unless removed with
409
+ stats. Native stat kinds include `cases`, `passRate`, `duration`, and
410
+ `cacheHits`; `cacheHits` shows Agent Eval operation-level cache hits over total
411
+ cache operations (`hits/total`) from spans and `evalTracer.cache(...)` refs, not
412
+ LLM provider prompt-cache read tokens such as `cachedInputTokens`. Usage stats
413
+ and LLM usage charts are added by default unless removed with
408
414
  `removeDefaultConfig`. Column stats can override `format` and `numberFormat`,
409
415
  otherwise they inherit from the matching column. Number formats use
410
416
  `maxDecimalPlaces` to cap decimals and `minDecimalPlaces` to pad trailing