@ls-stack/agent-eval 0.46.0 → 0.50.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,8 +25,8 @@
25
25
  href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
26
26
  rel="stylesheet"
27
27
  />
28
- <script type="module" crossorigin src="/assets/index-CaU84fHq.js"></script>
29
- <link rel="stylesheet" crossorigin href="/assets/index-B5JrV3_C.css">
28
+ <script type="module" crossorigin src="/assets/index-BkXnL_y8.js"></script>
29
+ <link rel="stylesheet" crossorigin href="/assets/index-BQY_snr3.css">
30
30
  </head>
31
31
  <body>
32
32
  <div id="root"></div>
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-vdJYkEVk.mjs";
2
+ import { t as runCli } from "./cli-R7_V6YWa.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -1,4 +1,4 @@
1
- import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, Lt as getEvalRegistry, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadEvalModule, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-BFdxG9ws.mjs";
1
+ import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, Lt as getEvalRegistry, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadEvalModule, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-CokPQet7.mjs";
2
2
  import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
3
3
  import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
4
4
  import { createHash, randomUUID } from "node:crypto";
@@ -2095,8 +2095,8 @@ async function commandApp(args) {
2095
2095
  const { serve } = await import("@hono/node-server");
2096
2096
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
2097
2097
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
2098
- const appModule = await import("./app-BZmhhSFZ.mjs");
2099
- const runnerModule = await import("./runner--aH0jO4Z.mjs");
2098
+ const appModule = await import("./app-DR9WPMA4.mjs");
2099
+ const runnerModule = await import("./runner-B8dLVAyM.mjs");
2100
2100
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
2101
2101
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
2102
2102
  await runnerModule.initRunner();
package/dist/index.d.mts CHANGED
@@ -152,6 +152,9 @@ declare const evalStatsConfigSchema$1: z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z
152
152
  }, z$1.core.$strip>, z$1.ZodObject<{
153
153
  hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
154
154
  kind: z$1.ZodLiteral<"duration">;
155
+ }, z$1.core.$strip>, z$1.ZodObject<{
156
+ hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
157
+ kind: z$1.ZodLiteral<"cacheHits">;
155
158
  }, z$1.core.$strip>, z$1.ZodObject<{
156
159
  hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
157
160
  kind: z$1.ZodLiteral<"column">;
@@ -222,6 +225,7 @@ declare const runLogEntrySchema$1: z$1.ZodObject<{
222
225
  file: z$1.ZodString;
223
226
  line: z$1.ZodNumber;
224
227
  column: z$1.ZodNumber;
228
+ stack: z$1.ZodOptional<z$1.ZodString>;
225
229
  }, z$1.core.$strip>>;
226
230
  source: z$1.ZodOptional<z$1.ZodString>;
227
231
  }, z$1.core.$strip>;
@@ -1012,13 +1016,15 @@ type EvalDefinitionBase<TInput = unknown, TOutputs extends EvalOutputs = EvalOut
1012
1016
  * Opt-in: when omitted (or empty) the EvalCard renders no stats row at all.
1013
1017
  * When provided, the stats render in order, left to right.
1014
1018
  *
1015
- * Built-in kinds (`cases`, `passRate`, `duration`, `cost`) read from the
1016
- * latest run summary. `kind: 'column'` aggregates a score or numeric output
1017
- * column across the latest run's cases `key` must match one of the eval's
1018
- * score or column keys, and only finite numeric values participate in the
1019
- * reduction. When no case has a numeric value for the key the stat renders
1020
- * an em dash, or hides when `hideIfNoValue` is true. `label`, `format`, and
1021
- * `numberFormat` default to the matching `ColumnDef`.
1019
+ * Built-in kinds (`cases`, `passRate`, `duration`, `cacheHits`) read from
1020
+ * the latest run summary. `cacheHits` counts Agent Eval operation-level cache
1021
+ * hits over total cache operations, not LLM provider prompt-cache read
1022
+ * tokens. `kind: 'column'` aggregates a score or numeric output column across
1023
+ * the latest run's cases `key` must match one of the eval's score or column
1024
+ * keys, and only finite numeric values participate in the reduction. When no
1025
+ * case has a numeric value for the key the stat renders an em dash, or hides
1026
+ * when `hideIfNoValue` is true. `label`, `format`, and `numberFormat` default
1027
+ * to the matching `ColumnDef`.
1022
1028
  */
1023
1029
  stats?: EvalStatsConfig$1;
1024
1030
  /**
@@ -1984,8 +1990,8 @@ declare const traceSpanSchema$1: z$1.ZodObject<{
1984
1990
  status: z$1.ZodEnum<{
1985
1991
  error: "error";
1986
1992
  running: "running";
1987
- cancelled: "cancelled";
1988
1993
  ok: "ok";
1994
+ cancelled: "cancelled";
1989
1995
  }>;
1990
1996
  attributes: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
1991
1997
  error: z$1.ZodOptional<z$1.ZodObject<{
@@ -2036,6 +2042,8 @@ declare const evalStatAggregateSchema: z$1.ZodEnum<{
2036
2042
  type EvalStatAggregate = z$1.infer<typeof evalStatAggregateSchema>;
2037
2043
  /**
2038
2044
  * One entry in the EvalCard stats row. Built-in kinds use latest run totals;
2045
+ * `cacheHits` counts Agent Eval operation-level cache hits from spans and
2046
+ * `evalTracer.cache(...)` refs, not LLM provider prompt-cache read tokens.
2039
2047
  * `column` aggregates a score or numeric output column across the latest run.
2040
2048
  */
2041
2049
  declare const evalStatItemSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
@@ -2048,6 +2056,9 @@ declare const evalStatItemSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
2048
2056
  }, z$1.core.$strip>, z$1.ZodObject<{
2049
2057
  hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
2050
2058
  kind: z$1.ZodLiteral<"duration">;
2059
+ }, z$1.core.$strip>, z$1.ZodObject<{
2060
+ hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
2061
+ kind: z$1.ZodLiteral<"cacheHits">;
2051
2062
  }, z$1.core.$strip>, z$1.ZodObject<{
2052
2063
  hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
2053
2064
  kind: z$1.ZodLiteral<"column">;
@@ -2090,6 +2101,9 @@ declare const evalStatsConfigSchema: z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z$1
2090
2101
  }, z$1.core.$strip>, z$1.ZodObject<{
2091
2102
  hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
2092
2103
  kind: z$1.ZodLiteral<"duration">;
2104
+ }, z$1.core.$strip>, z$1.ZodObject<{
2105
+ hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
2106
+ kind: z$1.ZodLiteral<"cacheHits">;
2093
2107
  }, z$1.core.$strip>, z$1.ZodObject<{
2094
2108
  hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
2095
2109
  kind: z$1.ZodLiteral<"column">;
@@ -2177,10 +2191,10 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
2177
2191
  caseIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
2178
2192
  lastRunStatus: z$1.ZodNullable<z$1.ZodEnum<{
2179
2193
  error: "error";
2180
- pass: "pass";
2181
- fail: "fail";
2182
2194
  running: "running";
2183
2195
  cancelled: "cancelled";
2196
+ pass: "pass";
2197
+ fail: "fail";
2184
2198
  unscored: "unscored";
2185
2199
  }>>;
2186
2200
  stats: z$1.ZodOptional<z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
@@ -2193,6 +2207,9 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
2193
2207
  }, z$1.core.$strip>, z$1.ZodObject<{
2194
2208
  hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
2195
2209
  kind: z$1.ZodLiteral<"duration">;
2210
+ }, z$1.core.$strip>, z$1.ZodObject<{
2211
+ hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
2212
+ kind: z$1.ZodLiteral<"cacheHits">;
2196
2213
  }, z$1.core.$strip>, z$1.ZodObject<{
2197
2214
  hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
2198
2215
  kind: z$1.ZodLiteral<"column">;
@@ -2396,13 +2413,15 @@ declare const caseRowSchema$1: z$1.ZodObject<{
2396
2413
  tags: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
2397
2414
  status: z$1.ZodEnum<{
2398
2415
  error: "error";
2399
- pass: "pass";
2400
- fail: "fail";
2401
2416
  running: "running";
2402
2417
  cancelled: "cancelled";
2418
+ pass: "pass";
2419
+ fail: "fail";
2403
2420
  pending: "pending";
2404
2421
  }>;
2405
2422
  durationMs: z$1.ZodNullable<z$1.ZodNumber>;
2423
+ cacheHits: z$1.ZodOptional<z$1.ZodNumber>;
2424
+ cacheOperations: z$1.ZodOptional<z$1.ZodNumber>;
2406
2425
  costUsd: z$1.ZodOptional<z$1.ZodNullable<z$1.ZodNumber>>;
2407
2426
  columns: z$1.ZodRecord<z$1.ZodString, z$1.ZodUnion<readonly [z$1.ZodType<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown, z$1.core.$ZodTypeInternals<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown>>, z$1.ZodUnion<readonly [z$1.ZodObject<{
2408
2427
  source: z$1.ZodLiteral<"repo">;
@@ -2449,8 +2468,9 @@ declare const runLogLocationSchema: z$1.ZodObject<{
2449
2468
  file: z$1.ZodString;
2450
2469
  line: z$1.ZodNumber;
2451
2470
  column: z$1.ZodNumber;
2471
+ stack: z$1.ZodOptional<z$1.ZodString>;
2452
2472
  }, z$1.core.$strip>;
2453
- /** Best-effort source location for one captured case log. */
2473
+ /** Best-effort source location and captured stack for one case log. */
2454
2474
  type RunLogLocation = z$1.infer<typeof runLogLocationSchema>;
2455
2475
  /** Schema for one persisted log entry captured during a case run. */
2456
2476
  declare const runLogEntrySchema: z$1.ZodObject<{
@@ -2474,6 +2494,7 @@ declare const runLogEntrySchema: z$1.ZodObject<{
2474
2494
  file: z$1.ZodString;
2475
2495
  line: z$1.ZodNumber;
2476
2496
  column: z$1.ZodNumber;
2497
+ stack: z$1.ZodOptional<z$1.ZodString>;
2477
2498
  }, z$1.core.$strip>>;
2478
2499
  source: z$1.ZodOptional<z$1.ZodString>;
2479
2500
  }, z$1.core.$strip>;
@@ -2492,8 +2513,8 @@ declare const scoreTraceSchema: z$1.ZodObject<{
2492
2513
  status: z$1.ZodEnum<{
2493
2514
  error: "error";
2494
2515
  running: "running";
2495
- cancelled: "cancelled";
2496
2516
  ok: "ok";
2517
+ cancelled: "cancelled";
2497
2518
  }>;
2498
2519
  attributes: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
2499
2520
  error: z$1.ZodOptional<z$1.ZodObject<{
@@ -2561,10 +2582,10 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
2561
2582
  tags: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
2562
2583
  status: z$1.ZodEnum<{
2563
2584
  error: "error";
2564
- pass: "pass";
2565
- fail: "fail";
2566
2585
  running: "running";
2567
2586
  cancelled: "cancelled";
2587
+ pass: "pass";
2588
+ fail: "fail";
2568
2589
  pending: "pending";
2569
2590
  }>;
2570
2591
  input: z$1.ZodUnknown;
@@ -2579,8 +2600,8 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
2579
2600
  status: z$1.ZodEnum<{
2580
2601
  error: "error";
2581
2602
  running: "running";
2582
- cancelled: "cancelled";
2583
2603
  ok: "ok";
2604
+ cancelled: "cancelled";
2584
2605
  }>;
2585
2606
  attributes: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
2586
2607
  error: z$1.ZodOptional<z$1.ZodObject<{
@@ -2648,8 +2669,8 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
2648
2669
  status: z$1.ZodEnum<{
2649
2670
  error: "error";
2650
2671
  running: "running";
2651
- cancelled: "cancelled";
2652
2672
  ok: "ok";
2673
+ cancelled: "cancelled";
2653
2674
  }>;
2654
2675
  attributes: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
2655
2676
  error: z$1.ZodOptional<z$1.ZodObject<{
@@ -2746,6 +2767,7 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
2746
2767
  file: z$1.ZodString;
2747
2768
  line: z$1.ZodNumber;
2748
2769
  column: z$1.ZodNumber;
2770
+ stack: z$1.ZodOptional<z$1.ZodString>;
2749
2771
  }, z$1.core.$strip>>;
2750
2772
  source: z$1.ZodOptional<z$1.ZodString>;
2751
2773
  }, z$1.core.$strip>>>;
@@ -2761,10 +2783,10 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
2761
2783
  namespace: z$1.ZodString;
2762
2784
  key: z$1.ZodString;
2763
2785
  status: z$1.ZodEnum<{
2786
+ bypass: "bypass";
2787
+ refresh: "refresh";
2764
2788
  hit: "hit";
2765
2789
  miss: "miss";
2766
- refresh: "refresh";
2767
- bypass: "bypass";
2768
2790
  }>;
2769
2791
  read: z$1.ZodOptional<z$1.ZodBoolean>;
2770
2792
  stored: z$1.ZodOptional<z$1.ZodBoolean>;
@@ -3137,9 +3159,9 @@ declare const runManifestSchema$1: z$1.ZodObject<{
3137
3159
  median: "median";
3138
3160
  }>>>;
3139
3161
  cacheMode: z$1.ZodOptional<z$1.ZodEnum<{
3140
- refresh: "refresh";
3141
- bypass: "bypass";
3142
3162
  use: "use";
3163
+ bypass: "bypass";
3164
+ refresh: "refresh";
3143
3165
  }>>;
3144
3166
  }, z$1.core.$strip>;
3145
3167
  /** Persisted lifecycle metadata for a single eval run. */
@@ -3189,6 +3211,21 @@ type ScopedCaseSummary = {
3189
3211
  pendingCases: number;
3190
3212
  runningCases: number;
3191
3213
  totalDurationMs: number | null;
3214
+ /**
3215
+ * Sum of Agent Eval operation-level cache hits across the scoped case rows.
3216
+ *
3217
+ * Missing values from older run artifacts count as zero. This is separate
3218
+ * from LLM prompt-cache token reads such as `cachedInputTokens`.
3219
+ */
3220
+ cacheHits: number;
3221
+ /**
3222
+ * Sum of Agent Eval operation-level cache activity entries across the scoped
3223
+ * case rows.
3224
+ *
3225
+ * This is the denominator for `cacheHits`. Missing values from older run
3226
+ * artifacts count as zero.
3227
+ */
3228
+ cacheOperations: number;
3192
3229
  };
3193
3230
  //#endregion
3194
3231
  //#region src/evalStatus.d.ts
@@ -4026,9 +4063,9 @@ declare function extractApiCalls(spans: EvalTraceSpan$1[], config: ResolvedApiCa
4026
4063
  * - `refresh`: never read, always write (forces re-execution and overwrites).
4027
4064
  */
4028
4065
  declare const cacheModeSchema: z$1.ZodEnum<{
4029
- refresh: "refresh";
4030
- bypass: "bypass";
4031
4066
  use: "use";
4067
+ bypass: "bypass";
4068
+ refresh: "refresh";
4032
4069
  }>;
4033
4070
  /** Mode controlling how cached spans behave during a run. */
4034
4071
  type CacheMode = z$1.infer<typeof cacheModeSchema>;
@@ -4049,10 +4086,10 @@ declare const cacheOperationTypeSchema: z$1.ZodEnum<{
4049
4086
  type CacheOperationType = z$1.infer<typeof cacheOperationTypeSchema>;
4050
4087
  /** Status of a cache lookup recorded on a span or case scope. */
4051
4088
  declare const cacheStatusSchema: z$1.ZodEnum<{
4089
+ bypass: "bypass";
4090
+ refresh: "refresh";
4052
4091
  hit: "hit";
4053
4092
  miss: "miss";
4054
- refresh: "refresh";
4055
- bypass: "bypass";
4056
4093
  }>;
4057
4094
  /** Status of a cache lookup recorded on a span or case scope. */
4058
4095
  type CacheStatus = z$1.infer<typeof cacheStatusSchema>;
@@ -4069,10 +4106,10 @@ declare const traceCacheRefSchema: z$1.ZodObject<{
4069
4106
  namespace: z$1.ZodString;
4070
4107
  key: z$1.ZodString;
4071
4108
  status: z$1.ZodEnum<{
4109
+ bypass: "bypass";
4110
+ refresh: "refresh";
4072
4111
  hit: "hit";
4073
4112
  miss: "miss";
4074
- refresh: "refresh";
4075
- bypass: "bypass";
4076
4113
  }>;
4077
4114
  read: z$1.ZodOptional<z$1.ZodBoolean>;
4078
4115
  stored: z$1.ZodOptional<z$1.ZodBoolean>;
@@ -4149,8 +4186,8 @@ declare const cacheRecordingSchema: z$1.ZodObject<{
4149
4186
  finalStatus: z$1.ZodOptional<z$1.ZodEnum<{
4150
4187
  error: "error";
4151
4188
  running: "running";
4152
- cancelled: "cancelled";
4153
4189
  ok: "ok";
4190
+ cancelled: "cancelled";
4154
4191
  }>>;
4155
4192
  finalError: z$1.ZodOptional<z$1.ZodObject<{
4156
4193
  name: z$1.ZodOptional<z$1.ZodString>;
@@ -4222,8 +4259,8 @@ declare const cacheEntrySchema: z$1.ZodObject<{
4222
4259
  finalStatus: z$1.ZodOptional<z$1.ZodEnum<{
4223
4260
  error: "error";
4224
4261
  running: "running";
4225
- cancelled: "cancelled";
4226
4262
  ok: "ok";
4263
+ cancelled: "cancelled";
4227
4264
  }>>;
4228
4265
  finalError: z$1.ZodOptional<z$1.ZodObject<{
4229
4266
  name: z$1.ZodOptional<z$1.ZodString>;
@@ -4312,8 +4349,8 @@ declare const cacheDebugKeyEntrySchema: z$1.ZodObject<{
4312
4349
  finalStatus: z$1.ZodOptional<z$1.ZodEnum<{
4313
4350
  error: "error";
4314
4351
  running: "running";
4315
- cancelled: "cancelled";
4316
4352
  ok: "ok";
4353
+ cancelled: "cancelled";
4317
4354
  }>>;
4318
4355
  finalError: z$1.ZodOptional<z$1.ZodObject<{
4319
4356
  name: z$1.ZodOptional<z$1.ZodString>;
@@ -4391,8 +4428,8 @@ declare const cacheEntryWithDebugKeySchema$1: z$1.ZodObject<{
4391
4428
  finalStatus: z$1.ZodOptional<z$1.ZodEnum<{
4392
4429
  error: "error";
4393
4430
  running: "running";
4394
- cancelled: "cancelled";
4395
4431
  ok: "ok";
4432
+ cancelled: "cancelled";
4396
4433
  }>>;
4397
4434
  finalError: z$1.ZodOptional<z$1.ZodObject<{
4398
4435
  name: z$1.ZodOptional<z$1.ZodString>;
@@ -4472,8 +4509,8 @@ declare const cacheEntryWithDebugKeySchema$1: z$1.ZodObject<{
4472
4509
  finalStatus: z$1.ZodOptional<z$1.ZodEnum<{
4473
4510
  error: "error";
4474
4511
  running: "running";
4475
- cancelled: "cancelled";
4476
4512
  ok: "ok";
4513
+ cancelled: "cancelled";
4477
4514
  }>>;
4478
4515
  finalError: z$1.ZodOptional<z$1.ZodObject<{
4479
4516
  name: z$1.ZodOptional<z$1.ZodString>;
@@ -4551,8 +4588,8 @@ declare const cacheFileSchema: z$1.ZodObject<{
4551
4588
  finalStatus: z$1.ZodOptional<z$1.ZodEnum<{
4552
4589
  error: "error";
4553
4590
  running: "running";
4554
- cancelled: "cancelled";
4555
4591
  ok: "ok";
4592
+ cancelled: "cancelled";
4556
4593
  }>>;
4557
4594
  finalError: z$1.ZodOptional<z$1.ZodObject<{
4558
4595
  name: z$1.ZodOptional<z$1.ZodString>;
@@ -4640,8 +4677,8 @@ declare const cacheDebugKeyFileSchema: z$1.ZodObject<{
4640
4677
  finalStatus: z$1.ZodOptional<z$1.ZodEnum<{
4641
4678
  error: "error";
4642
4679
  running: "running";
4643
- cancelled: "cancelled";
4644
4680
  ok: "ok";
4681
+ cancelled: "cancelled";
4645
4682
  }>>;
4646
4683
  finalError: z$1.ZodOptional<z$1.ZodObject<{
4647
4684
  name: z$1.ZodOptional<z$1.ZodString>;
@@ -4811,9 +4848,9 @@ declare const createRunRequestSchema$1: z$1.ZodObject<{
4811
4848
  temporary: z$1.ZodOptional<z$1.ZodBoolean>;
4812
4849
  cache: z$1.ZodOptional<z$1.ZodObject<{
4813
4850
  mode: z$1.ZodDefault<z$1.ZodEnum<{
4814
- refresh: "refresh";
4815
- bypass: "bypass";
4816
4851
  use: "use";
4852
+ bypass: "bypass";
4853
+ refresh: "refresh";
4817
4854
  }>>;
4818
4855
  }, z$1.core.$strip>>;
4819
4856
  manualInputs: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
@@ -5148,6 +5185,9 @@ declare const evalSummarySchema: z$1.ZodObject<{
5148
5185
  }, z$1.core.$strip>, z$1.ZodObject<{
5149
5186
  hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
5150
5187
  kind: z$1.ZodLiteral<"duration">;
5188
+ }, z$1.core.$strip>, z$1.ZodObject<{
5189
+ hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
5190
+ kind: z$1.ZodLiteral<"cacheHits">;
5151
5191
  }, z$1.core.$strip>, z$1.ZodObject<{
5152
5192
  hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
5153
5193
  kind: z$1.ZodLiteral<"column">;
@@ -5358,6 +5398,8 @@ declare const caseRowSchema: z$1.ZodObject<{
5358
5398
  pending: "pending";
5359
5399
  }>;
5360
5400
  durationMs: z$1.ZodNullable<z$1.ZodNumber>;
5401
+ cacheHits: z$1.ZodOptional<z$1.ZodNumber>;
5402
+ cacheOperations: z$1.ZodOptional<z$1.ZodNumber>;
5361
5403
  costUsd: z$1.ZodOptional<z$1.ZodNullable<z$1.ZodNumber>>;
5362
5404
  columns: z$1.ZodRecord<z$1.ZodString, z$1.ZodUnion<readonly [z$1.ZodType<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown, z$1.core.$ZodTypeInternals<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown>>, z$1.ZodUnion<readonly [z$1.ZodObject<{
5363
5405
  source: z$1.ZodLiteral<"repo">;
@@ -5567,6 +5609,7 @@ declare const caseDetailSchema: z$1.ZodObject<{
5567
5609
  file: z$1.ZodString;
5568
5610
  line: z$1.ZodNumber;
5569
5611
  column: z$1.ZodNumber;
5612
+ stack: z$1.ZodOptional<z$1.ZodString>;
5570
5613
  }, z$1.core.$strip>>;
5571
5614
  source: z$1.ZodOptional<z$1.ZodString>;
5572
5615
  }, z$1.core.$strip>>>;
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, Lt as getEvalRegistry, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-BFdxG9ws.mjs";
2
- import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-vdJYkEVk.mjs";
3
- import { n as matchesEvalTags, t as defineEval } from "./src-BRqs3kSA.mjs";
1
+ import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, Lt as getEvalRegistry, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-CokPQet7.mjs";
2
+ import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-R7_V6YWa.mjs";
3
+ import { n as matchesEvalTags, t as defineEval } from "./src-B43qR0Ea.mjs";
4
4
  export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
package/dist/runChild.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as columnDefSchema, Mt as evalStatsConfigSchema, Nt as manualInputDescriptorSchema, Pt as evalChartsConfigSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-BFdxG9ws.mjs";
1
+ import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as columnDefSchema, Mt as evalStatsConfigSchema, Nt as manualInputDescriptorSchema, Pt as evalChartsConfigSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-CokPQet7.mjs";
2
2
  import { z } from "zod/v4";
3
3
  import { readFile } from "node:fs/promises";
4
4
  import { relative } from "node:path";
@@ -621,6 +621,8 @@ const hideIfNoValueShape = {
621
621
  hideIfNoValue: z.boolean().optional() };
622
622
  /**
623
623
  * One entry in the EvalCard stats row. Built-in kinds use latest run totals;
624
+ * `cacheHits` counts Agent Eval operation-level cache hits from spans and
625
+ * `evalTracer.cache(...)` refs, not LLM provider prompt-cache read tokens.
624
626
  * `column` aggregates a score or numeric output column across the latest run.
625
627
  */
626
628
  const evalStatItemSchema = z.discriminatedUnion("kind", [
@@ -637,6 +639,10 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
637
639
  kind: z.literal("duration"),
638
640
  ...hideIfNoValueShape
639
641
  }),
642
+ z.object({
643
+ kind: z.literal("cacheHits"),
644
+ ...hideIfNoValueShape
645
+ }),
640
646
  z.object({
641
647
  kind: z.literal("column"),
642
648
  key: z.string(),
@@ -731,6 +737,23 @@ const caseRowSchema = z.object({
731
737
  ]),
732
738
  /** Elapsed case execution duration in milliseconds, or null before completion. */
733
739
  durationMs: z.number().nullable(),
740
+ /**
741
+ * Agent Eval operation-level cache hits recorded for this case.
742
+ *
743
+ * This counts persisted operation cache hits from spans and
744
+ * `evalTracer.cache(...)` refs. It does not count LLM provider prompt-cache
745
+ * read tokens such as `cachedInputTokens`. Older run artifacts may omit it
746
+ * and should be treated as zero by aggregate readers.
747
+ */
748
+ cacheHits: z.number().optional(),
749
+ /**
750
+ * Agent Eval operation-level cache activity entries recorded for this case.
751
+ *
752
+ * This is the denominator for `cacheHits`, counting hits plus misses and
753
+ * refreshes that appear in the Cache tab. Older run artifacts may omit it
754
+ * and should be treated as zero by aggregate readers.
755
+ */
756
+ cacheOperations: z.number().optional(),
734
757
  costUsd: z.number().nullable().optional(),
735
758
  columns: z.record(z.string(), cellValueSchema),
736
759
  /** Winning trial index for the persisted case result. */
@@ -771,7 +794,13 @@ const runLogLocationSchema = z.object({
771
794
  /** 1-based source line reported by the JavaScript stack frame. */
772
795
  line: z.number(),
773
796
  /** 1-based source column reported by the JavaScript stack frame. */
774
- column: z.number()
797
+ column: z.number(),
798
+ /**
799
+ * Full JavaScript stack captured when the log was emitted.
800
+ *
801
+ * Older run artifacts may only include the primary file, line, and column.
802
+ */
803
+ stack: z.string().optional()
775
804
  });
776
805
  /** Schema for one persisted log entry captured during a case run. */
777
806
  const runLogEntrySchema = z.object({
@@ -1692,6 +1721,8 @@ function deriveScopedSummaryFromCases(params) {
1692
1721
  let runningCases = 0;
1693
1722
  let totalDurationMs = 0;
1694
1723
  let hasDuration = false;
1724
+ let cacheHits = 0;
1725
+ let cacheOperations = 0;
1695
1726
  for (const caseRow of caseRows) {
1696
1727
  if (caseRow.status === "pass") passedCases += 1;
1697
1728
  else if (caseRow.status === "fail") failedCases += 1;
@@ -1703,6 +1734,8 @@ function deriveScopedSummaryFromCases(params) {
1703
1734
  totalDurationMs += caseRow.durationMs;
1704
1735
  hasDuration = true;
1705
1736
  }
1737
+ cacheHits += caseRow.cacheHits ?? 0;
1738
+ cacheOperations += caseRow.cacheOperations ?? 0;
1706
1739
  }
1707
1740
  return {
1708
1741
  status: deriveStatusFromCaseRows({
@@ -1716,7 +1749,9 @@ function deriveScopedSummaryFromCases(params) {
1716
1749
  cancelledCases,
1717
1750
  pendingCases,
1718
1751
  runningCases,
1719
- totalDurationMs: hasDuration ? totalDurationMs : null
1752
+ totalDurationMs: hasDuration ? totalDurationMs : null,
1753
+ cacheHits,
1754
+ cacheOperations
1720
1755
  };
1721
1756
  }
1722
1757
  //#endregion
@@ -2787,7 +2822,8 @@ function normalizeStackFile(value) {
2787
2822
  return decodeURIComponent(value.replace(fileUrlPrefixPattern, ""));
2788
2823
  }
2789
2824
  function isInternalLogFrame(file) {
2790
- return file.includes("/packages/sdk/src/runtime.ts") || file.includes("/node:internal/") || file.startsWith("node:internal/");
2825
+ const normalizedFile = file.replaceAll("\\", "/");
2826
+ return normalizedFile.includes("/packages/sdk/src/runtime.ts") || normalizedFile.includes("/packages/sdk/dist/") || normalizedFile.includes("/node_modules/@agent-evals/sdk/dist/") || normalizedFile.includes("/node_modules/@ls-stack/agent-eval/dist/") || normalizedFile.includes("/node:internal/") || normalizedFile.startsWith("node:internal/");
2791
2827
  }
2792
2828
  function parseStackFrameLocation(line) {
2793
2829
  const match = stackFrameLocationPattern.exec(line.trim());
@@ -2808,7 +2844,10 @@ function getLogLocation() {
2808
2844
  for (const line of stack.split("\n").slice(1)) {
2809
2845
  const location = parseStackFrameLocation(line);
2810
2846
  if (location === null || isInternalLogFrame(location.file)) continue;
2811
- return location;
2847
+ return {
2848
+ ...location,
2849
+ stack
2850
+ };
2812
2851
  }
2813
2852
  }
2814
2853
  function recordEvalLog(level, args) {
@@ -7169,12 +7208,16 @@ async function runCase(params) {
7169
7208
  };
7170
7209
  if (Object.keys(scoringTraces).length > 0) caseDetail.scoringTraces = scoringTraces;
7171
7210
  const elapsedMs = Date.now() - startTime;
7211
+ const cacheEntries = extractCacheEntries(displayTrace, scope.caseCacheRefs);
7212
+ const cacheHits = cacheEntries.filter((entry) => entry.status === "hit");
7172
7213
  return {
7173
7214
  caseDetail,
7174
7215
  caseRowUpdate: {
7175
7216
  tags: evalCase.tags ?? [],
7176
7217
  status,
7177
7218
  durationMs: elapsedMs,
7219
+ cacheHits: cacheHits.length,
7220
+ cacheOperations: cacheEntries.length,
7178
7221
  columns
7179
7222
  }
7180
7223
  };
@@ -7670,6 +7713,8 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
7670
7713
  tags: caseDetail.tags,
7671
7714
  status: caseRowUpdate.status ?? "pending",
7672
7715
  durationMs: caseRowUpdate.durationMs ?? null,
7716
+ cacheHits: caseRowUpdate.cacheHits ?? 0,
7717
+ cacheOperations: caseRowUpdate.cacheOperations ?? 0,
7673
7718
  columns: caseRowUpdate.columns ?? {},
7674
7719
  trial
7675
7720
  }
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-DJWn_7p0.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-Coc9wBWz.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-vdJYkEVk.mjs";
2
- import "./src-BRqs3kSA.mjs";
1
+ import { n as createRunner } from "./cli-R7_V6YWa.mjs";
2
+ import "./src-B43qR0Ea.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,5 +1,5 @@
1
- import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-BFdxG9ws.mjs";
2
- import "./cli-vdJYkEVk.mjs";
1
+ import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-CokPQet7.mjs";
2
+ import "./cli-R7_V6YWa.mjs";
3
3
  //#region src/index.ts
4
4
  /** Register an eval definition with typed tag support. */
5
5
  function defineEval(definition) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.46.0",
3
+ "version": "0.50.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -400,11 +400,17 @@ cacheCreationInputTokens` so cache details are not double-counted.
400
400
  - `runLogs` (in `agent-evals.config.ts`) controls case log capture. Use
401
401
  `runLogs: { captureConsole: false }` to keep console output in the terminal
402
402
  without persisting console calls to case details. Manual `evalLog(...)` calls
403
- are still captured.
403
+ are still captured. Captured log locations store the selected user-facing
404
+ source frame and the full JavaScript stack so agents can inspect additional
405
+ frames in persisted artifacts when diagnosing where a log came from.
404
406
 
405
407
  Stats rows and history charts can be authored via `stats` / `charts` on the eval
406
408
  definition. Global `stats` in `agent-evals.config.ts` combine with eval-level
407
- stats. Usage stats and LLM usage charts are added by default unless removed with
409
+ stats. Native stat kinds include `cases`, `passRate`, `duration`, and
410
+ `cacheHits`; `cacheHits` shows Agent Eval operation-level cache hits over total
411
+ cache operations (`hits/total`) from spans and `evalTracer.cache(...)` refs, not
412
+ LLM provider prompt-cache read tokens such as `cachedInputTokens`. Usage stats
413
+ and LLM usage charts are added by default unless removed with
408
414
  `removeDefaultConfig`. Column stats can override `format` and `numberFormat`,
409
415
  otherwise they inherit from the matching column. Number formats use
410
416
  `maxDecimalPlaces` to cap decimals and `minDecimalPlaces` to pad trailing