npm - @ls-stack/agent-eval - Versions diffs - 0.7.0 → 0.9.0 - Mend

@ls-stack/agent-eval 0.7.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/dist/{app-TjV5nDMM.mjs → app-hkNNN_jn.mjs} +53 -5
package/dist/apps/web/dist/assets/index-ChgByJbI.css +1 -0
package/dist/apps/web/dist/assets/index-CmY0_D5Z.js +113 -0
package/dist/apps/web/dist/index.html +2 -2
package/dist/bin.mjs +1 -1
package/dist/{cli-BTtgQLjB.mjs → cli-DrPk66xh.mjs} +13 -4
package/dist/index.d.mts +466 -78
package/dist/index.mjs +4 -4
package/dist/runChild.mjs +3 -2
package/dist/{runOrchestration-HaMahl6b.mjs → runOrchestration-DA4Rh5g0.mjs} +2379 -179
package/dist/{runner-CBDZos0Z.mjs → runner-BzT3B9OF.mjs} +1 -1
package/dist/{runner-DGVoOyJt.mjs → runner-DTP5Ui4_.mjs} +2 -2
package/dist/src-CfprG1RW.mjs +3 -0
package/package.json +3 -3
package/dist/apps/web/dist/assets/index-ClE28i5w.css +0 -1
package/dist/apps/web/dist/assets/index-gGumCEnD.js +0 -112
package/dist/src-Bt5Fz9HS.mjs +0 -3

package/dist/index.d.mts CHANGED Viewed

@@ -913,6 +913,20 @@ declare const caseDetailSchema: z$1.ZodObject<{
     stack: z$1.ZodOptional<z$1.ZodString>;
   }, z$1.core.$strip>>;
   trial: z$1.ZodNumber;
+  cacheRefs: z$1.ZodDefault<z$1.ZodArray<z$1.ZodObject<{
+    type: z$1.ZodLiteral<"value">;
+    name: z$1.ZodString;
+    namespace: z$1.ZodString;
+    key: z$1.ZodString;
+    status: z$1.ZodEnum<{
+      hit: "hit";
+      miss: "miss";
+      refresh: "refresh";
+      bypass: "bypass";
+    }>;
+    storedAt: z$1.ZodOptional<z$1.ZodString>;
+    age: z$1.ZodOptional<z$1.ZodNumber>;
+  }, z$1.core.$strip>>>;
 }, z$1.core.$strip>;
 /** Full case payload including inputs, trace, outputs, and failures. */
 type CaseDetail = z$1.infer<typeof caseDetailSchema>;
@@ -1363,60 +1377,16 @@ type EvalTitleLike = {
  */
 declare function getEvalTitle(evalLike: EvalTitleLike): string;
 //#endregion
-//#region ../shared/src/schemas/sse.d.ts
-declare const sseEventTypeSchema: z$1.ZodEnum<{
-  "discovery.updated": "discovery.updated";
-  "run.started": "run.started";
-  "run.summary": "run.summary";
-  "case.started": "case.started";
-  "case.updated": "case.updated";
-  "case.finished": "case.finished";
-  "trace.span": "trace.span";
-  "run.finished": "run.finished";
-  "run.cancelled": "run.cancelled";
-  "run.error": "run.error";
-}>;
-/** Server-sent event name emitted by the runner or backend. */
-type SseEventType = z$1.infer<typeof sseEventTypeSchema>;
-/** Schema for the SSE envelope used to stream run updates to clients. */
-declare const sseEnvelopeSchema: z$1.ZodObject<{
-  type: z$1.ZodString;
-  runId: z$1.ZodOptional<z$1.ZodString>;
-  timestamp: z$1.ZodString;
-  payload: z$1.ZodUnknown;
-}, z$1.core.$strip>;
-/** Wire format for a streamed event emitted during eval execution. */
-type SseEnvelope = z$1.infer<typeof sseEnvelopeSchema>;
-//#endregion
-//#region ../shared/src/schemas/api.d.ts
-/** Schema for the API request that starts a new eval run. */
-declare const createRunRequestSchema: z$1.ZodObject<{
-  target: z$1.ZodObject<{
-    mode: z$1.ZodEnum<{
-      all: "all";
-      evalIds: "evalIds";
-      caseIds: "caseIds";
-    }>;
-    evalIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
-    caseIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
-  }, z$1.core.$strip>;
-  trials: z$1.ZodNumber;
-  cache: z$1.ZodOptional<z$1.ZodObject<{
-    mode: z$1.ZodDefault<z$1.ZodEnum<{
-      use: "use";
-      bypass: "bypass";
-      refresh: "refresh";
-    }>>;
-  }, z$1.core.$strip>>;
-}, z$1.core.$strip>;
-/** Request payload accepted by the run creation endpoint. */
-type CreateRunRequest = z$1.infer<typeof createRunRequestSchema>;
-/** Schema for updating a UI-authored manual score on one persisted case. */
-declare const updateManualScoreRequestSchema: z$1.ZodObject<{
-  value: z$1.ZodNullable<z$1.ZodNumber>;
-}, z$1.core.$strip>;
-/** Request payload accepted by the manual score update endpoint. */
-type UpdateManualScoreRequest = z$1.infer<typeof updateManualScoreRequestSchema>;
+//#region ../shared/src/utils/getNestedAttribute.d.ts
+/**
+ * Read a value from `source` by walking a dot-separated path.
+ *
+ * Returns `undefined` when any segment of the path is missing or when an
+ * intermediate value is not a plain object. Used by trace-attribute display,
+ * the LLM calls extractor, and any consumer that needs to look up nested
+ * properties from a span's `attributes` record.
+ */
+declare function getNestedAttribute(value: unknown, path: string): unknown;
 //#endregion
 //#region ../shared/src/schemas/config.d.ts
 /** Strategy used to collapse repeated trials into one stored case result. */
@@ -1426,6 +1396,144 @@ declare const trialSelectionModeSchema: z$1.ZodEnum<{
 }>;
 /** Strategy used to collapse repeated trials into one stored case result. */
 type TrialSelectionMode = z$1.infer<typeof trialSelectionModeSchema>;
+/** Render formats supported by an LLM-call metric in the UI. */
+declare const llmCallMetricFormatSchema: z$1.ZodEnum<{
+  string: "string";
+  number: "number";
+  boolean: "boolean";
+  duration: "duration";
+  json: "json";
+}>;
+/** Render format applied to an LLM-call metric value. */
+type LlmCallMetricFormat = z$1.infer<typeof llmCallMetricFormatSchema>;
+/** Where an LLM-call metric is rendered inside the LLM calls tab. */
+declare const llmCallMetricPlacementSchema: z$1.ZodEnum<{
+  header: "header";
+  body: "body";
+}>;
+/** Placement option for an LLM-call metric. */
+type LlmCallMetricPlacement = z$1.infer<typeof llmCallMetricPlacementSchema>;
+/**
+ * Schema for a single user-defined metric attached to LLM call rows.
+ *
+ * Each metric reads `path` from the span's `attributes` and renders the value
+ * with the configured `format` and `numberFormat`. `placements` controls
+ * whether the metric appears as a chip on the collapsed row header, as a row
+ * inside the expanded body, or both. Defaults to `['body']` when omitted.
+ */
+declare const llmCallMetricSchema: z$1.ZodObject<{
+  label: z$1.ZodString;
+  tooltip: z$1.ZodOptional<z$1.ZodString>;
+  path: z$1.ZodString;
+  format: z$1.ZodOptional<z$1.ZodEnum<{
+    string: "string";
+    number: "number";
+    boolean: "boolean";
+    duration: "duration";
+    json: "json";
+  }>>;
+  numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
+  placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
+    header: "header";
+    body: "body";
+  }>>>;
+}, z$1.core.$strip>;
+/** User-defined metric authored in `agent-evals.config.ts`. */
+type LlmCallMetric = z$1.infer<typeof llmCallMetricSchema>;
+/** Schema for the global LLM calls config block in `agent-evals.config.ts`. */
+declare const llmCallsConfigSchema: z$1.ZodObject<{
+  kinds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
+  attributes: z$1.ZodOptional<z$1.ZodObject<{
+    model: z$1.ZodOptional<z$1.ZodString>;
+    provider: z$1.ZodOptional<z$1.ZodString>;
+    inputTokens: z$1.ZodOptional<z$1.ZodString>;
+    outputTokens: z$1.ZodOptional<z$1.ZodString>;
+    cachedInputTokens: z$1.ZodOptional<z$1.ZodString>;
+    cacheCreationInputTokens: z$1.ZodOptional<z$1.ZodString>;
+    reasoningTokens: z$1.ZodOptional<z$1.ZodString>;
+    totalTokens: z$1.ZodOptional<z$1.ZodString>;
+    cost: z$1.ZodOptional<z$1.ZodString>;
+    inputCost: z$1.ZodOptional<z$1.ZodString>;
+    outputCost: z$1.ZodOptional<z$1.ZodString>;
+    cachedInputCost: z$1.ZodOptional<z$1.ZodString>;
+    cacheCreationInputCost: z$1.ZodOptional<z$1.ZodString>;
+    reasoningCost: z$1.ZodOptional<z$1.ZodString>;
+    steps: z$1.ZodOptional<z$1.ZodString>;
+    finishReason: z$1.ZodOptional<z$1.ZodString>;
+    input: z$1.ZodOptional<z$1.ZodString>;
+    output: z$1.ZodOptional<z$1.ZodString>;
+    reasoning: z$1.ZodOptional<z$1.ZodString>;
+    toolCalls: z$1.ZodOptional<z$1.ZodString>;
+  }, z$1.core.$strip>>;
+  metrics: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
+    label: z$1.ZodString;
+    tooltip: z$1.ZodOptional<z$1.ZodString>;
+    path: z$1.ZodString;
+    format: z$1.ZodOptional<z$1.ZodEnum<{
+      string: "string";
+      number: "number";
+      boolean: "boolean";
+      duration: "duration";
+      json: "json";
+    }>>;
+    numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
+    placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
+      header: "header";
+      body: "body";
+    }>>>;
+  }, z$1.core.$strip>>>;
+}, z$1.core.$strip>;
+/** Authored LLM calls config accepted from `agent-evals.config.ts`. */
+type LlmCallsConfigInput = z$1.infer<typeof llmCallsConfigSchema>;
+/** Resolved LLM-calls config sent to the UI with all defaults applied. */
+type ResolvedLlmCallsConfig = {
+  kinds: string[];
+  attributes: {
+    model: string;
+    provider: string;
+    inputTokens: string;
+    outputTokens: string;
+    cachedInputTokens: string;
+    cacheCreationInputTokens: string;
+    reasoningTokens: string;
+    totalTokens: string;
+    cost: string;
+    inputCost: string;
+    outputCost: string;
+    cachedInputCost: string;
+    cacheCreationInputCost: string;
+    reasoningCost: string;
+    steps: string;
+    finishReason: string;
+    input: string;
+    output: string;
+    reasoning: string;
+    toolCalls: string;
+  };
+  metrics: ResolvedLlmCallMetric[];
+};
+/** Fully-resolved LLM-call metric used by the runner and UI. */
+type ResolvedLlmCallMetric = {
+  label: string;
+  tooltip?: string;
+  path: string;
+  format: LlmCallMetricFormat;
+  numberFormat?: NumberDisplayOptions;
+  placements: LlmCallMetricPlacement[];
+};
+/** Default LLM-calls config the UI uses before the workspace fetch resolves. */
+declare const DEFAULT_LLM_CALLS_CONFIG: ResolvedLlmCallsConfig;
+/**
+ * Resolve the user-authored LLM-calls config to a fully-defaulted shape used
+ * by the UI to derive the LLM calls tab.
+ *
+ * - Missing or empty `kinds` falls back to `['llm']`.
+ * - Missing `attributes.<field>` falls back to the corresponding default
+ *   attribute path.
+ * - Missing `metrics[].format` defaults to `'string'`.
+ * - Missing `metrics[].placements` defaults to `['body']`.
+ */
+declare function resolveLlmCallsConfig(input: LlmCallsConfigInput | undefined): ResolvedLlmCallsConfig;
 /** Top-level config authored in `agent-evals.config.ts`. */
 type AgentEvalsConfig = {
   /** Root directory used to resolve all relative paths. Defaults to `process.cwd()`. */workspaceRoot?: string; /** Glob patterns (relative to `workspaceRoot`) used to discover eval files. */
@@ -1455,6 +1563,32 @@ type AgentEvalsConfig = {
    * definition taking precedence for matching `key` or `path` entries.
    */
   traceDisplay?: TraceDisplayInputConfig;
+  /**
+   * Configuration for the "LLM calls" tab in the case-run drawer.
+   *
+   * Determines which trace spans are treated as LLM calls (`kinds`), how
+   * structured fields like `model` and `usage.inputTokens` are read from
+   * span attributes, and which custom user-defined metrics are surfaced on
+   * each call. All fields are optional and fall back to the documented
+   * defaults; the LLM calls tab is shown automatically when at least one
+   * matching span exists in a case run.
+   *
+   * @example
+   * ```ts
+   * llmCalls: {
+   *   kinds: ['llm', 'ai-sdk.generateText'],
+   *   attributes: {
+   *     cachedInputTokens: 'usage.cache_read_input_tokens',
+   *   },
+   *   metrics: [
+   *     { label: 'Tokens/sec', path: 'tokensPerSecond', format: 'number',
+   *       numberFormat: { decimalPlaces: 1 }, placements: ['header', 'body'] },
+   *     { label: 'Retries', path: 'retryCount', format: 'number' },
+   *   ],
+   * }
+   * ```
+   */
+  llmCalls?: LlmCallsConfigInput;
   /**
    * Optional controls for the operation cache. When omitted, the cache is
    * enabled and stored under `<workspaceRoot>/.agent-evals/cache`.
@@ -1463,9 +1597,15 @@ type AgentEvalsConfig = {
     /** Disable the cache entirely; spans with `cache` options execute as if uncached. */enabled?: boolean; /** Override the directory used to persist cache entries. */
     dir?: string;
     /**
-     * Maximum entries retained in each per-eval cache file. Defaults to `100`;
-     * non-positive or non-finite values fall back to the default.
+     * Default maximum entries retained for each cache namespace. Defaults to
+     * `100`; non-positive or non-finite values fall back to the default.
+     */
+    maxEntriesPerNamespace?: number;
+    /**
+     * Exact namespace-specific retention caps. Values override
+     * `maxEntriesPerNamespace` for matching namespaces.
      */
+    maxEntriesByNamespace?: Record<string, number>; /** Legacy alias for `maxEntriesPerNamespace`, retained so older config files keep working. */
     maxEntriesPerEval?: number;
   };
 };
@@ -1509,13 +1649,118 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
       transform: z$1.ZodOptional<z$1.ZodCustom<TraceAttributeTransform, TraceAttributeTransform>>;
     }, z$1.core.$strip>>>;
   }, z$1.core.$strip>>;
+  llmCalls: z$1.ZodOptional<z$1.ZodObject<{
+    kinds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
+    attributes: z$1.ZodOptional<z$1.ZodObject<{
+      model: z$1.ZodOptional<z$1.ZodString>;
+      provider: z$1.ZodOptional<z$1.ZodString>;
+      inputTokens: z$1.ZodOptional<z$1.ZodString>;
+      outputTokens: z$1.ZodOptional<z$1.ZodString>;
+      cachedInputTokens: z$1.ZodOptional<z$1.ZodString>;
+      cacheCreationInputTokens: z$1.ZodOptional<z$1.ZodString>;
+      reasoningTokens: z$1.ZodOptional<z$1.ZodString>;
+      totalTokens: z$1.ZodOptional<z$1.ZodString>;
+      cost: z$1.ZodOptional<z$1.ZodString>;
+      inputCost: z$1.ZodOptional<z$1.ZodString>;
+      outputCost: z$1.ZodOptional<z$1.ZodString>;
+      cachedInputCost: z$1.ZodOptional<z$1.ZodString>;
+      cacheCreationInputCost: z$1.ZodOptional<z$1.ZodString>;
+      reasoningCost: z$1.ZodOptional<z$1.ZodString>;
+      steps: z$1.ZodOptional<z$1.ZodString>;
+      finishReason: z$1.ZodOptional<z$1.ZodString>;
+      input: z$1.ZodOptional<z$1.ZodString>;
+      output: z$1.ZodOptional<z$1.ZodString>;
+      reasoning: z$1.ZodOptional<z$1.ZodString>;
+      toolCalls: z$1.ZodOptional<z$1.ZodString>;
+    }, z$1.core.$strip>>;
+    metrics: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
+      label: z$1.ZodString;
+      tooltip: z$1.ZodOptional<z$1.ZodString>;
+      path: z$1.ZodString;
+      format: z$1.ZodOptional<z$1.ZodEnum<{
+        string: "string";
+        number: "number";
+        boolean: "boolean";
+        duration: "duration";
+        json: "json";
+      }>>;
+      numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
+      placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
+        header: "header";
+        body: "body";
+      }>>>;
+    }, z$1.core.$strip>>>;
+  }, z$1.core.$strip>>;
   cache: z$1.ZodOptional<z$1.ZodObject<{
     enabled: z$1.ZodOptional<z$1.ZodBoolean>;
     dir: z$1.ZodOptional<z$1.ZodString>;
+    maxEntriesPerNamespace: z$1.ZodPipe<z$1.ZodTransform<number | undefined, unknown>, z$1.ZodOptional<z$1.ZodNumber>>;
+    maxEntriesByNamespace: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodNumber>>;
     maxEntriesPerEval: z$1.ZodPipe<z$1.ZodTransform<number | undefined, unknown>, z$1.ZodOptional<z$1.ZodNumber>>;
   }, z$1.core.$strip>>;
 }, z$1.core.$strip>;
 //#endregion
+//#region ../shared/src/utils/extractLlmCalls.d.ts
+/** Resolved value for one user-defined metric on an LLM call row. */
+type LlmCallMetricValue = {
+  label: string;
+  tooltip: string | undefined;
+  rawValue: unknown;
+  format: LlmCallMetricFormat;
+  numberFormat: NumberDisplayOptions | undefined;
+  placements: LlmCallMetricPlacement[];
+};
+/** Single entry rendered as one expandable row in the LLM calls tab. */
+type LlmCallEntry = {
+  id: string;
+  name: string;
+  kind: string;
+  status: EvalTraceSpan['status'];
+  model: string | null;
+  provider: string | null;
+  inputTokens: number | null;
+  outputTokens: number | null;
+  cachedInputTokens: number | null;
+  cacheCreationInputTokens: number | null;
+  reasoningTokens: number | null;
+  totalTokens: number | null;
+  costUsd: number | null;
+  inputCostUsd: number | null;
+  outputCostUsd: number | null;
+  cachedInputCostUsd: number | null;
+  cacheCreationInputCostUsd: number | null;
+  reasoningCostUsd: number | null; /** Number of inference rounds. Derived from the array length when `stepDetails` is set. */
+  stepCount: number | null; /** Per-step breakdown when the configured `steps` attribute resolves to an array. */
+  stepDetails: unknown[] | null;
+  finishReason: string | null;
+  latencyMs: number | null;
+  input: unknown;
+  output: unknown;
+  reasoning: unknown;
+  toolCalls: unknown;
+  metrics: LlmCallMetricValue[];
+  warnings: EvalTraceSpanWarning[];
+  error: EvalTraceSpanError | null;
+};
+/**
+ * Filter `spans` down to LLM calls and project each one to the structured
+ * shape consumed by the LLM calls tab.
+ *
+ * Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
+ * (`model`, token counts, cost, etc.) are read via `getNestedAttribute` from
+ * the configured paths, with safe coercion to `string | null` / `number |
+ * null`. `totalTokens` falls back to a sum of input + output + cached when no
+ * explicit total attribute is present. The `steps` attribute path may resolve
+ * to either a number (rendered as the inference-round count) or an array of
+ * per-step detail objects (rendered as a Steps section in the body, with
+ * `stepCount` derived from the array length). `latencyMs` is `null` while the
+ * span is still running. User-defined `metrics` whose path resolves to
+ * `undefined` are dropped, but `null`, `0`, and `false` are preserved as
+ * legitimate values worth displaying. Original span order is preserved so the
+ * LLM calls tab matches the ordering in the Trace tab.
+ */
+declare function extractLlmCalls(spans: EvalTraceSpan[], config: ResolvedLlmCallsConfig): LlmCallEntry[];
+//#endregion
 //#region ../shared/src/schemas/cache.d.ts
 /**
  * Mode that controls how the cache is consulted for a given run.
@@ -1535,6 +1780,7 @@ type CacheMode = z$1.infer<typeof cacheModeSchema>;
 declare const spanCacheOptionsSchema: z$1.ZodObject<{
   key: z$1.ZodUnknown;
   namespace: z$1.ZodOptional<z$1.ZodString>;
+  serializeFileBytes: z$1.ZodOptional<z$1.ZodBoolean>;
 }, z$1.core.$strip>;
 /** Options accepted by an `evalTracer.span` call to opt the span into caching. */
 type SpanCacheOptions = z$1.infer<typeof spanCacheOptionsSchema>;
@@ -1545,6 +1791,38 @@ declare const cacheOperationTypeSchema: z$1.ZodEnum<{
 }>;
 /** Category of operation stored in the eval cache. */
 type CacheOperationType = z$1.infer<typeof cacheOperationTypeSchema>;
+/** Status of a cache lookup recorded on a span or case scope. */
+declare const cacheStatusSchema: z$1.ZodEnum<{
+  bypass: "bypass";
+  refresh: "refresh";
+  hit: "hit";
+  miss: "miss";
+}>;
+/** Status of a cache lookup recorded on a span or case scope. */
+type CacheStatus = z$1.infer<typeof cacheStatusSchema>;
+/**
+ * Reference to a value-cache lookup performed via `evalTracer.cache(...)`.
+ *
+ * Refs are appended to the active span's `cache.refs` attribute when the call
+ * happens inside a `traceSpan(...)` body, or to the case scope's
+ * `caseCacheRefs` bucket when the call is made directly from the case body.
+ */
+declare const traceCacheRefSchema: z$1.ZodObject<{
+  type: z$1.ZodLiteral<"value">;
+  name: z$1.ZodString;
+  namespace: z$1.ZodString;
+  key: z$1.ZodString;
+  status: z$1.ZodEnum<{
+    bypass: "bypass";
+    refresh: "refresh";
+    hit: "hit";
+    miss: "miss";
+  }>;
+  storedAt: z$1.ZodOptional<z$1.ZodString>;
+  age: z$1.ZodOptional<z$1.ZodNumber>;
+}, z$1.core.$strip>;
+/** Reference to a value-cache lookup performed via `evalTracer.cache(...)`. */
+type TraceCacheRef = z$1.infer<typeof traceCacheRefSchema>;
 /** Summary of a single persisted cache entry, used by list/delete endpoints. */
 declare const cacheListItemSchema: z$1.ZodObject<{
   key: z$1.ZodString;
@@ -1824,6 +2102,93 @@ declare const cacheFileSchema: z$1.ZodObject<{
 /** Persisted per-owner cache file contents. */
 type CacheFile = z$1.infer<typeof cacheFileSchema>;
 //#endregion
+//#region ../shared/src/utils/extractCacheHits.d.ts
+/**
+ * Single cache-hit entry rendered as one row in the case drawer's
+ * "Cache hits" tab.
+ *
+ * `origin === 'span'` rows came from a span's `cache.status` attribute or from
+ * a `cache.refs` ref attached to a span body. `origin === 'caseRoot'` rows
+ * came from `evalTracer.cache(...)` calls made directly from the case body
+ * (no surrounding `traceSpan`), which would otherwise be invisible.
+ */
+type CacheHitEntry = {
+  id: string;
+  source: 'span' | 'value';
+  origin: 'span' | 'caseRoot';
+  name: string;
+  namespace: string;
+  key: string;
+  storedAt: string | undefined;
+  age: number | undefined;
+  spanId: string | undefined;
+};
+/**
+ * Collect every `status === 'hit'` cache event recorded for a case run.
+ *
+ * Walks `spans` for span-level cache hits (`attributes['cache.status'] ===
+ * 'hit'`) and per-span value-cache refs (`attributes['cache.refs']`), then
+ * appends spanless value-cache refs persisted on the case scope. Non-hit
+ * statuses (`miss`/`refresh`/`bypass`) are skipped — they remain visible
+ * inline in the Trace tab.
+ */
+declare function extractCacheHits(spans: EvalTraceSpan[], caseCacheRefs: TraceCacheRef[]): CacheHitEntry[];
+//#endregion
+//#region ../shared/src/schemas/sse.d.ts
+declare const sseEventTypeSchema: z$1.ZodEnum<{
+  "discovery.updated": "discovery.updated";
+  "run.started": "run.started";
+  "run.summary": "run.summary";
+  "case.started": "case.started";
+  "case.updated": "case.updated";
+  "case.finished": "case.finished";
+  "trace.span": "trace.span";
+  "run.finished": "run.finished";
+  "run.cancelled": "run.cancelled";
+  "run.error": "run.error";
+}>;
+/** Server-sent event name emitted by the runner or backend. */
+type SseEventType = z$1.infer<typeof sseEventTypeSchema>;
+/** Schema for the SSE envelope used to stream run updates to clients. */
+declare const sseEnvelopeSchema: z$1.ZodObject<{
+  type: z$1.ZodString;
+  runId: z$1.ZodOptional<z$1.ZodString>;
+  timestamp: z$1.ZodString;
+  payload: z$1.ZodUnknown;
+}, z$1.core.$strip>;
+/** Wire format for a streamed event emitted during eval execution. */
+type SseEnvelope = z$1.infer<typeof sseEnvelopeSchema>;
+//#endregion
+//#region ../shared/src/schemas/api.d.ts
+/** Schema for the API request that starts a new eval run. */
+declare const createRunRequestSchema: z$1.ZodObject<{
+  target: z$1.ZodObject<{
+    mode: z$1.ZodEnum<{
+      all: "all";
+      evalIds: "evalIds";
+      caseIds: "caseIds";
+    }>;
+    evalIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
+    caseIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
+  }, z$1.core.$strip>;
+  trials: z$1.ZodNumber;
+  cache: z$1.ZodOptional<z$1.ZodObject<{
+    mode: z$1.ZodDefault<z$1.ZodEnum<{
+      use: "use";
+      bypass: "bypass";
+      refresh: "refresh";
+    }>>;
+  }, z$1.core.$strip>>;
+}, z$1.core.$strip>;
+/** Request payload accepted by the run creation endpoint. */
+type CreateRunRequest = z$1.infer<typeof createRunRequestSchema>;
+/** Schema for updating a UI-authored manual score on one persisted case. */
+declare const updateManualScoreRequestSchema: z$1.ZodObject<{
+  value: z$1.ZodNullable<z$1.ZodNumber>;
+}, z$1.core.$strip>;
+/** Request payload accepted by the manual score update endpoint. */
+type UpdateManualScoreRequest = z$1.infer<typeof updateManualScoreRequestSchema>;
+//#endregion
 //#region ../sdk/src/types.d.ts
 /** Single authored eval case with its stable identifier and input payload. */
 type EvalCase<TInput> = {
@@ -2077,6 +2442,12 @@ type EvalCaseScope = {
    */
   replayingDepth: number; /** Runner-provided cache adapter + mode; absent when caching is disabled. */
   cacheContext: CacheScopeContext | undefined;
+  /**
+   * Value-cache refs recorded by `evalTracer.cache(...)` calls made with no
+   * active span. Span-bound refs are appended to the owning span's
+   * `cache.refs` attribute instead.
+   */
+  caseCacheRefs: TraceCacheRef[];
 };
 /** Error thrown when an eval assertion fails during case execution. */
 declare class EvalAssertionError extends Error {
@@ -2171,43 +2542,46 @@ type CaptureEvalSpanErrorOptions = {
   level?: CaptureEvalSpanErrorLevel;
 };
 //#endregion
-//#region ../sdk/src/cacheRecording.d.ts
-/** Cache reference appended to the active span by `evalTracer.cache(...)`. */
-type TraceCacheRef = {
-  type: 'value';
-  name: string;
-  namespace: string;
-  key: string;
-  status: 'hit' | 'miss' | 'refresh' | 'bypass';
-  storedAt?: string;
-  age?: number;
-};
-//#endregion
 //#region ../sdk/src/valueCache.d.ts
 /** Info accepted by `evalTracer.cache(info, fn)` for spanless value caching. */
 type TraceCacheInfo = {
   /** Display name used for cache listings and the default namespace. */name: string; /** Arbitrary JSON-safe value used to derive the cache key. */
   key: unknown; /** Override the default namespace (`${evalId}__${name}`). */
   namespace?: string;
+  /**
+   * Include native `Blob`/`File` bytes in the cache key. By default only stable
+   * metadata (`type`, `size`, plus `name`/`lastModified` for `File`) is used.
+   */
+  serializeFileBytes?: boolean;
 };
 //#endregion
 //#region ../sdk/src/cacheKey.d.ts
+/** Components folded into a deterministic cache key hash. */
 type CacheKeyHashInput = {
-  namespace: string;
-  codeFingerprint: string;
+  /** Cache namespace, usually derived from the eval id and operation name. */namespace: string; /** Eval source fingerprint used to invalidate cache entries on code edits. */
+  codeFingerprint: string; /** User-authored cache key value. */
   key: unknown;
 };
+/** Optional controls for cache key hashing. */
+type CacheKeyHashOptions = {
+  /**
+   * When true, native `Blob` and `File` values are read asynchronously and
+   * hashed by bytes plus stable metadata. Defaults to metadata-only hashing.
+   */
+  serializeFileBytes?: boolean;
+};
 /**
  * Hash the components of a cache key into a deterministic hex digest.
  *
- * Native `Blob` and `File` values are read asynchronously and hashed by
- * content. Use `hashCacheKeySync` only when the key contains no async values.
+ * Native `Blob` and `File` values use stable metadata by default. Pass
+ * `serializeFileBytes: true` to read them asynchronously and include their byte
+ * hash in the key.
  */
-declare function hashCacheKey(input: CacheKeyHashInput): Promise<string>;
+declare function hashCacheKey(input: CacheKeyHashInput, options?: CacheKeyHashOptions): Promise<string>;
 /**
  * Synchronously hash cache key components. This supports JSON-like data and
  * in-memory binary values such as `Buffer`, `ArrayBuffer`, and typed arrays,
- * but cannot content-hash native `Blob` or `File` values.
+ * plus stable metadata for native `Blob` and `File` values.
  */
 declare function hashCacheKeySync(input: CacheKeyHashInput): string;
 //#endregion
@@ -2296,8 +2670,8 @@ type TraceSpanInfoUncached = TraceSpanInfoBase & {
 /**
  * Info accepted by `evalTracer.span(info, fn)` when opting in to caching.
  *
- * Cached spans return `Promise<unknown>` because the replayed value comes from
- * a JSON round-trip on cache hit. Narrow the value yourself when you need a
+ * Cached spans return `Promise<unknown>` because the replayed value is revived
+ * from persisted cache data on hit. Narrow the value yourself when you need a
  * typed return.
  */
 type TraceSpanInfoCached = TraceSpanInfoBase & {
@@ -2388,9 +2762,23 @@ type EvalRunner = {
   subscribe(runId: string, listener: (event: SseEnvelope) => void): () => void; /** Subscribe to discovery updates triggered by file changes or manual refresh. */
   subscribeDiscovery(listener: (event: SseEnvelope) => void): () => void; /** Stop background filesystem watchers owned by this runner instance. */
   close(): Promise<void>; /** Resolve the workspace root backing this runner instance. */
-  getWorkspaceRoot(): string; /** Resolve a persisted artifact path when artifact storage is supported. */
+  getWorkspaceRoot(): string;
+  /**
+   * Resolved LLM-calls config used by the UI to derive the LLM calls tab.
+   *
+   * Returns the workspace's `llmCalls` config block from
+   * `agent-evals.config.ts` with all defaults applied.
+   */
+  getLlmCallsConfig(): ResolvedLlmCallsConfig; /** Resolve a persisted artifact path when artifact storage is supported. */
   getArtifactPath(artifactId: string): string | undefined; /** Return summaries for every persisted cache entry in the workspace. */
   listCache(): Promise<CacheListItem[]>;
+  /**
+   * Return the full persisted cache entry for `namespace` + `key`, including
+   * its recording. Returns `null` when no entry matches. Used by the case
+   * drawer's Cache hits tab to lazily fetch the cached return value when a
+   * row is expanded.
+   */
+  getCacheEntry(namespace: string, key: string): Promise<CacheEntry | null>;
   /**
    * Remove cache entries matching `filter`, or all entries when no filter is
    * supplied.
@@ -2445,4 +2833,4 @@ declare function createRunner({
  */
 declare function runCli(argv: string[]): Promise<void>;
 //#endregion
-export { type AgentEvalsConfig, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheFile, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type NumberDisplayOptions, type RepoFileRef, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, mergeEvalOutput, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
+export { type AgentEvalsConfig, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_LLM_CALLS_CONFIG, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallsConfigInput, type NumberDisplayOptions, type RepoFileRef, type ResolvedLlmCallMetric, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };