@ls-stack/agent-eval 0.7.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -913,6 +913,20 @@ declare const caseDetailSchema: z$1.ZodObject<{
913
913
  stack: z$1.ZodOptional<z$1.ZodString>;
914
914
  }, z$1.core.$strip>>;
915
915
  trial: z$1.ZodNumber;
916
+ cacheRefs: z$1.ZodDefault<z$1.ZodArray<z$1.ZodObject<{
917
+ type: z$1.ZodLiteral<"value">;
918
+ name: z$1.ZodString;
919
+ namespace: z$1.ZodString;
920
+ key: z$1.ZodString;
921
+ status: z$1.ZodEnum<{
922
+ hit: "hit";
923
+ miss: "miss";
924
+ refresh: "refresh";
925
+ bypass: "bypass";
926
+ }>;
927
+ storedAt: z$1.ZodOptional<z$1.ZodString>;
928
+ age: z$1.ZodOptional<z$1.ZodNumber>;
929
+ }, z$1.core.$strip>>>;
916
930
  }, z$1.core.$strip>;
917
931
  /** Full case payload including inputs, trace, outputs, and failures. */
918
932
  type CaseDetail = z$1.infer<typeof caseDetailSchema>;
@@ -1363,60 +1377,16 @@ type EvalTitleLike = {
1363
1377
  */
1364
1378
  declare function getEvalTitle(evalLike: EvalTitleLike): string;
1365
1379
  //#endregion
1366
- //#region ../shared/src/schemas/sse.d.ts
1367
- declare const sseEventTypeSchema: z$1.ZodEnum<{
1368
- "discovery.updated": "discovery.updated";
1369
- "run.started": "run.started";
1370
- "run.summary": "run.summary";
1371
- "case.started": "case.started";
1372
- "case.updated": "case.updated";
1373
- "case.finished": "case.finished";
1374
- "trace.span": "trace.span";
1375
- "run.finished": "run.finished";
1376
- "run.cancelled": "run.cancelled";
1377
- "run.error": "run.error";
1378
- }>;
1379
- /** Server-sent event name emitted by the runner or backend. */
1380
- type SseEventType = z$1.infer<typeof sseEventTypeSchema>;
1381
- /** Schema for the SSE envelope used to stream run updates to clients. */
1382
- declare const sseEnvelopeSchema: z$1.ZodObject<{
1383
- type: z$1.ZodString;
1384
- runId: z$1.ZodOptional<z$1.ZodString>;
1385
- timestamp: z$1.ZodString;
1386
- payload: z$1.ZodUnknown;
1387
- }, z$1.core.$strip>;
1388
- /** Wire format for a streamed event emitted during eval execution. */
1389
- type SseEnvelope = z$1.infer<typeof sseEnvelopeSchema>;
1390
- //#endregion
1391
- //#region ../shared/src/schemas/api.d.ts
1392
- /** Schema for the API request that starts a new eval run. */
1393
- declare const createRunRequestSchema: z$1.ZodObject<{
1394
- target: z$1.ZodObject<{
1395
- mode: z$1.ZodEnum<{
1396
- all: "all";
1397
- evalIds: "evalIds";
1398
- caseIds: "caseIds";
1399
- }>;
1400
- evalIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
1401
- caseIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
1402
- }, z$1.core.$strip>;
1403
- trials: z$1.ZodNumber;
1404
- cache: z$1.ZodOptional<z$1.ZodObject<{
1405
- mode: z$1.ZodDefault<z$1.ZodEnum<{
1406
- use: "use";
1407
- bypass: "bypass";
1408
- refresh: "refresh";
1409
- }>>;
1410
- }, z$1.core.$strip>>;
1411
- }, z$1.core.$strip>;
1412
- /** Request payload accepted by the run creation endpoint. */
1413
- type CreateRunRequest = z$1.infer<typeof createRunRequestSchema>;
1414
- /** Schema for updating a UI-authored manual score on one persisted case. */
1415
- declare const updateManualScoreRequestSchema: z$1.ZodObject<{
1416
- value: z$1.ZodNullable<z$1.ZodNumber>;
1417
- }, z$1.core.$strip>;
1418
- /** Request payload accepted by the manual score update endpoint. */
1419
- type UpdateManualScoreRequest = z$1.infer<typeof updateManualScoreRequestSchema>;
1380
+ //#region ../shared/src/utils/getNestedAttribute.d.ts
1381
+ /**
1382
+ * Read a value from `source` by walking a dot-separated path.
1383
+ *
1384
+ * Returns `undefined` when any segment of the path is missing or when an
1385
+ * intermediate value is not a plain object. Used by trace-attribute display,
1386
+ * the LLM calls extractor, and any consumer that needs to look up nested
1387
+ * properties from a span's `attributes` record.
1388
+ */
1389
+ declare function getNestedAttribute(value: unknown, path: string): unknown;
1420
1390
  //#endregion
1421
1391
  //#region ../shared/src/schemas/config.d.ts
1422
1392
  /** Strategy used to collapse repeated trials into one stored case result. */
@@ -1426,6 +1396,144 @@ declare const trialSelectionModeSchema: z$1.ZodEnum<{
1426
1396
  }>;
1427
1397
  /** Strategy used to collapse repeated trials into one stored case result. */
1428
1398
  type TrialSelectionMode = z$1.infer<typeof trialSelectionModeSchema>;
1399
+ /** Render formats supported by an LLM-call metric in the UI. */
1400
+ declare const llmCallMetricFormatSchema: z$1.ZodEnum<{
1401
+ string: "string";
1402
+ number: "number";
1403
+ boolean: "boolean";
1404
+ duration: "duration";
1405
+ json: "json";
1406
+ }>;
1407
+ /** Render format applied to an LLM-call metric value. */
1408
+ type LlmCallMetricFormat = z$1.infer<typeof llmCallMetricFormatSchema>;
1409
+ /** Where an LLM-call metric is rendered inside the LLM calls tab. */
1410
+ declare const llmCallMetricPlacementSchema: z$1.ZodEnum<{
1411
+ header: "header";
1412
+ body: "body";
1413
+ }>;
1414
+ /** Placement option for an LLM-call metric. */
1415
+ type LlmCallMetricPlacement = z$1.infer<typeof llmCallMetricPlacementSchema>;
1416
+ /**
1417
+ * Schema for a single user-defined metric attached to LLM call rows.
1418
+ *
1419
+ * Each metric reads `path` from the span's `attributes` and renders the value
1420
+ * with the configured `format` and `numberFormat`. `placements` controls
1421
+ * whether the metric appears as a chip on the collapsed row header, as a row
1422
+ * inside the expanded body, or both. Defaults to `['body']` when omitted.
1423
+ */
1424
+ declare const llmCallMetricSchema: z$1.ZodObject<{
1425
+ label: z$1.ZodString;
1426
+ tooltip: z$1.ZodOptional<z$1.ZodString>;
1427
+ path: z$1.ZodString;
1428
+ format: z$1.ZodOptional<z$1.ZodEnum<{
1429
+ string: "string";
1430
+ number: "number";
1431
+ boolean: "boolean";
1432
+ duration: "duration";
1433
+ json: "json";
1434
+ }>>;
1435
+ numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
1436
+ placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
1437
+ header: "header";
1438
+ body: "body";
1439
+ }>>>;
1440
+ }, z$1.core.$strip>;
1441
+ /** User-defined metric authored in `agent-evals.config.ts`. */
1442
+ type LlmCallMetric = z$1.infer<typeof llmCallMetricSchema>;
1443
+ /** Schema for the global LLM calls config block in `agent-evals.config.ts`. */
1444
+ declare const llmCallsConfigSchema: z$1.ZodObject<{
1445
+ kinds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
1446
+ attributes: z$1.ZodOptional<z$1.ZodObject<{
1447
+ model: z$1.ZodOptional<z$1.ZodString>;
1448
+ provider: z$1.ZodOptional<z$1.ZodString>;
1449
+ inputTokens: z$1.ZodOptional<z$1.ZodString>;
1450
+ outputTokens: z$1.ZodOptional<z$1.ZodString>;
1451
+ cachedInputTokens: z$1.ZodOptional<z$1.ZodString>;
1452
+ cacheCreationInputTokens: z$1.ZodOptional<z$1.ZodString>;
1453
+ reasoningTokens: z$1.ZodOptional<z$1.ZodString>;
1454
+ totalTokens: z$1.ZodOptional<z$1.ZodString>;
1455
+ cost: z$1.ZodOptional<z$1.ZodString>;
1456
+ inputCost: z$1.ZodOptional<z$1.ZodString>;
1457
+ outputCost: z$1.ZodOptional<z$1.ZodString>;
1458
+ cachedInputCost: z$1.ZodOptional<z$1.ZodString>;
1459
+ cacheCreationInputCost: z$1.ZodOptional<z$1.ZodString>;
1460
+ reasoningCost: z$1.ZodOptional<z$1.ZodString>;
1461
+ steps: z$1.ZodOptional<z$1.ZodString>;
1462
+ finishReason: z$1.ZodOptional<z$1.ZodString>;
1463
+ input: z$1.ZodOptional<z$1.ZodString>;
1464
+ output: z$1.ZodOptional<z$1.ZodString>;
1465
+ reasoning: z$1.ZodOptional<z$1.ZodString>;
1466
+ toolCalls: z$1.ZodOptional<z$1.ZodString>;
1467
+ }, z$1.core.$strip>>;
1468
+ metrics: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
1469
+ label: z$1.ZodString;
1470
+ tooltip: z$1.ZodOptional<z$1.ZodString>;
1471
+ path: z$1.ZodString;
1472
+ format: z$1.ZodOptional<z$1.ZodEnum<{
1473
+ string: "string";
1474
+ number: "number";
1475
+ boolean: "boolean";
1476
+ duration: "duration";
1477
+ json: "json";
1478
+ }>>;
1479
+ numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
1480
+ placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
1481
+ header: "header";
1482
+ body: "body";
1483
+ }>>>;
1484
+ }, z$1.core.$strip>>>;
1485
+ }, z$1.core.$strip>;
1486
+ /** Authored LLM calls config accepted from `agent-evals.config.ts`. */
1487
+ type LlmCallsConfigInput = z$1.infer<typeof llmCallsConfigSchema>;
1488
+ /** Resolved LLM-calls config sent to the UI with all defaults applied. */
1489
+ type ResolvedLlmCallsConfig = {
1490
+ kinds: string[];
1491
+ attributes: {
1492
+ model: string;
1493
+ provider: string;
1494
+ inputTokens: string;
1495
+ outputTokens: string;
1496
+ cachedInputTokens: string;
1497
+ cacheCreationInputTokens: string;
1498
+ reasoningTokens: string;
1499
+ totalTokens: string;
1500
+ cost: string;
1501
+ inputCost: string;
1502
+ outputCost: string;
1503
+ cachedInputCost: string;
1504
+ cacheCreationInputCost: string;
1505
+ reasoningCost: string;
1506
+ steps: string;
1507
+ finishReason: string;
1508
+ input: string;
1509
+ output: string;
1510
+ reasoning: string;
1511
+ toolCalls: string;
1512
+ };
1513
+ metrics: ResolvedLlmCallMetric[];
1514
+ };
1515
+ /** Fully-resolved LLM-call metric used by the runner and UI. */
1516
+ type ResolvedLlmCallMetric = {
1517
+ label: string;
1518
+ tooltip?: string;
1519
+ path: string;
1520
+ format: LlmCallMetricFormat;
1521
+ numberFormat?: NumberDisplayOptions;
1522
+ placements: LlmCallMetricPlacement[];
1523
+ };
1524
+ /** Default LLM-calls config the UI uses before the workspace fetch resolves. */
1525
+ declare const DEFAULT_LLM_CALLS_CONFIG: ResolvedLlmCallsConfig;
1526
+ /**
1527
+ * Resolve the user-authored LLM-calls config to a fully-defaulted shape used
1528
+ * by the UI to derive the LLM calls tab.
1529
+ *
1530
+ * - Missing or empty `kinds` falls back to `['llm']`.
1531
+ * - Missing `attributes.<field>` falls back to the corresponding default
1532
+ * attribute path.
1533
+ * - Missing `metrics[].format` defaults to `'string'`.
1534
+ * - Missing `metrics[].placements` defaults to `['body']`.
1535
+ */
1536
+ declare function resolveLlmCallsConfig(input: LlmCallsConfigInput | undefined): ResolvedLlmCallsConfig;
1429
1537
  /** Top-level config authored in `agent-evals.config.ts`. */
1430
1538
  type AgentEvalsConfig = {
1431
1539
  /** Root directory used to resolve all relative paths. Defaults to `process.cwd()`. */workspaceRoot?: string; /** Glob patterns (relative to `workspaceRoot`) used to discover eval files. */
@@ -1455,6 +1563,32 @@ type AgentEvalsConfig = {
1455
1563
  * definition taking precedence for matching `key` or `path` entries.
1456
1564
  */
1457
1565
  traceDisplay?: TraceDisplayInputConfig;
1566
+ /**
1567
+ * Configuration for the "LLM calls" tab in the case-run drawer.
1568
+ *
1569
+ * Determines which trace spans are treated as LLM calls (`kinds`), how
1570
+ * structured fields like `model` and `usage.inputTokens` are read from
1571
+ * span attributes, and which custom user-defined metrics are surfaced on
1572
+ * each call. All fields are optional and fall back to the documented
1573
+ * defaults; the LLM calls tab is shown automatically when at least one
1574
+ * matching span exists in a case run.
1575
+ *
1576
+ * @example
1577
+ * ```ts
1578
+ * llmCalls: {
1579
+ * kinds: ['llm', 'ai-sdk.generateText'],
1580
+ * attributes: {
1581
+ * cachedInputTokens: 'usage.cache_read_input_tokens',
1582
+ * },
1583
+ * metrics: [
1584
+ * { label: 'Tokens/sec', path: 'tokensPerSecond', format: 'number',
1585
+ * numberFormat: { decimalPlaces: 1 }, placements: ['header', 'body'] },
1586
+ * { label: 'Retries', path: 'retryCount', format: 'number' },
1587
+ * ],
1588
+ * }
1589
+ * ```
1590
+ */
1591
+ llmCalls?: LlmCallsConfigInput;
1458
1592
  /**
1459
1593
  * Optional controls for the operation cache. When omitted, the cache is
1460
1594
  * enabled and stored under `<workspaceRoot>/.agent-evals/cache`.
@@ -1463,9 +1597,15 @@ type AgentEvalsConfig = {
1463
1597
  /** Disable the cache entirely; spans with `cache` options execute as if uncached. */enabled?: boolean; /** Override the directory used to persist cache entries. */
1464
1598
  dir?: string;
1465
1599
  /**
1466
- * Maximum entries retained in each per-eval cache file. Defaults to `100`;
1467
- * non-positive or non-finite values fall back to the default.
1600
+ * Default maximum entries retained for each cache namespace. Defaults to
1601
+ * `100`; non-positive or non-finite values fall back to the default.
1602
+ */
1603
+ maxEntriesPerNamespace?: number;
1604
+ /**
1605
+ * Exact namespace-specific retention caps. Values override
1606
+ * `maxEntriesPerNamespace` for matching namespaces.
1468
1607
  */
1608
+ maxEntriesByNamespace?: Record<string, number>; /** Legacy alias for `maxEntriesPerNamespace`, retained so older config files keep working. */
1469
1609
  maxEntriesPerEval?: number;
1470
1610
  };
1471
1611
  };
@@ -1509,13 +1649,118 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
1509
1649
  transform: z$1.ZodOptional<z$1.ZodCustom<TraceAttributeTransform, TraceAttributeTransform>>;
1510
1650
  }, z$1.core.$strip>>>;
1511
1651
  }, z$1.core.$strip>>;
1652
+ llmCalls: z$1.ZodOptional<z$1.ZodObject<{
1653
+ kinds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
1654
+ attributes: z$1.ZodOptional<z$1.ZodObject<{
1655
+ model: z$1.ZodOptional<z$1.ZodString>;
1656
+ provider: z$1.ZodOptional<z$1.ZodString>;
1657
+ inputTokens: z$1.ZodOptional<z$1.ZodString>;
1658
+ outputTokens: z$1.ZodOptional<z$1.ZodString>;
1659
+ cachedInputTokens: z$1.ZodOptional<z$1.ZodString>;
1660
+ cacheCreationInputTokens: z$1.ZodOptional<z$1.ZodString>;
1661
+ reasoningTokens: z$1.ZodOptional<z$1.ZodString>;
1662
+ totalTokens: z$1.ZodOptional<z$1.ZodString>;
1663
+ cost: z$1.ZodOptional<z$1.ZodString>;
1664
+ inputCost: z$1.ZodOptional<z$1.ZodString>;
1665
+ outputCost: z$1.ZodOptional<z$1.ZodString>;
1666
+ cachedInputCost: z$1.ZodOptional<z$1.ZodString>;
1667
+ cacheCreationInputCost: z$1.ZodOptional<z$1.ZodString>;
1668
+ reasoningCost: z$1.ZodOptional<z$1.ZodString>;
1669
+ steps: z$1.ZodOptional<z$1.ZodString>;
1670
+ finishReason: z$1.ZodOptional<z$1.ZodString>;
1671
+ input: z$1.ZodOptional<z$1.ZodString>;
1672
+ output: z$1.ZodOptional<z$1.ZodString>;
1673
+ reasoning: z$1.ZodOptional<z$1.ZodString>;
1674
+ toolCalls: z$1.ZodOptional<z$1.ZodString>;
1675
+ }, z$1.core.$strip>>;
1676
+ metrics: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
1677
+ label: z$1.ZodString;
1678
+ tooltip: z$1.ZodOptional<z$1.ZodString>;
1679
+ path: z$1.ZodString;
1680
+ format: z$1.ZodOptional<z$1.ZodEnum<{
1681
+ string: "string";
1682
+ number: "number";
1683
+ boolean: "boolean";
1684
+ duration: "duration";
1685
+ json: "json";
1686
+ }>>;
1687
+ numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
1688
+ placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
1689
+ header: "header";
1690
+ body: "body";
1691
+ }>>>;
1692
+ }, z$1.core.$strip>>>;
1693
+ }, z$1.core.$strip>>;
1512
1694
  cache: z$1.ZodOptional<z$1.ZodObject<{
1513
1695
  enabled: z$1.ZodOptional<z$1.ZodBoolean>;
1514
1696
  dir: z$1.ZodOptional<z$1.ZodString>;
1697
+ maxEntriesPerNamespace: z$1.ZodPipe<z$1.ZodTransform<number | undefined, unknown>, z$1.ZodOptional<z$1.ZodNumber>>;
1698
+ maxEntriesByNamespace: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodNumber>>;
1515
1699
  maxEntriesPerEval: z$1.ZodPipe<z$1.ZodTransform<number | undefined, unknown>, z$1.ZodOptional<z$1.ZodNumber>>;
1516
1700
  }, z$1.core.$strip>>;
1517
1701
  }, z$1.core.$strip>;
1518
1702
  //#endregion
1703
+ //#region ../shared/src/utils/extractLlmCalls.d.ts
1704
+ /** Resolved value for one user-defined metric on an LLM call row. */
1705
+ type LlmCallMetricValue = {
1706
+ label: string;
1707
+ tooltip: string | undefined;
1708
+ rawValue: unknown;
1709
+ format: LlmCallMetricFormat;
1710
+ numberFormat: NumberDisplayOptions | undefined;
1711
+ placements: LlmCallMetricPlacement[];
1712
+ };
1713
+ /** Single entry rendered as one expandable row in the LLM calls tab. */
1714
+ type LlmCallEntry = {
1715
+ id: string;
1716
+ name: string;
1717
+ kind: string;
1718
+ status: EvalTraceSpan['status'];
1719
+ model: string | null;
1720
+ provider: string | null;
1721
+ inputTokens: number | null;
1722
+ outputTokens: number | null;
1723
+ cachedInputTokens: number | null;
1724
+ cacheCreationInputTokens: number | null;
1725
+ reasoningTokens: number | null;
1726
+ totalTokens: number | null;
1727
+ costUsd: number | null;
1728
+ inputCostUsd: number | null;
1729
+ outputCostUsd: number | null;
1730
+ cachedInputCostUsd: number | null;
1731
+ cacheCreationInputCostUsd: number | null;
1732
+ reasoningCostUsd: number | null; /** Number of inference rounds. Derived from the array length when `stepDetails` is set. */
1733
+ stepCount: number | null; /** Per-step breakdown when the configured `steps` attribute resolves to an array. */
1734
+ stepDetails: unknown[] | null;
1735
+ finishReason: string | null;
1736
+ latencyMs: number | null;
1737
+ input: unknown;
1738
+ output: unknown;
1739
+ reasoning: unknown;
1740
+ toolCalls: unknown;
1741
+ metrics: LlmCallMetricValue[];
1742
+ warnings: EvalTraceSpanWarning[];
1743
+ error: EvalTraceSpanError | null;
1744
+ };
1745
+ /**
1746
+ * Filter `spans` down to LLM calls and project each one to the structured
1747
+ * shape consumed by the LLM calls tab.
1748
+ *
1749
+ * Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
1750
+ * (`model`, token counts, cost, etc.) are read via `getNestedAttribute` from
1751
+ * the configured paths, with safe coercion to `string | null` / `number |
1752
+ * null`. `totalTokens` falls back to a sum of input + output + cached when no
1753
+ * explicit total attribute is present. The `steps` attribute path may resolve
1754
+ * to either a number (rendered as the inference-round count) or an array of
1755
+ * per-step detail objects (rendered as a Steps section in the body, with
1756
+ * `stepCount` derived from the array length). `latencyMs` is `null` while the
1757
+ * span is still running. User-defined `metrics` whose path resolves to
1758
+ * `undefined` are dropped, but `null`, `0`, and `false` are preserved as
1759
+ * legitimate values worth displaying. Original span order is preserved so the
1760
+ * LLM calls tab matches the ordering in the Trace tab.
1761
+ */
1762
+ declare function extractLlmCalls(spans: EvalTraceSpan[], config: ResolvedLlmCallsConfig): LlmCallEntry[];
1763
+ //#endregion
1519
1764
  //#region ../shared/src/schemas/cache.d.ts
1520
1765
  /**
1521
1766
  * Mode that controls how the cache is consulted for a given run.
@@ -1535,6 +1780,7 @@ type CacheMode = z$1.infer<typeof cacheModeSchema>;
1535
1780
  declare const spanCacheOptionsSchema: z$1.ZodObject<{
1536
1781
  key: z$1.ZodUnknown;
1537
1782
  namespace: z$1.ZodOptional<z$1.ZodString>;
1783
+ serializeFileBytes: z$1.ZodOptional<z$1.ZodBoolean>;
1538
1784
  }, z$1.core.$strip>;
1539
1785
  /** Options accepted by an `evalTracer.span` call to opt the span into caching. */
1540
1786
  type SpanCacheOptions = z$1.infer<typeof spanCacheOptionsSchema>;
@@ -1545,6 +1791,38 @@ declare const cacheOperationTypeSchema: z$1.ZodEnum<{
1545
1791
  }>;
1546
1792
  /** Category of operation stored in the eval cache. */
1547
1793
  type CacheOperationType = z$1.infer<typeof cacheOperationTypeSchema>;
1794
+ /** Status of a cache lookup recorded on a span or case scope. */
1795
+ declare const cacheStatusSchema: z$1.ZodEnum<{
1796
+ bypass: "bypass";
1797
+ refresh: "refresh";
1798
+ hit: "hit";
1799
+ miss: "miss";
1800
+ }>;
1801
+ /** Status of a cache lookup recorded on a span or case scope. */
1802
+ type CacheStatus = z$1.infer<typeof cacheStatusSchema>;
1803
+ /**
1804
+ * Reference to a value-cache lookup performed via `evalTracer.cache(...)`.
1805
+ *
1806
+ * Refs are appended to the active span's `cache.refs` attribute when the call
1807
+ * happens inside a `traceSpan(...)` body, or to the case scope's
1808
+ * `caseCacheRefs` bucket when the call is made directly from the case body.
1809
+ */
1810
+ declare const traceCacheRefSchema: z$1.ZodObject<{
1811
+ type: z$1.ZodLiteral<"value">;
1812
+ name: z$1.ZodString;
1813
+ namespace: z$1.ZodString;
1814
+ key: z$1.ZodString;
1815
+ status: z$1.ZodEnum<{
1816
+ bypass: "bypass";
1817
+ refresh: "refresh";
1818
+ hit: "hit";
1819
+ miss: "miss";
1820
+ }>;
1821
+ storedAt: z$1.ZodOptional<z$1.ZodString>;
1822
+ age: z$1.ZodOptional<z$1.ZodNumber>;
1823
+ }, z$1.core.$strip>;
1824
+ /** Reference to a value-cache lookup performed via `evalTracer.cache(...)`. */
1825
+ type TraceCacheRef = z$1.infer<typeof traceCacheRefSchema>;
1548
1826
  /** Summary of a single persisted cache entry, used by list/delete endpoints. */
1549
1827
  declare const cacheListItemSchema: z$1.ZodObject<{
1550
1828
  key: z$1.ZodString;
@@ -1824,6 +2102,93 @@ declare const cacheFileSchema: z$1.ZodObject<{
1824
2102
  /** Persisted per-owner cache file contents. */
1825
2103
  type CacheFile = z$1.infer<typeof cacheFileSchema>;
1826
2104
  //#endregion
2105
+ //#region ../shared/src/utils/extractCacheHits.d.ts
2106
+ /**
2107
+ * Single cache-hit entry rendered as one row in the case drawer's
2108
+ * "Cache hits" tab.
2109
+ *
2110
+ * `origin === 'span'` rows came from a span's `cache.status` attribute or from
2111
+ * a `cache.refs` ref attached to a span body. `origin === 'caseRoot'` rows
2112
+ * came from `evalTracer.cache(...)` calls made directly from the case body
2113
+ * (no surrounding `traceSpan`), which would otherwise be invisible.
2114
+ */
2115
+ type CacheHitEntry = {
2116
+ id: string;
2117
+ source: 'span' | 'value';
2118
+ origin: 'span' | 'caseRoot';
2119
+ name: string;
2120
+ namespace: string;
2121
+ key: string;
2122
+ storedAt: string | undefined;
2123
+ age: number | undefined;
2124
+ spanId: string | undefined;
2125
+ };
2126
+ /**
2127
+ * Collect every `status === 'hit'` cache event recorded for a case run.
2128
+ *
2129
+ * Walks `spans` for span-level cache hits (`attributes['cache.status'] ===
2130
+ * 'hit'`) and per-span value-cache refs (`attributes['cache.refs']`), then
2131
+ * appends spanless value-cache refs persisted on the case scope. Non-hit
2132
+ * statuses (`miss`/`refresh`/`bypass`) are skipped — they remain visible
2133
+ * inline in the Trace tab.
2134
+ */
2135
+ declare function extractCacheHits(spans: EvalTraceSpan[], caseCacheRefs: TraceCacheRef[]): CacheHitEntry[];
2136
+ //#endregion
2137
+ //#region ../shared/src/schemas/sse.d.ts
2138
+ declare const sseEventTypeSchema: z$1.ZodEnum<{
2139
+ "discovery.updated": "discovery.updated";
2140
+ "run.started": "run.started";
2141
+ "run.summary": "run.summary";
2142
+ "case.started": "case.started";
2143
+ "case.updated": "case.updated";
2144
+ "case.finished": "case.finished";
2145
+ "trace.span": "trace.span";
2146
+ "run.finished": "run.finished";
2147
+ "run.cancelled": "run.cancelled";
2148
+ "run.error": "run.error";
2149
+ }>;
2150
+ /** Server-sent event name emitted by the runner or backend. */
2151
+ type SseEventType = z$1.infer<typeof sseEventTypeSchema>;
2152
+ /** Schema for the SSE envelope used to stream run updates to clients. */
2153
+ declare const sseEnvelopeSchema: z$1.ZodObject<{
2154
+ type: z$1.ZodString;
2155
+ runId: z$1.ZodOptional<z$1.ZodString>;
2156
+ timestamp: z$1.ZodString;
2157
+ payload: z$1.ZodUnknown;
2158
+ }, z$1.core.$strip>;
2159
+ /** Wire format for a streamed event emitted during eval execution. */
2160
+ type SseEnvelope = z$1.infer<typeof sseEnvelopeSchema>;
2161
+ //#endregion
2162
+ //#region ../shared/src/schemas/api.d.ts
2163
+ /** Schema for the API request that starts a new eval run. */
2164
+ declare const createRunRequestSchema: z$1.ZodObject<{
2165
+ target: z$1.ZodObject<{
2166
+ mode: z$1.ZodEnum<{
2167
+ all: "all";
2168
+ evalIds: "evalIds";
2169
+ caseIds: "caseIds";
2170
+ }>;
2171
+ evalIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
2172
+ caseIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
2173
+ }, z$1.core.$strip>;
2174
+ trials: z$1.ZodNumber;
2175
+ cache: z$1.ZodOptional<z$1.ZodObject<{
2176
+ mode: z$1.ZodDefault<z$1.ZodEnum<{
2177
+ use: "use";
2178
+ bypass: "bypass";
2179
+ refresh: "refresh";
2180
+ }>>;
2181
+ }, z$1.core.$strip>>;
2182
+ }, z$1.core.$strip>;
2183
+ /** Request payload accepted by the run creation endpoint. */
2184
+ type CreateRunRequest = z$1.infer<typeof createRunRequestSchema>;
2185
+ /** Schema for updating a UI-authored manual score on one persisted case. */
2186
+ declare const updateManualScoreRequestSchema: z$1.ZodObject<{
2187
+ value: z$1.ZodNullable<z$1.ZodNumber>;
2188
+ }, z$1.core.$strip>;
2189
+ /** Request payload accepted by the manual score update endpoint. */
2190
+ type UpdateManualScoreRequest = z$1.infer<typeof updateManualScoreRequestSchema>;
2191
+ //#endregion
1827
2192
  //#region ../sdk/src/types.d.ts
1828
2193
  /** Single authored eval case with its stable identifier and input payload. */
1829
2194
  type EvalCase<TInput> = {
@@ -2077,6 +2442,12 @@ type EvalCaseScope = {
2077
2442
  */
2078
2443
  replayingDepth: number; /** Runner-provided cache adapter + mode; absent when caching is disabled. */
2079
2444
  cacheContext: CacheScopeContext | undefined;
2445
+ /**
2446
+ * Value-cache refs recorded by `evalTracer.cache(...)` calls made with no
2447
+ * active span. Span-bound refs are appended to the owning span's
2448
+ * `cache.refs` attribute instead.
2449
+ */
2450
+ caseCacheRefs: TraceCacheRef[];
2080
2451
  };
2081
2452
  /** Error thrown when an eval assertion fails during case execution. */
2082
2453
  declare class EvalAssertionError extends Error {
@@ -2171,43 +2542,46 @@ type CaptureEvalSpanErrorOptions = {
2171
2542
  level?: CaptureEvalSpanErrorLevel;
2172
2543
  };
2173
2544
  //#endregion
2174
- //#region ../sdk/src/cacheRecording.d.ts
2175
- /** Cache reference appended to the active span by `evalTracer.cache(...)`. */
2176
- type TraceCacheRef = {
2177
- type: 'value';
2178
- name: string;
2179
- namespace: string;
2180
- key: string;
2181
- status: 'hit' | 'miss' | 'refresh' | 'bypass';
2182
- storedAt?: string;
2183
- age?: number;
2184
- };
2185
- //#endregion
2186
2545
  //#region ../sdk/src/valueCache.d.ts
2187
2546
  /** Info accepted by `evalTracer.cache(info, fn)` for spanless value caching. */
2188
2547
  type TraceCacheInfo = {
2189
2548
  /** Display name used for cache listings and the default namespace. */name: string; /** Arbitrary JSON-safe value used to derive the cache key. */
2190
2549
  key: unknown; /** Override the default namespace (`${evalId}__${name}`). */
2191
2550
  namespace?: string;
2551
+ /**
2552
+ * Include native `Blob`/`File` bytes in the cache key. By default only stable
2553
+ * metadata (`type`, `size`, plus `name`/`lastModified` for `File`) is used.
2554
+ */
2555
+ serializeFileBytes?: boolean;
2192
2556
  };
2193
2557
  //#endregion
2194
2558
  //#region ../sdk/src/cacheKey.d.ts
2559
+ /** Components folded into a deterministic cache key hash. */
2195
2560
  type CacheKeyHashInput = {
2196
- namespace: string;
2197
- codeFingerprint: string;
2561
+ /** Cache namespace, usually derived from the eval id and operation name. */namespace: string; /** Eval source fingerprint used to invalidate cache entries on code edits. */
2562
+ codeFingerprint: string; /** User-authored cache key value. */
2198
2563
  key: unknown;
2199
2564
  };
2565
+ /** Optional controls for cache key hashing. */
2566
+ type CacheKeyHashOptions = {
2567
+ /**
2568
+ * When true, native `Blob` and `File` values are read asynchronously and
2569
+ * hashed by bytes plus stable metadata. Defaults to metadata-only hashing.
2570
+ */
2571
+ serializeFileBytes?: boolean;
2572
+ };
2200
2573
  /**
2201
2574
  * Hash the components of a cache key into a deterministic hex digest.
2202
2575
  *
2203
- * Native `Blob` and `File` values are read asynchronously and hashed by
2204
- * content. Use `hashCacheKeySync` only when the key contains no async values.
2576
+ * Native `Blob` and `File` values use stable metadata by default. Pass
2577
+ * `serializeFileBytes: true` to read them asynchronously and include their byte
2578
+ * hash in the key.
2205
2579
  */
2206
- declare function hashCacheKey(input: CacheKeyHashInput): Promise<string>;
2580
+ declare function hashCacheKey(input: CacheKeyHashInput, options?: CacheKeyHashOptions): Promise<string>;
2207
2581
  /**
2208
2582
  * Synchronously hash cache key components. This supports JSON-like data and
2209
2583
  * in-memory binary values such as `Buffer`, `ArrayBuffer`, and typed arrays,
2210
- * but cannot content-hash native `Blob` or `File` values.
2584
+ * plus stable metadata for native `Blob` and `File` values.
2211
2585
  */
2212
2586
  declare function hashCacheKeySync(input: CacheKeyHashInput): string;
2213
2587
  //#endregion
@@ -2296,8 +2670,8 @@ type TraceSpanInfoUncached = TraceSpanInfoBase & {
2296
2670
  /**
2297
2671
  * Info accepted by `evalTracer.span(info, fn)` when opting in to caching.
2298
2672
  *
2299
- * Cached spans return `Promise<unknown>` because the replayed value comes from
2300
- * a JSON round-trip on cache hit. Narrow the value yourself when you need a
2673
+ * Cached spans return `Promise<unknown>` because the replayed value is revived
2674
+ * from persisted cache data on hit. Narrow the value yourself when you need a
2301
2675
  * typed return.
2302
2676
  */
2303
2677
  type TraceSpanInfoCached = TraceSpanInfoBase & {
@@ -2388,9 +2762,23 @@ type EvalRunner = {
2388
2762
  subscribe(runId: string, listener: (event: SseEnvelope) => void): () => void; /** Subscribe to discovery updates triggered by file changes or manual refresh. */
2389
2763
  subscribeDiscovery(listener: (event: SseEnvelope) => void): () => void; /** Stop background filesystem watchers owned by this runner instance. */
2390
2764
  close(): Promise<void>; /** Resolve the workspace root backing this runner instance. */
2391
- getWorkspaceRoot(): string; /** Resolve a persisted artifact path when artifact storage is supported. */
2765
+ getWorkspaceRoot(): string;
2766
+ /**
2767
+ * Resolved LLM-calls config used by the UI to derive the LLM calls tab.
2768
+ *
2769
+ * Returns the workspace's `llmCalls` config block from
2770
+ * `agent-evals.config.ts` with all defaults applied.
2771
+ */
2772
+ getLlmCallsConfig(): ResolvedLlmCallsConfig; /** Resolve a persisted artifact path when artifact storage is supported. */
2392
2773
  getArtifactPath(artifactId: string): string | undefined; /** Return summaries for every persisted cache entry in the workspace. */
2393
2774
  listCache(): Promise<CacheListItem[]>;
2775
+ /**
2776
+ * Return the full persisted cache entry for `namespace` + `key`, including
2777
+ * its recording. Returns `null` when no entry matches. Used by the case
2778
+ * drawer's Cache hits tab to lazily fetch the cached return value when a
2779
+ * row is expanded.
2780
+ */
2781
+ getCacheEntry(namespace: string, key: string): Promise<CacheEntry | null>;
2394
2782
  /**
2395
2783
  * Remove cache entries matching `filter`, or all entries when no filter is
2396
2784
  * supplied.
@@ -2445,4 +2833,4 @@ declare function createRunner({
2445
2833
  */
2446
2834
  declare function runCli(argv: string[]): Promise<void>;
2447
2835
  //#endregion
2448
- export { type AgentEvalsConfig, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheFile, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type NumberDisplayOptions, type RepoFileRef, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, mergeEvalOutput, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
2836
+ export { type AgentEvalsConfig, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_LLM_CALLS_CONFIG, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallsConfigInput, type NumberDisplayOptions, type RepoFileRef, type ResolvedLlmCallMetric, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };