@ls-stack/agent-eval 0.26.2 → 0.26.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,7 +25,7 @@
25
25
  href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
26
26
  rel="stylesheet"
27
27
  />
28
- <script type="module" crossorigin src="/assets/index-CFF1eYUm.js"></script>
28
+ <script type="module" crossorigin src="/assets/index-DR2haqvV.js"></script>
29
29
  <link rel="stylesheet" crossorigin href="/assets/index-DOXT0Y9V.css">
30
30
  </head>
31
31
  <body>
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-C4yumCXE.mjs";
2
+ import { t as runCli } from "./cli-T6FYE2Bk.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -1,4 +1,4 @@
1
- import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef, Un as getEvalRegistry, _ as loadEvalModule, a as getLastRunStatuses, b as loadConfig, c as loadPersistedRunSnapshots, ct as getCaseRowEvalKey, d as persistRunState, f as recomputeEvalStatusesInRuns, g as deriveEvalFreshness, h as resolveArtifactPath, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, nt as resolveLlmCallsConfig, o as getLatestRunInfos, ot as buildEvalKey, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, st as getCaseRowCaseKey, tt as resolveApiCallsConfig, u as persistCaseDetail, v as parseEvalDiscovery, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-B31SV_Bq.mjs";
1
+ import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef, Un as getEvalRegistry, _ as loadEvalModule, a as getLastRunStatuses, b as loadConfig, c as loadPersistedRunSnapshots, ct as getCaseRowEvalKey, d as persistRunState, f as recomputeEvalStatusesInRuns, g as deriveEvalFreshness, h as resolveArtifactPath, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, nt as resolveLlmCallsConfig, o as getLatestRunInfos, ot as buildEvalKey, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, st as getCaseRowCaseKey, tt as resolveApiCallsConfig, u as persistCaseDetail, v as parseEvalDiscovery, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-CW_Fzr2c.mjs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
4
4
  import { dirname, join, relative, resolve } from "node:path";
@@ -1041,8 +1041,8 @@ async function commandApp(args) {
1041
1041
  const { serve } = await import("@hono/node-server");
1042
1042
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
1043
1043
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
1044
- const appModule = await import("./app-BxR4SbjA.mjs");
1045
- const runnerModule = await import("./runner-D5QMY3B_.mjs");
1044
+ const appModule = await import("./app-C-yzV9J8.mjs");
1045
+ const runnerModule = await import("./runner-DesCrD7Z.mjs");
1046
1046
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
1047
1047
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
1048
1048
  await runnerModule.initRunner();
package/dist/index.d.mts CHANGED
@@ -2288,7 +2288,7 @@ type CacheMode = z$1.infer<typeof cacheModeSchema>;
2288
2288
  /** Options accepted by an `evalTracer.span` call to opt the span into caching. */
2289
2289
  declare const spanCacheOptionsSchema: z$1.ZodObject<{
2290
2290
  key: z$1.ZodUnknown;
2291
- namespace: z$1.ZodOptional<z$1.ZodString>;
2291
+ namespace: z$1.ZodString;
2292
2292
  serializeFileBytes: z$1.ZodOptional<z$1.ZodBoolean>;
2293
2293
  }, z$1.core.$strip>;
2294
2294
  /** Options accepted by an `evalTracer.span` call to opt the span into caching. */
@@ -3168,7 +3168,7 @@ type CacheAdapter = {
3168
3168
  type CacheScopeContext = {
3169
3169
  adapter: CacheAdapter;
3170
3170
  mode: CacheMode;
3171
- evalId: string; /** Hash of the eval source file; used to invalidate on code changes. */
3171
+ evalId: string; /** Hash of the eval source file, stored as cache metadata for inspection. */
3172
3172
  codeFingerprint: string;
3173
3173
  };
3174
3174
  /** Active recording frame captured while a cached operation body executes. */
@@ -3419,8 +3419,7 @@ type TraceCacheInfo = {
3419
3419
  //#region ../sdk/src/cacheKey.d.ts
3420
3420
  /** Components folded into a deterministic cache key hash. */
3421
3421
  type CacheKeyHashInput = {
3422
- /** Cache namespace, usually derived from the eval id and operation name. */namespace: string; /** Eval source fingerprint used to invalidate cache entries on code edits. */
3423
- codeFingerprint: string; /** User-authored cache key value. */
3422
+ /** Cache namespace, usually derived from the eval id and operation name. */namespace: string; /** User-authored cache key value. */
3424
3423
  key: unknown;
3425
3424
  };
3426
3425
  /** Optional controls for cache key hashing. */
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as llmCallsConfigSchema, $t as traceSpanKindSchema, A as extractApiCalls, An as getEvalStartTime, At as evalChartTypeSchema, B as runSummarySchema, Bn as startEvalBackgroundJob, Bt as cacheRecordingOpSchema, Cn as advanceEvalTime, Ct as evalChartAggregateSchema, D as sseEnvelopeSchema, Dn as evalLog, Dt as evalChartConfigSchema, E as updateManualScoreRequestSchema, En as evalAssert, Et as evalChartColorSchema, F as getEvalDisplayStatus, Fn as runInEvalRuntimeScope, Ft as cacheEntryWithDebugKeySchema, G as apiCallMetricPlacementSchema, Gt as traceCacheRefSchema, H as DEFAULT_LLM_CALLS_CONFIG, Hn as defineEval, Ht as cacheStatusSchema, I as deriveScopedSummaryFromCases, In as runInEvalScope, It as cacheFileSchema, J as defaultConfigKeySchema, Jt as traceAttributeDisplayPlacementSchema, K as apiCallMetricSchema, Kt as traceAttributeDisplayFormatSchema, L as deriveStatusFromCaseRows, Ln as runInExistingEvalScope, Lt as cacheListItemSchema, M as applyDerivedCallAttributes, Mn as isInEvalScope, Mt as cacheDebugKeyEntrySchema, N as getNestedAttribute, Nn as mergeEvalOutput, Nt as cacheDebugKeyFileSchema, O as extractCacheEntries, On as getCurrentScope, Ot as evalChartMetricSchema, P as getEvalTitle, Pn as nextEvalId, Pt as cacheEntrySchema, Q as llmCallPricingSchema, Qt as traceSpanErrorSchema, R as deriveStatusFromChildStatuses, Rn as setEvalOutput, Rt as cacheModeSchema, Sn as EvalAssertionError, St as scoreTraceSchema, T as createRunRequestSchema, Tt as evalChartBuiltinMetricSchema, U as agentEvalsConfigSchema, Un as getEvalRegistry, Ut as serializedCacheSpanSchema, V as DEFAULT_API_CALLS_CONFIG, Vn as repoFile, Vt as cacheRecordingSchema, W as apiCallMetricFormatSchema, Wt as spanCacheOptionsSchema, X as llmCallMetricPlacementSchema, Xt as traceDisplayConfigSchema, Y as llmCallMetricFormatSchema, Yt as traceAttributeDisplaySchema, Z as llmCallMetricSchema, Zt as traceDisplayInputConfigSchema, _n as hashCacheKeySync, _t as evalSummarySchema, an as columnKindSchema, at as buildCaseKey, bn as serializeCacheRecording, bt as runLogLocationSchema, cn as numberDisplayOptionsSchema, ct as getCaseRowEvalKey, dn as z, dt as caseRowSchema, en as traceSpanSchema, et as removeDefaultConfigSchema, fn as buildTraceTree, ft as discoveryIssueSchema, gn as hashCacheKey, gt as evalStatsConfigSchema, hn as evalTracer, ht as evalStatItemSchema, in as columnFormatSchema, it as trialSelectionModeSchema, j as extractLlmCalls, jn as incrementEvalOutput, jt as evalChartsConfigSchema, k as extractCacheHits, kn as getEvalCaseInput, kt as evalChartTooltipExtraSchema, ln as repoFileRefSchema, lt as assertionFailureSchema, mn as evalSpan, mt as evalStatAggregateSchema, nn as cellValueSchema, nt as resolveLlmCallsConfig, on as fileRefSchema, ot as buildEvalKey, pn as captureEvalSpanError, pt as evalFreshnessStatusSchema, q as apiCallsConfigSchema, qt as traceAttributeDisplayInputSchema, rn as columnDefSchema, rt as runLogsConfigSchema, sn as jsonCellSchema, st as getCaseRowCaseKey, tn as traceSpanWarningSchema, tt as resolveApiCallsConfig, un as runArtifactRefSchema, ut as caseDetailSchema, vn as deserializeCacheRecording, vt as runLogEntrySchema, wn as appendToEvalOutput, wt as evalChartAxisSchema, xn as serializeCacheValue, xt as runLogPhaseSchema, yn as deserializeCacheValue, yt as runLogLevelSchema, z as runManifestSchema, zn as setScopeCacheContext, zt as cacheOperationTypeSchema } from "./runOrchestration-B31SV_Bq.mjs";
2
- import { n as createRunner, t as runCli } from "./cli-C4yumCXE.mjs";
3
- import "./src-CLMrRle2.mjs";
1
+ import { $ as llmCallsConfigSchema, $t as traceSpanKindSchema, A as extractApiCalls, An as getEvalStartTime, At as evalChartTypeSchema, B as runSummarySchema, Bn as startEvalBackgroundJob, Bt as cacheRecordingOpSchema, Cn as advanceEvalTime, Ct as evalChartAggregateSchema, D as sseEnvelopeSchema, Dn as evalLog, Dt as evalChartConfigSchema, E as updateManualScoreRequestSchema, En as evalAssert, Et as evalChartColorSchema, F as getEvalDisplayStatus, Fn as runInEvalRuntimeScope, Ft as cacheEntryWithDebugKeySchema, G as apiCallMetricPlacementSchema, Gt as traceCacheRefSchema, H as DEFAULT_LLM_CALLS_CONFIG, Hn as defineEval, Ht as cacheStatusSchema, I as deriveScopedSummaryFromCases, In as runInEvalScope, It as cacheFileSchema, J as defaultConfigKeySchema, Jt as traceAttributeDisplayPlacementSchema, K as apiCallMetricSchema, Kt as traceAttributeDisplayFormatSchema, L as deriveStatusFromCaseRows, Ln as runInExistingEvalScope, Lt as cacheListItemSchema, M as applyDerivedCallAttributes, Mn as isInEvalScope, Mt as cacheDebugKeyEntrySchema, N as getNestedAttribute, Nn as mergeEvalOutput, Nt as cacheDebugKeyFileSchema, O as extractCacheEntries, On as getCurrentScope, Ot as evalChartMetricSchema, P as getEvalTitle, Pn as nextEvalId, Pt as cacheEntrySchema, Q as llmCallPricingSchema, Qt as traceSpanErrorSchema, R as deriveStatusFromChildStatuses, Rn as setEvalOutput, Rt as cacheModeSchema, Sn as EvalAssertionError, St as scoreTraceSchema, T as createRunRequestSchema, Tt as evalChartBuiltinMetricSchema, U as agentEvalsConfigSchema, Un as getEvalRegistry, Ut as serializedCacheSpanSchema, V as DEFAULT_API_CALLS_CONFIG, Vn as repoFile, Vt as cacheRecordingSchema, W as apiCallMetricFormatSchema, Wt as spanCacheOptionsSchema, X as llmCallMetricPlacementSchema, Xt as traceDisplayConfigSchema, Y as llmCallMetricFormatSchema, Yt as traceAttributeDisplaySchema, Z as llmCallMetricSchema, Zt as traceDisplayInputConfigSchema, _n as hashCacheKeySync, _t as evalSummarySchema, an as columnKindSchema, at as buildCaseKey, bn as serializeCacheRecording, bt as runLogLocationSchema, cn as numberDisplayOptionsSchema, ct as getCaseRowEvalKey, dn as z, dt as caseRowSchema, en as traceSpanSchema, et as removeDefaultConfigSchema, fn as buildTraceTree, ft as discoveryIssueSchema, gn as hashCacheKey, gt as evalStatsConfigSchema, hn as evalTracer, ht as evalStatItemSchema, in as columnFormatSchema, it as trialSelectionModeSchema, j as extractLlmCalls, jn as incrementEvalOutput, jt as evalChartsConfigSchema, k as extractCacheHits, kn as getEvalCaseInput, kt as evalChartTooltipExtraSchema, ln as repoFileRefSchema, lt as assertionFailureSchema, mn as evalSpan, mt as evalStatAggregateSchema, nn as cellValueSchema, nt as resolveLlmCallsConfig, on as fileRefSchema, ot as buildEvalKey, pn as captureEvalSpanError, pt as evalFreshnessStatusSchema, q as apiCallsConfigSchema, qt as traceAttributeDisplayInputSchema, rn as columnDefSchema, rt as runLogsConfigSchema, sn as jsonCellSchema, st as getCaseRowCaseKey, tn as traceSpanWarningSchema, tt as resolveApiCallsConfig, un as runArtifactRefSchema, ut as caseDetailSchema, vn as deserializeCacheRecording, vt as runLogEntrySchema, wn as appendToEvalOutput, wt as evalChartAxisSchema, xn as serializeCacheValue, xt as runLogPhaseSchema, yn as deserializeCacheValue, yt as runLogLevelSchema, z as runManifestSchema, zn as setScopeCacheContext, zt as cacheOperationTypeSchema } from "./runOrchestration-CW_Fzr2c.mjs";
2
+ import { n as createRunner, t as runCli } from "./cli-T6FYE2Bk.mjs";
3
+ import "./src-CbVnqjW3.mjs";
4
4
  export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
package/dist/runChild.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { B as runSummarySchema, T as createRunRequestSchema, Tn as configureEvalRunLogs, b as loadConfig, gt as evalStatsConfigSchema, jt as evalChartsConfigSchema, ot as buildEvalKey, r as getTargetEvals$1, rn as columnDefSchema, t as executeRun, v as parseEvalDiscovery, w as createFsCacheStore, z as runManifestSchema } from "./runOrchestration-B31SV_Bq.mjs";
1
+ import { B as runSummarySchema, T as createRunRequestSchema, Tn as configureEvalRunLogs, b as loadConfig, gt as evalStatsConfigSchema, jt as evalChartsConfigSchema, ot as buildEvalKey, r as getTargetEvals$1, rn as columnDefSchema, t as executeRun, v as parseEvalDiscovery, w as createFsCacheStore, z as runManifestSchema } from "./runOrchestration-CW_Fzr2c.mjs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { readFile } from "node:fs/promises";
4
4
  import { relative } from "node:path";
@@ -1436,7 +1436,6 @@ function createTraceCache(generateSpanId) {
1436
1436
  const namespace = info.namespace ?? `${cacheCtx.evalId}__${info.name}`;
1437
1437
  const keyHash = await hashCacheKey({
1438
1438
  namespace,
1439
- codeFingerprint: cacheCtx.codeFingerprint,
1440
1439
  key: info.key
1441
1440
  }, { serializeFileBytes: info.serializeFileBytes === true });
1442
1441
  const activeSpan = scope.activeSpanStack.at(-1);
@@ -1879,10 +1878,9 @@ async function traceSpanInternal(info, fn) {
1879
1878
  const cacheCtx = scope.cacheContext;
1880
1879
  if (cacheOpts !== void 0 && cacheCtx !== void 0 && scope.replayingDepth === 0) {
1881
1880
  const ctx = cacheCtx;
1882
- const namespace = cacheOpts.namespace ?? `${ctx.evalId}__${info.name}`;
1881
+ const namespace = getRequiredSpanCacheNamespace(cacheOpts);
1883
1882
  const keyHash = await hashCacheKey({
1884
1883
  namespace,
1885
- codeFingerprint: ctx.codeFingerprint,
1886
1884
  key: cacheOpts.key
1887
1885
  }, { serializeFileBytes: cacheOpts.serializeFileBytes === true });
1888
1886
  mergeSpanAttributes(spanRecord, {
@@ -1966,6 +1964,12 @@ async function traceSpanInternal(info, fn) {
1966
1964
  scope.activeSpanStack.pop();
1967
1965
  }
1968
1966
  }
1967
+ function getRequiredSpanCacheNamespace(cacheOpts) {
1968
+ if (!isRecordLike$1(cacheOpts)) throw new Error("Cached spans require a non-empty cache.namespace");
1969
+ const namespace = cacheOpts.namespace;
1970
+ if (typeof namespace !== "string" || namespace.length === 0) throw new Error("Cached spans require a non-empty cache.namespace");
1971
+ return namespace;
1972
+ }
1969
1973
  /**
1970
1974
  * Trace builder used to create hierarchical spans and checkpoints during eval
1971
1975
  * execution.
@@ -2248,8 +2252,8 @@ const cacheModeSchema = z.enum([
2248
2252
  const spanCacheOptionsSchema = z.object({
2249
2253
  /** Arbitrary JSON-safe value used to derive the cache key. */
2250
2254
  key: z.unknown(),
2251
- /** Override the default namespace (`${evalId}__${spanName}`). */
2252
- namespace: z.string().optional(),
2255
+ /** Required cache namespace shared by span cache entries in the same domain. */
2256
+ namespace: z.string().min(1),
2253
2257
  /**
2254
2258
  * Include native `Blob`/`File` bytes in the cache key. By default only stable
2255
2259
  * metadata (`type`, `size`, plus `name`/`lastModified` for `File`) is used.
@@ -4197,7 +4201,7 @@ async function writeDebugKeyFile(debugDir, debugFile) {
4197
4201
  await mkdir(debugDir, { recursive: true });
4198
4202
  const filePath = ownerPath(debugDir, debugFile.owner);
4199
4203
  const tmpPath = `${filePath}.${process.pid.toString()}.tmp`;
4200
- await writeFile(tmpPath, JSON.stringify(debugFile));
4204
+ await writeFile(tmpPath, JSON.stringify(debugFile, null, 2));
4201
4205
  await rename(tmpPath, filePath);
4202
4206
  }
4203
4207
  function pruneEntries(entries, namespace, maxEntries, protectedKey) {
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-C4yumCXE.mjs";
2
- import "./src-CLMrRle2.mjs";
1
+ import { n as createRunner } from "./cli-T6FYE2Bk.mjs";
2
+ import "./src-CbVnqjW3.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-LHN7L-xk.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-BsRloAm3.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -0,0 +1,3 @@
1
+ import "./runOrchestration-CW_Fzr2c.mjs";
2
+ import "./cli-T6FYE2Bk.mjs";
3
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.26.2",
3
+ "version": "0.26.3",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -87,7 +87,10 @@ export async function runRefundWorkflow(input: RefundInput) {
87
87
  {
88
88
  kind: 'llm',
89
89
  name: 'plan-refund',
90
- cache: { key: { prompt: input.message, model: 'gpt-4o-mini' } },
90
+ cache: {
91
+ namespace: 'refund-workflow__plan-refund',
92
+ key: { prompt: input.message, model: 'gpt-4o-mini' },
93
+ },
91
94
  },
92
95
  async () => {
93
96
  let text: string;
@@ -319,15 +322,18 @@ Their shapes live in the types; no need to memorize the option set.
319
322
 
320
323
  ## Cached operations
321
324
 
322
- Wrap a costly pure span in `cache: { key }` so later runs replay its recorded
323
- effects without re-executing:
325
+ Wrap a costly pure span in `cache: { namespace, key }` so later runs replay its
326
+ recorded effects without re-executing:
324
327
 
325
328
  ```ts
326
329
  await evalTracer.span(
327
330
  {
328
331
  kind: 'llm',
329
332
  name: 'plan-refund',
330
- cache: { key: { prompt: input.message, model: 'gpt-4o-mini' } },
333
+ cache: {
334
+ namespace: 'refund-workflow__plan-refund',
335
+ key: { prompt: input.message, model: 'gpt-4o-mini' },
336
+ },
331
337
  },
332
338
  async () => {
333
339
  const result = await llm.complete(input.message);
@@ -368,12 +374,13 @@ Mental model:
368
374
  namespace, and hit/miss status. When called directly from the case body
369
375
  (no surrounding span), the ref is recorded on the case detail's `cacheRefs`
370
376
  array.
371
- - The cache key folds in a source-file fingerprint, so editing the eval busts
372
- the cache automatically.
373
- - `cache.namespace` on spans or `namespace` on value caches can share entries
374
- across operations/evals, but the source-file fingerprint still participates
375
- in the final key. Shared namespaces are reusable across evals in the same
376
- file; evals in different files miss even with the same namespace and key.
377
+ - Cache identity is the namespace plus the authored key. Source-file
378
+ fingerprints are stored as metadata for inspection, but do not participate in
379
+ cache-key hashing.
380
+ - Cached spans require an explicit `cache.namespace`; value caches default to
381
+ `${evalId}__${name}` and can be overridden with `namespace`. Matching
382
+ namespaces share entries across operations/evals that use the same authored
383
+ key.
377
384
  - Authored eval ids are unique within one eval file. The exact eval identity is
378
385
  the workspace-relative file path plus eval id, so the same id can be reused in
379
386
  different files. Case ids must be unique within one eval; duplicate case ids
@@ -462,9 +469,9 @@ When adding or changing evals:
462
469
  formats from the `ColumnFormat` type.
463
470
  5. Promote high-signal span attributes with `traceDisplay` so they surface in
464
471
  the trace tree and detail pane.
465
- 6. Cache costly pure spans with `cache: { key }` and pure spanless values with
466
- `evalTracer.cache(...)`; never cache operations whose external side effects
467
- you depend on.
472
+ 6. Cache costly pure spans with `cache: { namespace, key }` and pure spanless
473
+ values with `evalTracer.cache(...)`; never cache operations whose external
474
+ side effects you depend on.
468
475
  7. Sanity-check after changes: `agent-evals list`, then
469
476
  `agent-evals run --eval <id>`; use `--file <path|glob>` to target one file
470
477
  when multiple files use the same eval id.
@@ -1,3 +0,0 @@
1
- import "./runOrchestration-B31SV_Bq.mjs";
2
- import "./cli-C4yumCXE.mjs";
3
- export {};