@ls-stack/agent-eval 0.52.2 → 0.52.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,7 +25,7 @@
25
25
  href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
26
26
  rel="stylesheet"
27
27
  />
28
- <script type="module" crossorigin src="/assets/index-gJHi1MdH.js"></script>
28
+ <script type="module" crossorigin src="/assets/index-D9HUAH8K.js"></script>
29
29
  <link rel="stylesheet" crossorigin href="/assets/index-BHc4gfUO.css">
30
30
  </head>
31
31
  <body>
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-Cvs7tc2v.mjs";
2
+ import { t as runCli } from "./cli-DbVFgRO3.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -1,4 +1,4 @@
1
- import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, Lt as getEvalRegistry, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadEvalModule, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-o38J7uZO.mjs";
1
+ import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, Lt as getEvalRegistry, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadEvalModule, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-DT6cje9E.mjs";
2
2
  import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
3
3
  import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
4
4
  import { createHash, randomUUID } from "node:crypto";
@@ -2095,8 +2095,8 @@ async function commandApp(args) {
2095
2095
  const { serve } = await import("@hono/node-server");
2096
2096
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
2097
2097
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
2098
- const appModule = await import("./app-CzLj4ZX0.mjs");
2099
- const runnerModule = await import("./runner-iWtmKx9z.mjs");
2098
+ const appModule = await import("./app-BsFcUIQp.mjs");
2099
+ const runnerModule = await import("./runner-CyTUvbHE.mjs");
2100
2100
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
2101
2101
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
2102
2102
  await runnerModule.initRunner();
package/dist/index.d.mts CHANGED
@@ -2568,6 +2568,22 @@ declare const scoreTraceSchema: z$1.ZodObject<{
2568
2568
  }>>;
2569
2569
  }, z$1.core.$strip>>>;
2570
2570
  }, z$1.core.$strip>;
2571
+ cacheRefs: z$1.ZodDefault<z$1.ZodArray<z$1.ZodObject<{
2572
+ type: z$1.ZodLiteral<"value">;
2573
+ name: z$1.ZodString;
2574
+ namespace: z$1.ZodString;
2575
+ key: z$1.ZodString;
2576
+ status: z$1.ZodEnum<{
2577
+ hit: "hit";
2578
+ miss: "miss";
2579
+ refresh: "refresh";
2580
+ bypass: "bypass";
2581
+ }>;
2582
+ read: z$1.ZodOptional<z$1.ZodBoolean>;
2583
+ stored: z$1.ZodOptional<z$1.ZodBoolean>;
2584
+ storedAt: z$1.ZodOptional<z$1.ZodString>;
2585
+ age: z$1.ZodOptional<z$1.ZodNumber>;
2586
+ }, z$1.core.$strip>>>;
2571
2587
  }, z$1.core.$strip>;
2572
2588
  /** Trace payload captured while computing one score for a case. */
2573
2589
  type ScoreTrace = z$1.infer<typeof scoreTraceSchema>;
@@ -2724,6 +2740,22 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
2724
2740
  }>>;
2725
2741
  }, z$1.core.$strip>>>;
2726
2742
  }, z$1.core.$strip>;
2743
+ cacheRefs: z$1.ZodDefault<z$1.ZodArray<z$1.ZodObject<{
2744
+ type: z$1.ZodLiteral<"value">;
2745
+ name: z$1.ZodString;
2746
+ namespace: z$1.ZodString;
2747
+ key: z$1.ZodString;
2748
+ status: z$1.ZodEnum<{
2749
+ hit: "hit";
2750
+ miss: "miss";
2751
+ refresh: "refresh";
2752
+ bypass: "bypass";
2753
+ }>;
2754
+ read: z$1.ZodOptional<z$1.ZodBoolean>;
2755
+ stored: z$1.ZodOptional<z$1.ZodBoolean>;
2756
+ storedAt: z$1.ZodOptional<z$1.ZodString>;
2757
+ age: z$1.ZodOptional<z$1.ZodNumber>;
2758
+ }, z$1.core.$strip>>>;
2727
2759
  }, z$1.core.$strip>>>;
2728
2760
  columns: z$1.ZodRecord<z$1.ZodString, z$1.ZodUnion<readonly [z$1.ZodType<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown, z$1.core.$ZodTypeInternals<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown>>, z$1.ZodUnion<readonly [z$1.ZodObject<{
2729
2761
  source: z$1.ZodLiteral<"repo">;
@@ -5566,6 +5598,22 @@ declare const caseDetailSchema: z$1.ZodObject<{
5566
5598
  }>>;
5567
5599
  }, z$1.core.$strip>>>;
5568
5600
  }, z$1.core.$strip>;
5601
+ cacheRefs: z$1.ZodDefault<z$1.ZodArray<z$1.ZodObject<{
5602
+ type: z$1.ZodLiteral<"value">;
5603
+ name: z$1.ZodString;
5604
+ namespace: z$1.ZodString;
5605
+ key: z$1.ZodString;
5606
+ status: z$1.ZodEnum<{
5607
+ hit: "hit";
5608
+ miss: "miss";
5609
+ refresh: "refresh";
5610
+ bypass: "bypass";
5611
+ }>;
5612
+ read: z$1.ZodOptional<z$1.ZodBoolean>;
5613
+ stored: z$1.ZodOptional<z$1.ZodBoolean>;
5614
+ storedAt: z$1.ZodOptional<z$1.ZodString>;
5615
+ age: z$1.ZodOptional<z$1.ZodNumber>;
5616
+ }, z$1.core.$strip>>>;
5569
5617
  }, z$1.core.$strip>>>;
5570
5618
  columns: z$1.ZodRecord<z$1.ZodString, z$1.ZodUnion<readonly [z$1.ZodType<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown, z$1.core.$ZodTypeInternals<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown>>, z$1.ZodUnion<readonly [z$1.ZodObject<{
5571
5619
  source: z$1.ZodLiteral<"repo">;
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, Lt as getEvalRegistry, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-o38J7uZO.mjs";
2
- import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-Cvs7tc2v.mjs";
3
- import { n as matchesEvalTags, t as defineEval } from "./src-Jahivm6d.mjs";
1
+ import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, Lt as getEvalRegistry, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-DT6cje9E.mjs";
2
+ import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-DbVFgRO3.mjs";
3
+ import { n as matchesEvalTags, t as defineEval } from "./src-DlvYXPxG.mjs";
4
4
  export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
package/dist/runChild.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as columnDefSchema, Mt as evalStatsConfigSchema, Nt as manualInputDescriptorSchema, Pt as evalChartsConfigSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-o38J7uZO.mjs";
1
+ import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as columnDefSchema, Mt as evalStatsConfigSchema, Nt as manualInputDescriptorSchema, Pt as evalChartsConfigSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-DT6cje9E.mjs";
2
2
  import { z } from "zod/v4";
3
3
  import { readFile } from "node:fs/promises";
4
4
  import { relative } from "node:path";
@@ -827,7 +827,12 @@ const runLogEntrySchema = z.object({
827
827
  /** Trace payload captured while computing one score for a case. */
828
828
  const scoreTraceSchema = z.object({
829
829
  trace: z.array(traceSpanSchema),
830
- traceDisplay: traceDisplayConfigSchema
830
+ traceDisplay: traceDisplayConfigSchema,
831
+ /**
832
+ * Value-cache refs recorded by `evalTracer.cache(...)` calls made directly
833
+ * from the score compute body, with no surrounding scorer span.
834
+ */
835
+ cacheRefs: z.array(traceCacheRefSchema).default([])
831
836
  });
832
837
  /** Schema for the detailed payload shown when opening a specific case. */
833
838
  const caseDetailSchema = z.object({
@@ -7134,9 +7139,10 @@ async function runCase(params) {
7134
7139
  ...entry,
7135
7140
  source: key
7136
7141
  })));
7137
- if (trace.length > 0) scoringTraces[key] = {
7142
+ if (trace.length > 0 || scoreRun.scope.caseCacheRefs.length > 0) scoringTraces[key] = {
7138
7143
  trace,
7139
- traceDisplay
7144
+ traceDisplay,
7145
+ cacheRefs: scoreRun.scope.caseCacheRefs
7140
7146
  };
7141
7147
  const rawValue = scoreRun.result;
7142
7148
  if (scoreRun.error) {
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-LdMiDmAN.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-WRQdfG0r.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-Cvs7tc2v.mjs";
2
- import "./src-Jahivm6d.mjs";
1
+ import { n as createRunner } from "./cli-DbVFgRO3.mjs";
2
+ import "./src-DlvYXPxG.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,5 +1,5 @@
1
- import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-o38J7uZO.mjs";
2
- import "./cli-Cvs7tc2v.mjs";
1
+ import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-DT6cje9E.mjs";
2
+ import "./cli-DbVFgRO3.mjs";
3
3
  //#region src/index.ts
4
4
  /** Register an eval definition with typed tag support. */
5
5
  function defineEval(definition) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.52.2",
3
+ "version": "0.52.3",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -303,7 +303,8 @@ or if the case errors. Scores without `passThreshold` are informational.
303
303
  Score functions run in their own trace scope, separate from the execution
304
304
  trace, so LLM-as-judge scorers can use `evalTracer.span(...)` and cached spans
305
305
  without polluting the agent trajectory. Outputs set inside a scorer stay
306
- private to that score.
306
+ private to that score. Spanless `evalTracer.cache(...)` calls made directly
307
+ inside a scorer are stored on that score trace's `cacheRefs` payload.
307
308
 
308
309
  `manualScores` declares score columns that reviewers fill in after a run.
309
310
  Pending values keep the eval in an `unscored` state instead of failing.
@@ -473,7 +474,8 @@ Mental model:
473
474
  span, that span gets a `cache.refs` entry with the value cache name, key,
474
475
  namespace, and hit/miss status. When called directly from the case body
475
476
  (no surrounding span), the ref is recorded on the case detail's `cacheRefs`
476
- array.
477
+ array. When called directly from a scorer, the ref is recorded on that
478
+ scoring trace's `cacheRefs` array.
477
479
  - Cache identity is the namespace plus the authored key. Source-file
478
480
  fingerprints are tracked for run freshness separately, but do not participate
479
481
  in cache-key hashing.