npm - @ls-stack/agent-eval - Versions diffs - 0.52.2 → 0.52.3 - Mend

@ls-stack/agent-eval 0.52.2 → 0.52.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/{app-CzLj4ZX0.mjs → app-BsFcUIQp.mjs} +4 -4
package/dist/apps/web/dist/assets/{index-gJHi1MdH.js → index-D9HUAH8K.js} +74 -74
package/dist/apps/web/dist/index.html +1 -1
package/dist/bin.mjs +1 -1
package/dist/{cli-Cvs7tc2v.mjs → cli-DbVFgRO3.mjs} +3 -3
package/dist/index.d.mts +48 -0
package/dist/index.mjs +3 -3
package/dist/runChild.mjs +1 -1
package/dist/{runOrchestration-o38J7uZO.mjs → runOrchestration-DT6cje9E.mjs} +9 -3
package/dist/{runner-iWtmKx9z.mjs → runner-CyTUvbHE.mjs} +1 -1
package/dist/{runner-LdMiDmAN.mjs → runner-WRQdfG0r.mjs} +2 -2
package/dist/{src-Jahivm6d.mjs → src-DlvYXPxG.mjs} +2 -2
package/package.json +1 -1
package/skills/agent-eval/SKILL.md +4 -2

package/dist/apps/web/dist/index.html CHANGED Viewed

@@ -25,7 +25,7 @@
       href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
       rel="stylesheet"
     />
-    <script type="module" crossorigin src="/assets/index-gJHi1MdH.js"></script>
+    <script type="module" crossorigin src="/assets/index-D9HUAH8K.js"></script>
     <link rel="stylesheet" crossorigin href="/assets/index-BHc4gfUO.css">
   </head>
   <body>

package/dist/bin.mjs CHANGED Viewed

@@ -1,5 +1,5 @@
 #!/usr/bin/env node
-import { t as runCli } from "./cli-Cvs7tc2v.mjs";
+import { t as runCli } from "./cli-DbVFgRO3.mjs";
 import { spawn } from "node:child_process";
 //#region src/bin.ts
 const moduleMocksFlag = "--experimental-test-module-mocks";

package/dist/{cli-Cvs7tc2v.mjs → cli-DbVFgRO3.mjs} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, Lt as getEvalRegistry, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadEvalModule, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-o38J7uZO.mjs";
+import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, Lt as getEvalRegistry, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadEvalModule, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-DT6cje9E.mjs";
 import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
 import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
 import { createHash, randomUUID } from "node:crypto";
@@ -2095,8 +2095,8 @@ async function commandApp(args) {
 	const { serve } = await import("@hono/node-server");
 	const bundledWebDist = resolve(currentDir, "apps/web/dist");
 	if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
-	const appModule = await import("./app-CzLj4ZX0.mjs");
-	const runnerModule = await import("./runner-iWtmKx9z.mjs");
+	const appModule = await import("./app-BsFcUIQp.mjs");
+	const runnerModule = await import("./runner-CyTUvbHE.mjs");
 	if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
 	if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
 	await runnerModule.initRunner();

package/dist/index.d.mts CHANGED Viewed

@@ -2568,6 +2568,22 @@ declare const scoreTraceSchema: z$1.ZodObject<{
       }>>;
     }, z$1.core.$strip>>>;
   }, z$1.core.$strip>;
+  cacheRefs: z$1.ZodDefault<z$1.ZodArray<z$1.ZodObject<{
+    type: z$1.ZodLiteral<"value">;
+    name: z$1.ZodString;
+    namespace: z$1.ZodString;
+    key: z$1.ZodString;
+    status: z$1.ZodEnum<{
+      hit: "hit";
+      miss: "miss";
+      refresh: "refresh";
+      bypass: "bypass";
+    }>;
+    read: z$1.ZodOptional<z$1.ZodBoolean>;
+    stored: z$1.ZodOptional<z$1.ZodBoolean>;
+    storedAt: z$1.ZodOptional<z$1.ZodString>;
+    age: z$1.ZodOptional<z$1.ZodNumber>;
+  }, z$1.core.$strip>>>;
 }, z$1.core.$strip>;
 /** Trace payload captured while computing one score for a case. */
 type ScoreTrace = z$1.infer<typeof scoreTraceSchema>;
@@ -2724,6 +2740,22 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
         }>>;
       }, z$1.core.$strip>>>;
     }, z$1.core.$strip>;
+    cacheRefs: z$1.ZodDefault<z$1.ZodArray<z$1.ZodObject<{
+      type: z$1.ZodLiteral<"value">;
+      name: z$1.ZodString;
+      namespace: z$1.ZodString;
+      key: z$1.ZodString;
+      status: z$1.ZodEnum<{
+        hit: "hit";
+        miss: "miss";
+        refresh: "refresh";
+        bypass: "bypass";
+      }>;
+      read: z$1.ZodOptional<z$1.ZodBoolean>;
+      stored: z$1.ZodOptional<z$1.ZodBoolean>;
+      storedAt: z$1.ZodOptional<z$1.ZodString>;
+      age: z$1.ZodOptional<z$1.ZodNumber>;
+    }, z$1.core.$strip>>>;
   }, z$1.core.$strip>>>;
   columns: z$1.ZodRecord<z$1.ZodString, z$1.ZodUnion<readonly [z$1.ZodType<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown, z$1.core.$ZodTypeInternals<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown>>, z$1.ZodUnion<readonly [z$1.ZodObject<{
     source: z$1.ZodLiteral<"repo">;
@@ -5566,6 +5598,22 @@ declare const caseDetailSchema: z$1.ZodObject<{
         }>>;
       }, z$1.core.$strip>>>;
     }, z$1.core.$strip>;
+    cacheRefs: z$1.ZodDefault<z$1.ZodArray<z$1.ZodObject<{
+      type: z$1.ZodLiteral<"value">;
+      name: z$1.ZodString;
+      namespace: z$1.ZodString;
+      key: z$1.ZodString;
+      status: z$1.ZodEnum<{
+        hit: "hit";
+        miss: "miss";
+        refresh: "refresh";
+        bypass: "bypass";
+      }>;
+      read: z$1.ZodOptional<z$1.ZodBoolean>;
+      stored: z$1.ZodOptional<z$1.ZodBoolean>;
+      storedAt: z$1.ZodOptional<z$1.ZodString>;
+      age: z$1.ZodOptional<z$1.ZodNumber>;
+    }, z$1.core.$strip>>>;
   }, z$1.core.$strip>>>;
   columns: z$1.ZodRecord<z$1.ZodString, z$1.ZodUnion<readonly [z$1.ZodType<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown, z$1.core.$ZodTypeInternals<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown>>, z$1.ZodUnion<readonly [z$1.ZodObject<{
     source: z$1.ZodLiteral<"repo">;

package/dist/index.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, Lt as getEvalRegistry, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-o38J7uZO.mjs";
-import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-Cvs7tc2v.mjs";
-import { n as matchesEvalTags, t as defineEval } from "./src-Jahivm6d.mjs";
+import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, Lt as getEvalRegistry, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-DT6cje9E.mjs";
+import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-DbVFgRO3.mjs";
+import { n as matchesEvalTags, t as defineEval } from "./src-DlvYXPxG.mjs";
 export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };

package/dist/runChild.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as columnDefSchema, Mt as evalStatsConfigSchema, Nt as manualInputDescriptorSchema, Pt as evalChartsConfigSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-o38J7uZO.mjs";
+import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as columnDefSchema, Mt as evalStatsConfigSchema, Nt as manualInputDescriptorSchema, Pt as evalChartsConfigSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-DT6cje9E.mjs";
 import { z } from "zod/v4";
 import { readFile } from "node:fs/promises";
 import { relative } from "node:path";

package/dist/{runOrchestration-o38J7uZO.mjs → runOrchestration-DT6cje9E.mjs} RENAMED Viewed

@@ -827,7 +827,12 @@ const runLogEntrySchema = z.object({
 /** Trace payload captured while computing one score for a case. */
 const scoreTraceSchema = z.object({
 	trace: z.array(traceSpanSchema),
-	traceDisplay: traceDisplayConfigSchema
+	traceDisplay: traceDisplayConfigSchema,
+	/**
+	* Value-cache refs recorded by `evalTracer.cache(...)` calls made directly
+	* from the score compute body, with no surrounding scorer span.
+	*/
+	cacheRefs: z.array(traceCacheRefSchema).default([])
 });
 /** Schema for the detailed payload shown when opening a specific case. */
 const caseDetailSchema = z.object({
@@ -7134,9 +7139,10 @@ async function runCase(params) {
 			...entry,
 			source: key
 		})));
-		if (trace.length > 0) scoringTraces[key] = {
+		if (trace.length > 0 || scoreRun.scope.caseCacheRefs.length > 0) scoringTraces[key] = {
 			trace,
-			traceDisplay
+			traceDisplay,
+			cacheRefs: scoreRun.scope.caseCacheRefs
 		};
 		const rawValue = scoreRun.result;
 		if (scoreRun.error) {

package/dist/{runner-iWtmKx9z.mjs → runner-CyTUvbHE.mjs} RENAMED Viewed

@@ -1,2 +1,2 @@
-import { n as initRunner, t as getRunnerInstance } from "./runner-LdMiDmAN.mjs";
+import { n as initRunner, t as getRunnerInstance } from "./runner-WRQdfG0r.mjs";
 export { getRunnerInstance, initRunner };

package/dist/{runner-LdMiDmAN.mjs → runner-WRQdfG0r.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { n as createRunner } from "./cli-Cvs7tc2v.mjs";
-import "./src-Jahivm6d.mjs";
+import { n as createRunner } from "./cli-DbVFgRO3.mjs";
+import "./src-DlvYXPxG.mjs";
 //#region ../../apps/server/src/runner.ts
 let runnerInstance = null;
 function getRunnerInstance() {

package/dist/{src-Jahivm6d.mjs → src-DlvYXPxG.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-o38J7uZO.mjs";
-import "./cli-Cvs7tc2v.mjs";
+import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-DT6cje9E.mjs";
+import "./cli-DbVFgRO3.mjs";
 //#region src/index.ts
 /** Register an eval definition with typed tag support. */
 function defineEval(definition) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ls-stack/agent-eval",
-  "version": "0.52.2",
+  "version": "0.52.3",
   "type": "module",
   "bin": {
     "agent-evals": "./dist/bin.mjs"

package/skills/agent-eval/SKILL.md CHANGED Viewed

@@ -303,7 +303,8 @@ or if the case errors. Scores without `passThreshold` are informational.
 Score functions run in their own trace scope, separate from the execution
 trace, so LLM-as-judge scorers can use `evalTracer.span(...)` and cached spans
 without polluting the agent trajectory. Outputs set inside a scorer stay
-private to that score.
+private to that score. Spanless `evalTracer.cache(...)` calls made directly
+inside a scorer are stored on that score trace's `cacheRefs` payload.
 `manualScores` declares score columns that reviewers fill in after a run.
 Pending values keep the eval in an `unscored` state instead of failing.
@@ -473,7 +474,8 @@ Mental model:
   span, that span gets a `cache.refs` entry with the value cache name, key,
   namespace, and hit/miss status. When called directly from the case body
   (no surrounding span), the ref is recorded on the case detail's `cacheRefs`
-  array.
+  array. When called directly from a scorer, the ref is recorded on that
+  scoring trace's `cacheRefs` array.
 - Cache identity is the namespace plus the authored key. Source-file
   fingerprints are tracked for run freshness separately, but do not participate
   in cache-key hashing.