npm - @ls-stack/agent-eval - Versions diffs - 0.60.1 → 0.60.3 - Mend

@ls-stack/agent-eval 0.60.1 → 0.60.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/{app-CxKEVlng.mjs → app-l3ynaNsb.mjs} +4 -4
package/dist/bin.mjs +1 -1
package/dist/caseChild.mjs +1 -1
package/dist/{cli-CVBSlTD8.mjs → cli-BSVUCUxr.mjs} +34 -8
package/dist/index.d.mts +110 -110
package/dist/index.mjs +3 -3
package/dist/runChild.mjs +2 -2
package/dist/{runExecution-CjWJUUZ5.mjs → runExecution-Bq0Y3y_1.mjs} +2 -2
package/dist/{runOrchestration-DE2TFAS6.mjs → runOrchestration-C7qQISz2.mjs} +41 -8
package/dist/{runner-Cu1CQPTB.mjs → runner-C9xNJHt3.mjs} +1 -1
package/dist/{runner-DzDRasWV.mjs → runner-DmkSq-QG.mjs} +2 -2
package/dist/{src-DjOTPnDz.mjs → src-D5vGo2iv.mjs} +2 -2
package/package.json +3 -3
package/skills/agent-eval/SKILL.md +2 -0

package/dist/index.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { $ as setScopeCacheContext, A as repoFile, B as evalTime, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as EvalRuntimeUsageError, Ft as getEvalRegistry, H as getEvalCaseInput, I as appendToEvalOutput, J as runInEvalRuntimeScope, K as mergeEvalOutput, M as readManualInputFile, N as evalExpect, O as serializeCacheRecording, P as EvalAssertionError, Q as setEvalOutput, R as evalAssert, S as evalSpan, T as hashCacheKeySync, U as incrementEvalOutput, V as getCurrentScope, W as isInEvalScope, X as runInExistingEvalScope, Y as runInEvalScope, at as extractApiCalls, b as buildTraceTree, ct as simulateTokenAllocation, et as startEvalBackgroundJob, it as extractCacheHits, j as manualInputFileValueSchema, k as serializeCacheValue, ot as extractLlmCalls, q as nextEvalId, rt as extractCacheEntries, st as simulateLlmCallCost, ut as getNestedAttribute, w as hashCacheKey, x as captureEvalSpanError, z as evalLog } from "./runExecution-CjWJUUZ5.mjs";
-import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-CVBSlTD8.mjs";
-import { n as matchesEvalTags, t as defineEval } from "./src-DjOTPnDz.mjs";
+import { $ as setScopeCacheContext, A as repoFile, B as evalTime, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as EvalRuntimeUsageError, Ft as getEvalRegistry, H as getEvalCaseInput, I as appendToEvalOutput, J as runInEvalRuntimeScope, K as mergeEvalOutput, M as readManualInputFile, N as evalExpect, O as serializeCacheRecording, P as EvalAssertionError, Q as setEvalOutput, R as evalAssert, S as evalSpan, T as hashCacheKeySync, U as incrementEvalOutput, V as getCurrentScope, W as isInEvalScope, X as runInExistingEvalScope, Y as runInEvalScope, at as extractApiCalls, b as buildTraceTree, ct as simulateTokenAllocation, et as startEvalBackgroundJob, it as extractCacheHits, j as manualInputFileValueSchema, k as serializeCacheValue, ot as extractLlmCalls, q as nextEvalId, rt as extractCacheEntries, st as simulateLlmCallCost, ut as getNestedAttribute, w as hashCacheKey, x as captureEvalSpanError, z as evalLog } from "./runExecution-Bq0Y3y_1.mjs";
+import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-BSVUCUxr.mjs";
+import { n as matchesEvalTags, t as defineEval } from "./src-D5vGo2iv.mjs";
 export { EvalAssertionError, EvalRuntimeUsageError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob };

package/dist/runChild.mjs CHANGED Viewed

@@ -1,5 +1,5 @@
-import { At as evalStatsConfigSchema, L as configureEvalRunLogs, Mt as evalChartsConfigSchema, Nt as columnDefSchema, Tt as buildEvalKey, bt as runManifestSchema, jt as manualInputDescriptorSchema, kt as evalStatAggregateSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, tt as createRunRequestSchema, v as createFsCacheStore, xt as runSummarySchema, y as getCacheRetentionOptions } from "./runExecution-CjWJUUZ5.mjs";
-import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-DE2TFAS6.mjs";
+import { At as evalStatsConfigSchema, L as configureEvalRunLogs, Mt as evalChartsConfigSchema, Nt as columnDefSchema, Tt as buildEvalKey, bt as runManifestSchema, jt as manualInputDescriptorSchema, kt as evalStatAggregateSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, tt as createRunRequestSchema, v as createFsCacheStore, xt as runSummarySchema, y as getCacheRetentionOptions } from "./runExecution-Bq0Y3y_1.mjs";
+import { C as parseEvalDiscovery, h as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-C7qQISz2.mjs";
 import { z } from "zod";
 import { readFile } from "node:fs/promises";
 import { relative } from "node:path";

package/dist/{runExecution-CjWJUUZ5.mjs → runExecution-Bq0Y3y_1.mjs} RENAMED Viewed

@@ -289,7 +289,7 @@ z.object({
 	key: z.string(),
 	namespace: z.string(),
 	storedAt: z.string(),
-	/** Last successful cache hit time. `null` means the entry has not been hit yet. */
+	/** Last successful cache read or write time. Legacy entries may be `null`. */
 	lastAccessedAt: z.string().nullable()
 });
 z.object({
@@ -5395,7 +5395,7 @@ function createFsCacheStore(options) {
 				const index = await readNamespaceIndex(cacheDir, entry.namespace);
 				index.entries[entry.key] = {
 					storedAt: entry.storedAt,
-					lastAccessedAt: null,
+					lastAccessedAt: entry.storedAt,
 					blobRefs: await collectExternalJsonBlobRefs(entry, blobDirs)
 				};
 				await writeNamespaceIndex(cacheDir, index);

package/dist/{runOrchestration-DE2TFAS6.mjs → runOrchestration-C7qQISz2.mjs} RENAMED Viewed

@@ -1,8 +1,8 @@
-import { Dt as caseDetailSchema, Et as getCaseRowCaseKey, It as runWithEvalRegistry, J as runInEvalRuntimeScope, Ot as caseRowSchema, Z as runWithEvalClock, _t as matchesTagsFilter, bt as runManifestSchema, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as dedupeEvalTags, ht as deriveStatusFromChildStatuses, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromCaseRows, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveScopedSummaryFromCases, t as filterEvalCases, u as runWithModuleIsolation, vt as validateEvalTagName, wt as buildCaseKey, xt as runSummarySchema, yt as validateTagsFilterExpression } from "./runExecution-CjWJUUZ5.mjs";
+import { Dt as caseDetailSchema, Et as getCaseRowCaseKey, It as runWithEvalRegistry, J as runInEvalRuntimeScope, Ot as caseRowSchema, Z as runWithEvalClock, _t as matchesTagsFilter, bt as runManifestSchema, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as dedupeEvalTags, ht as deriveStatusFromChildStatuses, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromCaseRows, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveScopedSummaryFromCases, t as filterEvalCases, u as runWithModuleIsolation, vt as validateEvalTagName, wt as buildCaseKey, xt as runSummarySchema, yt as validateTagsFilterExpression } from "./runExecution-Bq0Y3y_1.mjs";
 import { Result, resultify } from "t-result";
 import { readFile, readdir, rm, writeFile } from "node:fs/promises";
 import { dirname, join } from "node:path";
-import { existsSync } from "node:fs";
+import { existsSync, readFileSync } from "node:fs";
 import { fileURLToPath } from "node:url";
 import { spawn } from "node:child_process";
 //#region ../runner/src/chartValidation.ts
@@ -670,7 +670,7 @@ async function recomputeEvalStatusesInRuns(params) {
 		let changed = false;
 		for (const caseRow of run.cases) {
 			if (caseRow.evalKey !== params.evalKey) continue;
-			const caseDetail = run.caseDetails.get(getCaseRowCaseKey(caseRow));
+			const caseDetail = params.getCaseDetail?.(run, caseRow) ?? run.caseDetails.get(getCaseRowCaseKey(caseRow));
 			const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, params.scoreThresholds);
 			if (caseRow.status === nextStatus) continue;
 			caseRow.status = nextStatus;
@@ -725,14 +725,22 @@ function nextShortIdFromSnapshots(snapshots) {
 	}
 	return maxNum + 1;
 }
-async function loadPersistedRunSnapshots(localStateDir) {
+/**
+* Load persisted run metadata from the local state directory.
+*
+* Case details are skipped by default so long-running app processes can keep
+* run history in memory without retaining every trace payload. Pass
+* `includeCaseDetails` only for narrow maintenance flows that need full
+* details for every case.
+*/
+async function loadPersistedRunSnapshots(localStateDir, options = {}) {
 	const runsDir = join(localStateDir, "runs");
 	const entriesResult = await resultify(() => readdir(runsDir, { withFileTypes: true }));
 	if (entriesResult.error) return [];
 	const snapshots = [];
 	const runDirs = entriesResult.value.filter((entry) => entry.isDirectory()).map((entry) => join(runsDir, entry.name)).toSorted();
 	for (const runDir of runDirs) {
-		const snapshot = await loadPersistedRunSnapshot(runDir);
+		const snapshot = await loadPersistedRunSnapshot(runDir, options);
 		if (!snapshot) continue;
 		snapshots.push(snapshot);
 	}
@@ -766,7 +774,14 @@ function getLatestRunInfos(params) {
 function toLastRunStatus$1(status) {
 	return status === "pending" ? null : status;
 }
-async function loadPersistedRunSnapshot(runDir) {
+/**
+* Load one persisted run snapshot from disk.
+*
+* The returned snapshot includes manifest, summary, and case rows. Case
+* details are loaded only when `includeCaseDetails` is true; otherwise callers
+* should use `loadPersistedCaseDetail` for the specific case being inspected.
+*/
+async function loadPersistedRunSnapshot(runDir, options = {}) {
 	const manifest = await readParsedJsonFile(join(runDir, "run.json"), { safeParse: runManifestSchema.safeParse.bind(runManifestSchema) });
 	if (!manifest) return null;
 	const summary = await readParsedJsonFile(join(runDir, "summary.json"), { safeParse: runSummarySchema.safeParse.bind(runSummarySchema) });
@@ -776,9 +791,18 @@ async function loadPersistedRunSnapshot(runDir) {
 		manifest,
 		summary,
 		cases: await readCaseRows(runDir),
-		caseDetails: await readCaseDetails(runDir)
+		caseDetails: options.includeCaseDetails === true ? await readCaseDetails(runDir) : /* @__PURE__ */ new Map()
 	};
 }
+/**
+* Load one persisted case detail by its artifact file id.
+*
+* Returns `null` when the file is missing, invalid JSON, or no longer matches
+* the current case-detail schema.
+*/
+function loadPersistedCaseDetail(runDir, fileId) {
+	return readParsedJsonFileSync(join(runDir, "case-details", `${encodeCaseDetailFileName(fileId)}.json`), { safeParse: caseDetailSchema.safeParse.bind(caseDetailSchema) });
+}
 async function readParsedJsonFile(filePath, schema) {
 	const fileResult = await resultify(() => readFile(filePath, "utf-8"));
 	if (fileResult.error) return null;
@@ -788,6 +812,15 @@ async function readParsedJsonFile(filePath, schema) {
 	if (!parsed.success) return null;
 	return parsed.data;
 }
+function readParsedJsonFileSync(filePath, schema) {
+	const fileResult = resultify(() => readFileSync(filePath, "utf-8"));
+	if (fileResult.error) return null;
+	const jsonResult = resultify(() => JSON.parse(fileResult.value));
+	if (jsonResult.error) return null;
+	const parsed = schema.safeParse(jsonResult.value);
+	if (!parsed.success) return null;
+	return parsed.data;
+}
 async function readCaseRows(runDir) {
 	const fileResult = await resultify(() => readFile(join(runDir, "cases.jsonl"), "utf-8"));
 	if (fileResult.error) return [];
@@ -1660,4 +1693,4 @@ function toLastRunStatus(status) {
 	return status === "pending" ? null : status;
 }
 //#endregion
-export { validateCharts as C, parseEvalDiscovery as S, runTouchesEval as _, validateTagsFilters as a, deriveEvalFreshness as b, getLatestRunInfos as c, nextShortIdFromSnapshots as d, persistCaseDetail as f, recomputePersistedCaseStatus as g, recomputeEvalStatusesInRuns as h, resolveEvalTags as i, loadPersistedRunSnapshot as l, persistRunState as m, getTargetEvalKeys as n, generateRunId as o, deleteTemporaryRuns as p, getTargetEvals as r, getLastRunStatuses as s, executeRun as t, loadPersistedRunSnapshots as u, buildManualInputDescriptor as v, loadIsolatedEvalRegistry as x, parseManualInputValues as y };
+export { parseEvalDiscovery as C, loadIsolatedEvalRegistry as S, recomputePersistedCaseStatus as _, validateTagsFilters as a, parseManualInputValues as b, getLatestRunInfos as c, loadPersistedRunSnapshots as d, nextShortIdFromSnapshots as f, recomputeEvalStatusesInRuns as g, persistRunState as h, resolveEvalTags as i, loadPersistedCaseDetail as l, deleteTemporaryRuns as m, getTargetEvalKeys as n, generateRunId as o, persistCaseDetail as p, getTargetEvals as r, getLastRunStatuses as s, executeRun as t, loadPersistedRunSnapshot as u, runTouchesEval as v, validateCharts as w, deriveEvalFreshness as x, buildManualInputDescriptor as y };

package/dist/{runner-Cu1CQPTB.mjs → runner-C9xNJHt3.mjs} RENAMED Viewed

@@ -1,2 +1,2 @@
-import { n as initRunner, t as getRunnerInstance } from "./runner-DzDRasWV.mjs";
+import { n as initRunner, t as getRunnerInstance } from "./runner-DmkSq-QG.mjs";
 export { getRunnerInstance, initRunner };

package/dist/{runner-DzDRasWV.mjs → runner-DmkSq-QG.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { n as createRunner } from "./cli-CVBSlTD8.mjs";
-import "./src-DjOTPnDz.mjs";
+import { n as createRunner } from "./cli-BSVUCUxr.mjs";
+import "./src-D5vGo2iv.mjs";
 //#region ../../apps/server/src/runner.ts
 let runnerInstance = null;
 function getRunnerInstance({ loadEnv = true } = {}) {

package/dist/{src-DjOTPnDz.mjs → src-D5vGo2iv.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { G as matchesEvalTags$1, Pt as defineEval$1 } from "./runExecution-CjWJUUZ5.mjs";
-import "./cli-CVBSlTD8.mjs";
+import { G as matchesEvalTags$1, Pt as defineEval$1 } from "./runExecution-Bq0Y3y_1.mjs";
+import "./cli-BSVUCUxr.mjs";
 //#region src/index.ts
 /** Register an eval definition with typed tag support. */
 function defineEval(definition) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ls-stack/agent-eval",
-  "version": "0.60.1",
+  "version": "0.60.3",
   "type": "module",
   "bin": {
     "agent-evals": "./dist/bin.mjs"
@@ -32,9 +32,9 @@
   "devDependencies": {
     "@types/node": "^24.7.2",
     "typescript": "^5.9.2",
-    "@agent-evals/sdk": "0.0.1",
+    "@agent-evals/runner": "0.0.1",
     "@agent-evals/shared": "0.0.1",
-    "@agent-evals/runner": "0.0.1"
+    "@agent-evals/sdk": "0.0.1"
   },
   "scripts": {
     "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",

package/skills/agent-eval/SKILL.md CHANGED Viewed

@@ -568,6 +568,8 @@ Mental model:
   runner stays idle for `cache.pruneIdleDelayMs ?? 5000` milliseconds. Configure
   `cache.maxEntries` as a number for the default cap, or as
   `{ default, namespaces }` for exact namespace-specific caps.
+  Writes initialize the row's last access time to the stored time; later cache
+  hits refresh that timestamp at the configured access-time update interval.
 - Unindexed legacy cache files are ignored by normal lookup/listing. Use
   `agent-evals cache repair` to remove unindexed cache files, stale index rows,
   debug sidecars, and unreferenced blob files.