@ls-stack/agent-eval 0.60.1 → 0.60.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as setScopeCacheContext, A as repoFile, B as evalTime, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as EvalRuntimeUsageError, Ft as getEvalRegistry, H as getEvalCaseInput, I as appendToEvalOutput, J as runInEvalRuntimeScope, K as mergeEvalOutput, M as readManualInputFile, N as evalExpect, O as serializeCacheRecording, P as EvalAssertionError, Q as setEvalOutput, R as evalAssert, S as evalSpan, T as hashCacheKeySync, U as incrementEvalOutput, V as getCurrentScope, W as isInEvalScope, X as runInExistingEvalScope, Y as runInEvalScope, at as extractApiCalls, b as buildTraceTree, ct as simulateTokenAllocation, et as startEvalBackgroundJob, it as extractCacheHits, j as manualInputFileValueSchema, k as serializeCacheValue, ot as extractLlmCalls, q as nextEvalId, rt as extractCacheEntries, st as simulateLlmCallCost, ut as getNestedAttribute, w as hashCacheKey, x as captureEvalSpanError, z as evalLog } from "./runExecution-CjWJUUZ5.mjs";
2
- import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-CVBSlTD8.mjs";
3
- import { n as matchesEvalTags, t as defineEval } from "./src-DjOTPnDz.mjs";
1
+ import { $ as setScopeCacheContext, A as repoFile, B as evalTime, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as EvalRuntimeUsageError, Ft as getEvalRegistry, H as getEvalCaseInput, I as appendToEvalOutput, J as runInEvalRuntimeScope, K as mergeEvalOutput, M as readManualInputFile, N as evalExpect, O as serializeCacheRecording, P as EvalAssertionError, Q as setEvalOutput, R as evalAssert, S as evalSpan, T as hashCacheKeySync, U as incrementEvalOutput, V as getCurrentScope, W as isInEvalScope, X as runInExistingEvalScope, Y as runInEvalScope, at as extractApiCalls, b as buildTraceTree, ct as simulateTokenAllocation, et as startEvalBackgroundJob, it as extractCacheHits, j as manualInputFileValueSchema, k as serializeCacheValue, ot as extractLlmCalls, q as nextEvalId, rt as extractCacheEntries, st as simulateLlmCallCost, ut as getNestedAttribute, w as hashCacheKey, x as captureEvalSpanError, z as evalLog } from "./runExecution-Bq0Y3y_1.mjs";
2
+ import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-BSVUCUxr.mjs";
3
+ import { n as matchesEvalTags, t as defineEval } from "./src-D5vGo2iv.mjs";
4
4
  export { EvalAssertionError, EvalRuntimeUsageError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob };
package/dist/runChild.mjs CHANGED
@@ -1,5 +1,5 @@
1
- import { At as evalStatsConfigSchema, L as configureEvalRunLogs, Mt as evalChartsConfigSchema, Nt as columnDefSchema, Tt as buildEvalKey, bt as runManifestSchema, jt as manualInputDescriptorSchema, kt as evalStatAggregateSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, tt as createRunRequestSchema, v as createFsCacheStore, xt as runSummarySchema, y as getCacheRetentionOptions } from "./runExecution-CjWJUUZ5.mjs";
2
- import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-DE2TFAS6.mjs";
1
+ import { At as evalStatsConfigSchema, L as configureEvalRunLogs, Mt as evalChartsConfigSchema, Nt as columnDefSchema, Tt as buildEvalKey, bt as runManifestSchema, jt as manualInputDescriptorSchema, kt as evalStatAggregateSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, tt as createRunRequestSchema, v as createFsCacheStore, xt as runSummarySchema, y as getCacheRetentionOptions } from "./runExecution-Bq0Y3y_1.mjs";
2
+ import { C as parseEvalDiscovery, h as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-C7qQISz2.mjs";
3
3
  import { z } from "zod";
4
4
  import { readFile } from "node:fs/promises";
5
5
  import { relative } from "node:path";
@@ -289,7 +289,7 @@ z.object({
289
289
  key: z.string(),
290
290
  namespace: z.string(),
291
291
  storedAt: z.string(),
292
- /** Last successful cache hit time. `null` means the entry has not been hit yet. */
292
+ /** Last successful cache read or write time. Legacy entries may be `null`. */
293
293
  lastAccessedAt: z.string().nullable()
294
294
  });
295
295
  z.object({
@@ -5395,7 +5395,7 @@ function createFsCacheStore(options) {
5395
5395
  const index = await readNamespaceIndex(cacheDir, entry.namespace);
5396
5396
  index.entries[entry.key] = {
5397
5397
  storedAt: entry.storedAt,
5398
- lastAccessedAt: null,
5398
+ lastAccessedAt: entry.storedAt,
5399
5399
  blobRefs: await collectExternalJsonBlobRefs(entry, blobDirs)
5400
5400
  };
5401
5401
  await writeNamespaceIndex(cacheDir, index);
@@ -1,8 +1,8 @@
1
- import { Dt as caseDetailSchema, Et as getCaseRowCaseKey, It as runWithEvalRegistry, J as runInEvalRuntimeScope, Ot as caseRowSchema, Z as runWithEvalClock, _t as matchesTagsFilter, bt as runManifestSchema, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as dedupeEvalTags, ht as deriveStatusFromChildStatuses, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromCaseRows, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveScopedSummaryFromCases, t as filterEvalCases, u as runWithModuleIsolation, vt as validateEvalTagName, wt as buildCaseKey, xt as runSummarySchema, yt as validateTagsFilterExpression } from "./runExecution-CjWJUUZ5.mjs";
1
+ import { Dt as caseDetailSchema, Et as getCaseRowCaseKey, It as runWithEvalRegistry, J as runInEvalRuntimeScope, Ot as caseRowSchema, Z as runWithEvalClock, _t as matchesTagsFilter, bt as runManifestSchema, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as dedupeEvalTags, ht as deriveStatusFromChildStatuses, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromCaseRows, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveScopedSummaryFromCases, t as filterEvalCases, u as runWithModuleIsolation, vt as validateEvalTagName, wt as buildCaseKey, xt as runSummarySchema, yt as validateTagsFilterExpression } from "./runExecution-Bq0Y3y_1.mjs";
2
2
  import { Result, resultify } from "t-result";
3
3
  import { readFile, readdir, rm, writeFile } from "node:fs/promises";
4
4
  import { dirname, join } from "node:path";
5
- import { existsSync } from "node:fs";
5
+ import { existsSync, readFileSync } from "node:fs";
6
6
  import { fileURLToPath } from "node:url";
7
7
  import { spawn } from "node:child_process";
8
8
  //#region ../runner/src/chartValidation.ts
@@ -670,7 +670,7 @@ async function recomputeEvalStatusesInRuns(params) {
670
670
  let changed = false;
671
671
  for (const caseRow of run.cases) {
672
672
  if (caseRow.evalKey !== params.evalKey) continue;
673
- const caseDetail = run.caseDetails.get(getCaseRowCaseKey(caseRow));
673
+ const caseDetail = params.getCaseDetail?.(run, caseRow) ?? run.caseDetails.get(getCaseRowCaseKey(caseRow));
674
674
  const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, params.scoreThresholds);
675
675
  if (caseRow.status === nextStatus) continue;
676
676
  caseRow.status = nextStatus;
@@ -725,14 +725,22 @@ function nextShortIdFromSnapshots(snapshots) {
725
725
  }
726
726
  return maxNum + 1;
727
727
  }
728
- async function loadPersistedRunSnapshots(localStateDir) {
728
+ /**
729
+ * Load persisted run metadata from the local state directory.
730
+ *
731
+ * Case details are skipped by default so long-running app processes can keep
732
+ * run history in memory without retaining every trace payload. Pass
733
+ * `includeCaseDetails` only for narrow maintenance flows that need full
734
+ * details for every case.
735
+ */
736
+ async function loadPersistedRunSnapshots(localStateDir, options = {}) {
729
737
  const runsDir = join(localStateDir, "runs");
730
738
  const entriesResult = await resultify(() => readdir(runsDir, { withFileTypes: true }));
731
739
  if (entriesResult.error) return [];
732
740
  const snapshots = [];
733
741
  const runDirs = entriesResult.value.filter((entry) => entry.isDirectory()).map((entry) => join(runsDir, entry.name)).toSorted();
734
742
  for (const runDir of runDirs) {
735
- const snapshot = await loadPersistedRunSnapshot(runDir);
743
+ const snapshot = await loadPersistedRunSnapshot(runDir, options);
736
744
  if (!snapshot) continue;
737
745
  snapshots.push(snapshot);
738
746
  }
@@ -766,7 +774,14 @@ function getLatestRunInfos(params) {
766
774
  function toLastRunStatus$1(status) {
767
775
  return status === "pending" ? null : status;
768
776
  }
769
- async function loadPersistedRunSnapshot(runDir) {
777
+ /**
778
+ * Load one persisted run snapshot from disk.
779
+ *
780
+ * The returned snapshot includes manifest, summary, and case rows. Case
781
+ * details are loaded only when `includeCaseDetails` is true; otherwise callers
782
+ * should use `loadPersistedCaseDetail` for the specific case being inspected.
783
+ */
784
+ async function loadPersistedRunSnapshot(runDir, options = {}) {
770
785
  const manifest = await readParsedJsonFile(join(runDir, "run.json"), { safeParse: runManifestSchema.safeParse.bind(runManifestSchema) });
771
786
  if (!manifest) return null;
772
787
  const summary = await readParsedJsonFile(join(runDir, "summary.json"), { safeParse: runSummarySchema.safeParse.bind(runSummarySchema) });
@@ -776,9 +791,18 @@ async function loadPersistedRunSnapshot(runDir) {
776
791
  manifest,
777
792
  summary,
778
793
  cases: await readCaseRows(runDir),
779
- caseDetails: await readCaseDetails(runDir)
794
+ caseDetails: options.includeCaseDetails === true ? await readCaseDetails(runDir) : /* @__PURE__ */ new Map()
780
795
  };
781
796
  }
797
+ /**
798
+ * Load one persisted case detail by its artifact file id.
799
+ *
800
+ * Returns `null` when the file is missing, invalid JSON, or no longer matches
801
+ * the current case-detail schema.
802
+ */
803
+ function loadPersistedCaseDetail(runDir, fileId) {
804
+ return readParsedJsonFileSync(join(runDir, "case-details", `${encodeCaseDetailFileName(fileId)}.json`), { safeParse: caseDetailSchema.safeParse.bind(caseDetailSchema) });
805
+ }
782
806
  async function readParsedJsonFile(filePath, schema) {
783
807
  const fileResult = await resultify(() => readFile(filePath, "utf-8"));
784
808
  if (fileResult.error) return null;
@@ -788,6 +812,15 @@ async function readParsedJsonFile(filePath, schema) {
788
812
  if (!parsed.success) return null;
789
813
  return parsed.data;
790
814
  }
815
+ function readParsedJsonFileSync(filePath, schema) {
816
+ const fileResult = resultify(() => readFileSync(filePath, "utf-8"));
817
+ if (fileResult.error) return null;
818
+ const jsonResult = resultify(() => JSON.parse(fileResult.value));
819
+ if (jsonResult.error) return null;
820
+ const parsed = schema.safeParse(jsonResult.value);
821
+ if (!parsed.success) return null;
822
+ return parsed.data;
823
+ }
791
824
  async function readCaseRows(runDir) {
792
825
  const fileResult = await resultify(() => readFile(join(runDir, "cases.jsonl"), "utf-8"));
793
826
  if (fileResult.error) return [];
@@ -1660,4 +1693,4 @@ function toLastRunStatus(status) {
1660
1693
  return status === "pending" ? null : status;
1661
1694
  }
1662
1695
  //#endregion
1663
- export { validateCharts as C, parseEvalDiscovery as S, runTouchesEval as _, validateTagsFilters as a, deriveEvalFreshness as b, getLatestRunInfos as c, nextShortIdFromSnapshots as d, persistCaseDetail as f, recomputePersistedCaseStatus as g, recomputeEvalStatusesInRuns as h, resolveEvalTags as i, loadPersistedRunSnapshot as l, persistRunState as m, getTargetEvalKeys as n, generateRunId as o, deleteTemporaryRuns as p, getTargetEvals as r, getLastRunStatuses as s, executeRun as t, loadPersistedRunSnapshots as u, buildManualInputDescriptor as v, loadIsolatedEvalRegistry as x, parseManualInputValues as y };
1696
+ export { parseEvalDiscovery as C, loadIsolatedEvalRegistry as S, recomputePersistedCaseStatus as _, validateTagsFilters as a, parseManualInputValues as b, getLatestRunInfos as c, loadPersistedRunSnapshots as d, nextShortIdFromSnapshots as f, recomputeEvalStatusesInRuns as g, persistRunState as h, resolveEvalTags as i, loadPersistedCaseDetail as l, deleteTemporaryRuns as m, getTargetEvalKeys as n, generateRunId as o, persistCaseDetail as p, getTargetEvals as r, getLastRunStatuses as s, executeRun as t, loadPersistedRunSnapshot as u, runTouchesEval as v, validateCharts as w, deriveEvalFreshness as x, buildManualInputDescriptor as y };
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-DzDRasWV.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-DmkSq-QG.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-CVBSlTD8.mjs";
2
- import "./src-DjOTPnDz.mjs";
1
+ import { n as createRunner } from "./cli-BSVUCUxr.mjs";
2
+ import "./src-D5vGo2iv.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance({ loadEnv = true } = {}) {
@@ -1,5 +1,5 @@
1
- import { G as matchesEvalTags$1, Pt as defineEval$1 } from "./runExecution-CjWJUUZ5.mjs";
2
- import "./cli-CVBSlTD8.mjs";
1
+ import { G as matchesEvalTags$1, Pt as defineEval$1 } from "./runExecution-Bq0Y3y_1.mjs";
2
+ import "./cli-BSVUCUxr.mjs";
3
3
  //#region src/index.ts
4
4
  /** Register an eval definition with typed tag support. */
5
5
  function defineEval(definition) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.60.1",
3
+ "version": "0.60.3",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -32,9 +32,9 @@
32
32
  "devDependencies": {
33
33
  "@types/node": "^24.7.2",
34
34
  "typescript": "^5.9.2",
35
- "@agent-evals/sdk": "0.0.1",
35
+ "@agent-evals/runner": "0.0.1",
36
36
  "@agent-evals/shared": "0.0.1",
37
- "@agent-evals/runner": "0.0.1"
37
+ "@agent-evals/sdk": "0.0.1"
38
38
  },
39
39
  "scripts": {
40
40
  "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
@@ -568,6 +568,8 @@ Mental model:
568
568
  runner stays idle for `cache.pruneIdleDelayMs ?? 5000` milliseconds. Configure
569
569
  `cache.maxEntries` as a number for the default cap, or as
570
570
  `{ default, namespaces }` for exact namespace-specific caps.
571
+ Writes initialize the row's last access time to the stored time; later cache
572
+ hits refresh that timestamp at the configured access-time update interval.
571
573
  - Unindexed legacy cache files are ignored by normal lookup/listing. Use
572
574
  `agent-evals cache repair` to remove unindexed cache files, stale index rows,
573
575
  debug sidecars, and unreferenced blob files.