@ls-stack/agent-eval 0.55.0 → 0.55.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,7 +25,7 @@
25
25
  href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
26
26
  rel="stylesheet"
27
27
  />
28
- <script type="module" crossorigin src="/assets/index-CvsPmlHl.js"></script>
28
+ <script type="module" crossorigin src="/assets/index-C7QjETk8.js"></script>
29
29
  <link rel="stylesheet" crossorigin href="/assets/index-2I-eWzVL.css">
30
30
  </head>
31
31
  <body>
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-rvPrUj6S.mjs";
2
+ import { t as runCli } from "./cli-Bu9347r1.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -0,0 +1,167 @@
1
+ import { I as configureEvalRunLogs, Pt as runWithEvalRegistry, St as resolveLlmCallsConfig, _ as createBufferedCacheStore, a as isCaseChildParentMessage, d as loadEvalModule, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, q as runInEvalRuntimeScope, r as runCase, v as createFsCacheStore, xt as resolveApiCallsConfig } from "./runExecution-C31dpemR.mjs";
2
+ //#region ../runner/src/caseChild.ts
3
+ let fatalErrorReported = false;
4
+ let disconnectExpected = false;
5
+ let runStarted = false;
6
+ const pendingMessageSends = /* @__PURE__ */ new Set();
7
+ function sendMessage(message) {
8
+ if (process.send === void 0) return;
9
+ const sendPromise = new Promise((resolvePromise) => {
10
+ try {
11
+ process.send?.(message, (error) => {
12
+ if (error) {
13
+ console.error("Failed to send case child message:");
14
+ console.error(formatUnknownErrorDetails(error));
15
+ }
16
+ resolvePromise();
17
+ });
18
+ } catch (error) {
19
+ console.error("Failed to send case child message:");
20
+ console.error(formatUnknownErrorDetails(error));
21
+ resolvePromise();
22
+ }
23
+ });
24
+ pendingMessageSends.add(sendPromise);
25
+ sendPromise.finally(() => {
26
+ pendingMessageSends.delete(sendPromise);
27
+ });
28
+ }
29
+ async function flushMessageSends() {
30
+ while (pendingMessageSends.size > 0) await Promise.allSettled([...pendingMessageSends]);
31
+ }
32
+ function installFatalCaseChildErrorHandlers() {
33
+ process.once("uncaughtException", (error) => {
34
+ reportFatalCaseChildErrorAndExit(error);
35
+ });
36
+ process.once("unhandledRejection", (reason) => {
37
+ reportFatalCaseChildErrorAndExit(toUnhandledRejectionError(reason));
38
+ });
39
+ }
40
+ async function useEvalDefinition(params) {
41
+ const entry = (await runWithEvalRegistry(async (activeRegistry) => {
42
+ await runInEvalRuntimeScope("env", async () => {
43
+ await loadEvalModule(params.evalFilePath, params.sourceFingerprint);
44
+ });
45
+ return activeRegistry;
46
+ })).get(params.evalId);
47
+ if (entry === void 0) throw new Error(`Eval "${params.evalId}" was not registered after importing ${params.evalFilePath}`);
48
+ return await entry.use(async (evalDef) => await params.use(evalDef));
49
+ }
50
+ async function executeCaseChild(context) {
51
+ process.chdir(context.workspaceRoot);
52
+ registerAgentEvalsPackageResolutionHooks();
53
+ const config = await loadConfig();
54
+ configureEvalRunLogs({ captureConsole: config.runLogs?.captureConsole !== false });
55
+ const cacheStore = createFsCacheStore({
56
+ workspaceRoot: context.workspaceRoot,
57
+ dir: config.cache?.dir,
58
+ maxEntriesPerNamespace: config.cache?.maxEntriesPerNamespace ?? config.cache?.maxEntriesPerEval,
59
+ maxEntriesByNamespace: config.cache?.maxEntriesByNamespace
60
+ });
61
+ const bufferedCacheStore = context.cacheEnabled && context.cacheMode !== "bypass" ? createBufferedCacheStore(cacheStore) : null;
62
+ const llmCallsConfig = resolveLlmCallsConfig(config.llmCalls);
63
+ const apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
64
+ const { caseDetail, caseRowUpdate } = await useEvalDefinition({
65
+ evalId: context.evalId,
66
+ evalFilePath: context.evalFilePath,
67
+ sourceFingerprint: context.sourceFingerprint,
68
+ use: async (evalDef) => await runCase({
69
+ evalDef,
70
+ evalId: context.evalId,
71
+ evalKey: context.evalKey,
72
+ evalCase: context.evalCase,
73
+ globalTraceDisplay: context.globalTraceDisplay,
74
+ globalColumns: config.columns,
75
+ globalDeriveFromTracing: config.deriveFromTracing,
76
+ llmCallsConfig,
77
+ apiCallsConfig,
78
+ globalRemoveDefaultConfig: config.removeDefaultConfig,
79
+ trial: context.trial,
80
+ startTime: context.startTime,
81
+ cacheAdapter: bufferedCacheStore ?? (context.cacheEnabled ? cacheStore : null),
82
+ cacheMode: context.cacheMode,
83
+ moduleIsolation: void 0,
84
+ evalFilePath: context.evalFilePath,
85
+ evalFileRelativePath: context.evalFileRelativePath,
86
+ workspaceRoot: context.workspaceRoot,
87
+ artifactDir: context.artifactDir,
88
+ runId: context.runId
89
+ })
90
+ });
91
+ return {
92
+ caseDetail,
93
+ caseRow: {
94
+ caseId: context.evalCase.id,
95
+ evalId: context.evalId,
96
+ evalKey: context.evalKey,
97
+ caseKey: caseDetail.caseKey,
98
+ tags: caseDetail.tags,
99
+ status: caseRowUpdate.status ?? "pending",
100
+ durationMs: caseRowUpdate.durationMs ?? null,
101
+ cacheHits: caseRowUpdate.cacheHits ?? 0,
102
+ cacheOperations: caseRowUpdate.cacheOperations ?? 0,
103
+ columns: caseRowUpdate.columns ?? {},
104
+ trial: context.trial
105
+ },
106
+ pendingCacheWrites: bufferedCacheStore?.getPendingWrites() ?? []
107
+ };
108
+ }
109
+ async function handleFatalCaseChildError(error) {
110
+ if (fatalErrorReported) return;
111
+ fatalErrorReported = true;
112
+ const message = formatUnknownErrorDetails(error);
113
+ process.exitCode = 1;
114
+ console.error(message);
115
+ sendMessage({
116
+ type: "error",
117
+ message
118
+ });
119
+ await flushMessageSends();
120
+ }
121
+ function formatUnknownErrorDetails(error) {
122
+ if (error instanceof Error) return error.stack ?? error.message;
123
+ if (typeof error === "string") return error;
124
+ return String(error);
125
+ }
126
+ function toUnhandledRejectionError(reason) {
127
+ if (reason instanceof Error) return reason;
128
+ return /* @__PURE__ */ new Error(`Unhandled rejection: ${formatUnknownErrorDetails(reason)}`);
129
+ }
130
+ async function reportFatalCaseChildErrorAndExit(error) {
131
+ try {
132
+ await handleFatalCaseChildError(error);
133
+ } catch (reportError) {
134
+ console.error("Failed to report fatal case child error:");
135
+ console.error(formatUnknownErrorDetails(reportError));
136
+ } finally {
137
+ process.exit(1);
138
+ }
139
+ }
140
+ installFatalCaseChildErrorHandlers();
141
+ process.on("disconnect", () => {
142
+ if (disconnectExpected) return;
143
+ process.exit(1);
144
+ });
145
+ process.on("message", (message) => {
146
+ if (runStarted) return;
147
+ runStarted = true;
148
+ if (!isCaseChildParentMessage(message)) {
149
+ reportFatalCaseChildErrorAndExit(/* @__PURE__ */ new Error("Case child received an invalid start message"));
150
+ return;
151
+ }
152
+ executeCaseChild(message.context).then(async (result) => {
153
+ sendMessage({
154
+ type: "done",
155
+ result
156
+ });
157
+ await flushMessageSends();
158
+ disconnectExpected = true;
159
+ process.disconnect();
160
+ }).catch(async (error) => {
161
+ await handleFatalCaseChildError(error);
162
+ disconnectExpected = true;
163
+ process.disconnect();
164
+ });
165
+ });
166
+ //#endregion
167
+ export {};
@@ -1,4 +1,5 @@
1
- import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, Rt as getEvalRegistry, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadIsolatedEvalRegistry, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-BWyE5lRX.mjs";
1
+ import { Ct as buildEvalKey, Nt as getEvalRegistry, St as resolveLlmCallsConfig, bt as runSummarySchema, c as resolveArtifactPath, ct as applyDerivedCallAttributes, dt as getEvalDisplayStatus, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, gt as matchesTagsFilter, h as normalizeScoreDef, m as buildDeclaredColumnDefs, o as stripTerminalControlCodes, p as loadConfig, s as resolveTracePresentation, ut as getEvalTitle, v as createFsCacheStore, wt as getCaseRowCaseKey, xt as resolveApiCallsConfig } from "./runExecution-C31dpemR.mjs";
2
+ import { C as validateCharts, S as parseEvalDiscovery, _ as runTouchesEval, a as validateTagsFilters, b as deriveEvalFreshness, c as getLatestRunInfos, d as nextShortIdFromSnapshots, f as persistCaseDetail, g as recomputePersistedCaseStatus, h as recomputeEvalStatusesInRuns, i as resolveEvalTags, l as loadPersistedRunSnapshot, m as persistRunState, n as getTargetEvalKeys, o as generateRunId, p as deleteTemporaryRuns, s as getLastRunStatuses, u as loadPersistedRunSnapshots, v as buildManualInputDescriptor, x as loadIsolatedEvalRegistry, y as parseManualInputValues } from "./runOrchestration-3RoHLW4U.mjs";
2
3
  import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
3
4
  import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
4
5
  import { createHash, randomUUID } from "node:crypto";
@@ -793,6 +794,7 @@ async function markRunErrored(runState, message, managerContext) {
793
794
  payload: { message }
794
795
  });
795
796
  managerContext.emitDiscoveryEvent();
797
+ managerContext.onRunTerminal();
796
798
  }
797
799
  async function markRunTerminalFromChild(runState, event, managerContext) {
798
800
  const snapshot = await loadPersistedRunSnapshot(runState.runDir);
@@ -814,6 +816,7 @@ async function markRunTerminalFromChild(runState, event, managerContext) {
814
816
  }
815
817
  managerContext.emitEvent(runState, event);
816
818
  managerContext.emitDiscoveryEvent();
819
+ managerContext.onRunTerminal();
817
820
  }
818
821
  //#endregion
819
822
  //#region ../runner/src/runnerStateHydration.ts
@@ -882,6 +885,7 @@ function getWatchRootsForIncludePatterns(params) {
882
885
  }
883
886
  //#endregion
884
887
  //#region ../runner/src/runner.ts
888
+ const defaultCachePruneIdleDelayMs = 5e3;
885
889
  /** Create an in-memory eval runner bound to the current workspace config. */
886
890
  function createRunner({ watchForChanges = true } = {}) {
887
891
  let config;
@@ -902,6 +906,7 @@ function createRunner({ watchForChanges = true } = {}) {
902
906
  let runHistoryWatcher;
903
907
  let discoveryRefreshTimer;
904
908
  let runHistoryRefreshTimer;
909
+ let cachePruneIdleTimer;
905
910
  let registryLoadCounter = 0;
906
911
  const configReload = createConfigReloadController({
907
912
  getActiveRunCount,
@@ -941,6 +946,9 @@ function createRunner({ watchForChanges = true } = {}) {
941
946
  async clearCache(filter) {
942
947
  await cacheStore.clear(filter);
943
948
  },
949
+ async repairCache() {
950
+ return cacheStore.repair();
951
+ },
944
952
  async recomputeStatusesForEval(evalKey) {
945
953
  const evalMeta = resolveEvalMeta(evalKey);
946
954
  if (!evalMeta) return { updatedRuns: 0 };
@@ -1251,6 +1259,7 @@ function createRunner({ watchForChanges = true } = {}) {
1251
1259
  emitDiscoveryEvent();
1252
1260
  },
1253
1261
  async startRun(request) {
1262
+ cancelCacheRetentionPrune();
1254
1263
  const tagsFilterError = validateTagsFilters(request.target.tagsFilter);
1255
1264
  if (tagsFilterError !== null) throw new Error(tagsFilterError);
1256
1265
  const deletedTemporaryRuns = await deleteTemporaryRuns({
@@ -1348,7 +1357,8 @@ function createRunner({ watchForChanges = true } = {}) {
1348
1357
  workspaceRoot,
1349
1358
  evals,
1350
1359
  emitEvent,
1351
- emitDiscoveryEvent
1360
+ emitDiscoveryEvent,
1361
+ onRunTerminal: scheduleCacheRetentionPrune
1352
1362
  }
1353
1363
  });
1354
1364
  if (deletedTemporaryRuns > 0) emitDiscoveryEvent();
@@ -1397,6 +1407,7 @@ function createRunner({ watchForChanges = true } = {}) {
1397
1407
  payload: run.summary
1398
1408
  });
1399
1409
  emitDiscoveryEvent();
1410
+ scheduleCacheRetentionPrune();
1400
1411
  },
1401
1412
  getCaseDetail(runId, caseId) {
1402
1413
  const run = runs.get(runId);
@@ -1418,6 +1429,7 @@ function createRunner({ watchForChanges = true } = {}) {
1418
1429
  };
1419
1430
  },
1420
1431
  async close() {
1432
+ cancelCacheRetentionPrune();
1421
1433
  await Promise.all([closeWatchers(), configReload.close()]);
1422
1434
  },
1423
1435
  getWorkspaceRoot() {
@@ -1521,6 +1533,28 @@ function createRunner({ watchForChanges = true } = {}) {
1521
1533
  function getActiveRunCount() {
1522
1534
  return [...runs.values()].filter((run) => run.manifest.status === "running").length;
1523
1535
  }
1536
+ function getCachePruneIdleDelayMs() {
1537
+ const configured = config.cache?.pruneIdleDelayMs;
1538
+ if (configured === void 0 || !Number.isFinite(configured) || configured <= 0) return defaultCachePruneIdleDelayMs;
1539
+ return Math.floor(configured);
1540
+ }
1541
+ function cancelCacheRetentionPrune() {
1542
+ if (cachePruneIdleTimer === void 0) return;
1543
+ clearTimeout(cachePruneIdleTimer);
1544
+ cachePruneIdleTimer = void 0;
1545
+ }
1546
+ function scheduleCacheRetentionPrune() {
1547
+ cancelCacheRetentionPrune();
1548
+ cachePruneIdleTimer = setTimeout(() => {
1549
+ cachePruneIdleTimer = void 0;
1550
+ pruneCacheRetentionIfIdle();
1551
+ }, getCachePruneIdleDelayMs());
1552
+ cachePruneIdleTimer.unref();
1553
+ }
1554
+ async function pruneCacheRetentionIfIdle() {
1555
+ if (getActiveRunCount() > 0) return;
1556
+ await cacheStore.pruneRetention();
1557
+ }
1524
1558
  function emitDiscoveryEvent() {
1525
1559
  const lastRunStatuses = getLastRunStatuses({
1526
1560
  runs: runs.values(),
@@ -1656,19 +1690,20 @@ Flags:
1656
1690
  `);
1657
1691
  return;
1658
1692
  }
1659
- if (topic === "cache" || topic === "cache list" || topic === "cache clear") {
1693
+ if (topic === "cache" || topic === "cache list" || topic === "cache clear" || topic === "cache repair") {
1660
1694
  console.info(`
1661
- agent-evals cache - Manage cached operation entries
1695
+ agent-evals cache - Manage cached namespace/key entries
1662
1696
 
1663
1697
  Usage:
1664
1698
  agent-evals cache list [flags]
1665
1699
  agent-evals cache clear --eval <id>
1666
1700
  agent-evals cache clear --all
1701
+ agent-evals cache repair [flags]
1667
1702
 
1668
1703
  Flags:
1669
1704
  --eval <id> Clear entries for specific eval(s) (comma-separated)
1670
1705
  --all Confirm clearing every cached entry
1671
- --json Output cache listing as JSON
1706
+ --json Output cache listing or repair summary as JSON
1672
1707
  --no-env Disable automatic .env loading
1673
1708
  --help, -h Show this help
1674
1709
  `);
@@ -1682,9 +1717,10 @@ Commands:
1682
1717
  list List discovered evals
1683
1718
  run Run evals
1684
1719
  show-runs [id|latest] Show saved run artifact file paths
1685
- cache list List cached operation entries
1720
+ cache list List cached namespace/key entries
1686
1721
  cache clear --eval <id> Clear cache entries for one eval
1687
1722
  cache clear --all Clear every cached entry
1723
+ cache repair Remove unindexed/orphaned cache files
1688
1724
  help Show this help
1689
1725
 
1690
1726
  Options:
@@ -1960,7 +1996,7 @@ function parseArgs(argv) {
1960
1996
  let cursor = 1;
1961
1997
  if (args.command === "cache") {
1962
1998
  const sub = normalizedArgv[cursor];
1963
- if (sub === "list" || sub === "clear") {
1999
+ if (sub === "list" || sub === "clear" || sub === "repair") {
1964
2000
  args.subcommand = sub;
1965
2001
  args.helpTopic = `cache ${sub}`;
1966
2002
  cursor++;
@@ -2135,8 +2171,8 @@ async function commandApp(args) {
2135
2171
  const { serve } = await import("@hono/node-server");
2136
2172
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
2137
2173
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
2138
- const appModule = await import("./app-CunZ8Dku.mjs");
2139
- const runnerModule = await import("./runner-C2fvjKZP.mjs");
2174
+ const appModule = await import("./app-NI4to6lp.mjs");
2175
+ const runnerModule = await import("./runner-CTp9zHbM.mjs");
2140
2176
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
2141
2177
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
2142
2178
  await runnerModule.initRunner();
@@ -2310,10 +2346,8 @@ async function commandCache(args) {
2310
2346
  for (const entry of entries) {
2311
2347
  console.info(` ${entry.namespace}`);
2312
2348
  console.info(` key: ${entry.key}`);
2313
- const operationLabel = entry.operationType === "span" ? `${entry.operationName} (span ${entry.spanKind ?? "unknown"})` : `${entry.operationName} (value)`;
2314
- console.info(` operation: ${operationLabel}`);
2315
2349
  console.info(` stored: ${entry.storedAt}`);
2316
- console.info(` size: ${String(entry.sizeBytes)} bytes`);
2350
+ console.info(` last accessed: ${entry.lastAccessedAt}`);
2317
2351
  console.info("");
2318
2352
  }
2319
2353
  return;
@@ -2341,6 +2375,20 @@ async function commandCache(args) {
2341
2375
  console.info("Refusing to clear cache without --eval <id> or --all. Use one of these flags to confirm.");
2342
2376
  process.exit(1);
2343
2377
  }
2378
+ if (args.subcommand === "repair") {
2379
+ const summary = await runner.repairCache();
2380
+ if (args.json) {
2381
+ console.info(JSON.stringify(summary, null, 2));
2382
+ return;
2383
+ }
2384
+ console.info("Cache repair complete.");
2385
+ console.info(`Removed cache files: ${String(summary.removedCacheFiles)}`);
2386
+ console.info(`Removed debug files: ${String(summary.removedDebugFiles)}`);
2387
+ console.info(`Removed blob files: ${String(summary.removedBlobFiles)}`);
2388
+ console.info(`Removed index rows: ${String(summary.removedIndexRows)}`);
2389
+ console.info(`Rewritten indexes: ${String(summary.rewrittenIndexes)}`);
2390
+ return;
2391
+ }
2344
2392
  printHelp(args.helpTopic);
2345
2393
  }
2346
2394
  function getSortedRunSnapshots(runner) {