@ls-stack/agent-eval 0.42.2 → 0.45.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,8 +25,8 @@
25
25
  href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
26
26
  rel="stylesheet"
27
27
  />
28
- <script type="module" crossorigin src="/assets/index-DH0qYD76.js"></script>
29
- <link rel="stylesheet" crossorigin href="/assets/index-eFM9VIsz.css">
28
+ <script type="module" crossorigin src="/assets/index-am2Knit6.js"></script>
29
+ <link rel="stylesheet" crossorigin href="/assets/index-BU3IqUso.css">
30
30
  </head>
31
31
  <body>
32
32
  <div id="root"></div>
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-BeJCJMQo.mjs";
2
+ import { t as runCli } from "./cli-vdJYkEVk.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -1,4 +1,4 @@
1
- import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, Lt as getEvalRegistry, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadEvalModule, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-OVUFw1fL.mjs";
1
+ import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, Lt as getEvalRegistry, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadEvalModule, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-BFdxG9ws.mjs";
2
2
  import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
3
3
  import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
4
4
  import { createHash, randomUUID } from "node:crypto";
@@ -8,6 +8,29 @@ import { existsSync } from "node:fs";
8
8
  import { resultify } from "t-result";
9
9
  import { fileURLToPath } from "node:url";
10
10
  import { spawn, spawnSync } from "node:child_process";
11
+ //#region ../runner/src/caseDetailLookup.ts
12
+ function getCaseLookupIds(caseId) {
13
+ const decodedResult = resultify(() => decodeURIComponent(caseId));
14
+ const decodedCaseId = decodedResult.error ? caseId : decodedResult.value;
15
+ const lookupIds = new Set([caseId, decodedCaseId]);
16
+ for (const lookupId of [...lookupIds]) {
17
+ const segments = lookupId.split("#");
18
+ if (segments.length < 3) continue;
19
+ lookupIds.add(segments.map((segment) => encodeURIComponent(segment)).join("#"));
20
+ }
21
+ return [...lookupIds];
22
+ }
23
+ function resolveCaseDetailLookup(run, caseId) {
24
+ const lookupIds = new Set(getCaseLookupIds(caseId));
25
+ for (const lookupId of lookupIds) {
26
+ const caseDetail = run.caseDetails.get(lookupId);
27
+ if (caseDetail) return caseDetail;
28
+ }
29
+ const matchingCaseRow = run.cases.find((caseRow) => lookupIds.has(getCaseRowCaseKey(caseRow)) || lookupIds.has(caseRow.caseId));
30
+ if (matchingCaseRow === void 0) return void 0;
31
+ return run.caseDetails.get(getCaseRowCaseKey(matchingCaseRow));
32
+ }
33
+ //#endregion
11
34
  //#region ../runner/src/configReload.ts
12
35
  /** Coordinates idle-only reloads for `agent-evals.config.ts` in app mode. */
13
36
  function createConfigReloadController({ getActiveRunCount, closeRunnerWatchers, loadRunnerState, emitToDiscoveryListeners }) {
@@ -1338,7 +1361,7 @@ function createRunner({ watchForChanges = true } = {}) {
1338
1361
  getCaseDetail(runId, caseId) {
1339
1362
  const run = runs.get(runId);
1340
1363
  if (!run) return void 0;
1341
- return run.caseDetails.get(caseId) ?? run.caseDetails.get(getCaseRowCaseKey(run.cases.find((caseRow) => getCaseRowCaseKey(caseRow) === caseId || caseRow.caseId === caseId) ?? { caseId }));
1364
+ return resolveCaseDetailLookup(run, caseId);
1342
1365
  },
1343
1366
  subscribe(runId, listener) {
1344
1367
  const run = runs.get(runId);
@@ -2072,8 +2095,8 @@ async function commandApp(args) {
2072
2095
  const { serve } = await import("@hono/node-server");
2073
2096
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
2074
2097
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
2075
- const appModule = await import("./app-DPamBr5R.mjs");
2076
- const runnerModule = await import("./runner-BJQq7cpd.mjs");
2098
+ const appModule = await import("./app-BZmhhSFZ.mjs");
2099
+ const runnerModule = await import("./runner--aH0jO4Z.mjs");
2077
2100
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
2078
2101
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
2079
2102
  await runnerModule.initRunner();
@@ -2260,7 +2283,7 @@ async function commandCache(args) {
2260
2283
  const evalIds = runner.getEvals().filter((ev) => (args.evalIds.length === 0 || args.evalIds.includes(ev.id)) && (args.files.length === 0 || args.files.some((file) => fileMatches(file, ev.filePath)))).map((ev) => ev.id);
2261
2284
  for (const evalId of evalIds) {
2262
2285
  const entries = await runner.listCache();
2263
- const prefix = `${evalId}__`;
2286
+ const prefix = `${evalId}.`;
2264
2287
  const matching = entries.filter((entry) => entry.namespace.startsWith(prefix));
2265
2288
  for (const entry of matching) await runner.clearCache({
2266
2289
  namespace: entry.namespace,