@ls-stack/agent-eval 0.53.0 → 0.55.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,8 +25,8 @@
25
25
  href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
26
26
  rel="stylesheet"
27
27
  />
28
- <script type="module" crossorigin src="/assets/index-RNejIyap.js"></script>
29
- <link rel="stylesheet" crossorigin href="/assets/index-vaLgWG8j.css">
28
+ <script type="module" crossorigin src="/assets/index-CvsPmlHl.js"></script>
29
+ <link rel="stylesheet" crossorigin href="/assets/index-2I-eWzVL.css">
30
30
  </head>
31
31
  <body>
32
32
  <div id="root"></div>
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-3FrKBc9l.mjs";
2
+ import { t as runCli } from "./cli-rvPrUj6S.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -1,4 +1,4 @@
1
- import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, Lt as getEvalRegistry, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadEvalModule, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-Cn6fGL2s.mjs";
1
+ import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, Rt as getEvalRegistry, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadIsolatedEvalRegistry, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-BWyE5lRX.mjs";
2
2
  import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
3
3
  import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
4
4
  import { createHash, randomUUID } from "node:crypto";
@@ -749,6 +749,7 @@ function applyChildEvalMetas(evals, childMetas) {
749
749
  evalMeta.caseCount = childMeta.caseCount;
750
750
  evalMeta.caseIds = childMeta.caseIds;
751
751
  evalMeta.stats = childMeta.stats;
752
+ evalMeta.defaultStatAggregate = childMeta.defaultStatAggregate;
752
753
  evalMeta.charts = childMeta.charts;
753
754
  evalMeta.sourceFingerprint = childMeta.sourceFingerprint;
754
755
  }
@@ -901,6 +902,7 @@ function createRunner({ watchForChanges = true } = {}) {
901
902
  let runHistoryWatcher;
902
903
  let discoveryRefreshTimer;
903
904
  let runHistoryRefreshTimer;
905
+ let registryLoadCounter = 0;
904
906
  const configReload = createConfigReloadController({
905
907
  getActiveRunCount,
906
908
  closeRunnerWatchers: closeWatchers,
@@ -922,6 +924,10 @@ function createRunner({ watchForChanges = true } = {}) {
922
924
  function getSourceFingerprint(source) {
923
925
  return createHash("sha256").update(source).digest("hex");
924
926
  }
927
+ function nextRegistryLoadIsolationKey(prefix, filePath) {
928
+ registryLoadCounter++;
929
+ return `${prefix}:${String(registryLoadCounter)}:${filePath}`;
930
+ }
925
931
  const runner = {
926
932
  async init() {
927
933
  await loadRunnerState();
@@ -938,9 +944,15 @@ function createRunner({ watchForChanges = true } = {}) {
938
944
  async recomputeStatusesForEval(evalKey) {
939
945
  const evalMeta = resolveEvalMeta(evalKey);
940
946
  if (!evalMeta) return { updatedRuns: 0 };
941
- const registry = getEvalRegistry();
942
- await loadEvalModule(evalMeta.sourceFilePath, evalMeta.sourceFingerprint ?? void 0);
943
- const entry = registry.get(evalMeta.id);
947
+ const entry = (await loadIsolatedEvalRegistry({
948
+ evalFilePath: evalMeta.sourceFilePath,
949
+ sourceFingerprint: evalMeta.sourceFingerprint ?? void 0,
950
+ moduleIsolation: {
951
+ key: nextRegistryLoadIsolationKey("recompute-status", evalMeta.sourceFilePath),
952
+ workspaceRoot
953
+ },
954
+ runtimeScope: "env"
955
+ })).get(evalMeta.id);
944
956
  if (!entry) return { updatedRuns: 0 };
945
957
  const scoreThresholds = /* @__PURE__ */ new Map();
946
958
  entry.use((evalDef) => {
@@ -1068,6 +1080,24 @@ function createRunner({ watchForChanges = true } = {}) {
1068
1080
  emitDiscoveryEvent();
1069
1081
  return { deleted: true };
1070
1082
  },
1083
+ async promoteRun(runId) {
1084
+ const run = runs.get(runId);
1085
+ if (!run) return { promoted: false };
1086
+ const wasTemporary = run.manifest.temporary === true;
1087
+ if (wasTemporary) {
1088
+ run.manifest.temporary = false;
1089
+ await persistRunState(run);
1090
+ emitDiscoveryEvent();
1091
+ }
1092
+ return {
1093
+ promoted: wasTemporary,
1094
+ run: {
1095
+ manifest: run.manifest,
1096
+ summary: run.summary,
1097
+ cases: run.cases
1098
+ }
1099
+ };
1100
+ },
1071
1101
  validateManualInputs(request) {
1072
1102
  return validateManualInputsForRequest({
1073
1103
  evalMetas: getSortedEvalMetas(),
@@ -1127,17 +1157,24 @@ function createRunner({ watchForChanges = true } = {}) {
1127
1157
  message: `Duplicate eval id "${issue.evalId}" in ${toWorkspaceRelativePath(issue.filePath)}. Eval ids must be unique within one file.`
1128
1158
  })));
1129
1159
  const sourceFingerprint = getSourceFingerprint(content);
1130
- const registry = getEvalRegistry();
1131
- let moduleLoaded = false;
1160
+ let loadedRegistry;
1132
1161
  try {
1133
- await loadEvalModule(filePath, sourceFingerprint);
1134
- moduleLoaded = true;
1162
+ loadedRegistry = await loadIsolatedEvalRegistry({
1163
+ evalFilePath: filePath,
1164
+ sourceFingerprint,
1165
+ moduleIsolation: {
1166
+ key: nextRegistryLoadIsolationKey("discovery", filePath),
1167
+ workspaceRoot
1168
+ },
1169
+ runtimeScope: "env"
1170
+ });
1135
1171
  } catch {}
1136
1172
  for (const meta of discoveredMetas) {
1137
- const discoveredEntry = moduleLoaded ? registry.get(meta.id) : void 0;
1173
+ const discoveredEntry = loadedRegistry?.get(meta.id);
1138
1174
  const title = meta.title;
1139
1175
  let columnDefs = buildDeclaredColumnDefs(void 0, void 0, void 0);
1140
1176
  let stats;
1177
+ let defaultStatAggregate;
1141
1178
  let charts;
1142
1179
  let tags = [];
1143
1180
  let manualInputDescriptor;
@@ -1156,10 +1193,12 @@ function createRunner({ watchForChanges = true } = {}) {
1156
1193
  evalDef,
1157
1194
  globalColumns: config.columns,
1158
1195
  globalStats: config.stats,
1196
+ globalDefaultStatAggregate: config.defaultStatAggregate,
1159
1197
  globalRemove: config.removeDefaultConfig
1160
1198
  });
1161
1199
  columnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
1162
1200
  stats = defaultConfig.stats;
1201
+ defaultStatAggregate = defaultConfig.defaultStatAggregate;
1163
1202
  const validated = validateCharts({
1164
1203
  charts: defaultConfig.charts,
1165
1204
  columnDefs,
@@ -1202,6 +1241,7 @@ function createRunner({ watchForChanges = true } = {}) {
1202
1241
  caseCount: null,
1203
1242
  caseIds: void 0,
1204
1243
  stats,
1244
+ defaultStatAggregate,
1205
1245
  charts,
1206
1246
  manualInputDescriptor,
1207
1247
  requiresManualInput
@@ -2095,8 +2135,8 @@ async function commandApp(args) {
2095
2135
  const { serve } = await import("@hono/node-server");
2096
2136
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
2097
2137
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
2098
- const appModule = await import("./app-D0a57pVo.mjs");
2099
- const runnerModule = await import("./runner-Dsqj431i.mjs");
2138
+ const appModule = await import("./app-CunZ8Dku.mjs");
2139
+ const runnerModule = await import("./runner-C2fvjKZP.mjs");
2100
2140
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
2101
2141
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
2102
2142
  await runnerModule.initRunner();