@ls-stack/agent-eval 0.60.3 → 0.61.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,8 +25,8 @@
25
25
  href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
26
26
  rel="stylesheet"
27
27
  />
28
- <script type="module" crossorigin src="/assets/index-CM6MDNqo.js"></script>
29
- <link rel="stylesheet" crossorigin href="/assets/index-CqWfzcFb.css">
28
+ <script type="module" crossorigin src="/assets/index-DxZsizjg.js"></script>
29
+ <link rel="stylesheet" crossorigin href="/assets/index-CM_zUhl_.css">
30
30
  </head>
31
31
  <body>
32
32
  <div id="root"></div>
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-BSVUCUxr.mjs";
2
+ import { t as runCli } from "./cli-CPBIcMP-.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -1,4 +1,4 @@
1
- import { Ct as resolveLlmCallsConfig, It as runWithEvalRegistry, J as runInEvalRuntimeScope, L as configureEvalRunLogs, St as resolveApiCallsConfig, _ as createBufferedCacheStore, a as isCaseChildParentMessage, d as loadEvalModule, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, r as runCase, v as createFsCacheStore, y as getCacheRetentionOptions } from "./runExecution-Bq0Y3y_1.mjs";
1
+ import { Ft as runWithEvalRegistry, I as configureEvalRunLogs, St as resolveLlmCallsConfig, _ as createFsCacheStore, a as isCaseChildParentMessage, d as loadEvalModule, g as createBufferedCacheStore, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, q as runInEvalRuntimeScope, r as runCase, v as getCacheRetentionOptions, xt as resolveApiCallsConfig } from "./runExecution-D-CnSRYy.mjs";
2
2
  //#region ../runner/src/caseChild.ts
3
3
  let fatalErrorReported = false;
4
4
  let disconnectExpected = false;
@@ -1,5 +1,5 @@
1
- import { Ct as resolveLlmCallsConfig, Et as getCaseRowCaseKey, Ft as getEvalRegistry, Ot as caseRowSchema, St as resolveApiCallsConfig, Tt as buildEvalKey, _t as matchesTagsFilter, c as resolveArtifactPath, dt as getEvalTitle, f as resolveEvalDefaultConfig, ft as getEvalDisplayStatus, h as normalizeScoreDef, lt as applyDerivedCallAttributes, m as buildDeclaredColumnDefs, o as stripTerminalControlCodes, p as loadConfig, pt as deriveScopedSummaryFromCases, s as resolveTracePresentation, v as createFsCacheStore, xt as runSummarySchema, y as getCacheRetentionOptions } from "./runExecution-Bq0Y3y_1.mjs";
2
- import { C as parseEvalDiscovery, S as loadIsolatedEvalRegistry, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as parseManualInputValues, c as getLatestRunInfos, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, l as loadPersistedCaseDetail, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as generateRunId, p as persistCaseDetail, s as getLastRunStatuses, u as loadPersistedRunSnapshot, v as runTouchesEval, w as validateCharts, x as deriveEvalFreshness, y as buildManualInputDescriptor } from "./runOrchestration-C7qQISz2.mjs";
1
+ import { Dt as caseRowSchema, Pt as getEvalRegistry, St as resolveLlmCallsConfig, Tt as getCaseRowCaseKey, _ as createFsCacheStore, bt as runSummarySchema, c as resolveArtifactPath, ct as applyDerivedCallAttributes, dt as getEvalDisplayStatus, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, gt as matchesTagsFilter, m as buildDeclaredColumnDefs, o as stripTerminalControlCodes, p as loadConfig, s as resolveTracePresentation, ut as getEvalTitle, v as getCacheRetentionOptions, wt as buildEvalKey, xt as resolveApiCallsConfig } from "./runExecution-D-CnSRYy.mjs";
2
+ import { C as parseEvalDiscovery, S as loadIsolatedEvalRegistry, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as parseManualInputValues, c as getLatestRunInfos, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, l as loadPersistedCaseDetail, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as generateRunId, p as persistCaseDetail, s as getLastRunStatuses, u as loadPersistedRunSnapshot, v as runTouchesEval, w as validateCharts, x as deriveEvalFreshness, y as buildManualInputDescriptor } from "./runOrchestration-Basvyp4u.mjs";
3
3
  import { parseEnv } from "node:util";
4
4
  import { resultify } from "t-result";
5
5
  import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
@@ -973,7 +973,6 @@ function createRunner({ watchForChanges = true, loadEnv = true } = {}) {
973
973
  let discoveryRefreshTimer;
974
974
  let runHistoryRefreshTimer;
975
975
  let cachePruneIdleTimer;
976
- let registryLoadCounter = 0;
977
976
  const configReload = createConfigReloadController({
978
977
  getActiveRunCount,
979
978
  closeRunnerWatchers: closeWatchers,
@@ -1019,9 +1018,8 @@ function createRunner({ watchForChanges = true, loadEnv = true } = {}) {
1019
1018
  if (caseRow === void 0) return void 0;
1020
1019
  return hydrateCaseDetailForRow(run, caseRow);
1021
1020
  }
1022
- function nextRegistryLoadIsolationKey(prefix, filePath) {
1023
- registryLoadCounter++;
1024
- return `${prefix}:${String(registryLoadCounter)}:${filePath}`;
1021
+ function getDiscoveryModuleIsolationKey(filePath) {
1022
+ return `discovery:${filePath}`;
1025
1023
  }
1026
1024
  const runner = {
1027
1025
  async init() {
@@ -1042,24 +1040,11 @@ function createRunner({ watchForChanges = true, loadEnv = true } = {}) {
1042
1040
  async recomputeStatusesForEval(evalKey) {
1043
1041
  const evalMeta = resolveEvalMeta(evalKey);
1044
1042
  if (!evalMeta) return { updatedRuns: 0 };
1045
- const entry = (await loadIsolatedEvalRegistry({
1046
- evalFilePath: evalMeta.sourceFilePath,
1047
- sourceFingerprint: evalMeta.sourceFingerprint ?? void 0,
1048
- moduleIsolation: {
1049
- key: nextRegistryLoadIsolationKey("recompute-status", evalMeta.sourceFilePath),
1050
- workspaceRoot
1051
- },
1052
- runtimeScope: "env"
1053
- })).get(evalMeta.id);
1054
- if (!entry) return { updatedRuns: 0 };
1055
1043
  const scoreThresholds = /* @__PURE__ */ new Map();
1056
- entry.use((evalDef) => {
1057
- for (const [key, def] of Object.entries(evalDef.scores ?? {})) {
1058
- const threshold = normalizeScoreDef(def).passThreshold;
1059
- if (threshold !== void 0) scoreThresholds.set(key, threshold);
1060
- }
1061
- for (const [key, def] of Object.entries(evalDef.manualScores ?? {})) if (def.passThreshold !== void 0) scoreThresholds.set(key, def.passThreshold);
1062
- });
1044
+ for (const columnDef of evalMeta.columnDefs) {
1045
+ if (columnDef.isScore !== true || columnDef.passThreshold === void 0) continue;
1046
+ scoreThresholds.set(columnDef.key, columnDef.passThreshold);
1047
+ }
1063
1048
  const updatedRuns = await recomputeEvalStatusesInRuns({
1064
1049
  runs: runs.values(),
1065
1050
  evalKey: evalMeta.key,
@@ -1264,7 +1249,7 @@ function createRunner({ watchForChanges = true, loadEnv = true } = {}) {
1264
1249
  evalFilePath: filePath,
1265
1250
  sourceFingerprint,
1266
1251
  moduleIsolation: {
1267
- key: nextRegistryLoadIsolationKey("discovery", filePath),
1252
+ key: getDiscoveryModuleIsolationKey(filePath),
1268
1253
  workspaceRoot
1269
1254
  },
1270
1255
  runtimeScope: "env"
@@ -2258,8 +2243,8 @@ async function commandApp(args) {
2258
2243
  const { serve } = await import("@hono/node-server");
2259
2244
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
2260
2245
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
2261
- const appModule = await import("./app-l3ynaNsb.mjs");
2262
- const runnerModule = await import("./runner-C9xNJHt3.mjs");
2246
+ const appModule = await import("./app-Dm_9ZTVa.mjs");
2247
+ const runnerModule = await import("./runner-B6UT1K7L.mjs");
2263
2248
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
2264
2249
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
2265
2250
  await runnerModule.initRunner({ loadEnv: args.loadEnv });