@ls-stack/agent-eval 0.58.1 → 0.58.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,8 +25,8 @@
25
25
  href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
26
26
  rel="stylesheet"
27
27
  />
28
- <script type="module" crossorigin src="/assets/index-Cz9p4l-t.js"></script>
29
- <link rel="stylesheet" crossorigin href="/assets/index-DtARRwsS.css">
28
+ <script type="module" crossorigin src="/assets/index-PTikBbhf.js"></script>
29
+ <link rel="stylesheet" crossorigin href="/assets/index-CHH7m5Cv.css">
30
30
  </head>
31
31
  <body>
32
32
  <div id="root"></div>
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-_g2qOMK6.mjs";
2
+ import { t as runCli } from "./cli-SP4kEtYL.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -1,4 +1,4 @@
1
- import { I as configureEvalRunLogs, Pt as runWithEvalRegistry, St as resolveLlmCallsConfig, _ as createBufferedCacheStore, a as isCaseChildParentMessage, d as loadEvalModule, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, q as runInEvalRuntimeScope, r as runCase, v as createFsCacheStore, xt as resolveApiCallsConfig } from "./runExecution-d42Lm0i5.mjs";
1
+ import { Ct as resolveLlmCallsConfig, It as runWithEvalRegistry, J as runInEvalRuntimeScope, L as configureEvalRunLogs, St as resolveApiCallsConfig, _ as createBufferedCacheStore, a as isCaseChildParentMessage, d as loadEvalModule, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, r as runCase, v as createFsCacheStore } from "./runExecution-CFw0MQFs.mjs";
2
2
  //#region ../runner/src/caseChild.ts
3
3
  let fatalErrorReported = false;
4
4
  let disconnectExpected = false;
@@ -74,6 +74,7 @@ async function executeCaseChild(context) {
74
74
  globalTraceDisplay: context.globalTraceDisplay,
75
75
  globalColumns: config.columns,
76
76
  globalDeriveFromTracing: config.deriveFromTracing,
77
+ globalTracingAssertions: config.tracingAssertions,
77
78
  llmCallsConfig,
78
79
  apiCallsConfig,
79
80
  globalRemoveDefaultConfig: config.removeDefaultConfig,
@@ -1,5 +1,5 @@
1
- import { Ct as buildEvalKey, Nt as getEvalRegistry, St as resolveLlmCallsConfig, bt as runSummarySchema, c as resolveArtifactPath, ct as applyDerivedCallAttributes, dt as getEvalDisplayStatus, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, gt as matchesTagsFilter, h as normalizeScoreDef, m as buildDeclaredColumnDefs, o as stripTerminalControlCodes, p as loadConfig, s as resolveTracePresentation, ut as getEvalTitle, v as createFsCacheStore, wt as getCaseRowCaseKey, xt as resolveApiCallsConfig } from "./runExecution-d42Lm0i5.mjs";
2
- import { C as validateCharts, S as parseEvalDiscovery, _ as runTouchesEval, a as validateTagsFilters, b as deriveEvalFreshness, c as getLatestRunInfos, d as nextShortIdFromSnapshots, f as persistCaseDetail, g as recomputePersistedCaseStatus, h as recomputeEvalStatusesInRuns, i as resolveEvalTags, l as loadPersistedRunSnapshot, m as persistRunState, n as getTargetEvalKeys, o as generateRunId, p as deleteTemporaryRuns, s as getLastRunStatuses, u as loadPersistedRunSnapshots, v as buildManualInputDescriptor, x as loadIsolatedEvalRegistry, y as parseManualInputValues } from "./runOrchestration-CvmFeOmT.mjs";
1
+ import { Ct as resolveLlmCallsConfig, Et as getCaseRowCaseKey, Ft as getEvalRegistry, Ot as caseRowSchema, St as resolveApiCallsConfig, Tt as buildEvalKey, _t as matchesTagsFilter, c as resolveArtifactPath, dt as getEvalTitle, f as resolveEvalDefaultConfig, ft as getEvalDisplayStatus, h as normalizeScoreDef, lt as applyDerivedCallAttributes, m as buildDeclaredColumnDefs, o as stripTerminalControlCodes, p as loadConfig, pt as deriveScopedSummaryFromCases, s as resolveTracePresentation, v as createFsCacheStore, xt as runSummarySchema } from "./runExecution-CFw0MQFs.mjs";
2
+ import { C as validateCharts, S as parseEvalDiscovery, _ as runTouchesEval, a as validateTagsFilters, b as deriveEvalFreshness, c as getLatestRunInfos, d as nextShortIdFromSnapshots, f as persistCaseDetail, g as recomputePersistedCaseStatus, h as recomputeEvalStatusesInRuns, i as resolveEvalTags, l as loadPersistedRunSnapshot, m as persistRunState, n as getTargetEvalKeys, o as generateRunId, p as deleteTemporaryRuns, s as getLastRunStatuses, u as loadPersistedRunSnapshots, v as buildManualInputDescriptor, x as loadIsolatedEvalRegistry, y as parseManualInputValues } from "./runOrchestration-CxjiQmof.mjs";
3
3
  import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
4
4
  import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
5
5
  import { createHash, randomUUID } from "node:crypto";
@@ -733,10 +733,24 @@ function handleRunChildMessage(params) {
733
733
  handleRunChildEvent(runState, message.event, managerContext);
734
734
  }
735
735
  function upsertFinishedCase(runState, caseDetail, caseRow) {
736
+ removeLiveCaseRows(runState.cases, caseRow);
737
+ upsertCaseRow(runState, caseRow);
738
+ runState.caseDetails.set(caseDetail.caseKey ?? caseDetail.caseId, caseDetail);
739
+ }
740
+ function upsertCaseRow(runState, caseRow) {
736
741
  const existingIndex = runState.cases.findIndex((row) => getCaseRowCaseKey(row) === getCaseRowCaseKey(caseRow) && row.trial === caseRow.trial);
737
742
  if (existingIndex === -1) runState.cases.push(caseRow);
738
743
  else runState.cases[existingIndex] = caseRow;
739
- runState.caseDetails.set(caseDetail.caseKey ?? caseDetail.caseId, caseDetail);
744
+ }
745
+ function removeLiveCaseRows(caseRows, nextCaseRow) {
746
+ const caseKey = getCaseRowCaseKey(nextCaseRow);
747
+ for (let i = caseRows.length - 1; i >= 0; i--) {
748
+ const caseRow = caseRows[i];
749
+ if (caseRow === void 0) continue;
750
+ if (getCaseRowCaseKey(caseRow) !== caseKey) continue;
751
+ if (caseRow.status !== "pending" && caseRow.status !== "running") continue;
752
+ caseRows.splice(i, 1);
753
+ }
740
754
  }
741
755
  function applyChildEvalMetas(evals, childMetas) {
742
756
  for (const childMeta of childMetas) {
@@ -763,6 +777,12 @@ function handleRunChildEvent(runState, event, managerContext) {
763
777
  managerContext.emitEvent(runState, event);
764
778
  return;
765
779
  }
780
+ if (event.type === "case.started" || event.type === "case.updated") {
781
+ const parsed = caseRowSchema.safeParse(event.payload);
782
+ if (parsed.success) upsertCaseRow(runState, parsed.data);
783
+ managerContext.emitEvent(runState, event);
784
+ return;
785
+ }
766
786
  if (event.type === "run.finished") {
767
787
  runState.childTerminalReceived = true;
768
788
  runState.childProcess = void 0;
@@ -929,6 +949,10 @@ function createRunner({ watchForChanges = true } = {}) {
929
949
  function getSourceFingerprint(source) {
930
950
  return createHash("sha256").update(source).digest("hex");
931
951
  }
952
+ function getConfiguredConcurrency() {
953
+ if (typeof config.concurrency !== "number" || !Number.isFinite(config.concurrency)) return 1;
954
+ return Math.max(1, Math.floor(config.concurrency));
955
+ }
932
956
  function nextRegistryLoadIsolationKey(prefix, filePath) {
933
957
  registryLoadCounter++;
934
958
  return `${prefix}:${String(registryLoadCounter)}:${filePath}`;
@@ -1142,6 +1166,7 @@ function createRunner({ watchForChanges = true } = {}) {
1142
1166
  getConfigReloadState() {
1143
1167
  return configReload.currentState();
1144
1168
  },
1169
+ getConfiguredConcurrency,
1145
1170
  async refreshDiscovery() {
1146
1171
  const patterns = config.include;
1147
1172
  const discovered = [];
@@ -1385,6 +1410,7 @@ function createRunner({ watchForChanges = true } = {}) {
1385
1410
  if (!run) return;
1386
1411
  if (run.manifest.status !== "running") return;
1387
1412
  const endedAt = /* @__PURE__ */ new Date();
1413
+ run.cases = run.cases.filter((caseRow) => caseRow.status !== "pending" && caseRow.status !== "running");
1388
1414
  run.manifest.status = "cancelled";
1389
1415
  run.manifest.endedAt = endedAt.toISOString();
1390
1416
  run.summary.status = "cancelled";
@@ -2172,8 +2198,8 @@ async function commandApp(args) {
2172
2198
  const { serve } = await import("@hono/node-server");
2173
2199
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
2174
2200
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
2175
- const appModule = await import("./app-DhMIbjlE.mjs");
2176
- const runnerModule = await import("./runner-BKogjiYd.mjs");
2201
+ const appModule = await import("./app-ROCEce9X.mjs");
2202
+ const runnerModule = await import("./runner-BlFQyvN2.mjs");
2177
2203
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
2178
2204
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
2179
2205
  await runnerModule.initRunner();