@ls-stack/agent-eval 0.54.0 → 0.55.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,8 +25,8 @@
25
25
  href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
26
26
  rel="stylesheet"
27
27
  />
28
- <script type="module" crossorigin src="/assets/index-Bx1VZctJ.js"></script>
29
- <link rel="stylesheet" crossorigin href="/assets/index-BcLH_Wme.css">
28
+ <script type="module" crossorigin src="/assets/index-CvsPmlHl.js"></script>
29
+ <link rel="stylesheet" crossorigin href="/assets/index-2I-eWzVL.css">
30
30
  </head>
31
31
  <body>
32
32
  <div id="root"></div>
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-BEtk5skO.mjs";
2
+ import { t as runCli } from "./cli-BR3wMZMx.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -0,0 +1,167 @@
1
+ import { I as configureEvalRunLogs, Pt as runWithEvalRegistry, St as resolveLlmCallsConfig, _ as createBufferedCacheStore, a as isCaseChildParentMessage, d as loadEvalModule, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, q as runInEvalRuntimeScope, r as runCase, v as createFsCacheStore, xt as resolveApiCallsConfig } from "./runExecution-Sw38bCaq.mjs";
2
+ //#region ../runner/src/caseChild.ts
3
+ let fatalErrorReported = false;
4
+ let disconnectExpected = false;
5
+ let runStarted = false;
6
+ const pendingMessageSends = /* @__PURE__ */ new Set();
7
+ function sendMessage(message) {
8
+ if (process.send === void 0) return;
9
+ const sendPromise = new Promise((resolvePromise) => {
10
+ try {
11
+ process.send?.(message, (error) => {
12
+ if (error) {
13
+ console.error("Failed to send case child message:");
14
+ console.error(formatUnknownErrorDetails(error));
15
+ }
16
+ resolvePromise();
17
+ });
18
+ } catch (error) {
19
+ console.error("Failed to send case child message:");
20
+ console.error(formatUnknownErrorDetails(error));
21
+ resolvePromise();
22
+ }
23
+ });
24
+ pendingMessageSends.add(sendPromise);
25
+ sendPromise.finally(() => {
26
+ pendingMessageSends.delete(sendPromise);
27
+ });
28
+ }
29
+ async function flushMessageSends() {
30
+ while (pendingMessageSends.size > 0) await Promise.allSettled([...pendingMessageSends]);
31
+ }
32
+ function installFatalCaseChildErrorHandlers() {
33
+ process.once("uncaughtException", (error) => {
34
+ reportFatalCaseChildErrorAndExit(error);
35
+ });
36
+ process.once("unhandledRejection", (reason) => {
37
+ reportFatalCaseChildErrorAndExit(toUnhandledRejectionError(reason));
38
+ });
39
+ }
40
+ async function useEvalDefinition(params) {
41
+ const entry = (await runWithEvalRegistry(async (activeRegistry) => {
42
+ await runInEvalRuntimeScope("env", async () => {
43
+ await loadEvalModule(params.evalFilePath, params.sourceFingerprint);
44
+ });
45
+ return activeRegistry;
46
+ })).get(params.evalId);
47
+ if (entry === void 0) throw new Error(`Eval "${params.evalId}" was not registered after importing ${params.evalFilePath}`);
48
+ return await entry.use(async (evalDef) => await params.use(evalDef));
49
+ }
50
+ async function executeCaseChild(context) {
51
+ process.chdir(context.workspaceRoot);
52
+ registerAgentEvalsPackageResolutionHooks();
53
+ const config = await loadConfig();
54
+ configureEvalRunLogs({ captureConsole: config.runLogs?.captureConsole !== false });
55
+ const cacheStore = createFsCacheStore({
56
+ workspaceRoot: context.workspaceRoot,
57
+ dir: config.cache?.dir,
58
+ maxEntriesPerNamespace: config.cache?.maxEntriesPerNamespace ?? config.cache?.maxEntriesPerEval,
59
+ maxEntriesByNamespace: config.cache?.maxEntriesByNamespace
60
+ });
61
+ const bufferedCacheStore = context.cacheEnabled && context.cacheMode !== "bypass" ? createBufferedCacheStore(cacheStore) : null;
62
+ const llmCallsConfig = resolveLlmCallsConfig(config.llmCalls);
63
+ const apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
64
+ const { caseDetail, caseRowUpdate } = await useEvalDefinition({
65
+ evalId: context.evalId,
66
+ evalFilePath: context.evalFilePath,
67
+ sourceFingerprint: context.sourceFingerprint,
68
+ use: async (evalDef) => await runCase({
69
+ evalDef,
70
+ evalId: context.evalId,
71
+ evalKey: context.evalKey,
72
+ evalCase: context.evalCase,
73
+ globalTraceDisplay: context.globalTraceDisplay,
74
+ globalColumns: config.columns,
75
+ globalDeriveFromTracing: config.deriveFromTracing,
76
+ llmCallsConfig,
77
+ apiCallsConfig,
78
+ globalRemoveDefaultConfig: config.removeDefaultConfig,
79
+ trial: context.trial,
80
+ startTime: context.startTime,
81
+ cacheAdapter: bufferedCacheStore ?? (context.cacheEnabled ? cacheStore : null),
82
+ cacheMode: context.cacheMode,
83
+ moduleIsolation: void 0,
84
+ evalFilePath: context.evalFilePath,
85
+ evalFileRelativePath: context.evalFileRelativePath,
86
+ workspaceRoot: context.workspaceRoot,
87
+ artifactDir: context.artifactDir,
88
+ runId: context.runId
89
+ })
90
+ });
91
+ return {
92
+ caseDetail,
93
+ caseRow: {
94
+ caseId: context.evalCase.id,
95
+ evalId: context.evalId,
96
+ evalKey: context.evalKey,
97
+ caseKey: caseDetail.caseKey,
98
+ tags: caseDetail.tags,
99
+ status: caseRowUpdate.status ?? "pending",
100
+ durationMs: caseRowUpdate.durationMs ?? null,
101
+ cacheHits: caseRowUpdate.cacheHits ?? 0,
102
+ cacheOperations: caseRowUpdate.cacheOperations ?? 0,
103
+ columns: caseRowUpdate.columns ?? {},
104
+ trial: context.trial
105
+ },
106
+ pendingCacheWrites: bufferedCacheStore?.getPendingWrites() ?? []
107
+ };
108
+ }
109
+ async function handleFatalCaseChildError(error) {
110
+ if (fatalErrorReported) return;
111
+ fatalErrorReported = true;
112
+ const message = formatUnknownErrorDetails(error);
113
+ process.exitCode = 1;
114
+ console.error(message);
115
+ sendMessage({
116
+ type: "error",
117
+ message
118
+ });
119
+ await flushMessageSends();
120
+ }
121
+ function formatUnknownErrorDetails(error) {
122
+ if (error instanceof Error) return error.stack ?? error.message;
123
+ if (typeof error === "string") return error;
124
+ return String(error);
125
+ }
126
+ function toUnhandledRejectionError(reason) {
127
+ if (reason instanceof Error) return reason;
128
+ return /* @__PURE__ */ new Error(`Unhandled rejection: ${formatUnknownErrorDetails(reason)}`);
129
+ }
130
+ async function reportFatalCaseChildErrorAndExit(error) {
131
+ try {
132
+ await handleFatalCaseChildError(error);
133
+ } catch (reportError) {
134
+ console.error("Failed to report fatal case child error:");
135
+ console.error(formatUnknownErrorDetails(reportError));
136
+ } finally {
137
+ process.exit(1);
138
+ }
139
+ }
140
+ installFatalCaseChildErrorHandlers();
141
+ process.on("disconnect", () => {
142
+ if (disconnectExpected) return;
143
+ process.exit(1);
144
+ });
145
+ process.on("message", (message) => {
146
+ if (runStarted) return;
147
+ runStarted = true;
148
+ if (!isCaseChildParentMessage(message)) {
149
+ reportFatalCaseChildErrorAndExit(/* @__PURE__ */ new Error("Case child received an invalid start message"));
150
+ return;
151
+ }
152
+ executeCaseChild(message.context).then(async (result) => {
153
+ sendMessage({
154
+ type: "done",
155
+ result
156
+ });
157
+ await flushMessageSends();
158
+ disconnectExpected = true;
159
+ process.disconnect();
160
+ }).catch(async (error) => {
161
+ await handleFatalCaseChildError(error);
162
+ disconnectExpected = true;
163
+ process.disconnect();
164
+ });
165
+ });
166
+ //#endregion
167
+ export {};
@@ -1,4 +1,5 @@
1
- import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, Rt as getEvalRegistry, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadIsolatedEvalRegistry, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-C78U4Ir0.mjs";
1
+ import { Ct as buildEvalKey, Nt as getEvalRegistry, St as resolveLlmCallsConfig, bt as runSummarySchema, c as resolveArtifactPath, ct as applyDerivedCallAttributes, dt as getEvalDisplayStatus, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, gt as matchesTagsFilter, h as normalizeScoreDef, m as buildDeclaredColumnDefs, o as stripTerminalControlCodes, p as loadConfig, s as resolveTracePresentation, ut as getEvalTitle, v as createFsCacheStore, wt as getCaseRowCaseKey, xt as resolveApiCallsConfig } from "./runExecution-Sw38bCaq.mjs";
2
+ import { C as validateCharts, S as parseEvalDiscovery, _ as runTouchesEval, a as validateTagsFilters, b as deriveEvalFreshness, c as getLatestRunInfos, d as nextShortIdFromSnapshots, f as persistCaseDetail, g as recomputePersistedCaseStatus, h as recomputeEvalStatusesInRuns, i as resolveEvalTags, l as loadPersistedRunSnapshot, m as persistRunState, n as getTargetEvalKeys, o as generateRunId, p as deleteTemporaryRuns, s as getLastRunStatuses, u as loadPersistedRunSnapshots, v as buildManualInputDescriptor, x as loadIsolatedEvalRegistry, y as parseManualInputValues } from "./runOrchestration-DJsdLYeZ.mjs";
2
3
  import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
3
4
  import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
4
5
  import { createHash, randomUUID } from "node:crypto";
@@ -2135,8 +2136,8 @@ async function commandApp(args) {
2135
2136
  const { serve } = await import("@hono/node-server");
2136
2137
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
2137
2138
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
2138
- const appModule = await import("./app-CMJtftUB.mjs");
2139
- const runnerModule = await import("./runner-pmIYculz.mjs");
2139
+ const appModule = await import("./app-BD0D9-7k.mjs");
2140
+ const runnerModule = await import("./runner-dB69WsnM.mjs");
2140
2141
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
2141
2142
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
2142
2143
  await runnerModule.initRunner();