@ls-stack/agent-eval 0.55.0 → 0.55.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-CunZ8Dku.mjs → app-BD0D9-7k.mjs} +87 -4
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +167 -0
- package/dist/{cli-rvPrUj6S.mjs → cli-BR3wMZMx.mjs} +4 -3
- package/dist/index.d.mts +115 -115
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +3 -1
- package/dist/{runOrchestration-BWyE5lRX.mjs → runExecution-Sw38bCaq.mjs} +31 -1495
- package/dist/runOrchestration-DJsdLYeZ.mjs +1596 -0
- package/dist/{runner-CFQ8LZmY.mjs → runner-72rsqJRq.mjs} +2 -2
- package/dist/{runner-C2fvjKZP.mjs → runner-dB69WsnM.mjs} +1 -1
- package/dist/{src-DEENkbkn.mjs → src-hBGtzWuA.mjs} +2 -2
- package/package.json +1 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { o as stageManualInputFile } from "./cli-
|
|
3
|
-
import "./src-
|
|
4
|
-
import { t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { et as createRunRequestSchema, nt as extractCacheEntries, tt as updateManualScoreRequestSchema, ut as getEvalTitle } from "./runExecution-Sw38bCaq.mjs";
|
|
2
|
+
import { o as stageManualInputFile } from "./cli-BR3wMZMx.mjs";
|
|
3
|
+
import "./src-hBGtzWuA.mjs";
|
|
4
|
+
import { t as getRunnerInstance } from "./runner-72rsqJRq.mjs";
|
|
5
5
|
import { z } from "zod/v4";
|
|
6
6
|
import { readFile } from "node:fs/promises";
|
|
7
7
|
import { dirname, isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
@@ -189,6 +189,77 @@ const openRunLocationRequestSchema = z.object({
|
|
|
189
189
|
column: z.number().int().min(1)
|
|
190
190
|
});
|
|
191
191
|
const importQuerySeparatorRegex = /[?#]/;
|
|
192
|
+
function escapeRegex(value) {
|
|
193
|
+
return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
|
|
194
|
+
}
|
|
195
|
+
function globToRegex(pattern) {
|
|
196
|
+
const normalized = pattern.replaceAll("\\", "/");
|
|
197
|
+
let regex = "^";
|
|
198
|
+
for (let i = 0; i < normalized.length; i++) {
|
|
199
|
+
const char = normalized[i];
|
|
200
|
+
const next = normalized[i + 1];
|
|
201
|
+
if (char === "*" && next === "*") {
|
|
202
|
+
regex += ".*";
|
|
203
|
+
i++;
|
|
204
|
+
} else if (char === "*") regex += "[^/]*";
|
|
205
|
+
else if (char === "?") regex += "[^/]";
|
|
206
|
+
else regex += escapeRegex(char ?? "");
|
|
207
|
+
}
|
|
208
|
+
return new RegExp(`${regex}$`);
|
|
209
|
+
}
|
|
210
|
+
function fileMatches(pattern, filePath) {
|
|
211
|
+
const normalizedPattern = pattern.replaceAll("\\", "/");
|
|
212
|
+
if (normalizedPattern === filePath) return true;
|
|
213
|
+
return globToRegex(normalizedPattern).test(filePath);
|
|
214
|
+
}
|
|
215
|
+
function matchesRunTarget(ev, target) {
|
|
216
|
+
if (target.evalKeys !== void 0 && target.evalKeys.length > 0) {
|
|
217
|
+
if (!target.evalKeys.includes(ev.key)) return false;
|
|
218
|
+
}
|
|
219
|
+
if (target.evalIds !== void 0 && target.evalIds.length > 0) {
|
|
220
|
+
if (!target.evalIds.includes(ev.id)) return false;
|
|
221
|
+
}
|
|
222
|
+
if (target.files !== void 0 && target.files.length > 0) {
|
|
223
|
+
if (!target.files.some((file) => fileMatches(file, ev.filePath))) return false;
|
|
224
|
+
}
|
|
225
|
+
return true;
|
|
226
|
+
}
|
|
227
|
+
function getRunTargetEvalSummaries(evals, target) {
|
|
228
|
+
return evals.filter((ev) => matchesRunTarget(ev, target)).toSorted((left, right) => left.filePath.localeCompare(right.filePath) || left.id.localeCompare(right.id));
|
|
229
|
+
}
|
|
230
|
+
function logStartedAppRunEvals(params) {
|
|
231
|
+
const targetEvals = getRunTargetEvalSummaries(params.evals, params.target);
|
|
232
|
+
if (targetEvals.length === 0) return;
|
|
233
|
+
const label = targetEvals.length === 1 ? "eval" : "evals";
|
|
234
|
+
console.info(`[agent-evals] Starting app run ${params.shortId} (${params.runId}) with ${String(targetEvals.length)} ${label}:`);
|
|
235
|
+
for (const ev of targetEvals) console.info(` - ${getEvalTitle(ev)} (${ev.filePath}#${ev.id})`);
|
|
236
|
+
}
|
|
237
|
+
function formatDurationMs(durationMs) {
|
|
238
|
+
if (durationMs === null) return "";
|
|
239
|
+
if (durationMs < 1e3) return ` in ${String(durationMs)}ms`;
|
|
240
|
+
return ` in ${(durationMs / 1e3).toFixed(1)}s`;
|
|
241
|
+
}
|
|
242
|
+
function formatRunResultSummary(summary) {
|
|
243
|
+
const cancelled = summary.cancelledCases > 0 ? `, ${String(summary.cancelledCases)} cancelled` : "";
|
|
244
|
+
return `${summary.status}: ${String(summary.totalCases)} total, ${String(summary.passedCases)} passed, ${String(summary.failedCases)} failed, ${String(summary.errorCases)} errors${cancelled}${formatDurationMs(summary.totalDurationMs)}`;
|
|
245
|
+
}
|
|
246
|
+
function isTerminalRunEvent(eventType) {
|
|
247
|
+
return eventType === "run.finished" || eventType === "run.error" || eventType === "run.cancelled";
|
|
248
|
+
}
|
|
249
|
+
function subscribeToAppRunResultLog(params) {
|
|
250
|
+
let unsubscribe;
|
|
251
|
+
unsubscribe = params.runner.subscribe(params.runId, (event) => {
|
|
252
|
+
if (!isTerminalRunEvent(event.type)) return;
|
|
253
|
+
unsubscribe?.();
|
|
254
|
+
unsubscribe = void 0;
|
|
255
|
+
const run = params.runner.getRun(params.runId);
|
|
256
|
+
if (run === void 0) {
|
|
257
|
+
console.info(`[agent-evals] Run ${params.shortId} (${params.runId}) finished.`);
|
|
258
|
+
return;
|
|
259
|
+
}
|
|
260
|
+
console.info(`[agent-evals] Run ${params.shortId} (${params.runId}) ${formatRunResultSummary(run.summary)}`);
|
|
261
|
+
});
|
|
262
|
+
}
|
|
192
263
|
function isInsideWorkspace(path, workspaceRoot) {
|
|
193
264
|
return path === workspaceRoot || path.startsWith(workspaceRoot + sep);
|
|
194
265
|
}
|
|
@@ -229,11 +300,23 @@ const runsRoutes = new Hono().get("/", (c) => {
|
|
|
229
300
|
error: "Manual input validation failed",
|
|
230
301
|
failures: validation.failures
|
|
231
302
|
}, 400);
|
|
303
|
+
const evalsForTerminalLog = runner.getEvals();
|
|
232
304
|
const runResult = await resultify(() => runner.startRun(body));
|
|
233
305
|
if (runResult.error) return c.json({
|
|
234
306
|
error: "Failed to start run",
|
|
235
307
|
message: formatUnknownErrorDetails(runResult.error)
|
|
236
308
|
}, 500);
|
|
309
|
+
logStartedAppRunEvals({
|
|
310
|
+
runId: runResult.value.manifest.id,
|
|
311
|
+
shortId: runResult.value.manifest.shortId,
|
|
312
|
+
evals: evalsForTerminalLog,
|
|
313
|
+
target: body.target
|
|
314
|
+
});
|
|
315
|
+
subscribeToAppRunResultLog({
|
|
316
|
+
runner,
|
|
317
|
+
runId: runResult.value.manifest.id,
|
|
318
|
+
shortId: runResult.value.manifest.shortId
|
|
319
|
+
});
|
|
237
320
|
return c.json(runResult.value, 201);
|
|
238
321
|
}).post("/actions/open-location", zValidator("json", openRunLocationRequestSchema), (c) => {
|
|
239
322
|
const body = c.req.valid("json");
|
package/dist/bin.mjs
CHANGED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
import { I as configureEvalRunLogs, Pt as runWithEvalRegistry, St as resolveLlmCallsConfig, _ as createBufferedCacheStore, a as isCaseChildParentMessage, d as loadEvalModule, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, q as runInEvalRuntimeScope, r as runCase, v as createFsCacheStore, xt as resolveApiCallsConfig } from "./runExecution-Sw38bCaq.mjs";
|
|
2
|
+
//#region ../runner/src/caseChild.ts
|
|
3
|
+
let fatalErrorReported = false;
|
|
4
|
+
let disconnectExpected = false;
|
|
5
|
+
let runStarted = false;
|
|
6
|
+
const pendingMessageSends = /* @__PURE__ */ new Set();
|
|
7
|
+
function sendMessage(message) {
|
|
8
|
+
if (process.send === void 0) return;
|
|
9
|
+
const sendPromise = new Promise((resolvePromise) => {
|
|
10
|
+
try {
|
|
11
|
+
process.send?.(message, (error) => {
|
|
12
|
+
if (error) {
|
|
13
|
+
console.error("Failed to send case child message:");
|
|
14
|
+
console.error(formatUnknownErrorDetails(error));
|
|
15
|
+
}
|
|
16
|
+
resolvePromise();
|
|
17
|
+
});
|
|
18
|
+
} catch (error) {
|
|
19
|
+
console.error("Failed to send case child message:");
|
|
20
|
+
console.error(formatUnknownErrorDetails(error));
|
|
21
|
+
resolvePromise();
|
|
22
|
+
}
|
|
23
|
+
});
|
|
24
|
+
pendingMessageSends.add(sendPromise);
|
|
25
|
+
sendPromise.finally(() => {
|
|
26
|
+
pendingMessageSends.delete(sendPromise);
|
|
27
|
+
});
|
|
28
|
+
}
|
|
29
|
+
async function flushMessageSends() {
|
|
30
|
+
while (pendingMessageSends.size > 0) await Promise.allSettled([...pendingMessageSends]);
|
|
31
|
+
}
|
|
32
|
+
function installFatalCaseChildErrorHandlers() {
|
|
33
|
+
process.once("uncaughtException", (error) => {
|
|
34
|
+
reportFatalCaseChildErrorAndExit(error);
|
|
35
|
+
});
|
|
36
|
+
process.once("unhandledRejection", (reason) => {
|
|
37
|
+
reportFatalCaseChildErrorAndExit(toUnhandledRejectionError(reason));
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
async function useEvalDefinition(params) {
|
|
41
|
+
const entry = (await runWithEvalRegistry(async (activeRegistry) => {
|
|
42
|
+
await runInEvalRuntimeScope("env", async () => {
|
|
43
|
+
await loadEvalModule(params.evalFilePath, params.sourceFingerprint);
|
|
44
|
+
});
|
|
45
|
+
return activeRegistry;
|
|
46
|
+
})).get(params.evalId);
|
|
47
|
+
if (entry === void 0) throw new Error(`Eval "${params.evalId}" was not registered after importing ${params.evalFilePath}`);
|
|
48
|
+
return await entry.use(async (evalDef) => await params.use(evalDef));
|
|
49
|
+
}
|
|
50
|
+
async function executeCaseChild(context) {
|
|
51
|
+
process.chdir(context.workspaceRoot);
|
|
52
|
+
registerAgentEvalsPackageResolutionHooks();
|
|
53
|
+
const config = await loadConfig();
|
|
54
|
+
configureEvalRunLogs({ captureConsole: config.runLogs?.captureConsole !== false });
|
|
55
|
+
const cacheStore = createFsCacheStore({
|
|
56
|
+
workspaceRoot: context.workspaceRoot,
|
|
57
|
+
dir: config.cache?.dir,
|
|
58
|
+
maxEntriesPerNamespace: config.cache?.maxEntriesPerNamespace ?? config.cache?.maxEntriesPerEval,
|
|
59
|
+
maxEntriesByNamespace: config.cache?.maxEntriesByNamespace
|
|
60
|
+
});
|
|
61
|
+
const bufferedCacheStore = context.cacheEnabled && context.cacheMode !== "bypass" ? createBufferedCacheStore(cacheStore) : null;
|
|
62
|
+
const llmCallsConfig = resolveLlmCallsConfig(config.llmCalls);
|
|
63
|
+
const apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
|
|
64
|
+
const { caseDetail, caseRowUpdate } = await useEvalDefinition({
|
|
65
|
+
evalId: context.evalId,
|
|
66
|
+
evalFilePath: context.evalFilePath,
|
|
67
|
+
sourceFingerprint: context.sourceFingerprint,
|
|
68
|
+
use: async (evalDef) => await runCase({
|
|
69
|
+
evalDef,
|
|
70
|
+
evalId: context.evalId,
|
|
71
|
+
evalKey: context.evalKey,
|
|
72
|
+
evalCase: context.evalCase,
|
|
73
|
+
globalTraceDisplay: context.globalTraceDisplay,
|
|
74
|
+
globalColumns: config.columns,
|
|
75
|
+
globalDeriveFromTracing: config.deriveFromTracing,
|
|
76
|
+
llmCallsConfig,
|
|
77
|
+
apiCallsConfig,
|
|
78
|
+
globalRemoveDefaultConfig: config.removeDefaultConfig,
|
|
79
|
+
trial: context.trial,
|
|
80
|
+
startTime: context.startTime,
|
|
81
|
+
cacheAdapter: bufferedCacheStore ?? (context.cacheEnabled ? cacheStore : null),
|
|
82
|
+
cacheMode: context.cacheMode,
|
|
83
|
+
moduleIsolation: void 0,
|
|
84
|
+
evalFilePath: context.evalFilePath,
|
|
85
|
+
evalFileRelativePath: context.evalFileRelativePath,
|
|
86
|
+
workspaceRoot: context.workspaceRoot,
|
|
87
|
+
artifactDir: context.artifactDir,
|
|
88
|
+
runId: context.runId
|
|
89
|
+
})
|
|
90
|
+
});
|
|
91
|
+
return {
|
|
92
|
+
caseDetail,
|
|
93
|
+
caseRow: {
|
|
94
|
+
caseId: context.evalCase.id,
|
|
95
|
+
evalId: context.evalId,
|
|
96
|
+
evalKey: context.evalKey,
|
|
97
|
+
caseKey: caseDetail.caseKey,
|
|
98
|
+
tags: caseDetail.tags,
|
|
99
|
+
status: caseRowUpdate.status ?? "pending",
|
|
100
|
+
durationMs: caseRowUpdate.durationMs ?? null,
|
|
101
|
+
cacheHits: caseRowUpdate.cacheHits ?? 0,
|
|
102
|
+
cacheOperations: caseRowUpdate.cacheOperations ?? 0,
|
|
103
|
+
columns: caseRowUpdate.columns ?? {},
|
|
104
|
+
trial: context.trial
|
|
105
|
+
},
|
|
106
|
+
pendingCacheWrites: bufferedCacheStore?.getPendingWrites() ?? []
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
async function handleFatalCaseChildError(error) {
|
|
110
|
+
if (fatalErrorReported) return;
|
|
111
|
+
fatalErrorReported = true;
|
|
112
|
+
const message = formatUnknownErrorDetails(error);
|
|
113
|
+
process.exitCode = 1;
|
|
114
|
+
console.error(message);
|
|
115
|
+
sendMessage({
|
|
116
|
+
type: "error",
|
|
117
|
+
message
|
|
118
|
+
});
|
|
119
|
+
await flushMessageSends();
|
|
120
|
+
}
|
|
121
|
+
function formatUnknownErrorDetails(error) {
|
|
122
|
+
if (error instanceof Error) return error.stack ?? error.message;
|
|
123
|
+
if (typeof error === "string") return error;
|
|
124
|
+
return String(error);
|
|
125
|
+
}
|
|
126
|
+
function toUnhandledRejectionError(reason) {
|
|
127
|
+
if (reason instanceof Error) return reason;
|
|
128
|
+
return /* @__PURE__ */ new Error(`Unhandled rejection: ${formatUnknownErrorDetails(reason)}`);
|
|
129
|
+
}
|
|
130
|
+
async function reportFatalCaseChildErrorAndExit(error) {
|
|
131
|
+
try {
|
|
132
|
+
await handleFatalCaseChildError(error);
|
|
133
|
+
} catch (reportError) {
|
|
134
|
+
console.error("Failed to report fatal case child error:");
|
|
135
|
+
console.error(formatUnknownErrorDetails(reportError));
|
|
136
|
+
} finally {
|
|
137
|
+
process.exit(1);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
installFatalCaseChildErrorHandlers();
|
|
141
|
+
process.on("disconnect", () => {
|
|
142
|
+
if (disconnectExpected) return;
|
|
143
|
+
process.exit(1);
|
|
144
|
+
});
|
|
145
|
+
process.on("message", (message) => {
|
|
146
|
+
if (runStarted) return;
|
|
147
|
+
runStarted = true;
|
|
148
|
+
if (!isCaseChildParentMessage(message)) {
|
|
149
|
+
reportFatalCaseChildErrorAndExit(/* @__PURE__ */ new Error("Case child received an invalid start message"));
|
|
150
|
+
return;
|
|
151
|
+
}
|
|
152
|
+
executeCaseChild(message.context).then(async (result) => {
|
|
153
|
+
sendMessage({
|
|
154
|
+
type: "done",
|
|
155
|
+
result
|
|
156
|
+
});
|
|
157
|
+
await flushMessageSends();
|
|
158
|
+
disconnectExpected = true;
|
|
159
|
+
process.disconnect();
|
|
160
|
+
}).catch(async (error) => {
|
|
161
|
+
await handleFatalCaseChildError(error);
|
|
162
|
+
disconnectExpected = true;
|
|
163
|
+
process.disconnect();
|
|
164
|
+
});
|
|
165
|
+
});
|
|
166
|
+
//#endregion
|
|
167
|
+
export {};
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { Ct as buildEvalKey, Nt as getEvalRegistry, St as resolveLlmCallsConfig, bt as runSummarySchema, c as resolveArtifactPath, ct as applyDerivedCallAttributes, dt as getEvalDisplayStatus, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, gt as matchesTagsFilter, h as normalizeScoreDef, m as buildDeclaredColumnDefs, o as stripTerminalControlCodes, p as loadConfig, s as resolveTracePresentation, ut as getEvalTitle, v as createFsCacheStore, wt as getCaseRowCaseKey, xt as resolveApiCallsConfig } from "./runExecution-Sw38bCaq.mjs";
|
|
2
|
+
import { C as validateCharts, S as parseEvalDiscovery, _ as runTouchesEval, a as validateTagsFilters, b as deriveEvalFreshness, c as getLatestRunInfos, d as nextShortIdFromSnapshots, f as persistCaseDetail, g as recomputePersistedCaseStatus, h as recomputeEvalStatusesInRuns, i as resolveEvalTags, l as loadPersistedRunSnapshot, m as persistRunState, n as getTargetEvalKeys, o as generateRunId, p as deleteTemporaryRuns, s as getLastRunStatuses, u as loadPersistedRunSnapshots, v as buildManualInputDescriptor, x as loadIsolatedEvalRegistry, y as parseManualInputValues } from "./runOrchestration-DJsdLYeZ.mjs";
|
|
2
3
|
import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
3
4
|
import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
4
5
|
import { createHash, randomUUID } from "node:crypto";
|
|
@@ -2135,8 +2136,8 @@ async function commandApp(args) {
|
|
|
2135
2136
|
const { serve } = await import("@hono/node-server");
|
|
2136
2137
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
2137
2138
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
2138
|
-
const appModule = await import("./app-
|
|
2139
|
-
const runnerModule = await import("./runner-
|
|
2139
|
+
const appModule = await import("./app-BD0D9-7k.mjs");
|
|
2140
|
+
const runnerModule = await import("./runner-dB69WsnM.mjs");
|
|
2140
2141
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
2141
2142
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
2142
2143
|
await runnerModule.initRunner();
|