@ls-stack/agent-eval 0.27.1 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-CJj1yPPD.mjs → app-mBbAN-Gt.mjs} +15 -3
- package/dist/apps/web/dist/assets/index-8VE7b6RK.css +1 -0
- package/dist/apps/web/dist/assets/index-Czer_MdN.js +118 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-Clf8xUFa.mjs → cli-BQwRbqsL.mjs} +75 -4
- package/dist/index.d.mts +342 -90
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-FEvBwwJI.mjs → runOrchestration-ClWYWPen.mjs} +428 -204
- package/dist/{runner-KbDKLSU4.mjs → runner-BQn_xf36.mjs} +1 -1
- package/dist/{runner-zqKwTlNj.mjs → runner-DbVB66h9.mjs} +2 -2
- package/dist/src-CuirVcPY.mjs +3 -0
- package/package.json +6 -4
- package/skills/agent-eval/SKILL.md +52 -20
- package/dist/apps/web/dist/assets/index-6YqV9t4k.js +0 -118
- package/dist/apps/web/dist/assets/index-C-OiMSQD.css +0 -1
- package/dist/bin.d.mts +0 -1
- package/dist/runChild.d.mts +0 -1
- package/dist/src-BBwT7_cy.mjs +0 -3
|
@@ -25,8 +25,8 @@
|
|
|
25
25
|
href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
|
|
26
26
|
rel="stylesheet"
|
|
27
27
|
/>
|
|
28
|
-
<script type="module" crossorigin src="/assets/index-
|
|
29
|
-
<link rel="stylesheet" crossorigin href="/assets/index-
|
|
28
|
+
<script type="module" crossorigin src="/assets/index-Czer_MdN.js"></script>
|
|
29
|
+
<link rel="stylesheet" crossorigin href="/assets/index-8VE7b6RK.css">
|
|
30
30
|
</head>
|
|
31
31
|
<body>
|
|
32
32
|
<div id="root"></div>
|
package/dist/bin.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { C as normalizeScoreDef, F as getEvalTitle, I as getEvalDisplayStatus, L as deriveScopedSummaryFromCases, N as applyDerivedCallAttributes, S as buildDeclaredColumnDefs, T as createFsCacheStore, V as runSummarySchema, Yn as getEvalRegistry, _ as deriveEvalFreshness, a as getLastRunStatuses, b as resolveEvalDefaultConfig, c as loadPersistedRunSnapshots, d as persistRunState, dt as buildEvalKey, f as recomputeEvalStatusesInRuns, ft as getCaseRowCaseKey, g as resolveArtifactPath, h as resolveTracePresentation, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, o as getLatestRunInfos, ot as resolveApiCallsConfig, p as recomputePersistedCaseStatus, pt as getCaseRowEvalKey, s as loadPersistedRunSnapshot, st as resolveLlmCallsConfig, u as persistCaseDetail, v as loadEvalModule, w as validateCharts, x as loadConfig, y as parseEvalDiscovery } from "./runOrchestration-ClWYWPen.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import { dirname, join, relative, resolve } from "node:path";
|
|
@@ -60,6 +60,57 @@ function readGitWorktreeState(workspaceRoot) {
|
|
|
60
60
|
return { commitSha: commitResult.status === 0 ? commitResult.stdout : null };
|
|
61
61
|
}
|
|
62
62
|
//#endregion
|
|
63
|
+
//#region ../runner/src/recalculateDerivedAttributes.ts
|
|
64
|
+
function getCaseArtifactFileIdForExistingRun(runState, caseRow) {
|
|
65
|
+
const caseKey = getCaseRowCaseKey(caseRow);
|
|
66
|
+
return runState.cases.some((existing) => existing.caseId === caseRow.caseId && getCaseRowCaseKey(existing) !== caseKey) ? caseKey : caseRow.caseId;
|
|
67
|
+
}
|
|
68
|
+
async function recalculateDerivedAttributesForCase(params) {
|
|
69
|
+
const { run, caseId } = params;
|
|
70
|
+
if (run.manifest.status === "running") return {
|
|
71
|
+
updated: false,
|
|
72
|
+
reason: "Run is still running"
|
|
73
|
+
};
|
|
74
|
+
const caseRow = run.cases.find((row) => getCaseRowCaseKey(row) === caseId || row.caseId === caseId);
|
|
75
|
+
if (!caseRow) return {
|
|
76
|
+
updated: false,
|
|
77
|
+
reason: "Case not found"
|
|
78
|
+
};
|
|
79
|
+
const caseKey = getCaseRowCaseKey(caseRow);
|
|
80
|
+
const caseDetail = run.caseDetails.get(caseKey);
|
|
81
|
+
if (!caseDetail) return {
|
|
82
|
+
updated: false,
|
|
83
|
+
reason: "Case detail not found"
|
|
84
|
+
};
|
|
85
|
+
const spansWithDerivedAttributes = applyDerivedCallAttributes({
|
|
86
|
+
spans: caseDetail.trace,
|
|
87
|
+
llmCallsConfig: params.llmCallsConfig,
|
|
88
|
+
apiCallsConfig: params.apiCallsConfig
|
|
89
|
+
});
|
|
90
|
+
let nextTrace = spansWithDerivedAttributes;
|
|
91
|
+
let nextTraceDisplay = caseDetail.traceDisplay;
|
|
92
|
+
const evalMeta = params.evals.get(getCaseRowEvalKey(caseRow));
|
|
93
|
+
const entry = evalMeta === void 0 ? void 0 : getEvalRegistry().get(evalMeta.id);
|
|
94
|
+
if (entry !== void 0) entry.use((evalDef) => {
|
|
95
|
+
const resolved = resolveTracePresentation(spansWithDerivedAttributes, params.traceDisplayConfig, evalDef.traceDisplay);
|
|
96
|
+
nextTrace = resolved.trace;
|
|
97
|
+
nextTraceDisplay = resolved.traceDisplay;
|
|
98
|
+
});
|
|
99
|
+
const nextCaseDetail = {
|
|
100
|
+
...caseDetail,
|
|
101
|
+
trace: nextTrace,
|
|
102
|
+
traceDisplay: nextTraceDisplay
|
|
103
|
+
};
|
|
104
|
+
run.caseDetails.set(caseKey, nextCaseDetail);
|
|
105
|
+
const artifactFileId = getCaseArtifactFileIdForExistingRun(run, caseRow);
|
|
106
|
+
await writeFile(join(run.runDir, "traces", `${encodeURIComponent(artifactFileId)}.json`), JSON.stringify(nextCaseDetail.trace, null, 2));
|
|
107
|
+
await params.persistCaseDetail(run.runDir, nextCaseDetail, artifactFileId);
|
|
108
|
+
return {
|
|
109
|
+
updated: true,
|
|
110
|
+
caseDetail: nextCaseDetail
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
//#endregion
|
|
63
114
|
//#region ../runner/src/runChildProtocol.ts
|
|
64
115
|
function isRunChildMessage(value) {
|
|
65
116
|
if (typeof value !== "object" || value === null) return false;
|
|
@@ -251,7 +302,7 @@ async function markRunTerminalFromChild(runState, event, managerContext) {
|
|
|
251
302
|
managerContext.emitDiscoveryEvent();
|
|
252
303
|
}
|
|
253
304
|
//#endregion
|
|
254
|
-
//#region ../runner/src/
|
|
305
|
+
//#region ../runner/src/watchRoots.ts
|
|
255
306
|
const globMagicCharacters = new Set([
|
|
256
307
|
"*",
|
|
257
308
|
"?",
|
|
@@ -285,6 +336,8 @@ function getWatchRootsForIncludePatterns(params) {
|
|
|
285
336
|
if (roots.size === 0) return [params.workspaceRoot];
|
|
286
337
|
return [...roots];
|
|
287
338
|
}
|
|
339
|
+
//#endregion
|
|
340
|
+
//#region ../runner/src/runner.ts
|
|
288
341
|
/** Create an in-memory eval runner bound to the current workspace config. */
|
|
289
342
|
function createRunner({ watchForChanges = true } = {}) {
|
|
290
343
|
let config;
|
|
@@ -373,6 +426,22 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
373
426
|
emitDiscoveryEvent();
|
|
374
427
|
return { updatedRuns };
|
|
375
428
|
},
|
|
429
|
+
async recalculateDerivedAttributesForCase({ runId, caseId }) {
|
|
430
|
+
const run = runs.get(runId);
|
|
431
|
+
if (!run) return {
|
|
432
|
+
updated: false,
|
|
433
|
+
reason: "Run not found"
|
|
434
|
+
};
|
|
435
|
+
return recalculateDerivedAttributesForCase({
|
|
436
|
+
run,
|
|
437
|
+
caseId,
|
|
438
|
+
llmCallsConfig,
|
|
439
|
+
apiCallsConfig,
|
|
440
|
+
traceDisplayConfig: config.traceDisplay,
|
|
441
|
+
evals,
|
|
442
|
+
persistCaseDetail
|
|
443
|
+
});
|
|
444
|
+
},
|
|
376
445
|
async cleanRunsForEval(evalKey) {
|
|
377
446
|
const evalMeta = resolveEvalMeta(evalKey);
|
|
378
447
|
let deletedRuns = 0;
|
|
@@ -529,6 +598,8 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
529
598
|
discoveredEntry?.use((evalDef) => {
|
|
530
599
|
const defaultConfig = resolveEvalDefaultConfig({
|
|
531
600
|
evalDef,
|
|
601
|
+
globalColumns: config.columns,
|
|
602
|
+
globalStats: config.stats,
|
|
532
603
|
globalRemove: config.removeDefaultConfig
|
|
533
604
|
});
|
|
534
605
|
columnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
|
|
@@ -1041,8 +1112,8 @@ async function commandApp(args) {
|
|
|
1041
1112
|
const { serve } = await import("@hono/node-server");
|
|
1042
1113
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
1043
1114
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
1044
|
-
const appModule = await import("./app-
|
|
1045
|
-
const runnerModule = await import("./runner-
|
|
1115
|
+
const appModule = await import("./app-mBbAN-Gt.mjs");
|
|
1116
|
+
const runnerModule = await import("./runner-BQn_xf36.mjs");
|
|
1046
1117
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
1047
1118
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
1048
1119
|
await runnerModule.initRunner();
|