@ls-stack/agent-eval 0.53.0 → 0.55.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-D0a57pVo.mjs → app-CunZ8Dku.mjs} +12 -4
- package/dist/apps/web/dist/assets/index-2I-eWzVL.css +1 -0
- package/dist/apps/web/dist/assets/index-CvsPmlHl.js +377 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-3FrKBc9l.mjs → cli-rvPrUj6S.mjs} +51 -11
- package/dist/index.d.mts +305 -133
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +2 -1
- package/dist/{runOrchestration-Cn6fGL2s.mjs → runOrchestration-BWyE5lRX.mjs} +236 -134
- package/dist/{runner-Dsqj431i.mjs → runner-C2fvjKZP.mjs} +1 -1
- package/dist/{runner-C0qdoRSi.mjs → runner-CFQ8LZmY.mjs} +2 -2
- package/dist/{src-BNmtaqeC.mjs → src-DEENkbkn.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +14 -3
- package/dist/apps/web/dist/assets/index-RNejIyap.js +0 -375
- package/dist/apps/web/dist/assets/index-vaLgWG8j.css +0 -1
|
@@ -25,8 +25,8 @@
|
|
|
25
25
|
href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
|
|
26
26
|
rel="stylesheet"
|
|
27
27
|
/>
|
|
28
|
-
<script type="module" crossorigin src="/assets/index-
|
|
29
|
-
<link rel="stylesheet" crossorigin href="/assets/index-
|
|
28
|
+
<script type="module" crossorigin src="/assets/index-CvsPmlHl.js"></script>
|
|
29
|
+
<link rel="stylesheet" crossorigin href="/assets/index-2I-eWzVL.css">
|
|
30
30
|
</head>
|
|
31
31
|
<body>
|
|
32
32
|
<div id="root"></div>
|
package/dist/bin.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig,
|
|
1
|
+
import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, Rt as getEvalRegistry, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadIsolatedEvalRegistry, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-BWyE5lRX.mjs";
|
|
2
2
|
import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
3
3
|
import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
4
4
|
import { createHash, randomUUID } from "node:crypto";
|
|
@@ -749,6 +749,7 @@ function applyChildEvalMetas(evals, childMetas) {
|
|
|
749
749
|
evalMeta.caseCount = childMeta.caseCount;
|
|
750
750
|
evalMeta.caseIds = childMeta.caseIds;
|
|
751
751
|
evalMeta.stats = childMeta.stats;
|
|
752
|
+
evalMeta.defaultStatAggregate = childMeta.defaultStatAggregate;
|
|
752
753
|
evalMeta.charts = childMeta.charts;
|
|
753
754
|
evalMeta.sourceFingerprint = childMeta.sourceFingerprint;
|
|
754
755
|
}
|
|
@@ -901,6 +902,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
901
902
|
let runHistoryWatcher;
|
|
902
903
|
let discoveryRefreshTimer;
|
|
903
904
|
let runHistoryRefreshTimer;
|
|
905
|
+
let registryLoadCounter = 0;
|
|
904
906
|
const configReload = createConfigReloadController({
|
|
905
907
|
getActiveRunCount,
|
|
906
908
|
closeRunnerWatchers: closeWatchers,
|
|
@@ -922,6 +924,10 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
922
924
|
function getSourceFingerprint(source) {
|
|
923
925
|
return createHash("sha256").update(source).digest("hex");
|
|
924
926
|
}
|
|
927
|
+
function nextRegistryLoadIsolationKey(prefix, filePath) {
|
|
928
|
+
registryLoadCounter++;
|
|
929
|
+
return `${prefix}:${String(registryLoadCounter)}:${filePath}`;
|
|
930
|
+
}
|
|
925
931
|
const runner = {
|
|
926
932
|
async init() {
|
|
927
933
|
await loadRunnerState();
|
|
@@ -938,9 +944,15 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
938
944
|
async recomputeStatusesForEval(evalKey) {
|
|
939
945
|
const evalMeta = resolveEvalMeta(evalKey);
|
|
940
946
|
if (!evalMeta) return { updatedRuns: 0 };
|
|
941
|
-
const
|
|
942
|
-
|
|
943
|
-
|
|
947
|
+
const entry = (await loadIsolatedEvalRegistry({
|
|
948
|
+
evalFilePath: evalMeta.sourceFilePath,
|
|
949
|
+
sourceFingerprint: evalMeta.sourceFingerprint ?? void 0,
|
|
950
|
+
moduleIsolation: {
|
|
951
|
+
key: nextRegistryLoadIsolationKey("recompute-status", evalMeta.sourceFilePath),
|
|
952
|
+
workspaceRoot
|
|
953
|
+
},
|
|
954
|
+
runtimeScope: "env"
|
|
955
|
+
})).get(evalMeta.id);
|
|
944
956
|
if (!entry) return { updatedRuns: 0 };
|
|
945
957
|
const scoreThresholds = /* @__PURE__ */ new Map();
|
|
946
958
|
entry.use((evalDef) => {
|
|
@@ -1068,6 +1080,24 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1068
1080
|
emitDiscoveryEvent();
|
|
1069
1081
|
return { deleted: true };
|
|
1070
1082
|
},
|
|
1083
|
+
async promoteRun(runId) {
|
|
1084
|
+
const run = runs.get(runId);
|
|
1085
|
+
if (!run) return { promoted: false };
|
|
1086
|
+
const wasTemporary = run.manifest.temporary === true;
|
|
1087
|
+
if (wasTemporary) {
|
|
1088
|
+
run.manifest.temporary = false;
|
|
1089
|
+
await persistRunState(run);
|
|
1090
|
+
emitDiscoveryEvent();
|
|
1091
|
+
}
|
|
1092
|
+
return {
|
|
1093
|
+
promoted: wasTemporary,
|
|
1094
|
+
run: {
|
|
1095
|
+
manifest: run.manifest,
|
|
1096
|
+
summary: run.summary,
|
|
1097
|
+
cases: run.cases
|
|
1098
|
+
}
|
|
1099
|
+
};
|
|
1100
|
+
},
|
|
1071
1101
|
validateManualInputs(request) {
|
|
1072
1102
|
return validateManualInputsForRequest({
|
|
1073
1103
|
evalMetas: getSortedEvalMetas(),
|
|
@@ -1127,17 +1157,24 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1127
1157
|
message: `Duplicate eval id "${issue.evalId}" in ${toWorkspaceRelativePath(issue.filePath)}. Eval ids must be unique within one file.`
|
|
1128
1158
|
})));
|
|
1129
1159
|
const sourceFingerprint = getSourceFingerprint(content);
|
|
1130
|
-
|
|
1131
|
-
let moduleLoaded = false;
|
|
1160
|
+
let loadedRegistry;
|
|
1132
1161
|
try {
|
|
1133
|
-
await
|
|
1134
|
-
|
|
1162
|
+
loadedRegistry = await loadIsolatedEvalRegistry({
|
|
1163
|
+
evalFilePath: filePath,
|
|
1164
|
+
sourceFingerprint,
|
|
1165
|
+
moduleIsolation: {
|
|
1166
|
+
key: nextRegistryLoadIsolationKey("discovery", filePath),
|
|
1167
|
+
workspaceRoot
|
|
1168
|
+
},
|
|
1169
|
+
runtimeScope: "env"
|
|
1170
|
+
});
|
|
1135
1171
|
} catch {}
|
|
1136
1172
|
for (const meta of discoveredMetas) {
|
|
1137
|
-
const discoveredEntry =
|
|
1173
|
+
const discoveredEntry = loadedRegistry?.get(meta.id);
|
|
1138
1174
|
const title = meta.title;
|
|
1139
1175
|
let columnDefs = buildDeclaredColumnDefs(void 0, void 0, void 0);
|
|
1140
1176
|
let stats;
|
|
1177
|
+
let defaultStatAggregate;
|
|
1141
1178
|
let charts;
|
|
1142
1179
|
let tags = [];
|
|
1143
1180
|
let manualInputDescriptor;
|
|
@@ -1156,10 +1193,12 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1156
1193
|
evalDef,
|
|
1157
1194
|
globalColumns: config.columns,
|
|
1158
1195
|
globalStats: config.stats,
|
|
1196
|
+
globalDefaultStatAggregate: config.defaultStatAggregate,
|
|
1159
1197
|
globalRemove: config.removeDefaultConfig
|
|
1160
1198
|
});
|
|
1161
1199
|
columnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
|
|
1162
1200
|
stats = defaultConfig.stats;
|
|
1201
|
+
defaultStatAggregate = defaultConfig.defaultStatAggregate;
|
|
1163
1202
|
const validated = validateCharts({
|
|
1164
1203
|
charts: defaultConfig.charts,
|
|
1165
1204
|
columnDefs,
|
|
@@ -1202,6 +1241,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1202
1241
|
caseCount: null,
|
|
1203
1242
|
caseIds: void 0,
|
|
1204
1243
|
stats,
|
|
1244
|
+
defaultStatAggregate,
|
|
1205
1245
|
charts,
|
|
1206
1246
|
manualInputDescriptor,
|
|
1207
1247
|
requiresManualInput
|
|
@@ -2095,8 +2135,8 @@ async function commandApp(args) {
|
|
|
2095
2135
|
const { serve } = await import("@hono/node-server");
|
|
2096
2136
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
2097
2137
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
2098
|
-
const appModule = await import("./app-
|
|
2099
|
-
const runnerModule = await import("./runner-
|
|
2138
|
+
const appModule = await import("./app-CunZ8Dku.mjs");
|
|
2139
|
+
const runnerModule = await import("./runner-C2fvjKZP.mjs");
|
|
2100
2140
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
2101
2141
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
2102
2142
|
await runnerModule.initRunner();
|