@ls-stack/agent-eval 0.42.2 → 0.45.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-DPamBr5R.mjs → app-BZmhhSFZ.mjs} +10 -4
- package/dist/apps/web/dist/assets/index-BU3IqUso.css +1 -0
- package/dist/apps/web/dist/assets/index-am2Knit6.js +140 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-BeJCJMQo.mjs → cli-vdJYkEVk.mjs} +28 -5
- package/dist/index.d.mts +368 -75
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-OVUFw1fL.mjs → runOrchestration-BFdxG9ws.mjs} +259 -250
- package/dist/{runner-BJQq7cpd.mjs → runner--aH0jO4Z.mjs} +1 -1
- package/dist/{runner-B1KygirW.mjs → runner-DJWn_7p0.mjs} +2 -2
- package/dist/{src-D7_xKo7h.mjs → src-BRqs3kSA.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +14 -12
- package/dist/apps/web/dist/assets/index-DH0qYD76.js +0 -140
- package/dist/apps/web/dist/assets/index-eFM9VIsz.css +0 -1
|
@@ -25,8 +25,8 @@
|
|
|
25
25
|
href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
|
|
26
26
|
rel="stylesheet"
|
|
27
27
|
/>
|
|
28
|
-
<script type="module" crossorigin src="/assets/index-
|
|
29
|
-
<link rel="stylesheet" crossorigin href="/assets/index-
|
|
28
|
+
<script type="module" crossorigin src="/assets/index-am2Knit6.js"></script>
|
|
29
|
+
<link rel="stylesheet" crossorigin href="/assets/index-BU3IqUso.css">
|
|
30
30
|
</head>
|
|
31
31
|
<body>
|
|
32
32
|
<div id="root"></div>
|
package/dist/bin.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, Lt as getEvalRegistry, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadEvalModule, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-
|
|
1
|
+
import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, Lt as getEvalRegistry, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadEvalModule, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-BFdxG9ws.mjs";
|
|
2
2
|
import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
3
3
|
import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
4
4
|
import { createHash, randomUUID } from "node:crypto";
|
|
@@ -8,6 +8,29 @@ import { existsSync } from "node:fs";
|
|
|
8
8
|
import { resultify } from "t-result";
|
|
9
9
|
import { fileURLToPath } from "node:url";
|
|
10
10
|
import { spawn, spawnSync } from "node:child_process";
|
|
11
|
+
//#region ../runner/src/caseDetailLookup.ts
|
|
12
|
+
function getCaseLookupIds(caseId) {
|
|
13
|
+
const decodedResult = resultify(() => decodeURIComponent(caseId));
|
|
14
|
+
const decodedCaseId = decodedResult.error ? caseId : decodedResult.value;
|
|
15
|
+
const lookupIds = new Set([caseId, decodedCaseId]);
|
|
16
|
+
for (const lookupId of [...lookupIds]) {
|
|
17
|
+
const segments = lookupId.split("#");
|
|
18
|
+
if (segments.length < 3) continue;
|
|
19
|
+
lookupIds.add(segments.map((segment) => encodeURIComponent(segment)).join("#"));
|
|
20
|
+
}
|
|
21
|
+
return [...lookupIds];
|
|
22
|
+
}
|
|
23
|
+
function resolveCaseDetailLookup(run, caseId) {
|
|
24
|
+
const lookupIds = new Set(getCaseLookupIds(caseId));
|
|
25
|
+
for (const lookupId of lookupIds) {
|
|
26
|
+
const caseDetail = run.caseDetails.get(lookupId);
|
|
27
|
+
if (caseDetail) return caseDetail;
|
|
28
|
+
}
|
|
29
|
+
const matchingCaseRow = run.cases.find((caseRow) => lookupIds.has(getCaseRowCaseKey(caseRow)) || lookupIds.has(caseRow.caseId));
|
|
30
|
+
if (matchingCaseRow === void 0) return void 0;
|
|
31
|
+
return run.caseDetails.get(getCaseRowCaseKey(matchingCaseRow));
|
|
32
|
+
}
|
|
33
|
+
//#endregion
|
|
11
34
|
//#region ../runner/src/configReload.ts
|
|
12
35
|
/** Coordinates idle-only reloads for `agent-evals.config.ts` in app mode. */
|
|
13
36
|
function createConfigReloadController({ getActiveRunCount, closeRunnerWatchers, loadRunnerState, emitToDiscoveryListeners }) {
|
|
@@ -1338,7 +1361,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1338
1361
|
getCaseDetail(runId, caseId) {
|
|
1339
1362
|
const run = runs.get(runId);
|
|
1340
1363
|
if (!run) return void 0;
|
|
1341
|
-
return
|
|
1364
|
+
return resolveCaseDetailLookup(run, caseId);
|
|
1342
1365
|
},
|
|
1343
1366
|
subscribe(runId, listener) {
|
|
1344
1367
|
const run = runs.get(runId);
|
|
@@ -2072,8 +2095,8 @@ async function commandApp(args) {
|
|
|
2072
2095
|
const { serve } = await import("@hono/node-server");
|
|
2073
2096
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
2074
2097
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
2075
|
-
const appModule = await import("./app-
|
|
2076
|
-
const runnerModule = await import("./runner
|
|
2098
|
+
const appModule = await import("./app-BZmhhSFZ.mjs");
|
|
2099
|
+
const runnerModule = await import("./runner--aH0jO4Z.mjs");
|
|
2077
2100
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
2078
2101
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
2079
2102
|
await runnerModule.initRunner();
|
|
@@ -2260,7 +2283,7 @@ async function commandCache(args) {
|
|
|
2260
2283
|
const evalIds = runner.getEvals().filter((ev) => (args.evalIds.length === 0 || args.evalIds.includes(ev.id)) && (args.files.length === 0 || args.files.some((file) => fileMatches(file, ev.filePath)))).map((ev) => ev.id);
|
|
2261
2284
|
for (const evalId of evalIds) {
|
|
2262
2285
|
const entries = await runner.listCache();
|
|
2263
|
-
const prefix = `${evalId}
|
|
2286
|
+
const prefix = `${evalId}.`;
|
|
2264
2287
|
const matching = entries.filter((entry) => entry.namespace.startsWith(prefix));
|
|
2265
2288
|
for (const entry of matching) await runner.clearCache({
|
|
2266
2289
|
namespace: entry.namespace,
|