@ls-stack/agent-eval 0.58.1 → 0.58.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-DhMIbjlE.mjs → app-ROCEce9X.mjs} +52 -7
- package/dist/apps/web/dist/assets/index-CHH7m5Cv.css +1 -0
- package/dist/apps/web/dist/assets/index-PTikBbhf.js +377 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +2 -1
- package/dist/{cli-_g2qOMK6.mjs → cli-SP4kEtYL.mjs} +31 -5
- package/dist/index.d.mts +184 -129
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +2 -2
- package/dist/{runExecution-d42Lm0i5.mjs → runExecution-CFw0MQFs.mjs} +114 -21
- package/dist/{runOrchestration-CvmFeOmT.mjs → runOrchestration-CxjiQmof.mjs} +73 -6
- package/dist/{runner-BKogjiYd.mjs → runner-BlFQyvN2.mjs} +1 -1
- package/dist/{runner-MSr8sAWm.mjs → runner-CY3bgsjU.mjs} +2 -2
- package/dist/{src-CdZsOn6y.mjs → src-7GbQj1sb.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +19 -3
- package/dist/apps/web/dist/assets/index-Cz9p4l-t.js +0 -377
- package/dist/apps/web/dist/assets/index-DtARRwsS.css +0 -1
|
@@ -25,8 +25,8 @@
|
|
|
25
25
|
href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
|
|
26
26
|
rel="stylesheet"
|
|
27
27
|
/>
|
|
28
|
-
<script type="module" crossorigin src="/assets/index-
|
|
29
|
-
<link rel="stylesheet" crossorigin href="/assets/index-
|
|
28
|
+
<script type="module" crossorigin src="/assets/index-PTikBbhf.js"></script>
|
|
29
|
+
<link rel="stylesheet" crossorigin href="/assets/index-CHH7m5Cv.css">
|
|
30
30
|
</head>
|
|
31
31
|
<body>
|
|
32
32
|
<div id="root"></div>
|
package/dist/bin.mjs
CHANGED
package/dist/caseChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { Ct as resolveLlmCallsConfig, It as runWithEvalRegistry, J as runInEvalRuntimeScope, L as configureEvalRunLogs, St as resolveApiCallsConfig, _ as createBufferedCacheStore, a as isCaseChildParentMessage, d as loadEvalModule, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, r as runCase, v as createFsCacheStore } from "./runExecution-CFw0MQFs.mjs";
|
|
2
2
|
//#region ../runner/src/caseChild.ts
|
|
3
3
|
let fatalErrorReported = false;
|
|
4
4
|
let disconnectExpected = false;
|
|
@@ -74,6 +74,7 @@ async function executeCaseChild(context) {
|
|
|
74
74
|
globalTraceDisplay: context.globalTraceDisplay,
|
|
75
75
|
globalColumns: config.columns,
|
|
76
76
|
globalDeriveFromTracing: config.deriveFromTracing,
|
|
77
|
+
globalTracingAssertions: config.tracingAssertions,
|
|
77
78
|
llmCallsConfig,
|
|
78
79
|
apiCallsConfig,
|
|
79
80
|
globalRemoveDefaultConfig: config.removeDefaultConfig,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Ct as
|
|
2
|
-
import { C as validateCharts, S as parseEvalDiscovery, _ as runTouchesEval, a as validateTagsFilters, b as deriveEvalFreshness, c as getLatestRunInfos, d as nextShortIdFromSnapshots, f as persistCaseDetail, g as recomputePersistedCaseStatus, h as recomputeEvalStatusesInRuns, i as resolveEvalTags, l as loadPersistedRunSnapshot, m as persistRunState, n as getTargetEvalKeys, o as generateRunId, p as deleteTemporaryRuns, s as getLastRunStatuses, u as loadPersistedRunSnapshots, v as buildManualInputDescriptor, x as loadIsolatedEvalRegistry, y as parseManualInputValues } from "./runOrchestration-
|
|
1
|
+
import { Ct as resolveLlmCallsConfig, Et as getCaseRowCaseKey, Ft as getEvalRegistry, Ot as caseRowSchema, St as resolveApiCallsConfig, Tt as buildEvalKey, _t as matchesTagsFilter, c as resolveArtifactPath, dt as getEvalTitle, f as resolveEvalDefaultConfig, ft as getEvalDisplayStatus, h as normalizeScoreDef, lt as applyDerivedCallAttributes, m as buildDeclaredColumnDefs, o as stripTerminalControlCodes, p as loadConfig, pt as deriveScopedSummaryFromCases, s as resolveTracePresentation, v as createFsCacheStore, xt as runSummarySchema } from "./runExecution-CFw0MQFs.mjs";
|
|
2
|
+
import { C as validateCharts, S as parseEvalDiscovery, _ as runTouchesEval, a as validateTagsFilters, b as deriveEvalFreshness, c as getLatestRunInfos, d as nextShortIdFromSnapshots, f as persistCaseDetail, g as recomputePersistedCaseStatus, h as recomputeEvalStatusesInRuns, i as resolveEvalTags, l as loadPersistedRunSnapshot, m as persistRunState, n as getTargetEvalKeys, o as generateRunId, p as deleteTemporaryRuns, s as getLastRunStatuses, u as loadPersistedRunSnapshots, v as buildManualInputDescriptor, x as loadIsolatedEvalRegistry, y as parseManualInputValues } from "./runOrchestration-CxjiQmof.mjs";
|
|
3
3
|
import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
5
5
|
import { createHash, randomUUID } from "node:crypto";
|
|
@@ -733,10 +733,24 @@ function handleRunChildMessage(params) {
|
|
|
733
733
|
handleRunChildEvent(runState, message.event, managerContext);
|
|
734
734
|
}
|
|
735
735
|
function upsertFinishedCase(runState, caseDetail, caseRow) {
|
|
736
|
+
removeLiveCaseRows(runState.cases, caseRow);
|
|
737
|
+
upsertCaseRow(runState, caseRow);
|
|
738
|
+
runState.caseDetails.set(caseDetail.caseKey ?? caseDetail.caseId, caseDetail);
|
|
739
|
+
}
|
|
740
|
+
function upsertCaseRow(runState, caseRow) {
|
|
736
741
|
const existingIndex = runState.cases.findIndex((row) => getCaseRowCaseKey(row) === getCaseRowCaseKey(caseRow) && row.trial === caseRow.trial);
|
|
737
742
|
if (existingIndex === -1) runState.cases.push(caseRow);
|
|
738
743
|
else runState.cases[existingIndex] = caseRow;
|
|
739
|
-
|
|
744
|
+
}
|
|
745
|
+
function removeLiveCaseRows(caseRows, nextCaseRow) {
|
|
746
|
+
const caseKey = getCaseRowCaseKey(nextCaseRow);
|
|
747
|
+
for (let i = caseRows.length - 1; i >= 0; i--) {
|
|
748
|
+
const caseRow = caseRows[i];
|
|
749
|
+
if (caseRow === void 0) continue;
|
|
750
|
+
if (getCaseRowCaseKey(caseRow) !== caseKey) continue;
|
|
751
|
+
if (caseRow.status !== "pending" && caseRow.status !== "running") continue;
|
|
752
|
+
caseRows.splice(i, 1);
|
|
753
|
+
}
|
|
740
754
|
}
|
|
741
755
|
function applyChildEvalMetas(evals, childMetas) {
|
|
742
756
|
for (const childMeta of childMetas) {
|
|
@@ -763,6 +777,12 @@ function handleRunChildEvent(runState, event, managerContext) {
|
|
|
763
777
|
managerContext.emitEvent(runState, event);
|
|
764
778
|
return;
|
|
765
779
|
}
|
|
780
|
+
if (event.type === "case.started" || event.type === "case.updated") {
|
|
781
|
+
const parsed = caseRowSchema.safeParse(event.payload);
|
|
782
|
+
if (parsed.success) upsertCaseRow(runState, parsed.data);
|
|
783
|
+
managerContext.emitEvent(runState, event);
|
|
784
|
+
return;
|
|
785
|
+
}
|
|
766
786
|
if (event.type === "run.finished") {
|
|
767
787
|
runState.childTerminalReceived = true;
|
|
768
788
|
runState.childProcess = void 0;
|
|
@@ -929,6 +949,10 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
929
949
|
function getSourceFingerprint(source) {
|
|
930
950
|
return createHash("sha256").update(source).digest("hex");
|
|
931
951
|
}
|
|
952
|
+
function getConfiguredConcurrency() {
|
|
953
|
+
if (typeof config.concurrency !== "number" || !Number.isFinite(config.concurrency)) return 1;
|
|
954
|
+
return Math.max(1, Math.floor(config.concurrency));
|
|
955
|
+
}
|
|
932
956
|
function nextRegistryLoadIsolationKey(prefix, filePath) {
|
|
933
957
|
registryLoadCounter++;
|
|
934
958
|
return `${prefix}:${String(registryLoadCounter)}:${filePath}`;
|
|
@@ -1142,6 +1166,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1142
1166
|
getConfigReloadState() {
|
|
1143
1167
|
return configReload.currentState();
|
|
1144
1168
|
},
|
|
1169
|
+
getConfiguredConcurrency,
|
|
1145
1170
|
async refreshDiscovery() {
|
|
1146
1171
|
const patterns = config.include;
|
|
1147
1172
|
const discovered = [];
|
|
@@ -1385,6 +1410,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1385
1410
|
if (!run) return;
|
|
1386
1411
|
if (run.manifest.status !== "running") return;
|
|
1387
1412
|
const endedAt = /* @__PURE__ */ new Date();
|
|
1413
|
+
run.cases = run.cases.filter((caseRow) => caseRow.status !== "pending" && caseRow.status !== "running");
|
|
1388
1414
|
run.manifest.status = "cancelled";
|
|
1389
1415
|
run.manifest.endedAt = endedAt.toISOString();
|
|
1390
1416
|
run.summary.status = "cancelled";
|
|
@@ -2172,8 +2198,8 @@ async function commandApp(args) {
|
|
|
2172
2198
|
const { serve } = await import("@hono/node-server");
|
|
2173
2199
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
2174
2200
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
2175
|
-
const appModule = await import("./app-
|
|
2176
|
-
const runnerModule = await import("./runner-
|
|
2201
|
+
const appModule = await import("./app-ROCEce9X.mjs");
|
|
2202
|
+
const runnerModule = await import("./runner-BlFQyvN2.mjs");
|
|
2177
2203
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
2178
2204
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
2179
2205
|
await runnerModule.initRunner();
|