@ls-stack/agent-eval 0.59.2 → 0.60.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-B3PEtWqH.mjs → app-CxKEVlng.mjs} +5 -5
- package/dist/apps/web/dist/assets/{index-Dowobz-z.js → index-CM6MDNqo.js} +73 -73
- package/dist/apps/web/dist/index.html +1 -1
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +1 -1
- package/dist/{cli-Dkp2-rBm.mjs → cli-CVBSlTD8.mjs} +72 -27
- package/dist/index.d.mts +2990 -2981
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +3 -3
- package/dist/{runExecution-C3XVZHRC.mjs → runExecution-CjWJUUZ5.mjs} +2 -2
- package/dist/{runOrchestration-B5An-AEi.mjs → runOrchestration-DE2TFAS6.mjs} +1 -1
- package/dist/{runner-BJXz_V_V.mjs → runner-Cu1CQPTB.mjs} +1 -1
- package/dist/runner-DzDRasWV.mjs +15 -0
- package/dist/{src-8dGXUULC.mjs → src-DjOTPnDz.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +10 -6
- package/dist/runner-C9J-1fkp.mjs +0 -15
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
|
|
26
26
|
rel="stylesheet"
|
|
27
27
|
/>
|
|
28
|
-
<script type="module" crossorigin src="/assets/index-
|
|
28
|
+
<script type="module" crossorigin src="/assets/index-CM6MDNqo.js"></script>
|
|
29
29
|
<link rel="stylesheet" crossorigin href="/assets/index-CqWfzcFb.css">
|
|
30
30
|
</head>
|
|
31
31
|
<body>
|
package/dist/bin.mjs
CHANGED
package/dist/caseChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Ct as
|
|
1
|
+
import { Ct as resolveLlmCallsConfig, It as runWithEvalRegistry, J as runInEvalRuntimeScope, L as configureEvalRunLogs, St as resolveApiCallsConfig, _ as createBufferedCacheStore, a as isCaseChildParentMessage, d as loadEvalModule, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, r as runCase, v as createFsCacheStore, y as getCacheRetentionOptions } from "./runExecution-CjWJUUZ5.mjs";
|
|
2
2
|
//#region ../runner/src/caseChild.ts
|
|
3
3
|
let fatalErrorReported = false;
|
|
4
4
|
let disconnectExpected = false;
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
import { Ct as
|
|
2
|
-
import { C as validateCharts, S as parseEvalDiscovery, _ as runTouchesEval, a as validateTagsFilters, b as deriveEvalFreshness, c as getLatestRunInfos, d as nextShortIdFromSnapshots, f as persistCaseDetail, g as recomputePersistedCaseStatus, h as recomputeEvalStatusesInRuns, i as resolveEvalTags, l as loadPersistedRunSnapshot, m as persistRunState, n as getTargetEvalKeys, o as generateRunId, p as deleteTemporaryRuns, s as getLastRunStatuses, u as loadPersistedRunSnapshots, v as buildManualInputDescriptor, x as loadIsolatedEvalRegistry, y as parseManualInputValues } from "./runOrchestration-
|
|
1
|
+
import { Ct as resolveLlmCallsConfig, Et as getCaseRowCaseKey, Ft as getEvalRegistry, Ot as caseRowSchema, St as resolveApiCallsConfig, Tt as buildEvalKey, _t as matchesTagsFilter, c as resolveArtifactPath, dt as getEvalTitle, f as resolveEvalDefaultConfig, ft as getEvalDisplayStatus, h as normalizeScoreDef, lt as applyDerivedCallAttributes, m as buildDeclaredColumnDefs, o as stripTerminalControlCodes, p as loadConfig, pt as deriveScopedSummaryFromCases, s as resolveTracePresentation, v as createFsCacheStore, xt as runSummarySchema, y as getCacheRetentionOptions } from "./runExecution-CjWJUUZ5.mjs";
|
|
2
|
+
import { C as validateCharts, S as parseEvalDiscovery, _ as runTouchesEval, a as validateTagsFilters, b as deriveEvalFreshness, c as getLatestRunInfos, d as nextShortIdFromSnapshots, f as persistCaseDetail, g as recomputePersistedCaseStatus, h as recomputeEvalStatusesInRuns, i as resolveEvalTags, l as loadPersistedRunSnapshot, m as persistRunState, n as getTargetEvalKeys, o as generateRunId, p as deleteTemporaryRuns, s as getLastRunStatuses, u as loadPersistedRunSnapshots, v as buildManualInputDescriptor, x as loadIsolatedEvalRegistry, y as parseManualInputValues } from "./runOrchestration-DE2TFAS6.mjs";
|
|
3
|
+
import { parseEnv } from "node:util";
|
|
3
4
|
import { resultify } from "t-result";
|
|
4
5
|
import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
5
6
|
import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
@@ -33,7 +34,7 @@ function resolveCaseDetailLookup(run, caseId) {
|
|
|
33
34
|
}
|
|
34
35
|
//#endregion
|
|
35
36
|
//#region ../runner/src/configReload.ts
|
|
36
|
-
/** Coordinates idle-only reloads for
|
|
37
|
+
/** Coordinates idle-only reloads for workspace config and `.env` in app mode. */
|
|
37
38
|
function createConfigReloadController({ getActiveRunCount, closeRunnerWatchers, loadRunnerState, emitToDiscoveryListeners }) {
|
|
38
39
|
let watcher;
|
|
39
40
|
let reloadTimer;
|
|
@@ -125,7 +126,7 @@ function createConfigReloadController({ getActiveRunCount, closeRunnerWatchers,
|
|
|
125
126
|
await reloadConfig(state.lastChangedAt ?? (/* @__PURE__ */ new Date()).toISOString());
|
|
126
127
|
}
|
|
127
128
|
async function setupWatcher() {
|
|
128
|
-
const nextWatcher = watch(resolve(process.cwd(), "agent-evals.config.ts"), {
|
|
129
|
+
const nextWatcher = watch([resolve(process.cwd(), "agent-evals.config.ts"), resolve(process.cwd(), ".env")], {
|
|
129
130
|
awaitWriteFinish: {
|
|
130
131
|
stabilityThreshold: 100,
|
|
131
132
|
pollInterval: 20
|
|
@@ -904,10 +905,51 @@ function getWatchRootsForIncludePatterns(params) {
|
|
|
904
905
|
return [...roots];
|
|
905
906
|
}
|
|
906
907
|
//#endregion
|
|
908
|
+
//#region ../runner/src/workspaceEnv.ts
|
|
909
|
+
const shellEnvKeys = new Set(Object.keys(process.env));
|
|
910
|
+
const appliedWorkspaceEnvValues = /* @__PURE__ */ new Map();
|
|
911
|
+
async function loadWorkspaceEnv(workspaceRoot) {
|
|
912
|
+
const envPath = resolve(workspaceRoot, ".env");
|
|
913
|
+
if (!existsSync(envPath)) {
|
|
914
|
+
applyWorkspaceEnv(/* @__PURE__ */ new Map());
|
|
915
|
+
return { error: null };
|
|
916
|
+
}
|
|
917
|
+
const readResult = await resultify(() => readFile(envPath, "utf-8"));
|
|
918
|
+
if (readResult.error) return { error: `Failed to read .env at ${envPath}: ${readResult.error.message}` };
|
|
919
|
+
const parseResult = resultify(() => parseEnv(readResult.value));
|
|
920
|
+
if (parseResult.error) return { error: `Failed to parse .env at ${envPath}: ${parseResult.error.message}` };
|
|
921
|
+
applyWorkspaceEnv(new Map(getEnvEntries(parseResult.value)));
|
|
922
|
+
return { error: null };
|
|
923
|
+
}
|
|
924
|
+
function getEnvEntries(env) {
|
|
925
|
+
const entries = [];
|
|
926
|
+
for (const [key, value] of Object.entries(env)) if (value !== void 0) entries.push([key, value]);
|
|
927
|
+
return entries;
|
|
928
|
+
}
|
|
929
|
+
function applyWorkspaceEnv(nextValues) {
|
|
930
|
+
for (const [key, previousValue] of appliedWorkspaceEnvValues) {
|
|
931
|
+
if (nextValues.has(key)) continue;
|
|
932
|
+
if (process.env[key] === previousValue) delete process.env[key];
|
|
933
|
+
appliedWorkspaceEnvValues.delete(key);
|
|
934
|
+
}
|
|
935
|
+
for (const [key, value] of nextValues) {
|
|
936
|
+
if (shellEnvKeys.has(key)) continue;
|
|
937
|
+
process.env[key] = value;
|
|
938
|
+
appliedWorkspaceEnvValues.set(key, value);
|
|
939
|
+
}
|
|
940
|
+
}
|
|
941
|
+
//#endregion
|
|
907
942
|
//#region ../runner/src/runner.ts
|
|
908
943
|
const defaultCachePruneIdleDelayMs = 5e3;
|
|
909
|
-
/**
|
|
910
|
-
|
|
944
|
+
/**
|
|
945
|
+
* Create an in-memory eval runner bound to the current workspace config.
|
|
946
|
+
*
|
|
947
|
+
* @param options.watchForChanges Watch eval files, run history, config, and
|
|
948
|
+
* workspace `.env` for live reloads.
|
|
949
|
+
* @param options.loadEnv Load `.env` from the current workspace before config,
|
|
950
|
+
* discovery, and runs. Shell-provided values keep precedence.
|
|
951
|
+
*/
|
|
952
|
+
function createRunner({ watchForChanges = true, loadEnv = true } = {}) {
|
|
911
953
|
let config;
|
|
912
954
|
let workspaceRoot;
|
|
913
955
|
let localStateDir;
|
|
@@ -1475,6 +1517,10 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1475
1517
|
}
|
|
1476
1518
|
};
|
|
1477
1519
|
async function loadRunnerState() {
|
|
1520
|
+
if (loadEnv) {
|
|
1521
|
+
const envResult = await loadWorkspaceEnv(process.cwd());
|
|
1522
|
+
if (envResult.error !== null) throw new Error(envResult.error);
|
|
1523
|
+
}
|
|
1478
1524
|
config = await loadConfig();
|
|
1479
1525
|
workspaceRoot = config.workspaceRoot ?? process.cwd();
|
|
1480
1526
|
localStateDir = resolve(workspaceRoot, ".agent-evals");
|
|
@@ -2079,7 +2125,6 @@ function parseArgs(argv) {
|
|
|
2079
2125
|
*/
|
|
2080
2126
|
async function runCli(argv) {
|
|
2081
2127
|
const args = parseArgs(argv);
|
|
2082
|
-
if (args.loadEnv && !loadWorkspaceEnv()) process.exit(1);
|
|
2083
2128
|
if (args.showHelp) {
|
|
2084
2129
|
if (args.unknownHelpTarget !== void 0) {
|
|
2085
2130
|
console.error(`No help found for "${args.unknownHelpTarget}".`);
|
|
@@ -2134,18 +2179,6 @@ function fileMatches(pattern, filePath) {
|
|
|
2134
2179
|
const normalized = pattern.replaceAll("\\", "/");
|
|
2135
2180
|
return normalized === filePath || globToRegex(normalized).test(filePath);
|
|
2136
2181
|
}
|
|
2137
|
-
function loadWorkspaceEnv() {
|
|
2138
|
-
const envPath = resolve(process.cwd(), ".env");
|
|
2139
|
-
if (!existsSync(envPath)) return true;
|
|
2140
|
-
const loadResult = resultify(() => {
|
|
2141
|
-
process.loadEnvFile(envPath);
|
|
2142
|
-
});
|
|
2143
|
-
if (loadResult.error) {
|
|
2144
|
-
console.error(`Failed to load .env at ${envPath}: ${loadResult.error.message}`);
|
|
2145
|
-
return false;
|
|
2146
|
-
}
|
|
2147
|
-
return true;
|
|
2148
|
-
}
|
|
2149
2182
|
function formatUnknownErrorDetails(error) {
|
|
2150
2183
|
if (error instanceof Error) return error.stack ?? error.message;
|
|
2151
2184
|
if (typeof error === "string") return error;
|
|
@@ -2199,19 +2232,22 @@ async function commandApp(args) {
|
|
|
2199
2232
|
const { serve } = await import("@hono/node-server");
|
|
2200
2233
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
2201
2234
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
2202
|
-
const appModule = await import("./app-
|
|
2203
|
-
const runnerModule = await import("./runner-
|
|
2235
|
+
const appModule = await import("./app-CxKEVlng.mjs");
|
|
2236
|
+
const runnerModule = await import("./runner-Cu1CQPTB.mjs");
|
|
2204
2237
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
2205
2238
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
2206
|
-
await runnerModule.initRunner();
|
|
2239
|
+
await runnerModule.initRunner({ loadEnv: args.loadEnv });
|
|
2207
2240
|
console.info(`Agent Evals app: http://localhost:${String(args.port)}`);
|
|
2208
2241
|
serve({
|
|
2209
2242
|
fetch: appModule.app.fetch,
|
|
2210
2243
|
port: args.port
|
|
2211
2244
|
});
|
|
2212
2245
|
}
|
|
2213
|
-
async function commandList(
|
|
2214
|
-
const runner = createRunner({
|
|
2246
|
+
async function commandList(args) {
|
|
2247
|
+
const runner = createRunner({
|
|
2248
|
+
watchForChanges: false,
|
|
2249
|
+
loadEnv: args.loadEnv
|
|
2250
|
+
});
|
|
2215
2251
|
await runner.init();
|
|
2216
2252
|
const discoveryIssues = runner.getDiscoveryIssues();
|
|
2217
2253
|
if (discoveryIssues.length > 0) {
|
|
@@ -2244,7 +2280,10 @@ async function commandList(args_) {
|
|
|
2244
2280
|
if (discoveryIssues.length > 0) process.exit(1);
|
|
2245
2281
|
}
|
|
2246
2282
|
async function commandRun(args) {
|
|
2247
|
-
const runner = createRunner({
|
|
2283
|
+
const runner = createRunner({
|
|
2284
|
+
watchForChanges: false,
|
|
2285
|
+
loadEnv: args.loadEnv
|
|
2286
|
+
});
|
|
2248
2287
|
await runner.init();
|
|
2249
2288
|
if (args.evalIds.length === 0 && args.caseIds.length === 0 && args.files.length === 0 && args.tagsFilter.length === 0 && !runner.getAllowCliRunAll()) {
|
|
2250
2289
|
console.error("This workspace disables running all evals from the CLI. Pass --eval <id>, --file <path|glob>, --case <id>, or --tags-filter <expr> to run a targeted subset.");
|
|
@@ -2333,7 +2372,10 @@ async function commandRun(args) {
|
|
|
2333
2372
|
if (summary.status === "error" || summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
|
|
2334
2373
|
}
|
|
2335
2374
|
async function commandShowRuns(args) {
|
|
2336
|
-
const runner = createRunner({
|
|
2375
|
+
const runner = createRunner({
|
|
2376
|
+
watchForChanges: false,
|
|
2377
|
+
loadEnv: args.loadEnv
|
|
2378
|
+
});
|
|
2337
2379
|
await runner.init();
|
|
2338
2380
|
const runRef = args.positionals[0];
|
|
2339
2381
|
if (runRef !== void 0) {
|
|
@@ -2358,7 +2400,10 @@ async function commandShowRuns(args) {
|
|
|
2358
2400
|
printRunFileIndexes(indexes);
|
|
2359
2401
|
}
|
|
2360
2402
|
async function commandCache(args) {
|
|
2361
|
-
const runner = createRunner({
|
|
2403
|
+
const runner = createRunner({
|
|
2404
|
+
watchForChanges: false,
|
|
2405
|
+
loadEnv: args.loadEnv
|
|
2406
|
+
});
|
|
2362
2407
|
await runner.init();
|
|
2363
2408
|
if (args.subcommand === "list" || args.subcommand === void 0) {
|
|
2364
2409
|
const entries = await runner.listCache();
|