@ls-stack/agent-eval 0.37.0 → 0.39.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-C7ON9Wdh.mjs → app-DD-8kx5H.mjs} +15 -6
- package/dist/apps/web/dist/assets/index-C6PgBOfP.css +1 -0
- package/dist/apps/web/dist/assets/{index-BiwYbMem.js → index-CO86PsY-.js} +43 -43
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +14 -3
- package/dist/{cli-CwGcJYWe.mjs → cli-BUX6tr9J.mjs} +60 -28
- package/dist/index.d.mts +124 -121
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +56 -2
- package/dist/{runOrchestration-C4o5TcIu.mjs → runOrchestration-BhUFWvq9.mjs} +251 -114
- package/dist/{runner-BTH8m_Er.mjs → runner-B1wfPKNH.mjs} +2 -2
- package/dist/{runner-LqeHPID6.mjs → runner-CoRf7Vef.mjs} +1 -1
- package/dist/src-BwJ5tod2.mjs +3 -0
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +9 -5
- package/dist/apps/web/dist/assets/index-CKdoOah2.css +0 -1
- package/dist/src--13_4uDG.mjs +0 -3
|
@@ -25,8 +25,8 @@
|
|
|
25
25
|
href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
|
|
26
26
|
rel="stylesheet"
|
|
27
27
|
/>
|
|
28
|
-
<script type="module" crossorigin src="/assets/index-
|
|
29
|
-
<link rel="stylesheet" crossorigin href="/assets/index-
|
|
28
|
+
<script type="module" crossorigin src="/assets/index-CO86PsY-.js"></script>
|
|
29
|
+
<link rel="stylesheet" crossorigin href="/assets/index-C6PgBOfP.css">
|
|
30
30
|
</head>
|
|
31
31
|
<body>
|
|
32
32
|
<div id="root"></div>
|
package/dist/bin.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import { t as runCli } from "./cli-
|
|
2
|
+
import { t as runCli } from "./cli-BUX6tr9J.mjs";
|
|
3
3
|
import { spawn } from "node:child_process";
|
|
4
4
|
//#region src/bin.ts
|
|
5
5
|
const moduleMocksFlag = "--experimental-test-module-mocks";
|
|
@@ -70,10 +70,21 @@ async function reexecWithNodeArgs(argv, execArgv) {
|
|
|
70
70
|
});
|
|
71
71
|
});
|
|
72
72
|
}
|
|
73
|
+
function formatUnknownErrorDetails(error) {
|
|
74
|
+
if (error instanceof Error) return error.stack ?? error.message;
|
|
75
|
+
if (typeof error === "string") return error;
|
|
76
|
+
return String(error);
|
|
77
|
+
}
|
|
73
78
|
const { argv, inspectArg } = parseDebugFlags(process.argv.slice(2));
|
|
74
79
|
setRunChildInspectArg(inspectArg);
|
|
75
80
|
const execArgv = buildExecArgv(inspectArg);
|
|
76
|
-
if (needsModuleMocksFlag() || !execArgvMatches(execArgv)) await reexecWithNodeArgs(argv, execArgv)
|
|
77
|
-
|
|
81
|
+
if (needsModuleMocksFlag() || !execArgvMatches(execArgv)) await reexecWithNodeArgs(argv, execArgv).catch((error) => {
|
|
82
|
+
console.error(formatUnknownErrorDetails(error));
|
|
83
|
+
process.exitCode = 1;
|
|
84
|
+
});
|
|
85
|
+
else await runCli(argv).catch((error) => {
|
|
86
|
+
console.error(formatUnknownErrorDetails(error));
|
|
87
|
+
process.exitCode = 1;
|
|
88
|
+
});
|
|
78
89
|
//#endregion
|
|
79
90
|
export {};
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { C as parseEvalDiscovery, D as normalizeScoreDef, E as buildDeclaredColumnDefs, G as deriveScopedSummaryFromCases, O as validateCharts, S as loadEvalModule, St as getCaseRowEvalKey, T as loadConfig, U as getEvalTitle, V as applyDerivedCallAttributes, W as getEvalDisplayStatus, Y as runSummarySchema, _ as resolveTracePresentation, a as generateRunId, b as parseManualInputValues, bt as buildEvalKey, c as loadPersistedRunSnapshot, d as persistCaseDetail, f as deleteTemporaryRuns, g as runTouchesEval, gt as resolveLlmCallsConfig, h as recomputePersistedCaseStatus, hr as getEvalRegistry, ht as resolveApiCallsConfig, i as stripTerminalControlCodes, k as createFsCacheStore, l as loadPersistedRunSnapshots, m as recomputeEvalStatusesInRuns, n as getTargetEvalKeys, o as getLastRunStatuses, p as persistRunState, s as getLatestRunInfos, u as nextShortIdFromSnapshots, v as resolveArtifactPath, w as resolveEvalDefaultConfig, x as deriveEvalFreshness, xt as getCaseRowCaseKey, y as buildManualInputDescriptor } from "./runOrchestration-
|
|
1
|
+
import { C as parseEvalDiscovery, D as normalizeScoreDef, E as buildDeclaredColumnDefs, G as deriveScopedSummaryFromCases, O as validateCharts, S as loadEvalModule, St as getCaseRowEvalKey, T as loadConfig, U as getEvalTitle, V as applyDerivedCallAttributes, W as getEvalDisplayStatus, Y as runSummarySchema, _ as resolveTracePresentation, a as generateRunId, b as parseManualInputValues, bt as buildEvalKey, c as loadPersistedRunSnapshot, d as persistCaseDetail, f as deleteTemporaryRuns, g as runTouchesEval, gt as resolveLlmCallsConfig, h as recomputePersistedCaseStatus, hr as getEvalRegistry, ht as resolveApiCallsConfig, i as stripTerminalControlCodes, k as createFsCacheStore, l as loadPersistedRunSnapshots, m as recomputeEvalStatusesInRuns, n as getTargetEvalKeys, o as getLastRunStatuses, p as persistRunState, s as getLatestRunInfos, u as nextShortIdFromSnapshots, v as resolveArtifactPath, w as resolveEvalDefaultConfig, x as deriveEvalFreshness, xt as getCaseRowCaseKey, y as buildManualInputDescriptor } from "./runOrchestration-BhUFWvq9.mjs";
|
|
2
2
|
import { createHash, randomUUID } from "node:crypto";
|
|
3
3
|
import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
@@ -537,6 +537,7 @@ const runChildInspectArgEnv = "AGENT_EVALS_RUN_CHILD_INSPECT_ARG";
|
|
|
537
537
|
const inspectFlagPrefix = "--inspect";
|
|
538
538
|
const inspectBrkFlagPrefix = "--inspect-brk";
|
|
539
539
|
const childOutputTailMaxLength = 12e3;
|
|
540
|
+
const outputHeadlineMaxLength = 240;
|
|
540
541
|
function startRunChild(params) {
|
|
541
542
|
const child = spawn(process.execPath, [
|
|
542
543
|
...getRunChildExecArgv(),
|
|
@@ -554,6 +555,11 @@ function startRunChild(params) {
|
|
|
554
555
|
});
|
|
555
556
|
const outputTail = createRunChildOutputTail(child);
|
|
556
557
|
params.runState.childProcess = child;
|
|
558
|
+
child.once("error", (error) => {
|
|
559
|
+
if (params.runState.childProcess === child) params.runState.childProcess = void 0;
|
|
560
|
+
if (params.runState.manifest.status !== "running" || params.runState.childTerminalReceived) return;
|
|
561
|
+
markRunErrored(params.runState, `Failed to start run child: ${error.message}`, params.managerContext);
|
|
562
|
+
});
|
|
557
563
|
child.on("message", (message) => {
|
|
558
564
|
if (!isRunChildMessage(message)) return;
|
|
559
565
|
handleRunChildMessage({
|
|
@@ -562,25 +568,31 @@ function startRunChild(params) {
|
|
|
562
568
|
managerContext: params.managerContext
|
|
563
569
|
});
|
|
564
570
|
});
|
|
565
|
-
child.once("
|
|
571
|
+
child.once("close", (code, signal) => {
|
|
566
572
|
if (params.runState.childProcess === child) params.runState.childProcess = void 0;
|
|
567
573
|
if (params.runState.manifest.status !== "running" || params.runState.childTerminalReceived) return;
|
|
568
|
-
const reason =
|
|
574
|
+
const reason = formatChildExitReason(code, signal);
|
|
569
575
|
markRunErrored(params.runState, formatUnexpectedRunChildExit(reason, outputTail), params.managerContext);
|
|
570
576
|
});
|
|
571
577
|
}
|
|
572
578
|
function createRunChildOutputTail(child) {
|
|
573
579
|
const tail = {
|
|
574
580
|
stdout: "",
|
|
575
|
-
stderr: ""
|
|
581
|
+
stderr: "",
|
|
582
|
+
stdoutTruncated: false,
|
|
583
|
+
stderrTruncated: false
|
|
576
584
|
};
|
|
577
585
|
child.stdout?.on("data", (chunk) => {
|
|
578
586
|
process.stdout.write(chunk);
|
|
579
|
-
|
|
587
|
+
const nextTail = appendOutputTail(tail.stdout, chunkToText(chunk));
|
|
588
|
+
tail.stdout = nextTail.text;
|
|
589
|
+
tail.stdoutTruncated = tail.stdoutTruncated || nextTail.truncated;
|
|
580
590
|
});
|
|
581
591
|
child.stderr?.on("data", (chunk) => {
|
|
582
592
|
process.stderr.write(chunk);
|
|
583
|
-
|
|
593
|
+
const nextTail = appendOutputTail(tail.stderr, chunkToText(chunk));
|
|
594
|
+
tail.stderr = nextTail.text;
|
|
595
|
+
tail.stderrTruncated = tail.stderrTruncated || nextTail.truncated;
|
|
584
596
|
});
|
|
585
597
|
return tail;
|
|
586
598
|
}
|
|
@@ -589,17 +601,37 @@ function chunkToText(chunk) {
|
|
|
589
601
|
}
|
|
590
602
|
function appendOutputTail(current, next) {
|
|
591
603
|
const combined = current + next;
|
|
592
|
-
if (combined.length <= childOutputTailMaxLength) return
|
|
593
|
-
|
|
604
|
+
if (combined.length <= childOutputTailMaxLength) return {
|
|
605
|
+
text: combined,
|
|
606
|
+
truncated: false
|
|
607
|
+
};
|
|
608
|
+
return {
|
|
609
|
+
text: combined.slice(combined.length - childOutputTailMaxLength),
|
|
610
|
+
truncated: true
|
|
611
|
+
};
|
|
594
612
|
}
|
|
595
613
|
function formatUnexpectedRunChildExit(reason, outputTail) {
|
|
596
|
-
const sections = [reason];
|
|
597
614
|
const stderr = stripTerminalControlCodes(outputTail.stderr).trim();
|
|
598
615
|
const stdout = stripTerminalControlCodes(outputTail.stdout).trim();
|
|
599
|
-
|
|
600
|
-
|
|
616
|
+
const headline = getChildStderrHeadline(stderr);
|
|
617
|
+
const sections = [headline === null ? `${reason} before sending a structured run error.` : `Run child exited before sending a structured run error: ${headline}`, reason];
|
|
618
|
+
if (stderr.length > 0) sections.push(formatOutputSection("stderr", stderr, outputTail.stderrTruncated));
|
|
619
|
+
if (stdout.length > 0) sections.push(formatOutputSection("stdout", stdout, outputTail.stdoutTruncated));
|
|
601
620
|
return sections.join("\n\n");
|
|
602
621
|
}
|
|
622
|
+
function formatChildExitReason(code, signal) {
|
|
623
|
+
if (signal !== null) return `Run child exited with signal ${signal}`;
|
|
624
|
+
return `Run child exited with code ${String(code)}`;
|
|
625
|
+
}
|
|
626
|
+
function getChildStderrHeadline(stderr) {
|
|
627
|
+
const line = stderr.split("\n").map((entry) => entry.trim()).find((entry) => entry.length > 0);
|
|
628
|
+
if (line === void 0) return null;
|
|
629
|
+
if (line.length <= outputHeadlineMaxLength) return line;
|
|
630
|
+
return `${line.slice(0, outputHeadlineMaxLength)}...`;
|
|
631
|
+
}
|
|
632
|
+
function formatOutputSection(streamName, output, truncated) {
|
|
633
|
+
return `${truncated ? `Child ${streamName} (last ${String(output.length)} chars)` : `Child ${streamName}`}:\n${output}`;
|
|
634
|
+
}
|
|
603
635
|
function getRunChildExecArgv() {
|
|
604
636
|
const execArgv = [];
|
|
605
637
|
let skipNext = false;
|
|
@@ -1867,15 +1899,11 @@ function parseArgs(argv) {
|
|
|
1867
1899
|
*/
|
|
1868
1900
|
async function runCli(argv) {
|
|
1869
1901
|
const args = parseArgs(argv);
|
|
1870
|
-
if (args.loadEnv && !loadWorkspaceEnv())
|
|
1871
|
-
process.exit(1);
|
|
1872
|
-
return;
|
|
1873
|
-
}
|
|
1902
|
+
if (args.loadEnv && !loadWorkspaceEnv()) process.exit(1);
|
|
1874
1903
|
if (args.showHelp) {
|
|
1875
1904
|
if (args.unknownHelpTarget !== void 0) {
|
|
1876
1905
|
console.error(`No help found for "${args.unknownHelpTarget}".`);
|
|
1877
1906
|
process.exit(1);
|
|
1878
|
-
return;
|
|
1879
1907
|
}
|
|
1880
1908
|
printHelp(args.helpTopic);
|
|
1881
1909
|
return;
|
|
@@ -1938,6 +1966,11 @@ function loadWorkspaceEnv() {
|
|
|
1938
1966
|
}
|
|
1939
1967
|
return true;
|
|
1940
1968
|
}
|
|
1969
|
+
function formatUnknownErrorDetails(error) {
|
|
1970
|
+
if (error instanceof Error) return error.stack ?? error.message;
|
|
1971
|
+
if (typeof error === "string") return error;
|
|
1972
|
+
return String(error);
|
|
1973
|
+
}
|
|
1941
1974
|
const currentDir = dirname(fileURLToPath(import.meta.url));
|
|
1942
1975
|
const repoRoot = resolve(currentDir, "../../..");
|
|
1943
1976
|
const pnpmCommand = process.platform === "win32" ? "pnpm.cmd" : "pnpm";
|
|
@@ -1986,8 +2019,8 @@ async function commandApp(args) {
|
|
|
1986
2019
|
const { serve } = await import("@hono/node-server");
|
|
1987
2020
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
1988
2021
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
1989
|
-
const appModule = await import("./app-
|
|
1990
|
-
const runnerModule = await import("./runner-
|
|
2022
|
+
const appModule = await import("./app-DD-8kx5H.mjs");
|
|
2023
|
+
const runnerModule = await import("./runner-CoRf7Vef.mjs");
|
|
1991
2024
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
1992
2025
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
1993
2026
|
await runnerModule.initRunner();
|
|
@@ -2036,7 +2069,6 @@ async function commandRun(args) {
|
|
|
2036
2069
|
if (args.evalIds.length === 0 && args.caseIds.length === 0 && args.files.length === 0 && !runner.getAllowCliRunAll()) {
|
|
2037
2070
|
console.error("This workspace disables running all evals from the CLI. Pass --eval <id>, --file <path|glob>, or --case <id> to run a targeted subset.");
|
|
2038
2071
|
process.exit(1);
|
|
2039
|
-
return;
|
|
2040
2072
|
}
|
|
2041
2073
|
if (args.clearCache) {
|
|
2042
2074
|
await runner.clearCache();
|
|
@@ -2071,15 +2103,20 @@ async function commandRun(args) {
|
|
|
2071
2103
|
if (manualInputsResult.error !== null) {
|
|
2072
2104
|
console.error(manualInputsResult.error);
|
|
2073
2105
|
process.exit(1);
|
|
2074
|
-
return;
|
|
2075
2106
|
}
|
|
2076
|
-
const
|
|
2107
|
+
const runResult = await resultify(() => runner.startRun({
|
|
2077
2108
|
target,
|
|
2078
2109
|
trials: args.trials,
|
|
2079
2110
|
temporary: args.temporary,
|
|
2080
2111
|
cache: { mode: args.cacheMode },
|
|
2081
2112
|
manualInputs: manualInputsResult.value
|
|
2082
|
-
});
|
|
2113
|
+
}));
|
|
2114
|
+
if (runResult.error) {
|
|
2115
|
+
console.error("Failed to start run:");
|
|
2116
|
+
console.error(formatUnknownErrorDetails(runResult.error));
|
|
2117
|
+
process.exit(1);
|
|
2118
|
+
}
|
|
2119
|
+
const run = runResult.value;
|
|
2083
2120
|
if (!args.json) {
|
|
2084
2121
|
console.info(`Run started: ${run.manifest.id}`);
|
|
2085
2122
|
console.info(`Trials: ${String(args.trials)}`);
|
|
@@ -2089,10 +2126,7 @@ async function commandRun(args) {
|
|
|
2089
2126
|
}
|
|
2090
2127
|
await waitForRunCompletion(runner, run.manifest.id);
|
|
2091
2128
|
const finalRun = runner.getRun(run.manifest.id);
|
|
2092
|
-
if (!finalRun)
|
|
2093
|
-
process.exit(1);
|
|
2094
|
-
return;
|
|
2095
|
-
}
|
|
2129
|
+
if (!finalRun) process.exit(1);
|
|
2096
2130
|
const { summary } = finalRun;
|
|
2097
2131
|
if (args.json) console.info(JSON.stringify(summary, null, 2));
|
|
2098
2132
|
else {
|
|
@@ -2120,7 +2154,6 @@ async function commandShowRuns(args) {
|
|
|
2120
2154
|
if (!run) {
|
|
2121
2155
|
printMissingRun(runRef);
|
|
2122
2156
|
process.exit(1);
|
|
2123
|
-
return;
|
|
2124
2157
|
}
|
|
2125
2158
|
const index = buildRunFileIndex(runner.getWorkspaceRoot(), run);
|
|
2126
2159
|
if (args.json) {
|
|
@@ -2184,7 +2217,6 @@ async function commandCache(args) {
|
|
|
2184
2217
|
}
|
|
2185
2218
|
console.info("Refusing to clear cache without --eval <id> or --all. Use one of these flags to confirm.");
|
|
2186
2219
|
process.exit(1);
|
|
2187
|
-
return;
|
|
2188
2220
|
}
|
|
2189
2221
|
printHelp(args.helpTopic);
|
|
2190
2222
|
}
|