@ls-stack/agent-eval 0.36.0 → 0.38.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,8 +25,8 @@
25
25
  href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
26
26
  rel="stylesheet"
27
27
  />
28
- <script type="module" crossorigin src="/assets/index-BYtcGddU.js"></script>
29
- <link rel="stylesheet" crossorigin href="/assets/index-D0rC5MSS.css">
28
+ <script type="module" crossorigin src="/assets/index-CO86PsY-.js"></script>
29
+ <link rel="stylesheet" crossorigin href="/assets/index-C6PgBOfP.css">
30
30
  </head>
31
31
  <body>
32
32
  <div id="root"></div>
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-Dg3abrOv.mjs";
2
+ import { t as runCli } from "./cli-BUX6tr9J.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -70,10 +70,21 @@ async function reexecWithNodeArgs(argv, execArgv) {
70
70
  });
71
71
  });
72
72
  }
73
+ function formatUnknownErrorDetails(error) {
74
+ if (error instanceof Error) return error.stack ?? error.message;
75
+ if (typeof error === "string") return error;
76
+ return String(error);
77
+ }
73
78
  const { argv, inspectArg } = parseDebugFlags(process.argv.slice(2));
74
79
  setRunChildInspectArg(inspectArg);
75
80
  const execArgv = buildExecArgv(inspectArg);
76
- if (needsModuleMocksFlag() || !execArgvMatches(execArgv)) await reexecWithNodeArgs(argv, execArgv);
77
- else await runCli(argv);
81
+ if (needsModuleMocksFlag() || !execArgvMatches(execArgv)) await reexecWithNodeArgs(argv, execArgv).catch((error) => {
82
+ console.error(formatUnknownErrorDetails(error));
83
+ process.exitCode = 1;
84
+ });
85
+ else await runCli(argv).catch((error) => {
86
+ console.error(formatUnknownErrorDetails(error));
87
+ process.exitCode = 1;
88
+ });
78
89
  //#endregion
79
90
  export {};
@@ -1,4 +1,4 @@
1
- import { C as loadConfig, D as createFsCacheStore, E as validateCharts, H as getEvalDisplayStatus, S as resolveEvalDefaultConfig, T as normalizeScoreDef, U as deriveScopedSummaryFromCases, V as getEvalTitle, _ as buildManualInputDescriptor, a as getLastRunStatuses, b as loadEvalModule, bt as getCaseRowEvalKey, c as loadPersistedRunSnapshots, d as persistRunState, f as recomputeEvalStatusesInRuns, g as resolveArtifactPath, h as resolveTracePresentation, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, mt as resolveLlmCallsConfig, n as getTargetEvalKeys, o as getLatestRunInfos, p as recomputePersistedCaseStatus, pr as getEvalRegistry, pt as resolveApiCallsConfig, q as runSummarySchema, s as loadPersistedRunSnapshot, u as persistCaseDetail, v as parseManualInputValues, vt as buildEvalKey, w as buildDeclaredColumnDefs, x as parseEvalDiscovery, y as deriveEvalFreshness, yt as getCaseRowCaseKey, z as applyDerivedCallAttributes } from "./runOrchestration-V1TxX8es.mjs";
1
+ import { C as parseEvalDiscovery, D as normalizeScoreDef, E as buildDeclaredColumnDefs, G as deriveScopedSummaryFromCases, O as validateCharts, S as loadEvalModule, St as getCaseRowEvalKey, T as loadConfig, U as getEvalTitle, V as applyDerivedCallAttributes, W as getEvalDisplayStatus, Y as runSummarySchema, _ as resolveTracePresentation, a as generateRunId, b as parseManualInputValues, bt as buildEvalKey, c as loadPersistedRunSnapshot, d as persistCaseDetail, f as deleteTemporaryRuns, g as runTouchesEval, gt as resolveLlmCallsConfig, h as recomputePersistedCaseStatus, hr as getEvalRegistry, ht as resolveApiCallsConfig, i as stripTerminalControlCodes, k as createFsCacheStore, l as loadPersistedRunSnapshots, m as recomputeEvalStatusesInRuns, n as getTargetEvalKeys, o as getLastRunStatuses, p as persistRunState, s as getLatestRunInfos, u as nextShortIdFromSnapshots, v as resolveArtifactPath, w as resolveEvalDefaultConfig, x as deriveEvalFreshness, xt as getCaseRowCaseKey, y as buildManualInputDescriptor } from "./runOrchestration-BhUFWvq9.mjs";
2
2
  import { createHash, randomUUID } from "node:crypto";
3
3
  import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
4
4
  import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
@@ -536,6 +536,8 @@ function isRunChildMessage(value) {
536
536
  const runChildInspectArgEnv = "AGENT_EVALS_RUN_CHILD_INSPECT_ARG";
537
537
  const inspectFlagPrefix = "--inspect";
538
538
  const inspectBrkFlagPrefix = "--inspect-brk";
539
+ const childOutputTailMaxLength = 12e3;
540
+ const outputHeadlineMaxLength = 240;
539
541
  function startRunChild(params) {
540
542
  const child = spawn(process.execPath, [
541
543
  ...getRunChildExecArgv(),
@@ -546,12 +548,18 @@ function startRunChild(params) {
546
548
  env: process.env,
547
549
  stdio: [
548
550
  "ignore",
549
- "inherit",
550
- "inherit",
551
+ "pipe",
552
+ "pipe",
551
553
  "ipc"
552
554
  ]
553
555
  });
556
+ const outputTail = createRunChildOutputTail(child);
554
557
  params.runState.childProcess = child;
558
+ child.once("error", (error) => {
559
+ if (params.runState.childProcess === child) params.runState.childProcess = void 0;
560
+ if (params.runState.manifest.status !== "running" || params.runState.childTerminalReceived) return;
561
+ markRunErrored(params.runState, `Failed to start run child: ${error.message}`, params.managerContext);
562
+ });
555
563
  child.on("message", (message) => {
556
564
  if (!isRunChildMessage(message)) return;
557
565
  handleRunChildMessage({
@@ -560,12 +568,69 @@ function startRunChild(params) {
560
568
  managerContext: params.managerContext
561
569
  });
562
570
  });
563
- child.once("exit", (code, signal) => {
571
+ child.once("close", (code, signal) => {
564
572
  if (params.runState.childProcess === child) params.runState.childProcess = void 0;
565
573
  if (params.runState.manifest.status !== "running" || params.runState.childTerminalReceived) return;
566
- const reason = signal !== null ? `Run child exited with signal ${signal}` : `Run child exited with code ${String(code)}`;
567
- markRunErrored(params.runState, reason, params.managerContext);
574
+ const reason = formatChildExitReason(code, signal);
575
+ markRunErrored(params.runState, formatUnexpectedRunChildExit(reason, outputTail), params.managerContext);
576
+ });
577
+ }
578
+ function createRunChildOutputTail(child) {
579
+ const tail = {
580
+ stdout: "",
581
+ stderr: "",
582
+ stdoutTruncated: false,
583
+ stderrTruncated: false
584
+ };
585
+ child.stdout?.on("data", (chunk) => {
586
+ process.stdout.write(chunk);
587
+ const nextTail = appendOutputTail(tail.stdout, chunkToText(chunk));
588
+ tail.stdout = nextTail.text;
589
+ tail.stdoutTruncated = tail.stdoutTruncated || nextTail.truncated;
590
+ });
591
+ child.stderr?.on("data", (chunk) => {
592
+ process.stderr.write(chunk);
593
+ const nextTail = appendOutputTail(tail.stderr, chunkToText(chunk));
594
+ tail.stderr = nextTail.text;
595
+ tail.stderrTruncated = tail.stderrTruncated || nextTail.truncated;
568
596
  });
597
+ return tail;
598
+ }
599
+ function chunkToText(chunk) {
600
+ return typeof chunk === "string" ? chunk : chunk.toString("utf-8");
601
+ }
602
+ function appendOutputTail(current, next) {
603
+ const combined = current + next;
604
+ if (combined.length <= childOutputTailMaxLength) return {
605
+ text: combined,
606
+ truncated: false
607
+ };
608
+ return {
609
+ text: combined.slice(combined.length - childOutputTailMaxLength),
610
+ truncated: true
611
+ };
612
+ }
613
+ function formatUnexpectedRunChildExit(reason, outputTail) {
614
+ const stderr = stripTerminalControlCodes(outputTail.stderr).trim();
615
+ const stdout = stripTerminalControlCodes(outputTail.stdout).trim();
616
+ const headline = getChildStderrHeadline(stderr);
617
+ const sections = [headline === null ? `${reason} before sending a structured run error.` : `Run child exited before sending a structured run error: ${headline}`, reason];
618
+ if (stderr.length > 0) sections.push(formatOutputSection("stderr", stderr, outputTail.stderrTruncated));
619
+ if (stdout.length > 0) sections.push(formatOutputSection("stdout", stdout, outputTail.stdoutTruncated));
620
+ return sections.join("\n\n");
621
+ }
622
+ function formatChildExitReason(code, signal) {
623
+ if (signal !== null) return `Run child exited with signal ${signal}`;
624
+ return `Run child exited with code ${String(code)}`;
625
+ }
626
+ function getChildStderrHeadline(stderr) {
627
+ const line = stderr.split("\n").map((entry) => entry.trim()).find((entry) => entry.length > 0);
628
+ if (line === void 0) return null;
629
+ if (line.length <= outputHeadlineMaxLength) return line;
630
+ return `${line.slice(0, outputHeadlineMaxLength)}...`;
631
+ }
632
+ function formatOutputSection(streamName, output, truncated) {
633
+ return `${truncated ? `Child ${streamName} (last ${String(output.length)} chars)` : `Child ${streamName}`}:\n${output}`;
569
634
  }
570
635
  function getRunChildExecArgv() {
571
636
  const execArgv = [];
@@ -647,6 +712,7 @@ function applyChildEvalMetas(evals, childMetas) {
647
712
  }
648
713
  evalMeta.columnDefs = childMeta.columnDefs;
649
714
  evalMeta.caseCount = childMeta.caseCount;
715
+ evalMeta.caseIds = childMeta.caseIds;
650
716
  evalMeta.stats = childMeta.stats;
651
717
  evalMeta.charts = childMeta.charts;
652
718
  evalMeta.sourceFingerprint = childMeta.sourceFingerprint;
@@ -1064,6 +1130,7 @@ function createRunner({ watchForChanges = true } = {}) {
1064
1130
  sourceFingerprint,
1065
1131
  columnDefs,
1066
1132
  caseCount: null,
1133
+ caseIds: void 0,
1067
1134
  stats,
1068
1135
  charts,
1069
1136
  manualInputDescriptor,
@@ -1074,18 +1141,24 @@ function createRunner({ watchForChanges = true } = {}) {
1074
1141
  emitDiscoveryEvent();
1075
1142
  },
1076
1143
  async startRun(request) {
1144
+ const deletedTemporaryRuns = await deleteTemporaryRuns({
1145
+ runs,
1146
+ cancelRunningRun: killRunChild
1147
+ });
1077
1148
  const runId = generateRunId();
1078
1149
  const shortId = `r${String(nextShortIdNum++)}`;
1079
1150
  const now = (/* @__PURE__ */ new Date()).toISOString();
1080
1151
  const cacheMode = request.cache?.mode ?? "use";
1081
1152
  const runDir = join(localStateDir, "runs", runId);
1153
+ const gitState = readGitWorktreeState(workspaceRoot);
1082
1154
  const manifest = {
1083
1155
  id: runId,
1084
1156
  shortId,
1085
1157
  status: "running",
1158
+ temporary: request.temporary === true,
1086
1159
  startedAt: now,
1087
1160
  endedAt: null,
1088
- commitSha: readGitWorktreeState(workspaceRoot).commitSha,
1161
+ commitSha: gitState.commitSha,
1089
1162
  evalSourceFingerprints: {},
1090
1163
  target: request.target,
1091
1164
  trials: request.trials,
@@ -1162,6 +1235,7 @@ function createRunner({ watchForChanges = true } = {}) {
1162
1235
  emitDiscoveryEvent
1163
1236
  }
1164
1237
  });
1238
+ if (deletedTemporaryRuns > 0) emitDiscoveryEvent();
1165
1239
  return {
1166
1240
  manifest,
1167
1241
  summary,
@@ -1445,6 +1519,7 @@ Flags:
1445
1519
  --no-cache Shortcut for --cache bypass
1446
1520
  --refresh-cache Shortcut for --cache refresh
1447
1521
  --clear-cache Clear the cache before starting the run
1522
+ --temporary Persist until the next run starts, then delete
1448
1523
  --input <json> Manual input value for a single targeted eval
1449
1524
  that declares manualInput
1450
1525
  --input-file <path> JSON object keyed by eval key (or eval id) with
@@ -1515,6 +1590,7 @@ Options:
1515
1590
  --no-cache Shortcut for --cache bypass
1516
1591
  --refresh-cache Shortcut for --cache refresh
1517
1592
  --clear-cache Clear the cache before starting the run
1593
+ --temporary Persist until the next run starts, then delete
1518
1594
  --no-env Disable automatic .env loading
1519
1595
  --help, -h Show help
1520
1596
  `);
@@ -1753,6 +1829,7 @@ function parseArgs(argv) {
1753
1829
  port: 4100,
1754
1830
  cacheMode: "use",
1755
1831
  clearCache: false,
1832
+ temporary: false,
1756
1833
  all: false,
1757
1834
  loadEnv: normalizedArgv.length === argv.length,
1758
1835
  inputJson: void 0,
@@ -1803,6 +1880,7 @@ function parseArgs(argv) {
1803
1880
  } else if (arg === "--no-cache") args.cacheMode = "bypass";
1804
1881
  else if (arg === "--refresh-cache") args.cacheMode = "refresh";
1805
1882
  else if (arg === "--clear-cache") args.clearCache = true;
1883
+ else if (arg === "--temporary") args.temporary = true;
1806
1884
  else if (arg === "--input" && next !== void 0) {
1807
1885
  args.inputJson = next;
1808
1886
  i++;
@@ -1821,15 +1899,11 @@ function parseArgs(argv) {
1821
1899
  */
1822
1900
  async function runCli(argv) {
1823
1901
  const args = parseArgs(argv);
1824
- if (args.loadEnv && !loadWorkspaceEnv()) {
1825
- process.exit(1);
1826
- return;
1827
- }
1902
+ if (args.loadEnv && !loadWorkspaceEnv()) process.exit(1);
1828
1903
  if (args.showHelp) {
1829
1904
  if (args.unknownHelpTarget !== void 0) {
1830
1905
  console.error(`No help found for "${args.unknownHelpTarget}".`);
1831
1906
  process.exit(1);
1832
- return;
1833
1907
  }
1834
1908
  printHelp(args.helpTopic);
1835
1909
  return;
@@ -1892,6 +1966,11 @@ function loadWorkspaceEnv() {
1892
1966
  }
1893
1967
  return true;
1894
1968
  }
1969
+ function formatUnknownErrorDetails(error) {
1970
+ if (error instanceof Error) return error.stack ?? error.message;
1971
+ if (typeof error === "string") return error;
1972
+ return String(error);
1973
+ }
1895
1974
  const currentDir = dirname(fileURLToPath(import.meta.url));
1896
1975
  const repoRoot = resolve(currentDir, "../../..");
1897
1976
  const pnpmCommand = process.platform === "win32" ? "pnpm.cmd" : "pnpm";
@@ -1940,8 +2019,8 @@ async function commandApp(args) {
1940
2019
  const { serve } = await import("@hono/node-server");
1941
2020
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
1942
2021
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
1943
- const appModule = await import("./app-BlNzXWDM.mjs");
1944
- const runnerModule = await import("./runner-znY6PY1M.mjs");
2022
+ const appModule = await import("./app-DD-8kx5H.mjs");
2023
+ const runnerModule = await import("./runner-CoRf7Vef.mjs");
1945
2024
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
1946
2025
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
1947
2026
  await runnerModule.initRunner();
@@ -1990,7 +2069,6 @@ async function commandRun(args) {
1990
2069
  if (args.evalIds.length === 0 && args.caseIds.length === 0 && args.files.length === 0 && !runner.getAllowCliRunAll()) {
1991
2070
  console.error("This workspace disables running all evals from the CLI. Pass --eval <id>, --file <path|glob>, or --case <id> to run a targeted subset.");
1992
2071
  process.exit(1);
1993
- return;
1994
2072
  }
1995
2073
  if (args.clearCache) {
1996
2074
  await runner.clearCache();
@@ -2025,26 +2103,30 @@ async function commandRun(args) {
2025
2103
  if (manualInputsResult.error !== null) {
2026
2104
  console.error(manualInputsResult.error);
2027
2105
  process.exit(1);
2028
- return;
2029
2106
  }
2030
- const run = await runner.startRun({
2107
+ const runResult = await resultify(() => runner.startRun({
2031
2108
  target,
2032
2109
  trials: args.trials,
2110
+ temporary: args.temporary,
2033
2111
  cache: { mode: args.cacheMode },
2034
2112
  manualInputs: manualInputsResult.value
2035
- });
2113
+ }));
2114
+ if (runResult.error) {
2115
+ console.error("Failed to start run:");
2116
+ console.error(formatUnknownErrorDetails(runResult.error));
2117
+ process.exit(1);
2118
+ }
2119
+ const run = runResult.value;
2036
2120
  if (!args.json) {
2037
2121
  console.info(`Run started: ${run.manifest.id}`);
2038
2122
  console.info(`Trials: ${String(args.trials)}`);
2039
2123
  if (args.cacheMode !== "use") console.info(`Cache mode: ${args.cacheMode}`);
2124
+ if (args.temporary) console.info("Temporary: yes");
2040
2125
  console.info("");
2041
2126
  }
2042
2127
  await waitForRunCompletion(runner, run.manifest.id);
2043
2128
  const finalRun = runner.getRun(run.manifest.id);
2044
- if (!finalRun) {
2045
- process.exit(1);
2046
- return;
2047
- }
2129
+ if (!finalRun) process.exit(1);
2048
2130
  const { summary } = finalRun;
2049
2131
  if (args.json) console.info(JSON.stringify(summary, null, 2));
2050
2132
  else {
@@ -2072,7 +2154,6 @@ async function commandShowRuns(args) {
2072
2154
  if (!run) {
2073
2155
  printMissingRun(runRef);
2074
2156
  process.exit(1);
2075
- return;
2076
2157
  }
2077
2158
  const index = buildRunFileIndex(runner.getWorkspaceRoot(), run);
2078
2159
  if (args.json) {
@@ -2136,7 +2217,6 @@ async function commandCache(args) {
2136
2217
  }
2137
2218
  console.info("Refusing to clear cache without --eval <id> or --all. Use one of these flags to confirm.");
2138
2219
  process.exit(1);
2139
- return;
2140
2220
  }
2141
2221
  printHelp(args.helpTopic);
2142
2222
  }
@@ -2152,6 +2232,7 @@ function buildRunFileIndex(workspaceRoot, run) {
2152
2232
  id: run.manifest.id,
2153
2233
  shortId: run.manifest.shortId,
2154
2234
  status: run.manifest.status,
2235
+ temporary: run.manifest.temporary,
2155
2236
  startedAt: run.manifest.startedAt,
2156
2237
  endedAt: run.manifest.endedAt,
2157
2238
  target: run.manifest.target,
@@ -2212,7 +2293,7 @@ function printRunFileIndexes(indexes) {
2212
2293
  }
2213
2294
  }
2214
2295
  function printRunFileIndex(index) {
2215
- console.info(`${index.shortId} (${index.id}) ${index.status} ${formatCaseCounts(index.summary)}`);
2296
+ console.info(`${index.shortId} (${index.id}) ${index.status}${index.temporary ? " temporary" : ""} ${formatCaseCounts(index.summary)}`);
2216
2297
  console.info(` dir: ${index.files.dir}`);
2217
2298
  console.info(` run: ${index.files.run}`);
2218
2299
  console.info(` summary: ${index.files.summary}`);