@ls-stack/agent-eval 0.37.0 → 0.39.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,8 +25,8 @@
25
25
  href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
26
26
  rel="stylesheet"
27
27
  />
28
- <script type="module" crossorigin src="/assets/index-BiwYbMem.js"></script>
29
- <link rel="stylesheet" crossorigin href="/assets/index-CKdoOah2.css">
28
+ <script type="module" crossorigin src="/assets/index-CO86PsY-.js"></script>
29
+ <link rel="stylesheet" crossorigin href="/assets/index-C6PgBOfP.css">
30
30
  </head>
31
31
  <body>
32
32
  <div id="root"></div>
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-CwGcJYWe.mjs";
2
+ import { t as runCli } from "./cli-BUX6tr9J.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -70,10 +70,21 @@ async function reexecWithNodeArgs(argv, execArgv) {
70
70
  });
71
71
  });
72
72
  }
73
+ function formatUnknownErrorDetails(error) {
74
+ if (error instanceof Error) return error.stack ?? error.message;
75
+ if (typeof error === "string") return error;
76
+ return String(error);
77
+ }
73
78
  const { argv, inspectArg } = parseDebugFlags(process.argv.slice(2));
74
79
  setRunChildInspectArg(inspectArg);
75
80
  const execArgv = buildExecArgv(inspectArg);
76
- if (needsModuleMocksFlag() || !execArgvMatches(execArgv)) await reexecWithNodeArgs(argv, execArgv);
77
- else await runCli(argv);
81
+ if (needsModuleMocksFlag() || !execArgvMatches(execArgv)) await reexecWithNodeArgs(argv, execArgv).catch((error) => {
82
+ console.error(formatUnknownErrorDetails(error));
83
+ process.exitCode = 1;
84
+ });
85
+ else await runCli(argv).catch((error) => {
86
+ console.error(formatUnknownErrorDetails(error));
87
+ process.exitCode = 1;
88
+ });
78
89
  //#endregion
79
90
  export {};
@@ -1,4 +1,4 @@
1
- import { C as parseEvalDiscovery, D as normalizeScoreDef, E as buildDeclaredColumnDefs, G as deriveScopedSummaryFromCases, O as validateCharts, S as loadEvalModule, St as getCaseRowEvalKey, T as loadConfig, U as getEvalTitle, V as applyDerivedCallAttributes, W as getEvalDisplayStatus, Y as runSummarySchema, _ as resolveTracePresentation, a as generateRunId, b as parseManualInputValues, bt as buildEvalKey, c as loadPersistedRunSnapshot, d as persistCaseDetail, f as deleteTemporaryRuns, g as runTouchesEval, gt as resolveLlmCallsConfig, h as recomputePersistedCaseStatus, hr as getEvalRegistry, ht as resolveApiCallsConfig, i as stripTerminalControlCodes, k as createFsCacheStore, l as loadPersistedRunSnapshots, m as recomputeEvalStatusesInRuns, n as getTargetEvalKeys, o as getLastRunStatuses, p as persistRunState, s as getLatestRunInfos, u as nextShortIdFromSnapshots, v as resolveArtifactPath, w as resolveEvalDefaultConfig, x as deriveEvalFreshness, xt as getCaseRowCaseKey, y as buildManualInputDescriptor } from "./runOrchestration-C4o5TcIu.mjs";
1
+ import { C as parseEvalDiscovery, D as normalizeScoreDef, E as buildDeclaredColumnDefs, G as deriveScopedSummaryFromCases, O as validateCharts, S as loadEvalModule, St as getCaseRowEvalKey, T as loadConfig, U as getEvalTitle, V as applyDerivedCallAttributes, W as getEvalDisplayStatus, Y as runSummarySchema, _ as resolveTracePresentation, a as generateRunId, b as parseManualInputValues, bt as buildEvalKey, c as loadPersistedRunSnapshot, d as persistCaseDetail, f as deleteTemporaryRuns, g as runTouchesEval, gt as resolveLlmCallsConfig, h as recomputePersistedCaseStatus, hr as getEvalRegistry, ht as resolveApiCallsConfig, i as stripTerminalControlCodes, k as createFsCacheStore, l as loadPersistedRunSnapshots, m as recomputeEvalStatusesInRuns, n as getTargetEvalKeys, o as getLastRunStatuses, p as persistRunState, s as getLatestRunInfos, u as nextShortIdFromSnapshots, v as resolveArtifactPath, w as resolveEvalDefaultConfig, x as deriveEvalFreshness, xt as getCaseRowCaseKey, y as buildManualInputDescriptor } from "./runOrchestration-BhUFWvq9.mjs";
2
2
  import { createHash, randomUUID } from "node:crypto";
3
3
  import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
4
4
  import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
@@ -537,6 +537,7 @@ const runChildInspectArgEnv = "AGENT_EVALS_RUN_CHILD_INSPECT_ARG";
537
537
  const inspectFlagPrefix = "--inspect";
538
538
  const inspectBrkFlagPrefix = "--inspect-brk";
539
539
  const childOutputTailMaxLength = 12e3;
540
+ const outputHeadlineMaxLength = 240;
540
541
  function startRunChild(params) {
541
542
  const child = spawn(process.execPath, [
542
543
  ...getRunChildExecArgv(),
@@ -554,6 +555,11 @@ function startRunChild(params) {
554
555
  });
555
556
  const outputTail = createRunChildOutputTail(child);
556
557
  params.runState.childProcess = child;
558
+ child.once("error", (error) => {
559
+ if (params.runState.childProcess === child) params.runState.childProcess = void 0;
560
+ if (params.runState.manifest.status !== "running" || params.runState.childTerminalReceived) return;
561
+ markRunErrored(params.runState, `Failed to start run child: ${error.message}`, params.managerContext);
562
+ });
557
563
  child.on("message", (message) => {
558
564
  if (!isRunChildMessage(message)) return;
559
565
  handleRunChildMessage({
@@ -562,25 +568,31 @@ function startRunChild(params) {
562
568
  managerContext: params.managerContext
563
569
  });
564
570
  });
565
- child.once("exit", (code, signal) => {
571
+ child.once("close", (code, signal) => {
566
572
  if (params.runState.childProcess === child) params.runState.childProcess = void 0;
567
573
  if (params.runState.manifest.status !== "running" || params.runState.childTerminalReceived) return;
568
- const reason = signal !== null ? `Run child exited with signal ${signal}` : `Run child exited with code ${String(code)}`;
574
+ const reason = formatChildExitReason(code, signal);
569
575
  markRunErrored(params.runState, formatUnexpectedRunChildExit(reason, outputTail), params.managerContext);
570
576
  });
571
577
  }
572
578
  function createRunChildOutputTail(child) {
573
579
  const tail = {
574
580
  stdout: "",
575
- stderr: ""
581
+ stderr: "",
582
+ stdoutTruncated: false,
583
+ stderrTruncated: false
576
584
  };
577
585
  child.stdout?.on("data", (chunk) => {
578
586
  process.stdout.write(chunk);
579
- tail.stdout = appendOutputTail(tail.stdout, chunkToText(chunk));
587
+ const nextTail = appendOutputTail(tail.stdout, chunkToText(chunk));
588
+ tail.stdout = nextTail.text;
589
+ tail.stdoutTruncated = tail.stdoutTruncated || nextTail.truncated;
580
590
  });
581
591
  child.stderr?.on("data", (chunk) => {
582
592
  process.stderr.write(chunk);
583
- tail.stderr = appendOutputTail(tail.stderr, chunkToText(chunk));
593
+ const nextTail = appendOutputTail(tail.stderr, chunkToText(chunk));
594
+ tail.stderr = nextTail.text;
595
+ tail.stderrTruncated = tail.stderrTruncated || nextTail.truncated;
584
596
  });
585
597
  return tail;
586
598
  }
@@ -589,17 +601,37 @@ function chunkToText(chunk) {
589
601
  }
590
602
  function appendOutputTail(current, next) {
591
603
  const combined = current + next;
592
- if (combined.length <= childOutputTailMaxLength) return combined;
593
- return combined.slice(combined.length - childOutputTailMaxLength);
604
+ if (combined.length <= childOutputTailMaxLength) return {
605
+ text: combined,
606
+ truncated: false
607
+ };
608
+ return {
609
+ text: combined.slice(combined.length - childOutputTailMaxLength),
610
+ truncated: true
611
+ };
594
612
  }
595
613
  function formatUnexpectedRunChildExit(reason, outputTail) {
596
- const sections = [reason];
597
614
  const stderr = stripTerminalControlCodes(outputTail.stderr).trim();
598
615
  const stdout = stripTerminalControlCodes(outputTail.stdout).trim();
599
- if (stderr.length > 0) sections.push(`Child stderr (last ${String(stderr.length)} chars):\n${stderr}`);
600
- if (stdout.length > 0) sections.push(`Child stdout (last ${String(stdout.length)} chars):\n${stdout}`);
616
+ const headline = getChildStderrHeadline(stderr);
617
+ const sections = [headline === null ? `${reason} before sending a structured run error.` : `Run child exited before sending a structured run error: ${headline}`, reason];
618
+ if (stderr.length > 0) sections.push(formatOutputSection("stderr", stderr, outputTail.stderrTruncated));
619
+ if (stdout.length > 0) sections.push(formatOutputSection("stdout", stdout, outputTail.stdoutTruncated));
601
620
  return sections.join("\n\n");
602
621
  }
622
+ function formatChildExitReason(code, signal) {
623
+ if (signal !== null) return `Run child exited with signal ${signal}`;
624
+ return `Run child exited with code ${String(code)}`;
625
+ }
626
+ function getChildStderrHeadline(stderr) {
627
+ const line = stderr.split("\n").map((entry) => entry.trim()).find((entry) => entry.length > 0);
628
+ if (line === void 0) return null;
629
+ if (line.length <= outputHeadlineMaxLength) return line;
630
+ return `${line.slice(0, outputHeadlineMaxLength)}...`;
631
+ }
632
+ function formatOutputSection(streamName, output, truncated) {
633
+ return `${truncated ? `Child ${streamName} (last ${String(output.length)} chars)` : `Child ${streamName}`}:\n${output}`;
634
+ }
603
635
  function getRunChildExecArgv() {
604
636
  const execArgv = [];
605
637
  let skipNext = false;
@@ -1867,15 +1899,11 @@ function parseArgs(argv) {
1867
1899
  */
1868
1900
  async function runCli(argv) {
1869
1901
  const args = parseArgs(argv);
1870
- if (args.loadEnv && !loadWorkspaceEnv()) {
1871
- process.exit(1);
1872
- return;
1873
- }
1902
+ if (args.loadEnv && !loadWorkspaceEnv()) process.exit(1);
1874
1903
  if (args.showHelp) {
1875
1904
  if (args.unknownHelpTarget !== void 0) {
1876
1905
  console.error(`No help found for "${args.unknownHelpTarget}".`);
1877
1906
  process.exit(1);
1878
- return;
1879
1907
  }
1880
1908
  printHelp(args.helpTopic);
1881
1909
  return;
@@ -1938,6 +1966,11 @@ function loadWorkspaceEnv() {
1938
1966
  }
1939
1967
  return true;
1940
1968
  }
1969
+ function formatUnknownErrorDetails(error) {
1970
+ if (error instanceof Error) return error.stack ?? error.message;
1971
+ if (typeof error === "string") return error;
1972
+ return String(error);
1973
+ }
1941
1974
  const currentDir = dirname(fileURLToPath(import.meta.url));
1942
1975
  const repoRoot = resolve(currentDir, "../../..");
1943
1976
  const pnpmCommand = process.platform === "win32" ? "pnpm.cmd" : "pnpm";
@@ -1986,8 +2019,8 @@ async function commandApp(args) {
1986
2019
  const { serve } = await import("@hono/node-server");
1987
2020
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
1988
2021
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
1989
- const appModule = await import("./app-C7ON9Wdh.mjs");
1990
- const runnerModule = await import("./runner-LqeHPID6.mjs");
2022
+ const appModule = await import("./app-DD-8kx5H.mjs");
2023
+ const runnerModule = await import("./runner-CoRf7Vef.mjs");
1991
2024
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
1992
2025
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
1993
2026
  await runnerModule.initRunner();
@@ -2036,7 +2069,6 @@ async function commandRun(args) {
2036
2069
  if (args.evalIds.length === 0 && args.caseIds.length === 0 && args.files.length === 0 && !runner.getAllowCliRunAll()) {
2037
2070
  console.error("This workspace disables running all evals from the CLI. Pass --eval <id>, --file <path|glob>, or --case <id> to run a targeted subset.");
2038
2071
  process.exit(1);
2039
- return;
2040
2072
  }
2041
2073
  if (args.clearCache) {
2042
2074
  await runner.clearCache();
@@ -2071,15 +2103,20 @@ async function commandRun(args) {
2071
2103
  if (manualInputsResult.error !== null) {
2072
2104
  console.error(manualInputsResult.error);
2073
2105
  process.exit(1);
2074
- return;
2075
2106
  }
2076
- const run = await runner.startRun({
2107
+ const runResult = await resultify(() => runner.startRun({
2077
2108
  target,
2078
2109
  trials: args.trials,
2079
2110
  temporary: args.temporary,
2080
2111
  cache: { mode: args.cacheMode },
2081
2112
  manualInputs: manualInputsResult.value
2082
- });
2113
+ }));
2114
+ if (runResult.error) {
2115
+ console.error("Failed to start run:");
2116
+ console.error(formatUnknownErrorDetails(runResult.error));
2117
+ process.exit(1);
2118
+ }
2119
+ const run = runResult.value;
2083
2120
  if (!args.json) {
2084
2121
  console.info(`Run started: ${run.manifest.id}`);
2085
2122
  console.info(`Trials: ${String(args.trials)}`);
@@ -2089,10 +2126,7 @@ async function commandRun(args) {
2089
2126
  }
2090
2127
  await waitForRunCompletion(runner, run.manifest.id);
2091
2128
  const finalRun = runner.getRun(run.manifest.id);
2092
- if (!finalRun) {
2093
- process.exit(1);
2094
- return;
2095
- }
2129
+ if (!finalRun) process.exit(1);
2096
2130
  const { summary } = finalRun;
2097
2131
  if (args.json) console.info(JSON.stringify(summary, null, 2));
2098
2132
  else {
@@ -2120,7 +2154,6 @@ async function commandShowRuns(args) {
2120
2154
  if (!run) {
2121
2155
  printMissingRun(runRef);
2122
2156
  process.exit(1);
2123
- return;
2124
2157
  }
2125
2158
  const index = buildRunFileIndex(runner.getWorkspaceRoot(), run);
2126
2159
  if (args.json) {
@@ -2184,7 +2217,6 @@ async function commandCache(args) {
2184
2217
  }
2185
2218
  console.info("Refusing to clear cache without --eval <id> or --all. Use one of these flags to confirm.");
2186
2219
  process.exit(1);
2187
- return;
2188
2220
  }
2189
2221
  printHelp(args.helpTopic);
2190
2222
  }