@ls-stack/agent-eval 0.12.0 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,7 +25,7 @@
25
25
  href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
26
26
  rel="stylesheet"
27
27
  />
28
- <script type="module" crossorigin src="/assets/index-BMe8ZR3n.js"></script>
28
+ <script type="module" crossorigin src="/assets/index-Drw0IpOd.js"></script>
29
29
  <link rel="stylesheet" crossorigin href="/assets/index-BVnLr79e.css">
30
30
  </head>
31
31
  <body>
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-COzPxKg2.mjs";
2
+ import { t as runCli } from "./cli-BEPaYHmX.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -1,4 +1,4 @@
1
- import { A as getEvalDisplayStatus, F as runSummarySchema, J as resolveLlmCallsConfig, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as normalizeScoreDef, c as persistCaseDetail, d as recomputePersistedCaseStatus, dn as getEvalRegistry, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, i as getLatestRunInfos, j as deriveScopedSummaryFromCases, k as getEvalTitle, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, q as resolveApiCallsConfig, r as getLastRunStatuses, s as nextShortIdFromSnapshots, u as recomputeEvalStatusesInRuns, v as loadConfig, x as createFsCacheStore, y as buildDeclaredColumnDefs } from "./runOrchestration-COFhQvTJ.mjs";
1
+ import { A as getEvalDisplayStatus, F as runSummarySchema, J as resolveLlmCallsConfig, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as normalizeScoreDef, c as persistCaseDetail, d as recomputePersistedCaseStatus, dn as getEvalRegistry, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, i as getLatestRunInfos, j as deriveScopedSummaryFromCases, k as getEvalTitle, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, q as resolveApiCallsConfig, r as getLastRunStatuses, s as nextShortIdFromSnapshots, u as recomputeEvalStatusesInRuns, v as loadConfig, x as createFsCacheStore, y as buildDeclaredColumnDefs } from "./runOrchestration-DrgpaDaf.mjs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
4
4
  import { dirname, join, relative, resolve } from "node:path";
@@ -1050,8 +1050,8 @@ async function commandApp(args) {
1050
1050
  const { serve } = await import("@hono/node-server");
1051
1051
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
1052
1052
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
1053
- const appModule = await import("./app-DI3IUGb_.mjs");
1054
- const runnerModule = await import("./runner-sMZXoDp3.mjs");
1053
+ const appModule = await import("./app-D3wtTfbu.mjs");
1054
+ const runnerModule = await import("./runner-BHCokR_t.mjs");
1055
1055
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
1056
1056
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
1057
1057
  await runnerModule.initRunner();
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as evalFreshnessStatusSchema, $t as evalAssert, A as getEvalDisplayStatus, At as traceDisplayInputConfigSchema, B as apiCallMetricPlacementSchema, Bt as jsonCellSchema, C as updateManualScoreRequestSchema, Ct as spanCacheOptionsSchema, D as extractLlmCalls, Dt as traceAttributeDisplayPlacementSchema, E as extractApiCalls, Et as traceAttributeDisplayInputSchema, F as runSummarySchema, Ft as cellValueSchema, G as llmCallMetricSchema, Gt as buildTraceTree, H as apiCallsConfigSchema, Ht as repoFileRefSchema, I as DEFAULT_API_CALLS_CONFIG, It as columnDefSchema, J as resolveLlmCallsConfig, Jt as evalTracer, K as llmCallsConfigSchema, Kt as captureEvalSpanError, L as DEFAULT_LLM_CALLS_CONFIG, Lt as columnFormatSchema, M as deriveStatusFromCaseRows, Mt as traceSpanKindSchema, N as deriveStatusFromChildStatuses, Nt as traceSpanSchema, O as getNestedAttribute, Ot as traceAttributeDisplaySchema, P as runManifestSchema, Pt as traceSpanWarningSchema, Q as caseRowSchema, Qt as appendToEvalOutput, R as agentEvalsConfigSchema, Rt as columnKindSchema, S as createRunRequestSchema, St as serializedCacheSpanSchema, T as extractCacheHits, Tt as traceAttributeDisplayFormatSchema, U as llmCallMetricFormatSchema, Ut as runArtifactRefSchema, V as apiCallMetricSchema, Vt as numberDisplayOptionsSchema, W as llmCallMetricPlacementSchema, Wt as z, X as assertionFailureSchema, Xt as hashCacheKeySync, Y as trialSelectionModeSchema, Yt as hashCacheKey, Z as caseDetailSchema, Zt as EvalAssertionError, _t as cacheModeSchema, an as nextEvalId, at as evalChartAggregateSchema, bt as cacheRecordingSchema, cn as setScopeCacheContext, ct as evalChartColorSchema, dn as getEvalRegistry, dt as evalChartTooltipExtraSchema, en as getCurrentScope, et as evalStatAggregateSchema, ft as evalChartTypeSchema, gt as cacheListItemSchema, ht as cacheFileSchema, in as mergeEvalOutput, it as scoreTraceSchema, j as deriveScopedSummaryFromCases, jt as traceSpanErrorSchema, k as getEvalTitle, kt as traceDisplayConfigSchema, ln as repoFile, lt as evalChartConfigSchema, mt as cacheEntrySchema, nn as incrementEvalOutput, nt as evalStatsConfigSchema, on as runInEvalScope, ot as evalChartAxisSchema, pt as evalChartsConfigSchema, q as resolveApiCallsConfig, qt as evalSpan, rn as isInEvalScope, rt as evalSummarySchema, sn as setEvalOutput, st as evalChartBuiltinMetricSchema, tn as getEvalCaseInput, tt as evalStatItemSchema, un as defineEval, ut as evalChartMetricSchema, vt as cacheOperationTypeSchema, w as sseEnvelopeSchema, wt as traceCacheRefSchema, xt as cacheStatusSchema, yt as cacheRecordingOpSchema, z as apiCallMetricFormatSchema, zt as fileRefSchema } from "./runOrchestration-COFhQvTJ.mjs";
2
- import { n as createRunner, t as runCli } from "./cli-COzPxKg2.mjs";
3
- import "./src-OZSs693X.mjs";
1
+ import { $ as evalFreshnessStatusSchema, $t as evalAssert, A as getEvalDisplayStatus, At as traceDisplayInputConfigSchema, B as apiCallMetricPlacementSchema, Bt as jsonCellSchema, C as updateManualScoreRequestSchema, Ct as spanCacheOptionsSchema, D as extractLlmCalls, Dt as traceAttributeDisplayPlacementSchema, E as extractApiCalls, Et as traceAttributeDisplayInputSchema, F as runSummarySchema, Ft as cellValueSchema, G as llmCallMetricSchema, Gt as buildTraceTree, H as apiCallsConfigSchema, Ht as repoFileRefSchema, I as DEFAULT_API_CALLS_CONFIG, It as columnDefSchema, J as resolveLlmCallsConfig, Jt as evalTracer, K as llmCallsConfigSchema, Kt as captureEvalSpanError, L as DEFAULT_LLM_CALLS_CONFIG, Lt as columnFormatSchema, M as deriveStatusFromCaseRows, Mt as traceSpanKindSchema, N as deriveStatusFromChildStatuses, Nt as traceSpanSchema, O as getNestedAttribute, Ot as traceAttributeDisplaySchema, P as runManifestSchema, Pt as traceSpanWarningSchema, Q as caseRowSchema, Qt as appendToEvalOutput, R as agentEvalsConfigSchema, Rt as columnKindSchema, S as createRunRequestSchema, St as serializedCacheSpanSchema, T as extractCacheHits, Tt as traceAttributeDisplayFormatSchema, U as llmCallMetricFormatSchema, Ut as runArtifactRefSchema, V as apiCallMetricSchema, Vt as numberDisplayOptionsSchema, W as llmCallMetricPlacementSchema, Wt as z, X as assertionFailureSchema, Xt as hashCacheKeySync, Y as trialSelectionModeSchema, Yt as hashCacheKey, Z as caseDetailSchema, Zt as EvalAssertionError, _t as cacheModeSchema, an as nextEvalId, at as evalChartAggregateSchema, bt as cacheRecordingSchema, cn as setScopeCacheContext, ct as evalChartColorSchema, dn as getEvalRegistry, dt as evalChartTooltipExtraSchema, en as getCurrentScope, et as evalStatAggregateSchema, ft as evalChartTypeSchema, gt as cacheListItemSchema, ht as cacheFileSchema, in as mergeEvalOutput, it as scoreTraceSchema, j as deriveScopedSummaryFromCases, jt as traceSpanErrorSchema, k as getEvalTitle, kt as traceDisplayConfigSchema, ln as repoFile, lt as evalChartConfigSchema, mt as cacheEntrySchema, nn as incrementEvalOutput, nt as evalStatsConfigSchema, on as runInEvalScope, ot as evalChartAxisSchema, pt as evalChartsConfigSchema, q as resolveApiCallsConfig, qt as evalSpan, rn as isInEvalScope, rt as evalSummarySchema, sn as setEvalOutput, st as evalChartBuiltinMetricSchema, tn as getEvalCaseInput, tt as evalStatItemSchema, un as defineEval, ut as evalChartMetricSchema, vt as cacheOperationTypeSchema, w as sseEnvelopeSchema, wt as traceCacheRefSchema, xt as cacheStatusSchema, yt as cacheRecordingOpSchema, z as apiCallMetricFormatSchema, zt as fileRefSchema } from "./runOrchestration-DrgpaDaf.mjs";
2
+ import { n as createRunner, t as runCli } from "./cli-BEPaYHmX.mjs";
3
+ import "./src-BU6ZtVIB.mjs";
4
4
  export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
package/dist/runChild.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { F as runSummarySchema, It as columnDefSchema, P as runManifestSchema, S as createRunRequestSchema, nt as evalStatsConfigSchema, pt as evalChartsConfigSchema, t as executeRun, v as loadConfig, x as createFsCacheStore } from "./runOrchestration-COFhQvTJ.mjs";
1
+ import { F as runSummarySchema, It as columnDefSchema, P as runManifestSchema, S as createRunRequestSchema, nt as evalStatsConfigSchema, pt as evalChartsConfigSchema, t as executeRun, v as loadConfig, x as createFsCacheStore } from "./runOrchestration-DrgpaDaf.mjs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { readFile } from "node:fs/promises";
4
4
  import { z } from "zod/v4";
@@ -5676,6 +5676,20 @@ function pickWinningTrial(params) {
5676
5676
  if (medianAttempt === void 0) throw new Error("Expected at least one trial attempt");
5677
5677
  return medianAttempt;
5678
5678
  }
5679
+ function formatUnknownErrorDetails(error) {
5680
+ if (error instanceof Error) return error.stack ?? error.message;
5681
+ if (typeof error === "string") return error;
5682
+ return String(error);
5683
+ }
5684
+ function buildRunErrorMessage(errors) {
5685
+ return errors.map((entry) => {
5686
+ const [firstLine, ...detailLines] = entry.details.split("\n");
5687
+ const messageLine = firstLine?.trim() ?? "Unknown error";
5688
+ const details = detailLines.join("\n").trim();
5689
+ if (details.length === 0) return `[${entry.evalId}] ${messageLine}`;
5690
+ return `[${entry.evalId}] ${messageLine}\n${details}`;
5691
+ }).join("\n");
5692
+ }
5679
5693
  async function finalizePreparedCase(params) {
5680
5694
  const { runState, runDir, preparedEval, preparedCase, onCaseFinished, emitEvent } = params;
5681
5695
  if (preparedCase.finalized || preparedCase.trialResults.length === 0) return;
@@ -5754,7 +5768,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5754
5768
  if (!entry) {
5755
5769
  evalErrors.push({
5756
5770
  evalId: evalMeta.id,
5757
- message: `Eval "${evalMeta.id}" was not registered after importing ${evalFilePath}`
5771
+ details: `Eval "${evalMeta.id}" was not registered after importing ${evalFilePath}`
5758
5772
  });
5759
5773
  continue;
5760
5774
  }
@@ -5846,7 +5860,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5846
5860
  console.error(`Error running eval ${evalMeta.id}:`, error);
5847
5861
  evalErrors.push({
5848
5862
  evalId: evalMeta.id,
5849
- message: error instanceof Error ? error.message : String(error)
5863
+ details: formatUnknownErrorDetails(error)
5850
5864
  });
5851
5865
  lastRunStatusMap.set(evalMeta.id, "error");
5852
5866
  latestRunInfoMap.set(evalMeta.id, {
@@ -5890,7 +5904,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5890
5904
  runState.manifest.status = finalStatus;
5891
5905
  const completedRunAt = endTime.toISOString();
5892
5906
  runState.manifest.endedAt = completedRunAt;
5893
- runState.summary.errorMessage = evalErrors.length > 0 ? evalErrors.map((entry) => `[${entry.evalId}] ${entry.message}`).join("\n") : null;
5907
+ runState.summary.errorMessage = evalErrors.length > 0 ? buildRunErrorMessage(evalErrors) : null;
5894
5908
  for (const evalId of getTargetEvalIds({
5895
5909
  request,
5896
5910
  sortedEvalIds: getSortedEvalMetas().map((meta) => meta.id),
@@ -5918,7 +5932,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5918
5932
  type: "run.error",
5919
5933
  runId: runState.manifest.id,
5920
5934
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
5921
- payload: { message: evalErrors.map((entry) => `[${entry.evalId}] ${entry.message}`).join("\n") }
5935
+ payload: { message: buildRunErrorMessage(evalErrors) }
5922
5936
  });
5923
5937
  else emitEvent(runState, {
5924
5938
  type: "run.finished",
@@ -5928,7 +5942,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5928
5942
  });
5929
5943
  emitDiscoveryEvent();
5930
5944
  } catch (error) {
5931
- const message = error instanceof Error ? error.message : String(error);
5945
+ const message = formatUnknownErrorDetails(error);
5932
5946
  runState.manifest.status = "error";
5933
5947
  runState.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
5934
5948
  runState.summary.status = "error";
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-nQjuRZGC.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-BVC9yBDu.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-COzPxKg2.mjs";
2
- import "./src-OZSs693X.mjs";
1
+ import { n as createRunner } from "./cli-BEPaYHmX.mjs";
2
+ import "./src-BU6ZtVIB.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -0,0 +1,3 @@
1
+ import "./runOrchestration-DrgpaDaf.mjs";
2
+ import "./cli-BEPaYHmX.mjs";
3
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.12.0",
3
+ "version": "0.12.2",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -1,3 +0,0 @@
1
- import "./runOrchestration-COFhQvTJ.mjs";
2
- import "./cli-COzPxKg2.mjs";
3
- export {};