@ls-stack/agent-eval 0.27.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,8 +25,8 @@
25
25
  href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
26
26
  rel="stylesheet"
27
27
  />
28
- <script type="module" crossorigin src="/assets/index-DR2haqvV.js"></script>
29
- <link rel="stylesheet" crossorigin href="/assets/index-DOXT0Y9V.css">
28
+ <script type="module" crossorigin src="/assets/index-Czer_MdN.js"></script>
29
+ <link rel="stylesheet" crossorigin href="/assets/index-8VE7b6RK.css">
30
30
  </head>
31
31
  <body>
32
32
  <div id="root"></div>
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-DRkwWgTj.mjs";
2
+ import { t as runCli } from "./cli-BQwRbqsL.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -1,4 +1,4 @@
1
- import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef, Wn as getEvalRegistry, _ as loadEvalModule, a as getLastRunStatuses, b as loadConfig, c as loadPersistedRunSnapshots, ct as getCaseRowEvalKey, d as persistRunState, f as recomputeEvalStatusesInRuns, g as deriveEvalFreshness, h as resolveArtifactPath, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, nt as resolveLlmCallsConfig, o as getLatestRunInfos, ot as buildEvalKey, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, st as getCaseRowCaseKey, tt as resolveApiCallsConfig, u as persistCaseDetail, v as parseEvalDiscovery, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-DB0dwGrd.mjs";
1
+ import { C as normalizeScoreDef, F as getEvalTitle, I as getEvalDisplayStatus, L as deriveScopedSummaryFromCases, N as applyDerivedCallAttributes, S as buildDeclaredColumnDefs, T as createFsCacheStore, V as runSummarySchema, Yn as getEvalRegistry, _ as deriveEvalFreshness, a as getLastRunStatuses, b as resolveEvalDefaultConfig, c as loadPersistedRunSnapshots, d as persistRunState, dt as buildEvalKey, f as recomputeEvalStatusesInRuns, ft as getCaseRowCaseKey, g as resolveArtifactPath, h as resolveTracePresentation, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, o as getLatestRunInfos, ot as resolveApiCallsConfig, p as recomputePersistedCaseStatus, pt as getCaseRowEvalKey, s as loadPersistedRunSnapshot, st as resolveLlmCallsConfig, u as persistCaseDetail, v as loadEvalModule, w as validateCharts, x as loadConfig, y as parseEvalDiscovery } from "./runOrchestration-ClWYWPen.mjs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
4
4
  import { dirname, join, relative, resolve } from "node:path";
@@ -60,6 +60,57 @@ function readGitWorktreeState(workspaceRoot) {
60
60
  return { commitSha: commitResult.status === 0 ? commitResult.stdout : null };
61
61
  }
62
62
  //#endregion
63
+ //#region ../runner/src/recalculateDerivedAttributes.ts
64
+ function getCaseArtifactFileIdForExistingRun(runState, caseRow) {
65
+ const caseKey = getCaseRowCaseKey(caseRow);
66
+ return runState.cases.some((existing) => existing.caseId === caseRow.caseId && getCaseRowCaseKey(existing) !== caseKey) ? caseKey : caseRow.caseId;
67
+ }
68
+ async function recalculateDerivedAttributesForCase(params) {
69
+ const { run, caseId } = params;
70
+ if (run.manifest.status === "running") return {
71
+ updated: false,
72
+ reason: "Run is still running"
73
+ };
74
+ const caseRow = run.cases.find((row) => getCaseRowCaseKey(row) === caseId || row.caseId === caseId);
75
+ if (!caseRow) return {
76
+ updated: false,
77
+ reason: "Case not found"
78
+ };
79
+ const caseKey = getCaseRowCaseKey(caseRow);
80
+ const caseDetail = run.caseDetails.get(caseKey);
81
+ if (!caseDetail) return {
82
+ updated: false,
83
+ reason: "Case detail not found"
84
+ };
85
+ const spansWithDerivedAttributes = applyDerivedCallAttributes({
86
+ spans: caseDetail.trace,
87
+ llmCallsConfig: params.llmCallsConfig,
88
+ apiCallsConfig: params.apiCallsConfig
89
+ });
90
+ let nextTrace = spansWithDerivedAttributes;
91
+ let nextTraceDisplay = caseDetail.traceDisplay;
92
+ const evalMeta = params.evals.get(getCaseRowEvalKey(caseRow));
93
+ const entry = evalMeta === void 0 ? void 0 : getEvalRegistry().get(evalMeta.id);
94
+ if (entry !== void 0) entry.use((evalDef) => {
95
+ const resolved = resolveTracePresentation(spansWithDerivedAttributes, params.traceDisplayConfig, evalDef.traceDisplay);
96
+ nextTrace = resolved.trace;
97
+ nextTraceDisplay = resolved.traceDisplay;
98
+ });
99
+ const nextCaseDetail = {
100
+ ...caseDetail,
101
+ trace: nextTrace,
102
+ traceDisplay: nextTraceDisplay
103
+ };
104
+ run.caseDetails.set(caseKey, nextCaseDetail);
105
+ const artifactFileId = getCaseArtifactFileIdForExistingRun(run, caseRow);
106
+ await writeFile(join(run.runDir, "traces", `${encodeURIComponent(artifactFileId)}.json`), JSON.stringify(nextCaseDetail.trace, null, 2));
107
+ await params.persistCaseDetail(run.runDir, nextCaseDetail, artifactFileId);
108
+ return {
109
+ updated: true,
110
+ caseDetail: nextCaseDetail
111
+ };
112
+ }
113
+ //#endregion
63
114
  //#region ../runner/src/runChildProtocol.ts
64
115
  function isRunChildMessage(value) {
65
116
  if (typeof value !== "object" || value === null) return false;
@@ -251,7 +302,7 @@ async function markRunTerminalFromChild(runState, event, managerContext) {
251
302
  managerContext.emitDiscoveryEvent();
252
303
  }
253
304
  //#endregion
254
- //#region ../runner/src/runner.ts
305
+ //#region ../runner/src/watchRoots.ts
255
306
  const globMagicCharacters = new Set([
256
307
  "*",
257
308
  "?",
@@ -285,6 +336,8 @@ function getWatchRootsForIncludePatterns(params) {
285
336
  if (roots.size === 0) return [params.workspaceRoot];
286
337
  return [...roots];
287
338
  }
339
+ //#endregion
340
+ //#region ../runner/src/runner.ts
288
341
  /** Create an in-memory eval runner bound to the current workspace config. */
289
342
  function createRunner({ watchForChanges = true } = {}) {
290
343
  let config;
@@ -373,6 +426,22 @@ function createRunner({ watchForChanges = true } = {}) {
373
426
  emitDiscoveryEvent();
374
427
  return { updatedRuns };
375
428
  },
429
+ async recalculateDerivedAttributesForCase({ runId, caseId }) {
430
+ const run = runs.get(runId);
431
+ if (!run) return {
432
+ updated: false,
433
+ reason: "Run not found"
434
+ };
435
+ return recalculateDerivedAttributesForCase({
436
+ run,
437
+ caseId,
438
+ llmCallsConfig,
439
+ apiCallsConfig,
440
+ traceDisplayConfig: config.traceDisplay,
441
+ evals,
442
+ persistCaseDetail
443
+ });
444
+ },
376
445
  async cleanRunsForEval(evalKey) {
377
446
  const evalMeta = resolveEvalMeta(evalKey);
378
447
  let deletedRuns = 0;
@@ -529,6 +598,8 @@ function createRunner({ watchForChanges = true } = {}) {
529
598
  discoveredEntry?.use((evalDef) => {
530
599
  const defaultConfig = resolveEvalDefaultConfig({
531
600
  evalDef,
601
+ globalColumns: config.columns,
602
+ globalStats: config.stats,
532
603
  globalRemove: config.removeDefaultConfig
533
604
  });
534
605
  columnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
@@ -1041,8 +1112,8 @@ async function commandApp(args) {
1041
1112
  const { serve } = await import("@hono/node-server");
1042
1113
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
1043
1114
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
1044
- const appModule = await import("./app-ByMLOds2.mjs");
1045
- const runnerModule = await import("./runner-39KGoaM1.mjs");
1115
+ const appModule = await import("./app-mBbAN-Gt.mjs");
1116
+ const runnerModule = await import("./runner-BQn_xf36.mjs");
1046
1117
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
1047
1118
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
1048
1119
  await runnerModule.initRunner();