@ls-stack/agent-eval 0.42.1 → 0.42.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, Rt as getEvalRegistry, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-SPaHx-SC.mjs";
2
- import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-DbVfkr9T.mjs";
3
- import { n as matchesEvalTags, t as defineEval } from "./src-CANi3gpd.mjs";
1
+ import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, Lt as getEvalRegistry, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-OVUFw1fL.mjs";
2
+ import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-BeJCJMQo.mjs";
3
+ import { n as matchesEvalTags, t as defineEval } from "./src-D7_xKo7h.mjs";
4
4
  export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
package/dist/runChild.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as evalChartsConfigSchema, It as columnDefSchema, Nt as evalStatsConfigSchema, Pt as manualInputDescriptorSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-SPaHx-SC.mjs";
1
+ import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as columnDefSchema, Mt as evalStatsConfigSchema, Nt as manualInputDescriptorSchema, Pt as evalChartsConfigSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-OVUFw1fL.mjs";
2
2
  import { z } from "zod/v4";
3
3
  import { readFile } from "node:fs/promises";
4
4
  import { relative } from "node:path";
@@ -866,10 +866,6 @@ function buildCaseKey(params) {
866
866
  encodeURIComponent(params.caseId)
867
867
  ].join("#");
868
868
  }
869
- /** Return the collision-safe eval key stored on a row, falling back for legacy data. */
870
- function getCaseRowEvalKey(row) {
871
- return row.evalKey ?? row.evalId;
872
- }
873
869
  /** Return the collision-safe case key stored on a row, falling back for legacy data. */
874
870
  function getCaseRowCaseKey(row) {
875
871
  return row.caseKey ?? row.caseId;
@@ -6586,9 +6582,9 @@ function recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds) {
6586
6582
  return caseRow.status === "error" ? "error" : "pass";
6587
6583
  }
6588
6584
  function runTouchesEval(params) {
6589
- if (params.caseRows.some((caseRow) => getCaseRowEvalKey(caseRow) === params.evalKey || caseRow.evalKey === void 0 && caseRow.evalId === params.evalId)) return true;
6585
+ if (params.caseRows.some((caseRow) => caseRow.evalKey === params.evalKey)) return true;
6590
6586
  if (params.target.mode === "all") return params.evalExists;
6591
- if (params.target.mode === "evalIds") return params.target.evalKeys?.includes(params.evalKey) ?? params.target.evalIds?.includes(params.evalId ?? params.evalKey) ?? false;
6587
+ if (params.target.mode === "evalIds") return params.target.evalKeys?.includes(params.evalKey) ?? false;
6592
6588
  return false;
6593
6589
  }
6594
6590
  async function deleteTemporaryRuns(params) {
@@ -6619,13 +6615,12 @@ async function recomputeEvalStatusesInRuns(params) {
6619
6615
  target: run.manifest.target,
6620
6616
  caseRows: run.cases,
6621
6617
  evalKey: params.evalKey,
6622
- evalId: params.evalId,
6623
6618
  evalExists: params.evalExists
6624
6619
  })) continue;
6625
6620
  if (run.manifest.status === "running") continue;
6626
6621
  let changed = false;
6627
6622
  for (const caseRow of run.cases) {
6628
- if (getCaseRowEvalKey(caseRow) !== params.evalKey && !(caseRow.evalKey === void 0 && caseRow.evalId === params.evalId)) continue;
6623
+ if (caseRow.evalKey !== params.evalKey) continue;
6629
6624
  const caseDetail = run.caseDetails.get(getCaseRowCaseKey(caseRow));
6630
6625
  const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, params.scoreThresholds);
6631
6626
  if (caseRow.status === nextStatus) continue;
@@ -6708,15 +6703,14 @@ function getLastRunStatuses(params) {
6708
6703
  function getLatestRunInfos(params) {
6709
6704
  const { runs, knownEvals } = params;
6710
6705
  const knownEvalMetas = [...knownEvals];
6711
- const evalIdByKey = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.key, evalMeta.id]));
6712
6706
  const manualScoreKeysByEval = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.key, evalMeta.columnDefs.filter((columnDef) => columnDef.isManualScore === true).map((columnDef) => columnDef.key)]));
6713
6707
  const orderedRuns = [...runs].toSorted((a, b) => new Date(getRunFreshnessTimestamp(a.manifest)).getTime() - new Date(getRunFreshnessTimestamp(b.manifest)).getTime());
6714
6708
  const latestRunInfos = /* @__PURE__ */ new Map();
6715
6709
  for (const run of orderedRuns) for (const evalKey of getRunEvalKeys(run, knownEvalMetas)) latestRunInfos.set(evalKey, {
6716
- status: getEvalStatusForRun(run, evalKey, evalIdByKey.get(evalKey), manualScoreKeysByEval.get(evalKey) ?? []),
6710
+ status: getEvalStatusForRun(run, evalKey, manualScoreKeysByEval.get(evalKey) ?? []),
6717
6711
  startedAt: getRunFreshnessTimestamp(run.manifest),
6718
6712
  commitSha: run.manifest.commitSha ?? null,
6719
- evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalKey] ?? run.manifest.evalSourceFingerprints[evalIdByKey.get(evalKey) ?? ""] ?? null
6713
+ evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalKey] ?? null
6720
6714
  });
6721
6715
  return latestRunInfos;
6722
6716
  }
@@ -6775,19 +6769,14 @@ async function readCaseDetails(runDir) {
6775
6769
  }
6776
6770
  function getRunEvalKeys(run, knownEvals) {
6777
6771
  const knownEvalMetas = [...knownEvals];
6778
- const evalKeys = new Set(run.cases.map(getCaseRowEvalKey));
6779
- for (const caseRow of run.cases) {
6780
- if (caseRow.evalKey !== void 0) continue;
6781
- for (const evalMeta of knownEvalMetas) if (evalMeta.id === caseRow.evalId) evalKeys.add(evalMeta.key);
6782
- }
6783
- if (run.manifest.target.mode === "evalIds") {
6784
- for (const evalKey of run.manifest.target.evalKeys ?? []) evalKeys.add(evalKey);
6785
- for (const evalId of run.manifest.target.evalIds ?? []) for (const evalMeta of knownEvalMetas) if (evalMeta.id === evalId) evalKeys.add(evalMeta.key);
6786
- } else if (run.manifest.target.mode === "all" && evalKeys.size === 0) for (const evalMeta of knownEvalMetas) evalKeys.add(evalMeta.key);
6772
+ const evalKeys = /* @__PURE__ */ new Set();
6773
+ for (const caseRow of run.cases) if (caseRow.evalKey !== void 0) evalKeys.add(caseRow.evalKey);
6774
+ if (run.manifest.target.mode === "evalIds") for (const evalKey of run.manifest.target.evalKeys ?? []) evalKeys.add(evalKey);
6775
+ else if (run.manifest.target.mode === "all" && evalKeys.size === 0) for (const evalMeta of knownEvalMetas) evalKeys.add(evalMeta.key);
6787
6776
  return [...evalKeys];
6788
6777
  }
6789
- function getEvalStatusForRun(run, evalKey, evalId, manualScoreKeys) {
6790
- const evalCases = run.cases.filter((caseRow) => getCaseRowEvalKey(caseRow) === evalKey || caseRow.evalKey === void 0 && caseRow.evalId === evalId);
6778
+ function getEvalStatusForRun(run, evalKey, manualScoreKeys) {
6779
+ const evalCases = run.cases.filter((caseRow) => caseRow.evalKey === evalKey);
6791
6780
  if (evalCases.length > 0) {
6792
6781
  if (hasPendingManualScores(evalCases, manualScoreKeys)) return "unscored";
6793
6782
  return toLastRunStatus$1(deriveStatusFromCaseRows({ caseRows: evalCases }));
@@ -7809,4 +7798,4 @@ function toLastRunStatus(status) {
7809
7798
  return status === "pending" ? null : status;
7810
7799
  }
7811
7800
  //#endregion
7812
- export { getCurrentScope as $, validateCharts as A, buildEvalKey as At, deserializeCacheValue as B, deriveEvalFreshness as C, getEvalDisplayStatus as Ct, loadConfig as D, runSummarySchema as Dt, resolveEvalDefaultConfig as E, runManifestSchema as Et, evalSpan as F, evalChartsConfigSchema as Ft, readManualInputFile as G, serializeCacheValue as H, evalTracer as I, columnDefSchema as It, appendToEvalOutput as J, evalExpect as K, hashCacheKey as L, defineEval as Lt, z$1 as M, getCaseRowEvalKey as Mt, buildTraceTree as N, evalStatsConfigSchema as Nt, buildDeclaredColumnDefs as O, resolveApiCallsConfig as Ot, captureEvalSpanError as P, manualInputDescriptorSchema as Pt, evalTime as Q, hashCacheKeySync as R, getEvalRegistry as Rt, parseManualInputValues as S, getEvalTitle as St, parseEvalDiscovery as T, matchesTagsFilter as Tt, repoFile as U, serializeCacheRecording as V, manualInputFileValueSchema as W, evalAssert as X, configureEvalRunLogs as Y, evalLog as Z, recomputePersistedCaseStatus as _, extractLlmCalls as _t, validateTagsFilters as a, nextEvalId as at, resolveArtifactPath as b, applyDerivedCallAttributes as bt, getLastRunStatuses as c, runInExistingEvalScope as ct, loadPersistedRunSnapshots as d, startEvalBackgroundJob as dt, getEvalCaseInput as et, nextShortIdFromSnapshots as f, createRunRequestSchema as ft, recomputeEvalStatusesInRuns as g, extractApiCalls as gt, persistRunState as h, extractCacheHits as ht, resolveEvalTags as i, mergeEvalOutput as it, createFsCacheStore as j, getCaseRowCaseKey as jt, normalizeScoreDef as k, resolveLlmCallsConfig as kt, getLatestRunInfos as l, setEvalOutput as lt, deleteTemporaryRuns as m, extractCacheEntries as mt, getTargetEvalKeys as n, isInEvalScope as nt, stripTerminalControlCodes as o, runInEvalRuntimeScope as ot, persistCaseDetail as p, updateManualScoreRequestSchema as pt, EvalAssertionError as q, getTargetEvals as r, matchesEvalTags as rt, generateRunId as s, runInEvalScope as st, executeRun as t, incrementEvalOutput as tt, loadPersistedRunSnapshot as u, setScopeCacheContext as ut, runTouchesEval as v, simulateLlmCallCost as vt, loadEvalModule as w, deriveScopedSummaryFromCases as wt, buildManualInputDescriptor as x, getNestedAttribute as xt, resolveTracePresentation as y, simulateTokenAllocation as yt, deserializeCacheRecording as z };
7801
+ export { getCurrentScope as $, validateCharts as A, buildEvalKey as At, deserializeCacheValue as B, deriveEvalFreshness as C, getEvalDisplayStatus as Ct, loadConfig as D, runSummarySchema as Dt, resolveEvalDefaultConfig as E, runManifestSchema as Et, evalSpan as F, columnDefSchema as Ft, readManualInputFile as G, serializeCacheValue as H, evalTracer as I, defineEval as It, appendToEvalOutput as J, evalExpect as K, hashCacheKey as L, getEvalRegistry as Lt, z$1 as M, evalStatsConfigSchema as Mt, buildTraceTree as N, manualInputDescriptorSchema as Nt, buildDeclaredColumnDefs as O, resolveApiCallsConfig as Ot, captureEvalSpanError as P, evalChartsConfigSchema as Pt, evalTime as Q, hashCacheKeySync as R, parseManualInputValues as S, getEvalTitle as St, parseEvalDiscovery as T, matchesTagsFilter as Tt, repoFile as U, serializeCacheRecording as V, manualInputFileValueSchema as W, evalAssert as X, configureEvalRunLogs as Y, evalLog as Z, recomputePersistedCaseStatus as _, extractLlmCalls as _t, validateTagsFilters as a, nextEvalId as at, resolveArtifactPath as b, applyDerivedCallAttributes as bt, getLastRunStatuses as c, runInExistingEvalScope as ct, loadPersistedRunSnapshots as d, startEvalBackgroundJob as dt, getEvalCaseInput as et, nextShortIdFromSnapshots as f, createRunRequestSchema as ft, recomputeEvalStatusesInRuns as g, extractApiCalls as gt, persistRunState as h, extractCacheHits as ht, resolveEvalTags as i, mergeEvalOutput as it, createFsCacheStore as j, getCaseRowCaseKey as jt, normalizeScoreDef as k, resolveLlmCallsConfig as kt, getLatestRunInfos as l, setEvalOutput as lt, deleteTemporaryRuns as m, extractCacheEntries as mt, getTargetEvalKeys as n, isInEvalScope as nt, stripTerminalControlCodes as o, runInEvalRuntimeScope as ot, persistCaseDetail as p, updateManualScoreRequestSchema as pt, EvalAssertionError as q, getTargetEvals as r, matchesEvalTags as rt, generateRunId as s, runInEvalScope as st, executeRun as t, incrementEvalOutput as tt, loadPersistedRunSnapshot as u, setScopeCacheContext as ut, runTouchesEval as v, simulateLlmCallCost as vt, loadEvalModule as w, deriveScopedSummaryFromCases as wt, buildManualInputDescriptor as x, getNestedAttribute as xt, resolveTracePresentation as y, simulateTokenAllocation as yt, deserializeCacheRecording as z };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-DbVfkr9T.mjs";
2
- import "./src-CANi3gpd.mjs";
1
+ import { n as createRunner } from "./cli-BeJCJMQo.mjs";
2
+ import "./src-D7_xKo7h.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-DiCQ57JQ.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-B1KygirW.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { Lt as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-SPaHx-SC.mjs";
2
- import "./cli-DbVfkr9T.mjs";
1
+ import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-OVUFw1fL.mjs";
2
+ import "./cli-BeJCJMQo.mjs";
3
3
  //#region src/index.ts
4
4
  /** Register an eval definition with typed tag support. */
5
5
  function defineEval(definition) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.42.1",
3
+ "version": "0.42.2",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -490,10 +490,10 @@ Mental model:
490
490
  - Cache identity is the namespace plus the authored key. Source-file
491
491
  fingerprints are tracked for run freshness separately, but do not participate
492
492
  in cache-key hashing.
493
- - Cached spans require an explicit `cache.namespace`; value caches default to
494
- `${evalId}__${name}` and can be overridden with `namespace`. Matching
495
- namespaces share entries across operations/evals that use the same authored
496
- key.
493
+ - Cached spans require an explicit `cache.namespace`. Value caches can also set
494
+ an explicit `namespace`; prefer doing that when the cache is part of a
495
+ documented workflow. Matching namespaces share entries across operations/evals
496
+ that use the same authored key.
497
497
  - Per eval, `cache: { read?: boolean; store?: boolean }` controls whether
498
498
  authored cached operations may read or persist entries. Both default to
499
499
  `true`. Use `read: false` to always execute instead of replaying hits, and
@@ -536,10 +536,17 @@ Mental model:
536
536
 
537
537
  ## Artifacts
538
538
 
539
- Run output lives under `.agent-evals/runs/<run-id>/` and cache entries under
540
- `.agent-evals/cache/<eval-id>.json`. Files in a run directory include run
541
- metadata, a run summary, per-case results, and per-case trace JSON. Inspect
542
- these when debugging persisted output, costs, columns, traces, or failures.
539
+ Run output lives under `.agent-evals/runs/<run-id>/`. Cache metadata lives under
540
+ `.agent-evals/cache/`, grouped into runner-managed owner files. Do not rely on a
541
+ specific cache filename when authoring evals; configure cache namespaces
542
+ manually in eval code, then use `agent-evals cache list` or the UI Cache tab to
543
+ inspect the persisted namespace/key entries. Files in a run directory include
544
+ run metadata, a run summary, per-case results, and per-case trace JSON. Inspect
545
+ run files when debugging persisted output, costs, columns, traces, or failures;
546
+ inspect cache entries when debugging replayed span/value-cache results.
547
+ Targeted evals in `run.json` are recorded by exact `evalKeys`
548
+ (`filePath + evalId`) rather than authored eval ids, so duplicate eval ids stay
549
+ unambiguous in saved history.
543
550
  Temporary runs use the same directory layout, but are removed before the next
544
551
  run of any kind starts.
545
552