@ls-stack/agent-eval 0.58.1 → 0.58.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as startEvalBackgroundJob, A as repoFile, B as getCurrentScope, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as appendToEvalOutput, G as mergeEvalOutput, H as incrementEvalOutput, J as runInEvalScope, K as nextEvalId, L as evalAssert, M as readManualInputFile, N as evalExpect, Nt as getEvalRegistry, O as serializeCacheRecording, P as EvalAssertionError, Q as setScopeCacheContext, R as evalLog, S as evalSpan, T as hashCacheKeySync, U as isInEvalScope, V as getEvalCaseInput, Y as runInExistingEvalScope, Z as setEvalOutput, at as extractLlmCalls, b as buildTraceTree, it as extractApiCalls, j as manualInputFileValueSchema, k as serializeCacheValue, lt as getNestedAttribute, nt as extractCacheEntries, ot as simulateLlmCallCost, q as runInEvalRuntimeScope, rt as extractCacheHits, st as simulateTokenAllocation, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalTime } from "./runExecution-d42Lm0i5.mjs";
2
- import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-_g2qOMK6.mjs";
3
- import { n as matchesEvalTags, t as defineEval } from "./src-CdZsOn6y.mjs";
4
- export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
1
+ import { $ as setScopeCacheContext, A as repoFile, B as evalTime, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as EvalRuntimeUsageError, Ft as getEvalRegistry, H as getEvalCaseInput, I as appendToEvalOutput, J as runInEvalRuntimeScope, K as mergeEvalOutput, M as readManualInputFile, N as evalExpect, O as serializeCacheRecording, P as EvalAssertionError, Q as setEvalOutput, R as evalAssert, S as evalSpan, T as hashCacheKeySync, U as incrementEvalOutput, V as getCurrentScope, W as isInEvalScope, X as runInExistingEvalScope, Y as runInEvalScope, at as extractApiCalls, b as buildTraceTree, ct as simulateTokenAllocation, et as startEvalBackgroundJob, it as extractCacheHits, j as manualInputFileValueSchema, k as serializeCacheValue, ot as extractLlmCalls, q as nextEvalId, rt as extractCacheEntries, st as simulateLlmCallCost, ut as getNestedAttribute, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalLog } from "./runExecution-CFw0MQFs.mjs";
2
+ import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-SP4kEtYL.mjs";
3
+ import { n as matchesEvalTags, t as defineEval } from "./src-7GbQj1sb.mjs";
4
+ export { EvalAssertionError, EvalRuntimeUsageError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
package/dist/runChild.mjs CHANGED
@@ -1,5 +1,5 @@
1
- import { At as evalChartsConfigSchema, Ct as buildEvalKey, Dt as evalStatAggregateSchema, I as configureEvalRunLogs, Ot as evalStatsConfigSchema, bt as runSummarySchema, et as createRunRequestSchema, jt as columnDefSchema, kt as manualInputDescriptorSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, v as createFsCacheStore, yt as runManifestSchema } from "./runExecution-d42Lm0i5.mjs";
2
- import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-CvmFeOmT.mjs";
1
+ import { At as evalStatsConfigSchema, L as configureEvalRunLogs, Mt as evalChartsConfigSchema, Nt as columnDefSchema, Tt as buildEvalKey, bt as runManifestSchema, jt as manualInputDescriptorSchema, kt as evalStatAggregateSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, tt as createRunRequestSchema, v as createFsCacheStore, xt as runSummarySchema } from "./runExecution-CFw0MQFs.mjs";
2
+ import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-CxjiQmof.mjs";
3
3
  import { z } from "zod/v4";
4
4
  import { readFile } from "node:fs/promises";
5
5
  import { relative } from "node:path";
@@ -851,6 +851,7 @@ const runLogLevelSchema = z.enum([
851
851
  const runLogPhaseSchema = z.enum([
852
852
  "eval",
853
853
  "derive",
854
+ "tracingAssertions",
854
855
  "outputsSchema",
855
856
  "scorer"
856
857
  ]);
@@ -1008,6 +1009,8 @@ const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfi
1008
1009
  const evalDeriveValueFnSchema = z.custom((value) => typeof value === "function", { message: "Expected a derive output function" });
1009
1010
  /** Schema for keyed or object-returning trace-derived output config. */
1010
1011
  const evalDeriveConfigSchema = z.union([z.custom((value) => typeof value === "function", { message: "Expected a deriveFromTracing function" }), z.record(z.string().min(1), evalDeriveValueFnSchema)]);
1012
+ /** Schema for trace-derived assertion config. */
1013
+ const evalTracingAssertionsConfigSchema = z.custom((value) => typeof value === "function", { message: "Expected a tracingAssertions function" });
1011
1014
  /** Schema for UI overrides on derived or scored columns. */
1012
1015
  const evalColumnOverrideSchema = z.object({
1013
1016
  label: z.string().optional(),
@@ -1411,6 +1414,7 @@ const agentEvalsConfigSchema = z.object({
1411
1414
  traceDisplay: traceDisplayInputConfigSchema.optional(),
1412
1415
  columns: evalColumnsSchema.optional(),
1413
1416
  deriveFromTracing: evalDeriveConfigSchema.optional(),
1417
+ tracingAssertions: evalTracingAssertionsConfigSchema.optional(),
1414
1418
  stats: evalStatsConfigSchema.optional(),
1415
1419
  defaultStatAggregate: evalStatAggregateSchema.optional(),
1416
1420
  llmCalls: llmCallsConfigSchema.optional(),
@@ -1847,8 +1851,9 @@ function deriveScopedSummaryFromCases(params) {
1847
1851
  * freshness state.
1848
1852
  */
1849
1853
  function getEvalDisplayStatus(params) {
1850
- const { stale, outdated, lastRunStatus, isRunning = false } = params;
1854
+ const { stale, outdated, lastRunStatus, isRunning = false, isEnqueued = false } = params;
1851
1855
  if (isRunning || lastRunStatus === "running") return "running";
1856
+ if (isEnqueued) return "enqueued";
1852
1857
  if (lastRunStatus === "pass") {
1853
1858
  if (stale) return "stale";
1854
1859
  if (outdated) return "outdated";
@@ -2718,6 +2723,17 @@ var EvalAssertionError = class extends Error {
2718
2723
  this.name = "EvalAssertionError";
2719
2724
  }
2720
2725
  };
2726
+ /** Error thrown when an SDK helper is used in an unsupported runner phase. */
2727
+ var EvalRuntimeUsageError = class extends Error {
2728
+ constructor(message) {
2729
+ super(message);
2730
+ this.name = "EvalRuntimeUsageError";
2731
+ }
2732
+ };
2733
+ /** Throw when assertion helpers are used in a runner phase that forbids them. */
2734
+ function assertEvalAssertionsAllowed(apiName) {
2735
+ if (getCurrentScope() && runtimeScopeStorage.getStore() === "derive") throw new EvalRuntimeUsageError(`${apiName} cannot be used inside deriveFromTracing. Use tracingAssertions for trace-derived assertions.`);
2736
+ }
2721
2737
  function getEvalClockStateNowMs(state) {
2722
2738
  const elapsedMs = state.frozen ? 0 : realDate.now() - state.realStartMs;
2723
2739
  return state.startMs + elapsedMs + state.offsetMs;
@@ -2823,8 +2839,10 @@ function recordSpanForActiveCacheRecording(scope, spanId) {
2823
2839
  *
2824
2840
  * Returns `null` outside eval-owned work, `env` while the runner is loading
2825
2841
  * eval modules for a run, `cases` while generating cases, `eval` while running
2826
- * case `execute`, `derive` while deriving outputs from traces, `outputsSchema`
2827
- * while validating outputs, and `scorer` while computing scores.
2842
+ * case `execute`, `derive` while deriving outputs from traces,
2843
+ * `tracingAssertions` while checking trace-derived assertions,
2844
+ * `outputsSchema` while validating outputs, and `scorer` while computing
2845
+ * scores.
2828
2846
  */
2829
2847
  function isInEvalScope() {
2830
2848
  if (activeEvalRuntimeScopeCount === 0) return null;
@@ -2845,7 +2863,7 @@ function normalizeLogLevel(level) {
2845
2863
  }
2846
2864
  function getCurrentLogPhase() {
2847
2865
  const runtimeScope = runtimeScopeStorage.getStore();
2848
- if (runtimeScope === "eval" || runtimeScope === "derive" || runtimeScope === "outputsSchema" || runtimeScope === "scorer") return runtimeScope;
2866
+ if (runtimeScope === "eval" || runtimeScope === "derive" || runtimeScope === "tracingAssertions" || runtimeScope === "outputsSchema" || runtimeScope === "scorer") return runtimeScope;
2849
2867
  return null;
2850
2868
  }
2851
2869
  function formatLogArgs(args) {
@@ -3300,10 +3318,12 @@ function incrementEvalOutput(key, delta) {
3300
3318
  * Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
3301
3319
  * can safely reuse `evalAssert(...)` when it also runs outside an eval. The
3302
3320
  * TypeScript assertion signature still narrows the checked value after the
3303
- * call.
3321
+ * call. Calls inside `deriveFromTracing` throw because derivations must only
3322
+ * write outputs; use `tracingAssertions` for trace-derived pass/fail checks.
3304
3323
  */
3305
3324
  function evalAssert(condition, message) {
3306
3325
  const scope = getCurrentScope();
3326
+ assertEvalAssertionsAllowed("evalAssert(...)");
3307
3327
  if (condition) {
3308
3328
  if (scope) scope.assertions.push({
3309
3329
  message,
@@ -3454,6 +3474,7 @@ var EvalExpectationImpl = class EvalExpectationImpl {
3454
3474
  * case scope is active, matching `evalAssert(...)`.
3455
3475
  */
3456
3476
  function evalExpect(value) {
3477
+ assertEvalAssertionsAllowed("evalExpect(...)");
3457
3478
  return new EvalExpectationImpl(value, false);
3458
3479
  }
3459
3480
  //#endregion
@@ -5005,25 +5026,53 @@ const evalTracer = {
5005
5026
  };
5006
5027
  /** Build a queryable trace tree helper from a flat span list and checkpoints. */
5007
5028
  function buildTraceTree(spans, checkpoints) {
5029
+ const rootSpans = spans.filter((s) => s.parentId === null);
5030
+ const flattenDfs = () => {
5031
+ const result = [];
5032
+ function visit(parentId) {
5033
+ for (const childSpan of spans) if (childSpan.parentId === parentId) {
5034
+ result.push(childSpan);
5035
+ visit(childSpan.id);
5036
+ }
5037
+ }
5038
+ visit(null);
5039
+ return result;
5040
+ };
5041
+ const filterSpanNames = (sourceSpans, kind) => {
5042
+ return sourceSpans.filter((span) => kind === void 0 || span.kind === kind).map((span) => span.name);
5043
+ };
5008
5044
  return {
5009
5045
  spans,
5010
- rootSpans: spans.filter((s) => s.parentId === null),
5046
+ rootSpans,
5011
5047
  findSpan(name) {
5012
5048
  return spans.find((s) => s.name === name);
5013
5049
  },
5050
+ findSpans(name) {
5051
+ return spans.filter((s) => s.name === name);
5052
+ },
5053
+ hasSpan(name) {
5054
+ return spans.some((s) => s.name === name);
5055
+ },
5014
5056
  findSpansByKind(kind) {
5015
5057
  return spans.filter((s) => s.kind === kind);
5016
5058
  },
5059
+ findToolCallSpans() {
5060
+ return spans.filter((s) => s.kind === "tool");
5061
+ },
5062
+ listToolCallSpanNames() {
5063
+ return filterSpanNames(spans, "tool");
5064
+ },
5065
+ hasToolCallSpan(name) {
5066
+ return spans.some((s) => s.kind === "tool" && s.name === name);
5067
+ },
5068
+ listSpanNames(kind) {
5069
+ return filterSpanNames(spans, kind);
5070
+ },
5071
+ listSpanNamesDfs(kind) {
5072
+ return filterSpanNames(flattenDfs(), kind);
5073
+ },
5017
5074
  flattenDfs() {
5018
- const result = [];
5019
- function visit(parentId) {
5020
- for (const childSpan of spans) if (childSpan.parentId === parentId) {
5021
- result.push(childSpan);
5022
- visit(childSpan.id);
5023
- }
5024
- }
5025
- visit(null);
5026
- return result;
5075
+ return flattenDfs();
5027
5076
  },
5028
5077
  checkpoints
5029
5078
  };
@@ -6665,7 +6714,7 @@ async function resolveDeriveFromTracingConfig(params) {
6665
6714
  return derived;
6666
6715
  }
6667
6716
  async function runDeriveFromTracingConfig(params) {
6668
- if (params.deriveFromTracing === void 0) return;
6717
+ if (params.deriveFromTracing === void 0) return null;
6669
6718
  const { deriveFromTracing } = params;
6670
6719
  try {
6671
6720
  const derived = await runInExistingEvalScope(params.scope, "derive", async () => await resolveDeriveFromTracingConfig({
@@ -6677,13 +6726,43 @@ async function runDeriveFromTracingConfig(params) {
6677
6726
  outputs: params.scope.outputs,
6678
6727
  derived
6679
6728
  });
6729
+ return null;
6680
6730
  } catch (e) {
6731
+ if (e instanceof EvalRuntimeUsageError) return e;
6681
6732
  const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
6682
6733
  recordAssertionFailure(params.scope, toAssertionFailure(message, e instanceof Error ? e : void 0));
6734
+ return null;
6683
6735
  }
6684
6736
  }
6737
+ async function runOneTracingAssertion(params) {
6738
+ const { label, tracingAssertion, scope, traceTree, evalCase } = params;
6739
+ const failureCountBefore = scope.assertionFailures.length;
6740
+ const ctx = {
6741
+ trace: traceTree,
6742
+ input: evalCase.input,
6743
+ case: evalCase
6744
+ };
6745
+ try {
6746
+ await runInExistingEvalScope(scope, "tracingAssertions", async () => {
6747
+ await callUnknownFunction(tracingAssertion, [ctx]);
6748
+ });
6749
+ } catch (e) {
6750
+ if (e instanceof EvalAssertionError && scope.assertionFailures.length > failureCountBefore) return;
6751
+ recordAssertionFailure(scope, toAssertionFailure(`${label} threw: ${e instanceof Error ? e.message : String(e)}`, e instanceof Error ? e : void 0));
6752
+ }
6753
+ }
6754
+ async function runTracingAssertionsConfig(params) {
6755
+ if (params.tracingAssertions === void 0) return;
6756
+ await runOneTracingAssertion({
6757
+ label: "tracingAssertions",
6758
+ tracingAssertion: params.tracingAssertions,
6759
+ scope: params.scope,
6760
+ traceTree: params.traceTree,
6761
+ evalCase: params.evalCase
6762
+ });
6763
+ }
6685
6764
  async function runCase(params) {
6686
- const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalColumns, globalDeriveFromTracing, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
6765
+ const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalColumns, globalDeriveFromTracing, globalTracingAssertions, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
6687
6766
  const scopedIdPrefix = buildScopedEvalIdPrefix({
6688
6767
  evalId,
6689
6768
  evalFilePath,
@@ -6728,22 +6807,36 @@ async function runCase(params) {
6728
6807
  apiCallsConfig
6729
6808
  });
6730
6809
  const traceTree = buildTraceTree(spansWithDerivedAttributes, scope.checkpoints);
6731
- const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
6810
+ let nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
6732
6811
  if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) recordAssertionFailure(scope, toAssertionFailure(executeError.message, executeError));
6733
6812
  if (!nonAssertError) {
6734
- await runDeriveFromTracingConfig({
6813
+ nonAssertError = await runDeriveFromTracingConfig({
6735
6814
  deriveFromTracing: globalDeriveFromTracing,
6736
6815
  scope,
6737
6816
  traceTree,
6738
6817
  evalCase
6739
6818
  });
6740
- await runDeriveFromTracingConfig({
6819
+ if (!nonAssertError) nonAssertError = await runDeriveFromTracingConfig({
6741
6820
  deriveFromTracing: evalDef.deriveFromTracing,
6742
6821
  scope,
6743
6822
  traceTree,
6744
6823
  evalCase
6745
6824
  });
6746
6825
  }
6826
+ if (!nonAssertError) {
6827
+ await runTracingAssertionsConfig({
6828
+ tracingAssertions: globalTracingAssertions,
6829
+ scope,
6830
+ traceTree,
6831
+ evalCase
6832
+ });
6833
+ await runTracingAssertionsConfig({
6834
+ tracingAssertions: evalDef.tracingAssertions,
6835
+ scope,
6836
+ traceTree,
6837
+ evalCase
6838
+ });
6839
+ }
6747
6840
  if (!nonAssertError) addDefaultOutputs({
6748
6841
  outputs: scope.outputs,
6749
6842
  spans: spansWithDerivedAttributes,
@@ -6933,4 +7026,4 @@ function recordAssertionFailure(scope, failure) {
6933
7026
  });
6934
7027
  }
6935
7028
  //#endregion
6936
- export { startEvalBackgroundJob as $, repoFile as A, evalChartsConfigSchema as At, getCurrentScope as B, evalTracer as C, buildEvalKey as Ct, deserializeCacheValue as D, evalStatAggregateSchema as Dt, deserializeCacheRecording as E, caseRowSchema as Et, appendToEvalOutput as F, mergeEvalOutput as G, incrementEvalOutput as H, configureEvalRunLogs as I, runInEvalScope as J, nextEvalId as K, evalAssert as L, readManualInputFile as M, defineEval as Mt, evalExpect as N, getEvalRegistry as Nt, serializeCacheRecording as O, evalStatsConfigSchema as Ot, EvalAssertionError as P, runWithEvalRegistry as Pt, setScopeCacheContext as Q, evalLog as R, evalSpan as S, resolveLlmCallsConfig as St, hashCacheKeySync as T, caseDetailSchema as Tt, isInEvalScope as U, getEvalCaseInput as V, matchesEvalTags as W, runWithEvalClock as X, runInExistingEvalScope as Y, setEvalOutput as Z, createBufferedCacheStore as _, validateEvalTagName as _t, isCaseChildParentMessage as a, extractLlmCalls as at, buildTraceTree as b, runSummarySchema as bt, resolveArtifactPath as c, applyDerivedCallAttributes as ct, loadEvalModule as d, getEvalDisplayStatus as dt, createRunRequestSchema as et, resolveEvalDefaultConfig as f, deriveScopedSummaryFromCases as ft, commitPendingCacheWrites as g, matchesTagsFilter as gt, normalizeScoreDef as h, dedupeEvalTags as ht, isCaseChildMessage as i, extractApiCalls as it, manualInputFileValueSchema as j, columnDefSchema as jt, serializeCacheValue as k, manualInputDescriptorSchema as kt, registerAgentEvalsPackageResolutionHooks as l, getNestedAttribute as lt, buildDeclaredColumnDefs as m, deriveStatusFromChildStatuses as mt, resolveRunnableEvalCases as n, extractCacheEntries as nt, stripTerminalControlCodes as o, simulateLlmCallCost as ot, loadConfig as p, deriveStatusFromCaseRows as pt, runInEvalRuntimeScope as q, runCase as r, extractCacheHits as rt, resolveTracePresentation as s, simulateTokenAllocation as st, filterEvalCases as t, updateManualScoreRequestSchema as tt, runWithModuleIsolation as u, getEvalTitle as ut, createFsCacheStore as v, validateTagsFilterExpression as vt, hashCacheKey as w, getCaseRowCaseKey as wt, captureEvalSpanError as x, resolveApiCallsConfig as xt, z$1 as y, runManifestSchema as yt, evalTime as z };
7029
+ export { setScopeCacheContext as $, repoFile as A, evalStatsConfigSchema as At, evalTime as B, evalTracer as C, resolveLlmCallsConfig as Ct, deserializeCacheValue as D, caseDetailSchema as Dt, deserializeCacheRecording as E, getCaseRowCaseKey as Et, EvalRuntimeUsageError as F, getEvalRegistry as Ft, matchesEvalTags as G, getEvalCaseInput as H, appendToEvalOutput as I, runWithEvalRegistry as It, runInEvalRuntimeScope as J, mergeEvalOutput as K, configureEvalRunLogs as L, readManualInputFile as M, evalChartsConfigSchema as Mt, evalExpect as N, columnDefSchema as Nt, serializeCacheRecording as O, caseRowSchema as Ot, EvalAssertionError as P, defineEval as Pt, setEvalOutput as Q, evalAssert as R, evalSpan as S, resolveApiCallsConfig as St, hashCacheKeySync as T, buildEvalKey as Tt, incrementEvalOutput as U, getCurrentScope as V, isInEvalScope as W, runInExistingEvalScope as X, runInEvalScope as Y, runWithEvalClock as Z, createBufferedCacheStore as _, matchesTagsFilter as _t, isCaseChildParentMessage as a, extractApiCalls as at, buildTraceTree as b, runManifestSchema as bt, resolveArtifactPath as c, simulateTokenAllocation as ct, loadEvalModule as d, getEvalTitle as dt, startEvalBackgroundJob as et, resolveEvalDefaultConfig as f, getEvalDisplayStatus as ft, commitPendingCacheWrites as g, dedupeEvalTags as gt, normalizeScoreDef as h, deriveStatusFromChildStatuses as ht, isCaseChildMessage as i, extractCacheHits as it, manualInputFileValueSchema as j, manualInputDescriptorSchema as jt, serializeCacheValue as k, evalStatAggregateSchema as kt, registerAgentEvalsPackageResolutionHooks as l, applyDerivedCallAttributes as lt, buildDeclaredColumnDefs as m, deriveStatusFromCaseRows as mt, resolveRunnableEvalCases as n, updateManualScoreRequestSchema as nt, stripTerminalControlCodes as o, extractLlmCalls as ot, loadConfig as p, deriveScopedSummaryFromCases as pt, nextEvalId as q, runCase as r, extractCacheEntries as rt, resolveTracePresentation as s, simulateLlmCallCost as st, filterEvalCases as t, createRunRequestSchema as tt, runWithModuleIsolation as u, getNestedAttribute as ut, createFsCacheStore as v, validateEvalTagName as vt, hashCacheKey as w, buildCaseKey as wt, captureEvalSpanError as x, runSummarySchema as xt, z$1 as y, validateTagsFilterExpression as yt, evalLog as z };
@@ -1,4 +1,4 @@
1
- import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-d42Lm0i5.mjs";
1
+ import { Dt as caseDetailSchema, Et as getCaseRowCaseKey, It as runWithEvalRegistry, J as runInEvalRuntimeScope, Ot as caseRowSchema, Z as runWithEvalClock, _t as matchesTagsFilter, bt as runManifestSchema, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as dedupeEvalTags, ht as deriveStatusFromChildStatuses, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromCaseRows, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveScopedSummaryFromCases, t as filterEvalCases, u as runWithModuleIsolation, vt as validateEvalTagName, wt as buildCaseKey, xt as runSummarySchema, yt as validateTagsFilterExpression } from "./runExecution-CFw0MQFs.mjs";
2
2
  import { readFile, readdir, rm, writeFile } from "node:fs/promises";
3
3
  import { dirname, join } from "node:path";
4
4
  import { existsSync } from "node:fs";
@@ -1030,6 +1030,7 @@ async function executeQueuedCases(params) {
1030
1030
  async function executeQueuedCase(params) {
1031
1031
  const { queuedCase, globalTraceDisplay } = params;
1032
1032
  const startTime = Date.now();
1033
+ await queuedCase.onStart?.();
1033
1034
  const result = await queuedCase.execute({
1034
1035
  globalTraceDisplay,
1035
1036
  startTime
@@ -1249,6 +1250,32 @@ function buildRunErrorMessage(errors) {
1249
1250
  return `[${entry.evalId}] ${messageLine}\n${details}`;
1250
1251
  }).join("\n");
1251
1252
  }
1253
+ function upsertCaseRow(caseRows, nextCaseRow) {
1254
+ const existingIndex = caseRows.findIndex((caseRow) => getCaseRowCaseKey(caseRow) === getCaseRowCaseKey(nextCaseRow) && caseRow.trial === nextCaseRow.trial);
1255
+ if (existingIndex === -1) {
1256
+ caseRows.push(nextCaseRow);
1257
+ return;
1258
+ }
1259
+ caseRows[existingIndex] = nextCaseRow;
1260
+ }
1261
+ function removeLiveCaseRows(caseRows, nextCaseRow) {
1262
+ const caseKey = getCaseRowCaseKey(nextCaseRow);
1263
+ for (let i = caseRows.length - 1; i >= 0; i--) {
1264
+ const caseRow = caseRows[i];
1265
+ if (caseRow === void 0) continue;
1266
+ if (getCaseRowCaseKey(caseRow) !== caseKey) continue;
1267
+ if (caseRow.status !== "pending" && caseRow.status !== "running") continue;
1268
+ caseRows.splice(i, 1);
1269
+ }
1270
+ }
1271
+ function emitCaseRowEvent(params) {
1272
+ params.emitEvent(params.runState, {
1273
+ type: params.type,
1274
+ runId: params.runState.manifest.id,
1275
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1276
+ payload: params.caseRow
1277
+ });
1278
+ }
1252
1279
  async function finalizePreparedCase(params) {
1253
1280
  const { runState, runDir, preparedEval, preparedCase, onCaseFinished, emitEvent } = params;
1254
1281
  if (preparedCase.finalized || preparedCase.trialResults.length === 0) return;
@@ -1263,7 +1290,8 @@ async function finalizePreparedCase(params) {
1263
1290
  pendingWrites: winningTrial.pendingCacheWrites
1264
1291
  });
1265
1292
  const artifactFileId = getCaseArtifactFileId(runState, winningTrial.caseRow);
1266
- runState.cases.push(winningTrial.caseRow);
1293
+ removeLiveCaseRows(runState.cases, winningTrial.caseRow);
1294
+ upsertCaseRow(runState.cases, winningTrial.caseRow);
1267
1295
  runState.caseDetails.set(getCaseRowCaseKey(winningTrial.caseRow), winningTrial.caseDetail);
1268
1296
  if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
1269
1297
  else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
@@ -1271,11 +1299,11 @@ async function finalizePreparedCase(params) {
1271
1299
  await writeFile(join(runDir, "traces", `${encodeURIComponent(artifactFileId)}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
1272
1300
  await persistCaseDetail(runDir, winningTrial.caseDetail, artifactFileId);
1273
1301
  onCaseFinished?.(winningTrial.caseDetail, winningTrial.caseRow);
1274
- emitEvent(runState, {
1302
+ emitCaseRowEvent({
1303
+ runState,
1304
+ emitEvent,
1275
1305
  type: "case.finished",
1276
- runId: runState.manifest.id,
1277
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1278
- payload: winningTrial.caseRow
1306
+ caseRow: winningTrial.caseRow
1279
1307
  });
1280
1308
  preparedEval.evalCaseRows.push(winningTrial.caseRow);
1281
1309
  }
@@ -1437,13 +1465,52 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
1437
1465
  preparedEvals.push(preparedEval);
1438
1466
  for (const evalCase of cases) {
1439
1467
  const trialResults = [];
1468
+ const liveCaseRow = {
1469
+ caseId: evalCase.id,
1470
+ evalId: evalMeta.id,
1471
+ evalKey: evalMeta.key,
1472
+ caseKey: buildCaseKey({
1473
+ filePath: evalMeta.filePath,
1474
+ evalId: evalMeta.id,
1475
+ caseId: evalCase.id
1476
+ }),
1477
+ tags: evalCase.tags,
1478
+ status: "pending",
1479
+ durationMs: null,
1480
+ cacheHits: 0,
1481
+ cacheOperations: 0,
1482
+ columns: {},
1483
+ trial: 0
1484
+ };
1440
1485
  const preparedCase = {
1441
1486
  caseId: evalCase.id,
1487
+ liveCaseRow,
1442
1488
  trialResults,
1443
1489
  finalized: false
1444
1490
  };
1445
1491
  preparedCases.push(preparedCase);
1492
+ upsertCaseRow(runState.cases, liveCaseRow);
1493
+ emitCaseRowEvent({
1494
+ runState,
1495
+ emitEvent,
1496
+ type: "case.updated",
1497
+ caseRow: liveCaseRow
1498
+ });
1446
1499
  for (let trial = 0; trial < request.trials; trial++) queuedCases.push({
1500
+ onStart: () => {
1501
+ if (preparedCase.finalized) return;
1502
+ preparedCase.liveCaseRow = {
1503
+ ...preparedCase.liveCaseRow,
1504
+ status: "running"
1505
+ };
1506
+ upsertCaseRow(runState.cases, preparedCase.liveCaseRow);
1507
+ emitCaseRowEvent({
1508
+ runState,
1509
+ emitEvent,
1510
+ type: "case.started",
1511
+ caseRow: preparedCase.liveCaseRow
1512
+ });
1513
+ },
1447
1514
  execute: async ({ startTime, globalTraceDisplay }) => await executeCaseChild({
1448
1515
  evalId: evalMeta.id,
1449
1516
  evalKey: evalMeta.key,
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-MSr8sAWm.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-CY3bgsjU.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-_g2qOMK6.mjs";
2
- import "./src-CdZsOn6y.mjs";
1
+ import { n as createRunner } from "./cli-SP4kEtYL.mjs";
2
+ import "./src-7GbQj1sb.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,5 +1,5 @@
1
- import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-d42Lm0i5.mjs";
2
- import "./cli-_g2qOMK6.mjs";
1
+ import { G as matchesEvalTags$1, Pt as defineEval$1 } from "./runExecution-CFw0MQFs.mjs";
2
+ import "./cli-SP4kEtYL.mjs";
3
3
  //#region src/index.ts
4
4
  /** Register an eval definition with typed tag support. */
5
5
  function defineEval(definition) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.58.1",
3
+ "version": "0.58.3",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -33,8 +33,8 @@
33
33
  "@types/node": "^24.7.2",
34
34
  "typescript": "^5.9.2",
35
35
  "@agent-evals/runner": "0.0.1",
36
- "@agent-evals/shared": "0.0.1",
37
- "@agent-evals/sdk": "0.0.1"
36
+ "@agent-evals/sdk": "0.0.1",
37
+ "@agent-evals/shared": "0.0.1"
38
38
  },
39
39
  "scripts": {
40
40
  "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
@@ -35,6 +35,9 @@ display rules), read the TypeScript declarations shipped with the package:
35
35
  - `agent-evals app` watches `agent-evals.config.ts` and reloads config in
36
36
  place when the runner is idle. If config changes during an active run, the
37
37
  reload applies after the current run reaches a terminal state.
38
+ - App-triggered runs log the queued target evals, resolved case concurrency,
39
+ each case start for evals that are actually running, and the terminal run
40
+ summary in the server terminal.
38
41
 
39
42
  Assume that enumerated tables in this document may lag behind the types —
40
43
  treat the types as source of truth when they disagree.
@@ -360,7 +363,19 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
360
363
  The older object-returning function form remains supported. Global
361
364
  derivations run first; runtime outputs are never overwritten, and eval-level
362
365
  derivations only fill keys still missing after global derivations. In keyed
363
- form, return `undefined` to omit one output for that case.
366
+ form, return `undefined` to omit one output for that case. Do not call
367
+ `evalAssert(...)` or `evalExpect(...)` from `deriveFromTracing`; use
368
+ `tracingAssertions` for trace-derived pass/fail checks.
369
+ - `tracingAssertions` is a single function that can be authored globally or
370
+ locally on one eval when a finished-trace invariant should pass or fail the
371
+ case without creating a fake score column. It receives the same
372
+ `{ trace, input, case }` context as `deriveFromTracing`; call
373
+ `evalAssert(...)` or `evalExpect(...)` inside it.
374
+ Useful trace helpers include `trace.findSpan(name)`, `trace.findSpans(name)`,
375
+ `trace.hasSpan(name)`, `trace.findSpansByKind(kind)`,
376
+ `trace.findToolCallSpans()`, `trace.listToolCallSpanNames()`,
377
+ `trace.hasToolCallSpan(name)`, `trace.listSpanNames(kind?)`,
378
+ `trace.listSpanNamesDfs(kind?)`, and `trace.flattenDfs()`.
364
379
  - `traceDisplay` promotes selected span attributes into the trace tree and
365
380
  detail pane; it supports aggregation across subtrees (`scope`, `mode`) and
366
381
  user-defined `transform(...)` for derived views (e.g. currency conversion).
@@ -629,8 +644,9 @@ When adding or changing evals:
629
644
  3. `evalAssert` for hard invariants and truthy type narrowing. It records
630
645
  pass/fail entries in case-detail `assertions`; failed entries are also kept
631
646
  in `assertionFailures` and fail the case. Use `evalExpect` for non-trivial
632
- comparisons, `scores` for graded signals, and `passThreshold` only on
633
- scores that should gate pass/fail.
647
+ comparisons, `tracingAssertions` for invariants derived from the finished
648
+ trace, `scores` for graded signals, and `passThreshold` only on scores that
649
+ should gate pass/fail.
634
650
  4. Surface reviewable values through execute-context `setOutput` or ambient
635
651
  `setEvalOutput` in shared workflow code, and shape them with `columns`
636
652
  formats from the `ColumnFormat` type.