@ls-stack/agent-eval 0.55.2 → 0.56.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as startEvalBackgroundJob, A as repoFile, B as getCurrentScope, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as appendToEvalOutput, G as mergeEvalOutput, H as incrementEvalOutput, J as runInEvalScope, K as nextEvalId, L as evalAssert, M as readManualInputFile, N as evalExpect, Nt as getEvalRegistry, O as serializeCacheRecording, P as EvalAssertionError, Q as setScopeCacheContext, R as evalLog, S as evalSpan, T as hashCacheKeySync, U as isInEvalScope, V as getEvalCaseInput, Y as runInExistingEvalScope, Z as setEvalOutput, at as extractLlmCalls, b as buildTraceTree, it as extractApiCalls, j as manualInputFileValueSchema, k as serializeCacheValue, lt as getNestedAttribute, nt as extractCacheEntries, ot as simulateLlmCallCost, q as runInEvalRuntimeScope, rt as extractCacheHits, st as simulateTokenAllocation, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalTime } from "./runExecution-C31dpemR.mjs";
2
- import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-Bu9347r1.mjs";
3
- import { n as matchesEvalTags, t as defineEval } from "./src-FR60ZR_4.mjs";
1
+ import { $ as startEvalBackgroundJob, A as repoFile, B as getCurrentScope, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as appendToEvalOutput, G as mergeEvalOutput, H as incrementEvalOutput, J as runInEvalScope, K as nextEvalId, L as evalAssert, M as readManualInputFile, N as evalExpect, Nt as getEvalRegistry, O as serializeCacheRecording, P as EvalAssertionError, Q as setScopeCacheContext, R as evalLog, S as evalSpan, T as hashCacheKeySync, U as isInEvalScope, V as getEvalCaseInput, Y as runInExistingEvalScope, Z as setEvalOutput, at as extractLlmCalls, b as buildTraceTree, it as extractApiCalls, j as manualInputFileValueSchema, k as serializeCacheValue, lt as getNestedAttribute, nt as extractCacheEntries, ot as simulateLlmCallCost, q as runInEvalRuntimeScope, rt as extractCacheHits, st as simulateTokenAllocation, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalTime } from "./runExecution-6lrtj48K.mjs";
2
+ import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-DQO2Fpt2.mjs";
3
+ import { n as matchesEvalTags, t as defineEval } from "./src-DCGrFAmO.mjs";
4
4
  export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
package/dist/runChild.mjs CHANGED
@@ -1,5 +1,5 @@
1
- import { At as evalChartsConfigSchema, Ct as buildEvalKey, Dt as evalStatAggregateSchema, I as configureEvalRunLogs, Ot as evalStatsConfigSchema, bt as runSummarySchema, et as createRunRequestSchema, jt as columnDefSchema, kt as manualInputDescriptorSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, v as createFsCacheStore, yt as runManifestSchema } from "./runExecution-C31dpemR.mjs";
2
- import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-3RoHLW4U.mjs";
1
+ import { At as evalChartsConfigSchema, Ct as buildEvalKey, Dt as evalStatAggregateSchema, I as configureEvalRunLogs, Ot as evalStatsConfigSchema, bt as runSummarySchema, et as createRunRequestSchema, jt as columnDefSchema, kt as manualInputDescriptorSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, v as createFsCacheStore, yt as runManifestSchema } from "./runExecution-6lrtj48K.mjs";
2
+ import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-BYaN2mzS.mjs";
3
3
  import { z } from "zod/v4";
4
4
  import { readFile } from "node:fs/promises";
5
5
  import { relative } from "node:path";
@@ -220,6 +220,19 @@ const traceSpanSchema = z.object({
220
220
  });
221
221
  //#endregion
222
222
  //#region ../shared/src/schemas/cache.ts
223
+ const outputColumnOverrideSchema = z.object({
224
+ label: z.string().optional(),
225
+ format: columnFormatSchema.optional(),
226
+ numberFormat: numberDisplayOptionsSchema.optional(),
227
+ hideInTable: z.boolean().optional(),
228
+ hideIfNoValue: z.boolean().optional(),
229
+ align: z.enum([
230
+ "left",
231
+ "center",
232
+ "right"
233
+ ]).optional(),
234
+ maxStars: z.number().int().min(2).optional()
235
+ });
223
236
  /**
224
237
  * Mode that controls how the cache is consulted for a given run.
225
238
  *
@@ -311,7 +324,8 @@ const cacheRecordingOpSchema = z.discriminatedUnion("kind", [
311
324
  z.object({
312
325
  kind: z.literal("setOutput"),
313
326
  key: z.string(),
314
- value: z.unknown()
327
+ value: z.unknown(),
328
+ column: outputColumnOverrideSchema.optional()
315
329
  }),
316
330
  z.object({
317
331
  kind: z.literal("appendOutput"),
@@ -792,6 +806,11 @@ const caseRowSchema = z.object({
792
806
  cacheOperations: z.number().optional(),
793
807
  costUsd: z.number().nullable().optional(),
794
808
  columns: z.record(z.string(), cellValueSchema),
809
+ /**
810
+ * Runtime column definitions authored by output helpers for this case.
811
+ * These complement eval-level `columns` without changing discovery metadata.
812
+ */
813
+ outputColumnDefs: z.array(columnDefSchema).optional(),
795
814
  /** Winning trial index for the persisted case result. */
796
815
  trial: z.number()
797
816
  });
@@ -898,6 +917,11 @@ const caseDetailSchema = z.object({
898
917
  */
899
918
  scoringTraces: z.record(z.string(), scoreTraceSchema).optional(),
900
919
  columns: z.record(z.string(), cellValueSchema),
920
+ /**
921
+ * Runtime column definitions authored by output helpers for this case.
922
+ * These complement eval-level `columns` without changing discovery metadata.
923
+ */
924
+ outputColumnDefs: z.array(columnDefSchema).optional(),
901
925
  assertionFailures: z.array(z.union([assertionFailureSchema, legacyAssertionFailureSchema])),
902
926
  /** Logs captured from manual `evalLog(...)` calls and enabled console calls. */
903
927
  logs: z.array(runLogEntrySchema).default([]),
@@ -2140,17 +2164,34 @@ function computeTokensPerSecond({ outputTokens, durationMs }) {
2140
2164
  if (durationMs <= 0) return null;
2141
2165
  return outputTokens / (durationMs / 1e3);
2142
2166
  }
2143
- function readSteps(attributes, path) {
2167
+ function readSteps(attributes, path, childModelSteps) {
2144
2168
  const raw = getNestedAttribute(attributes, path);
2145
2169
  if (Array.isArray(raw)) return {
2146
2170
  stepCount: raw.length,
2147
2171
  stepDetails: raw
2148
2172
  };
2173
+ if (childModelSteps.length > 0) return {
2174
+ stepCount: childModelSteps.length,
2175
+ stepDetails: childModelSteps
2176
+ };
2149
2177
  return {
2150
2178
  stepCount: null,
2151
2179
  stepDetails: null
2152
2180
  };
2153
2181
  }
2182
+ function buildModelStepsByParent(spans) {
2183
+ const stepsByParent = /* @__PURE__ */ new Map();
2184
+ for (const span of spans) {
2185
+ if (span.kind !== "model_step" || span.parentId === null) continue;
2186
+ const current = stepsByParent.get(span.parentId);
2187
+ if (current === void 0) {
2188
+ stepsByParent.set(span.parentId, [span]);
2189
+ continue;
2190
+ }
2191
+ current.push(span);
2192
+ }
2193
+ return stepsByParent;
2194
+ }
2154
2195
  function collectWarnings$1(span) {
2155
2196
  const out = [];
2156
2197
  if (span.warning) out.push(span.warning);
@@ -2182,6 +2223,9 @@ function pickError$1(span) {
2182
2223
  * charged twice. Cache read/write costs still contribute to the total USD cost
2183
2224
  * at their configured rates. The `steps` attribute path may resolve to an array
2184
2225
  * of per-step detail objects, with `stepCount` derived from the array length.
2226
+ * When a matching LLM span does not expose that array, direct child spans with
2227
+ * `kind: 'model_step'` are used as the step details instead. This preserves
2228
+ * Mastra/OpenTelemetry traces where model steps are emitted as child spans.
2185
2229
  * `durationMs` and `tokensPerSecond` are `null` while the span is still
2186
2230
  * running. User-defined `metrics` whose path resolves to
2187
2231
  * `undefined` are dropped, but `null`, `0`, and `false` are preserved as
@@ -2190,6 +2234,7 @@ function pickError$1(span) {
2190
2234
  */
2191
2235
  function extractLlmCalls(spans, config) {
2192
2236
  const kindSet = new Set(config.kinds);
2237
+ const modelStepsByParent = buildModelStepsByParent(spans);
2193
2238
  const result = [];
2194
2239
  for (const span of spans) {
2195
2240
  if (!kindSet.has(span.kind)) continue;
@@ -2275,7 +2320,7 @@ function extractLlmCalls(spans, config) {
2275
2320
  cachedInputCostUsd,
2276
2321
  cacheCreationInputCostUsd,
2277
2322
  reasoningCostUsd,
2278
- ...readSteps(attrs, config.attributes.steps),
2323
+ ...readSteps(attrs, config.attributes.steps, modelStepsByParent.get(span.id) ?? []),
2279
2324
  finishReason: readString$2(attrs, config.attributes.finishReason),
2280
2325
  durationMs,
2281
2326
  input: getNestedAttribute(attrs, config.attributes.input),
@@ -3040,6 +3085,7 @@ async function runInEvalScope(caseId, fn, options = {}) {
3040
3085
  input: options.input,
3041
3086
  tags: options.tags ?? [],
3042
3087
  outputs: {},
3088
+ outputColumnOverrides: {},
3043
3089
  assertionFailures: [],
3044
3090
  logs: [],
3045
3091
  spans: [],
@@ -3089,6 +3135,11 @@ function recordOpIfActive(scope, op) {
3089
3135
  const top = scope.recordingStack.at(-1);
3090
3136
  if (top) top.ops.push(op);
3091
3137
  }
3138
+ function normalizeEvalOutputOptions(options) {
3139
+ if (options === void 0) return void 0;
3140
+ if (typeof options === "string") return { format: options };
3141
+ return options;
3142
+ }
3092
3143
  function toAssertionFailure$1(message, error = void 0) {
3093
3144
  const name = error?.name;
3094
3145
  const stack = error?.stack ? stripTerminalControlCodes$1(error.stack) : void 0;
@@ -3103,15 +3154,22 @@ function toAssertionFailure$1(message, error = void 0) {
3103
3154
  *
3104
3155
  * Supported values include scalars, JSON-safe objects/arrays, explicit file
3105
3156
  * refs, and native `Blob`/`File` instances for media or file columns.
3157
+ *
3158
+ * Pass the optional third argument to persist a display format or full column
3159
+ * override with this runtime output, for example `'markdown'` or
3160
+ * `{ label: 'Receipt', format: 'image', hideInTable: true }`.
3106
3161
  */
3107
- function setEvalOutput(key, value) {
3162
+ function setEvalOutput(key, value, options = void 0) {
3108
3163
  const scope = getCurrentScope();
3109
3164
  if (!scope) return;
3110
3165
  scope.outputs[key] = value;
3166
+ const column = normalizeEvalOutputOptions(options);
3167
+ if (column !== void 0) scope.outputColumnOverrides[key] = column;
3111
3168
  recordOpIfActive(scope, {
3112
3169
  kind: "setOutput",
3113
3170
  key,
3114
- value
3171
+ value,
3172
+ column
3115
3173
  });
3116
3174
  }
3117
3175
  /**
@@ -4110,6 +4168,7 @@ function replayRecording(scope, parentSpan, recording, options) {
4110
4168
  function applyRecordingOp(scope, parentSpan, op, options) {
4111
4169
  if (op.kind === "setOutput") {
4112
4170
  scope.outputs[op.key] = op.value;
4171
+ if (op.column !== void 0) scope.outputColumnOverrides[op.key] = op.column;
4113
4172
  return;
4114
4173
  }
4115
4174
  if (op.kind === "appendOutput") {
@@ -5758,6 +5817,27 @@ function buildDeclaredColumnDefs(overrides, scores, manualScores) {
5758
5817
  return [...declaredDefs.values()];
5759
5818
  }
5760
5819
  /**
5820
+ * Build runtime column definitions from output-level display overrides.
5821
+ *
5822
+ * These definitions are persisted on case rows/details so `setOutput(...)`
5823
+ * can format one-off outputs without adding them to eval discovery metadata.
5824
+ */
5825
+ function buildRuntimeOutputColumnDefs(columns, overrides, configuredColumnKeys = /* @__PURE__ */ new Set()) {
5826
+ return Object.entries(overrides).filter(([key]) => columns[key] !== void 0 && !configuredColumnKeys.has(key)).map(([key, override]) => createColumnDef({
5827
+ key,
5828
+ override,
5829
+ inferredKind: inferKindFromFormat(override.format) ?? (override.numberFormat === void 0 ? inferKind(columns[key]) : "number"),
5830
+ isScore: false,
5831
+ isManualScore: false
5832
+ }));
5833
+ }
5834
+ /** Infer a `ColumnKind` from a runtime value when no override is set. */
5835
+ function inferKind(value) {
5836
+ if (typeof value === "number") return "number";
5837
+ if (typeof value === "boolean") return "boolean";
5838
+ return "string";
5839
+ }
5840
+ /**
5761
5841
  * Coerce an arbitrary runtime value into a serializable `CellValue`.
5762
5842
  * Runtime values use the SDK's tagged serializer so saved run artifacts keep
5763
5843
  * structured data instead of storing JSON strings. Native binary/file root
@@ -6526,7 +6606,7 @@ async function runDeriveFromTracingConfig(params) {
6526
6606
  }
6527
6607
  }
6528
6608
  async function runCase(params) {
6529
- const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalDeriveFromTracing, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
6609
+ const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalColumns, globalDeriveFromTracing, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
6530
6610
  const scopedIdPrefix = buildScopedEvalIdPrefix({
6531
6611
  evalId,
6532
6612
  evalFilePath,
@@ -6694,6 +6774,12 @@ async function runCase(params) {
6694
6774
  if (cell !== void 0) columns[key] = cell;
6695
6775
  }
6696
6776
  for (const key of Object.keys(evalDef.manualScores ?? {})) columns[key] = null;
6777
+ const outputColumnDefs = buildRuntimeOutputColumnDefs(columns, scope.outputColumnOverrides, new Set(Object.keys(mergeDefaultColumns({
6778
+ globalColumns,
6779
+ columns: evalDef.columns,
6780
+ globalRemove: globalRemoveDefaultConfig,
6781
+ evalRemove: evalDef.removeDefaultConfig
6782
+ }) ?? {})));
6697
6783
  const errorInfo = nonAssertError ? {
6698
6784
  name: nonAssertError.name,
6699
6785
  message: nonAssertError.message,
@@ -6710,6 +6796,7 @@ async function runCase(params) {
6710
6796
  trace: displayTrace,
6711
6797
  traceDisplay,
6712
6798
  columns,
6799
+ ...outputColumnDefs.length > 0 ? { outputColumnDefs } : {},
6713
6800
  assertionFailures: scope.assertionFailures,
6714
6801
  logs: scope.logs,
6715
6802
  error: errorInfo,
@@ -6728,7 +6815,8 @@ async function runCase(params) {
6728
6815
  durationMs: elapsedMs,
6729
6816
  cacheHits: cacheHits.length,
6730
6817
  cacheOperations: cacheEntries.length,
6731
- columns
6818
+ columns,
6819
+ ...outputColumnDefs.length > 0 ? { outputColumnDefs } : {}
6732
6820
  }
6733
6821
  };
6734
6822
  }
@@ -1,4 +1,4 @@
1
- import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-C31dpemR.mjs";
1
+ import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-6lrtj48K.mjs";
2
2
  import { readFile, readdir, rm, writeFile } from "node:fs/promises";
3
3
  import { dirname, join } from "node:path";
4
4
  import { existsSync } from "node:fs";
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-B4EfMn1d.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-DYlwuAT3.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-Bu9347r1.mjs";
2
- import "./src-FR60ZR_4.mjs";
1
+ import { n as createRunner } from "./cli-DQO2Fpt2.mjs";
2
+ import "./src-DCGrFAmO.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,5 +1,5 @@
1
- import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-C31dpemR.mjs";
2
- import "./cli-Bu9347r1.mjs";
1
+ import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-6lrtj48K.mjs";
2
+ import "./cli-DQO2Fpt2.mjs";
3
3
  //#region src/index.ts
4
4
  /** Register an eval definition with typed tag support. */
5
5
  function defineEval(definition) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.55.2",
3
+ "version": "0.56.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -318,6 +318,11 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
318
318
  - `setEvalOutput(key, value)` writes reviewable data for the case. Values are
319
319
  stored as received: primitives, objects/arrays, explicit file refs, and
320
320
  native `Blob`/`File` values. `columns.format` only controls visualization.
321
+ Inside `execute`, `setOutput(key, value, formatOrOverride)` can attach a
322
+ display hint directly to a runtime output, e.g. `'markdown'` or
323
+ `{ label: 'Receipt', format: 'image', hideInTable: true }`. Authored
324
+ global/eval `columns` for the same key take precedence over that runtime
325
+ hint.
321
326
  Non-JSON runtime values such as `Date`, `Map`, `Set`, `BigInt`, typed arrays,
322
327
  and class instances use the tagged value serializer instead of a string
323
328
  fallback. Native `Blob`/`File` values are copied to run artifacts because
@@ -363,8 +368,11 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
363
368
  - `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are
364
369
  summarized for review. Defaults to `kind: 'llm'` spans with `model`,
365
370
  `usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional
366
- attribute paths. `latencyMs` is time to first token; duration, total tokens,
367
- output tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
371
+ attribute paths. The default `steps` path reads an array from
372
+ `span.attributes.steps`; if it is missing, direct child `model_step` spans are
373
+ shown as that call's steps. `latencyMs` is time to first token; duration,
374
+ total tokens, output tokens/sec, and USD costs are derived. Override `kinds`
375
+ to broaden the filter,
368
376
  override `attributes.<field>` for non-default primitive span shapes, configure
369
377
  model-keyed `pricing` to derive USD costs from token counts, with nested
370
378
  `providers` entries for provider-specific rates, add `costCurrencies` to show