npm - @ls-stack/agent-eval - Versions diffs - 0.55.2 → 0.56.0 - Mend

@ls-stack/agent-eval 0.55.2 → 0.56.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/{app-NI4to6lp.mjs → app-Bpe6Monh.mjs} +4 -4
package/dist/apps/web/dist/assets/index-CfSiAVmi.js +377 -0
package/dist/apps/web/dist/assets/{index-2I-eWzVL.css → index-Xa_7PteQ.css} +1 -1
package/dist/apps/web/dist/index.html +2 -2
package/dist/bin.mjs +1 -1
package/dist/caseChild.mjs +2 -1
package/dist/{cli-Bu9347r1.mjs → cli-DQO2Fpt2.mjs} +4 -4
package/dist/index.d.mts +573 -38
package/dist/index.mjs +3 -3
package/dist/runChild.mjs +2 -2
package/dist/{runExecution-C31dpemR.mjs → runExecution-6lrtj48K.mjs} +95 -7
package/dist/{runOrchestration-3RoHLW4U.mjs → runOrchestration-BYaN2mzS.mjs} +1 -1
package/dist/{runner-CTp9zHbM.mjs → runner-C3CiS2o7.mjs} +1 -1
package/dist/{runner-B4EfMn1d.mjs → runner-DYlwuAT3.mjs} +2 -2
package/dist/{src-FR60ZR_4.mjs → src-DCGrFAmO.mjs} +2 -2
package/package.json +1 -1
package/skills/agent-eval/SKILL.md +10 -2
package/dist/apps/web/dist/assets/index-C7QjETk8.js +0 -377

package/dist/index.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { $ as startEvalBackgroundJob, A as repoFile, B as getCurrentScope, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as appendToEvalOutput, G as mergeEvalOutput, H as incrementEvalOutput, J as runInEvalScope, K as nextEvalId, L as evalAssert, M as readManualInputFile, N as evalExpect, Nt as getEvalRegistry, O as serializeCacheRecording, P as EvalAssertionError, Q as setScopeCacheContext, R as evalLog, S as evalSpan, T as hashCacheKeySync, U as isInEvalScope, V as getEvalCaseInput, Y as runInExistingEvalScope, Z as setEvalOutput, at as extractLlmCalls, b as buildTraceTree, it as extractApiCalls, j as manualInputFileValueSchema, k as serializeCacheValue, lt as getNestedAttribute, nt as extractCacheEntries, ot as simulateLlmCallCost, q as runInEvalRuntimeScope, rt as extractCacheHits, st as simulateTokenAllocation, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalTime } from "./runExecution-C31dpemR.mjs";
-import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-Bu9347r1.mjs";
-import { n as matchesEvalTags, t as defineEval } from "./src-FR60ZR_4.mjs";
+import { $ as startEvalBackgroundJob, A as repoFile, B as getCurrentScope, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as appendToEvalOutput, G as mergeEvalOutput, H as incrementEvalOutput, J as runInEvalScope, K as nextEvalId, L as evalAssert, M as readManualInputFile, N as evalExpect, Nt as getEvalRegistry, O as serializeCacheRecording, P as EvalAssertionError, Q as setScopeCacheContext, R as evalLog, S as evalSpan, T as hashCacheKeySync, U as isInEvalScope, V as getEvalCaseInput, Y as runInExistingEvalScope, Z as setEvalOutput, at as extractLlmCalls, b as buildTraceTree, it as extractApiCalls, j as manualInputFileValueSchema, k as serializeCacheValue, lt as getNestedAttribute, nt as extractCacheEntries, ot as simulateLlmCallCost, q as runInEvalRuntimeScope, rt as extractCacheHits, st as simulateTokenAllocation, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalTime } from "./runExecution-6lrtj48K.mjs";
+import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-DQO2Fpt2.mjs";
+import { n as matchesEvalTags, t as defineEval } from "./src-DCGrFAmO.mjs";
 export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };

package/dist/runChild.mjs CHANGED Viewed

@@ -1,5 +1,5 @@
-import { At as evalChartsConfigSchema, Ct as buildEvalKey, Dt as evalStatAggregateSchema, I as configureEvalRunLogs, Ot as evalStatsConfigSchema, bt as runSummarySchema, et as createRunRequestSchema, jt as columnDefSchema, kt as manualInputDescriptorSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, v as createFsCacheStore, yt as runManifestSchema } from "./runExecution-C31dpemR.mjs";
-import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-3RoHLW4U.mjs";
+import { At as evalChartsConfigSchema, Ct as buildEvalKey, Dt as evalStatAggregateSchema, I as configureEvalRunLogs, Ot as evalStatsConfigSchema, bt as runSummarySchema, et as createRunRequestSchema, jt as columnDefSchema, kt as manualInputDescriptorSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, v as createFsCacheStore, yt as runManifestSchema } from "./runExecution-6lrtj48K.mjs";
+import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-BYaN2mzS.mjs";
 import { z } from "zod/v4";
 import { readFile } from "node:fs/promises";
 import { relative } from "node:path";

package/dist/{runExecution-C31dpemR.mjs → runExecution-6lrtj48K.mjs} RENAMED Viewed

@@ -220,6 +220,19 @@ const traceSpanSchema = z.object({
 });
 //#endregion
 //#region ../shared/src/schemas/cache.ts
+const outputColumnOverrideSchema = z.object({
+	label: z.string().optional(),
+	format: columnFormatSchema.optional(),
+	numberFormat: numberDisplayOptionsSchema.optional(),
+	hideInTable: z.boolean().optional(),
+	hideIfNoValue: z.boolean().optional(),
+	align: z.enum([
+		"left",
+		"center",
+		"right"
+	]).optional(),
+	maxStars: z.number().int().min(2).optional()
+});
 /**
 * Mode that controls how the cache is consulted for a given run.
 *
@@ -311,7 +324,8 @@ const cacheRecordingOpSchema = z.discriminatedUnion("kind", [
 	z.object({
 		kind: z.literal("setOutput"),
 		key: z.string(),
-		value: z.unknown()
+		value: z.unknown(),
+		column: outputColumnOverrideSchema.optional()
 	}),
 	z.object({
 		kind: z.literal("appendOutput"),
@@ -792,6 +806,11 @@ const caseRowSchema = z.object({
 	cacheOperations: z.number().optional(),
 	costUsd: z.number().nullable().optional(),
 	columns: z.record(z.string(), cellValueSchema),
+	/**
+	* Runtime column definitions authored by output helpers for this case.
+	* These complement eval-level `columns` without changing discovery metadata.
+	*/
+	outputColumnDefs: z.array(columnDefSchema).optional(),
 	/** Winning trial index for the persisted case result. */
 	trial: z.number()
 });
@@ -898,6 +917,11 @@ const caseDetailSchema = z.object({
 	*/
 	scoringTraces: z.record(z.string(), scoreTraceSchema).optional(),
 	columns: z.record(z.string(), cellValueSchema),
+	/**
+	* Runtime column definitions authored by output helpers for this case.
+	* These complement eval-level `columns` without changing discovery metadata.
+	*/
+	outputColumnDefs: z.array(columnDefSchema).optional(),
 	assertionFailures: z.array(z.union([assertionFailureSchema, legacyAssertionFailureSchema])),
 	/** Logs captured from manual `evalLog(...)` calls and enabled console calls. */
 	logs: z.array(runLogEntrySchema).default([]),
@@ -2140,17 +2164,34 @@ function computeTokensPerSecond({ outputTokens, durationMs }) {
 	if (durationMs <= 0) return null;
 	return outputTokens / (durationMs / 1e3);
 }
-function readSteps(attributes, path) {
+function readSteps(attributes, path, childModelSteps) {
 	const raw = getNestedAttribute(attributes, path);
 	if (Array.isArray(raw)) return {
 		stepCount: raw.length,
 		stepDetails: raw
 	};
+	if (childModelSteps.length > 0) return {
+		stepCount: childModelSteps.length,
+		stepDetails: childModelSteps
+	};
 	return {
 		stepCount: null,
 		stepDetails: null
 	};
 }
+function buildModelStepsByParent(spans) {
+	const stepsByParent = /* @__PURE__ */ new Map();
+	for (const span of spans) {
+		if (span.kind !== "model_step" || span.parentId === null) continue;
+		const current = stepsByParent.get(span.parentId);
+		if (current === void 0) {
+			stepsByParent.set(span.parentId, [span]);
+			continue;
+		}
+		current.push(span);
+	}
+	return stepsByParent;
+}
 function collectWarnings$1(span) {
 	const out = [];
 	if (span.warning) out.push(span.warning);
@@ -2182,6 +2223,9 @@ function pickError$1(span) {
 * charged twice. Cache read/write costs still contribute to the total USD cost
 * at their configured rates. The `steps` attribute path may resolve to an array
 * of per-step detail objects, with `stepCount` derived from the array length.
+* When a matching LLM span does not expose that array, direct child spans with
+* `kind: 'model_step'` are used as the step details instead. This preserves
+* Mastra/OpenTelemetry traces where model steps are emitted as child spans.
 * `durationMs` and `tokensPerSecond` are `null` while the span is still
 * running. User-defined `metrics` whose path resolves to
 * `undefined` are dropped, but `null`, `0`, and `false` are preserved as
@@ -2190,6 +2234,7 @@ function pickError$1(span) {
 */
 function extractLlmCalls(spans, config) {
 	const kindSet = new Set(config.kinds);
+	const modelStepsByParent = buildModelStepsByParent(spans);
 	const result = [];
 	for (const span of spans) {
 		if (!kindSet.has(span.kind)) continue;
@@ -2275,7 +2320,7 @@ function extractLlmCalls(spans, config) {
 			cachedInputCostUsd,
 			cacheCreationInputCostUsd,
 			reasoningCostUsd,
-			...readSteps(attrs, config.attributes.steps),
+			...readSteps(attrs, config.attributes.steps, modelStepsByParent.get(span.id) ?? []),
 			finishReason: readString$2(attrs, config.attributes.finishReason),
 			durationMs,
 			input: getNestedAttribute(attrs, config.attributes.input),
@@ -3040,6 +3085,7 @@ async function runInEvalScope(caseId, fn, options = {}) {
 		input: options.input,
 		tags: options.tags ?? [],
 		outputs: {},
+		outputColumnOverrides: {},
 		assertionFailures: [],
 		logs: [],
 		spans: [],
@@ -3089,6 +3135,11 @@ function recordOpIfActive(scope, op) {
 	const top = scope.recordingStack.at(-1);
 	if (top) top.ops.push(op);
 }
+function normalizeEvalOutputOptions(options) {
+	if (options === void 0) return void 0;
+	if (typeof options === "string") return { format: options };
+	return options;
+}
 function toAssertionFailure$1(message, error = void 0) {
 	const name = error?.name;
 	const stack = error?.stack ? stripTerminalControlCodes$1(error.stack) : void 0;
@@ -3103,15 +3154,22 @@ function toAssertionFailure$1(message, error = void 0) {
 *
 * Supported values include scalars, JSON-safe objects/arrays, explicit file
 * refs, and native `Blob`/`File` instances for media or file columns.
+*
+* Pass the optional third argument to persist a display format or full column
+* override with this runtime output, for example `'markdown'` or
+* `{ label: 'Receipt', format: 'image', hideInTable: true }`.
 */
-function setEvalOutput(key, value) {
+function setEvalOutput(key, value, options = void 0) {
 	const scope = getCurrentScope();
 	if (!scope) return;
 	scope.outputs[key] = value;
+	const column = normalizeEvalOutputOptions(options);
+	if (column !== void 0) scope.outputColumnOverrides[key] = column;
 	recordOpIfActive(scope, {
 		kind: "setOutput",
 		key,
-		value
+		value,
+		column
 	});
 }
 /**
@@ -4110,6 +4168,7 @@ function replayRecording(scope, parentSpan, recording, options) {
 function applyRecordingOp(scope, parentSpan, op, options) {
 	if (op.kind === "setOutput") {
 		scope.outputs[op.key] = op.value;
+		if (op.column !== void 0) scope.outputColumnOverrides[op.key] = op.column;
 		return;
 	}
 	if (op.kind === "appendOutput") {
@@ -5758,6 +5817,27 @@ function buildDeclaredColumnDefs(overrides, scores, manualScores) {
 	return [...declaredDefs.values()];
 }
 /**
+* Build runtime column definitions from output-level display overrides.
+*
+* These definitions are persisted on case rows/details so `setOutput(...)`
+* can format one-off outputs without adding them to eval discovery metadata.
+*/
+function buildRuntimeOutputColumnDefs(columns, overrides, configuredColumnKeys = /* @__PURE__ */ new Set()) {
+	return Object.entries(overrides).filter(([key]) => columns[key] !== void 0 && !configuredColumnKeys.has(key)).map(([key, override]) => createColumnDef({
+		key,
+		override,
+		inferredKind: inferKindFromFormat(override.format) ?? (override.numberFormat === void 0 ? inferKind(columns[key]) : "number"),
+		isScore: false,
+		isManualScore: false
+	}));
+}
+/** Infer a `ColumnKind` from a runtime value when no override is set. */
+function inferKind(value) {
+	if (typeof value === "number") return "number";
+	if (typeof value === "boolean") return "boolean";
+	return "string";
+}
+/**
 * Coerce an arbitrary runtime value into a serializable `CellValue`.
 * Runtime values use the SDK's tagged serializer so saved run artifacts keep
 * structured data instead of storing JSON strings. Native binary/file root
@@ -6526,7 +6606,7 @@ async function runDeriveFromTracingConfig(params) {
 	}
 }
 async function runCase(params) {
-	const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalDeriveFromTracing, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
+	const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalColumns, globalDeriveFromTracing, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
 	const scopedIdPrefix = buildScopedEvalIdPrefix({
 		evalId,
 		evalFilePath,
@@ -6694,6 +6774,12 @@ async function runCase(params) {
 		if (cell !== void 0) columns[key] = cell;
 	}
 	for (const key of Object.keys(evalDef.manualScores ?? {})) columns[key] = null;
+	const outputColumnDefs = buildRuntimeOutputColumnDefs(columns, scope.outputColumnOverrides, new Set(Object.keys(mergeDefaultColumns({
+		globalColumns,
+		columns: evalDef.columns,
+		globalRemove: globalRemoveDefaultConfig,
+		evalRemove: evalDef.removeDefaultConfig
+	}) ?? {})));
 	const errorInfo = nonAssertError ? {
 		name: nonAssertError.name,
 		message: nonAssertError.message,
@@ -6710,6 +6796,7 @@ async function runCase(params) {
 		trace: displayTrace,
 		traceDisplay,
 		columns,
+		...outputColumnDefs.length > 0 ? { outputColumnDefs } : {},
 		assertionFailures: scope.assertionFailures,
 		logs: scope.logs,
 		error: errorInfo,
@@ -6728,7 +6815,8 @@ async function runCase(params) {
 			durationMs: elapsedMs,
 			cacheHits: cacheHits.length,
 			cacheOperations: cacheEntries.length,
-			columns
+			columns,
+			...outputColumnDefs.length > 0 ? { outputColumnDefs } : {}
 		}
 	};
 }

package/dist/{runOrchestration-3RoHLW4U.mjs → runOrchestration-BYaN2mzS.mjs} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-C31dpemR.mjs";
+import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-6lrtj48K.mjs";
 import { readFile, readdir, rm, writeFile } from "node:fs/promises";
 import { dirname, join } from "node:path";
 import { existsSync } from "node:fs";

package/dist/{runner-CTp9zHbM.mjs → runner-C3CiS2o7.mjs} RENAMED Viewed

@@ -1,2 +1,2 @@
-import { n as initRunner, t as getRunnerInstance } from "./runner-B4EfMn1d.mjs";
+import { n as initRunner, t as getRunnerInstance } from "./runner-DYlwuAT3.mjs";
 export { getRunnerInstance, initRunner };

package/dist/{runner-B4EfMn1d.mjs → runner-DYlwuAT3.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { n as createRunner } from "./cli-Bu9347r1.mjs";
-import "./src-FR60ZR_4.mjs";
+import { n as createRunner } from "./cli-DQO2Fpt2.mjs";
+import "./src-DCGrFAmO.mjs";
 //#region ../../apps/server/src/runner.ts
 let runnerInstance = null;
 function getRunnerInstance() {

package/dist/{src-FR60ZR_4.mjs → src-DCGrFAmO.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-C31dpemR.mjs";
-import "./cli-Bu9347r1.mjs";
+import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-6lrtj48K.mjs";
+import "./cli-DQO2Fpt2.mjs";
 //#region src/index.ts
 /** Register an eval definition with typed tag support. */
 function defineEval(definition) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ls-stack/agent-eval",
-  "version": "0.55.2",
+  "version": "0.56.0",
   "type": "module",
   "bin": {
     "agent-evals": "./dist/bin.mjs"

package/skills/agent-eval/SKILL.md CHANGED Viewed

@@ -318,6 +318,11 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
 - `setEvalOutput(key, value)` writes reviewable data for the case. Values are
   stored as received: primitives, objects/arrays, explicit file refs, and
   native `Blob`/`File` values. `columns.format` only controls visualization.
+  Inside `execute`, `setOutput(key, value, formatOrOverride)` can attach a
+  display hint directly to a runtime output, e.g. `'markdown'` or
+  `{ label: 'Receipt', format: 'image', hideInTable: true }`. Authored
+  global/eval `columns` for the same key take precedence over that runtime
+  hint.
   Non-JSON runtime values such as `Date`, `Map`, `Set`, `BigInt`, typed arrays,
   and class instances use the tagged value serializer instead of a string
   fallback. Native `Blob`/`File` values are copied to run artifacts because
@@ -363,8 +368,11 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
 - `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are
   summarized for review. Defaults to `kind: 'llm'` spans with `model`,
   `usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional
-  attribute paths. `latencyMs` is time to first token; duration, total tokens,
-  output tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
+  attribute paths. The default `steps` path reads an array from
+  `span.attributes.steps`; if it is missing, direct child `model_step` spans are
+  shown as that call's steps. `latencyMs` is time to first token; duration,
+  total tokens, output tokens/sec, and USD costs are derived. Override `kinds`
+  to broaden the filter,
   override `attributes.<field>` for non-default primitive span shapes, configure
   model-keyed `pricing` to derive USD costs from token counts, with nested
   `providers` entries for provider-specific rates, add `costCurrencies` to show