npm - @ls-stack/agent-eval - Versions diffs - 0.61.1 → 0.62.0 - Mend

@ls-stack/agent-eval 0.61.1 → 0.62.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/dist/{app-Dm_9ZTVa.mjs → app-CByWi7LX.mjs} +4 -4
package/dist/apps/web/dist/assets/{index-CwSehYad.js → index-CE1teCsp.js} +70 -70
package/dist/apps/web/dist/assets/{index-CM_zUhl_.css → index-zWPuRQmP.css} +1 -1
package/dist/apps/web/dist/index.html +2 -2
package/dist/bin.mjs +1 -1
package/dist/caseChild.mjs +1 -1
package/dist/{cli-CPBIcMP-.mjs → cli-CCHcjbC1.mjs} +4 -4
package/dist/index.d.mts +70 -58
package/dist/index.mjs +3 -3
package/dist/runChild.mjs +2 -2
package/dist/{runExecution-D-CnSRYy.mjs → runExecution-C24aYsk3.mjs} +97 -15
package/dist/{runOrchestration-Basvyp4u.mjs → runOrchestration-9XKoYcP9.mjs} +1 -1
package/dist/{runner-B6UT1K7L.mjs → runner-5eU-FLHV.mjs} +1 -1
package/dist/{runner-DwNb5TCb.mjs → runner-BEQGkHF0.mjs} +2 -2
package/dist/{src-SixIk0b7.mjs → src-C8n7QANC.mjs} +2 -2
package/package.json +3 -3
package/skills/agent-eval/SKILL.md +3 -3

package/dist/{runExecution-D-CnSRYy.mjs → runExecution-C24aYsk3.mjs} RENAMED Viewed

@@ -991,7 +991,12 @@ function getCaseRowCaseKey(row) {
 //#region ../shared/src/schemas/config.ts
 /** Strategy used to collapse repeated trials into one stored case result. */
 const trialSelectionModeSchema = z.enum(["lowestScore", "median"]);
-/** Built-in eval-level output/column keys. */
+/**
+* Built-in eval-level output/column keys.
+*
+* `costUsd` controls the default LLM cost family: actual billed cost plus the
+* normalized `costUsdWithoutCache` and `costUsdWarmedCache` chart outputs.
+*/
 const defaultConfigKeySchema = z.enum([
 	"apiCalls",
 	"costUsd",
@@ -6301,21 +6306,40 @@ const costNumberFormat = {
 	prefix: "$",
 	maxDecimalPlaces: 4
 };
-const DEFAULT_COLUMNS = {
-	apiCalls: {
-		label: "API Calls",
+const DEFAULT_COST_COLUMNS = {
+	costUsd: {
+		label: "Cost",
 		format: "number",
-		numberFormat: countNumberFormat,
+		numberFormat: costNumberFormat,
 		align: "right",
 		hideIfNoValue: true
 	},
-	costUsd: {
-		label: "Cost",
+	costUsdWithoutCache: {
+		label: "Cost Without Cache",
 		format: "number",
 		numberFormat: costNumberFormat,
 		align: "right",
+		hideInTable: true,
 		hideIfNoValue: true
 	},
+	costUsdWarmedCache: {
+		label: "Cost Warmed Cache",
+		format: "number",
+		numberFormat: costNumberFormat,
+		align: "right",
+		hideInTable: true,
+		hideIfNoValue: true
+	}
+};
+const DEFAULT_COLUMNS = {
+	apiCalls: {
+		label: "API Calls",
+		format: "number",
+		numberFormat: countNumberFormat,
+		align: "right",
+		hideIfNoValue: true
+	},
+	costUsd: { ...DEFAULT_COST_COLUMNS.costUsd },
 	llmTurns: {
 		label: "LLM Turns",
 		format: "number",
@@ -6389,8 +6413,16 @@ function mergeDefaultColumns(params) {
 		};
 		return Object.keys(merged).length > 0 ? merged : void 0;
 	}
+	const defaults = {};
+	for (const key of activeKeys) {
+		defaults[key] = DEFAULT_COLUMNS[key];
+		if (key === "costUsd") {
+			defaults.costUsdWithoutCache = DEFAULT_COST_COLUMNS.costUsdWithoutCache;
+			defaults.costUsdWarmedCache = DEFAULT_COST_COLUMNS.costUsdWarmedCache;
+		}
+	}
 	return {
-		...Object.fromEntries(activeKeys.map((key) => [key, DEFAULT_COLUMNS[key]])),
+		...defaults,
 		...params.globalColumns,
 		...params.columns
 	};
@@ -6445,13 +6477,29 @@ function appendDefaultCharts(params) {
 		hideIfNoValue: true,
 		dedupeConsecutiveValues: true,
 		type: "area",
-		metrics: [{
-			source: "column",
-			key: "costUsd",
-			aggregate: "avg",
-			label: "Cost",
-			color: "warning"
-		}]
+		metrics: [
+			{
+				source: "column",
+				key: "costUsd",
+				aggregate: "avg",
+				label: "Actual",
+				color: "warning"
+			},
+			{
+				source: "column",
+				key: "costUsdWithoutCache",
+				aggregate: "avg",
+				label: "Without Cache",
+				color: "error"
+			},
+			{
+				source: "column",
+				key: "costUsdWarmedCache",
+				aggregate: "avg",
+				label: "Warmed Cache",
+				color: "success"
+			}
+		]
 	});
 	const inputTokenMetrics = [
 		activeKeys.has("inputTokens") ? {
@@ -6538,10 +6586,24 @@ function getMaxLlmTurns(calls) {
 }
 function assignIfMissing(params) {
 	if (!params.activeKeys.has(params.key)) return;
+	assignOutputIfMissing({
+		outputs: params.outputs,
+		key: params.key,
+		value: params.value
+	});
+}
+function assignOutputIfMissing(params) {
 	if (params.key in params.outputs) return;
 	if (params.value === void 0) return;
 	params.outputs[params.key] = params.value;
 }
+function sumSimulatedCost(params) {
+	return sumNullable(params.calls.map((call) => simulateLlmCallCost({
+		entry: call,
+		pricing: params.llmCallsConfig.pricing,
+		scenario: params.scenario
+	}).totalCostUsd));
+}
 function addDefaultOutputs(params) {
 	const activeKeys = new Set(getActiveDefaultConfigKeys(params));
 	if (activeKeys.size === 0) return;
@@ -6566,6 +6628,26 @@ function addDefaultOutputs(params) {
 		value: sumNullable(calls.map((call) => call.costUsd)),
 		activeKeys
 	});
+	if (activeKeys.has("costUsd")) {
+		assignOutputIfMissing({
+			outputs: params.outputs,
+			key: "costUsdWithoutCache",
+			value: sumSimulatedCost({
+				calls,
+				llmCallsConfig: params.llmCallsConfig,
+				scenario: "noCache"
+			})
+		});
+		assignOutputIfMissing({
+			outputs: params.outputs,
+			key: "costUsdWarmedCache",
+			value: sumSimulatedCost({
+				calls,
+				llmCallsConfig: params.llmCallsConfig,
+				scenario: "withBaseCaching"
+			})
+		});
+	}
 	assignIfMissing({
 		outputs: params.outputs,
 		key: "inputTokens",

package/dist/{runOrchestration-Basvyp4u.mjs → runOrchestration-9XKoYcP9.mjs} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { Ct as buildCaseKey, Dt as caseRowSchema, Et as caseDetailSchema, Ft as runWithEvalRegistry, Tt as getCaseRowCaseKey, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, gt as matchesTagsFilter, h as commitPendingCacheWrites, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, yt as runManifestSchema } from "./runExecution-D-CnSRYy.mjs";
+import { Ct as buildCaseKey, Dt as caseRowSchema, Et as caseDetailSchema, Ft as runWithEvalRegistry, Tt as getCaseRowCaseKey, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, gt as matchesTagsFilter, h as commitPendingCacheWrites, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, yt as runManifestSchema } from "./runExecution-C24aYsk3.mjs";
 import { Result, resultify } from "t-result";
 import { readFile, readdir, rm, writeFile } from "node:fs/promises";
 import { dirname, join } from "node:path";

package/dist/{runner-B6UT1K7L.mjs → runner-5eU-FLHV.mjs} RENAMED Viewed

@@ -1,2 +1,2 @@
-import { n as initRunner, t as getRunnerInstance } from "./runner-DwNb5TCb.mjs";
+import { n as initRunner, t as getRunnerInstance } from "./runner-BEQGkHF0.mjs";
 export { getRunnerInstance, initRunner };

package/dist/{runner-DwNb5TCb.mjs → runner-BEQGkHF0.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { n as createRunner } from "./cli-CPBIcMP-.mjs";
-import "./src-SixIk0b7.mjs";
+import { n as createRunner } from "./cli-CCHcjbC1.mjs";
+import "./src-C8n7QANC.mjs";
 //#region ../../apps/server/src/runner.ts
 let runnerInstance = null;
 function getRunnerInstance({ loadEnv = true } = {}) {

package/dist/{src-SixIk0b7.mjs → src-C8n7QANC.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { Nt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-D-CnSRYy.mjs";
-import "./cli-CPBIcMP-.mjs";
+import { Nt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-C24aYsk3.mjs";
+import "./cli-CCHcjbC1.mjs";
 //#region src/index.ts
 /** Register an eval definition with typed tag support. */
 function defineEval(definition) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ls-stack/agent-eval",
-  "version": "0.61.1",
+  "version": "0.62.0",
   "type": "module",
   "bin": {
     "agent-evals": "./dist/bin.mjs"
@@ -33,8 +33,8 @@
     "@types/node": "^24.7.2",
     "typescript": "^5.9.2",
     "@agent-evals/runner": "0.0.1",
-    "@agent-evals/sdk": "0.0.1",
-    "@agent-evals/shared": "0.0.1"
+    "@agent-evals/shared": "0.0.1",
+    "@agent-evals/sdk": "0.0.1"
   },
   "scripts": {
     "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",

package/skills/agent-eval/SKILL.md CHANGED Viewed

@@ -109,7 +109,7 @@ export async function runRefundWorkflow(input: RefundInput) {
 }
 ```
-Span `kind` values are open-ended strings. Use familiar kinds such as `agent`, `tool`, `llm`, `api`, `retrieval`, `scorer`, or `checkpoint` when they fit, and preserve external tracer kinds such as `mastra.workflow.step` when they are more specific. Only the `input` and `output` span attributes are promoted automatically in the trace tree; use `traceDisplay` for other span attributes such as `model` or `usage`. Eval-level LLM usage outputs, columns, stats, and charts are derived from matching LLM spans by default. Prefer `llmCalls.pricing` for LLM-call cost display; built-in costs ignore span `costUsd` attributes.
+Span `kind` values are open-ended strings. Use familiar kinds such as `agent`, `tool`, `llm`, `api`, `retrieval`, `scorer`, or `checkpoint` when they fit, and preserve external tracer kinds such as `mastra.workflow.step` when they are more specific. Only the `input` and `output` span attributes are promoted automatically in the trace tree; use `traceDisplay` for other span attributes such as `model` or `usage`. Eval-level LLM usage outputs, columns, stats, and charts are derived from matching LLM spans by default. Prefer `llmCalls.pricing` for LLM-call cost display; built-in costs ignore span `costUsd` attributes and derive normalized cost outputs for no-cache and warmed-cache chart comparisons.
 Use `captureEvalSpanError(error)` for recoverable errors on the active `evalTracer.span(...)`, such as optional model/tool failures that fall back and continue. You can pass one error, multiple error arguments, or an array. The span is still marked `error`. Pass `'warning'` or `{ level: 'warning' }` as the final argument for diagnostics that should not change an otherwise successful span's status.
@@ -212,11 +212,11 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape (forma
 - `tracingAssertions` is a single function that can be authored globally or locally on one eval when a finished-trace invariant should pass or fail the case without creating a fake score column. It receives the same `{ trace, input, case }` context as `deriveFromTracing`; call `evalAssert(...)` or `evalExpect(...)` inside it. Useful trace helpers include `trace.findSpan(name)`, `trace.findSpans(name)`, `trace.hasSpan(name)`, `trace.findSpansByKind(kind)`, `trace.findToolCallSpans()`, `trace.listToolCallSpanNames()`, `trace.hasToolCallSpan(name)`, `trace.getToolCallSpans(name)`, `trace.getToolCallSpanCount(toolName)`, `trace.hasToolCallSpanCount(toolName, expectedCalls)`, `trace.listSpanNames(kind?)`, `trace.listSpanNamesDfs(kind?)`, and `trace.flattenDfs()`. The tool-call helpers include both `kind: 'tool'` spans and imported execution spans recorded as `kind: 'tool_call'`. Tool-name checks and counts match the span `name` as well as GenAI/Mastra identity attributes such as `genAI["gen_ai.tool.name"]` and `mastra.entityName`; list helpers prefer those tool identity attributes when present. `getToolCallSpans(name)` returns one normalized object per matching call, including parsed `arguments`, parsed `result`, `description`, `toolType`, `attributes`, and the original `span`.
 - `traceDisplay` promotes selected span attributes into the trace tree and detail pane; it supports aggregation across subtrees (`scope`, `mode`) and user-defined `transform(...)` for derived views (e.g. currency conversion). See the `TraceDisplayInputConfig` type.
 - `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are summarized for review. Defaults to `kind: 'llm'` spans with `model`, `usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional attribute paths. The default `steps` path reads an array from `span.attributes.steps`; if it is missing, direct child `model_step` spans are shown as that call's steps. Tool calls are aggregated from the configured `toolCalls` path plus step-level `toolCalls` on authored step arrays or direct `model_step` child spans, including Mastra's serialized `mastra.model_step.output` format, and child `tool_call` execution spans under each model step. `latencyMs` is time to first token; duration, total tokens, output tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter, override `attributes.<field>` for non-default primitive span shapes, configure model-keyed `pricing` to derive USD costs from token counts, with nested `providers` entries for provider-specific rates, add `costCurrencies` to show converted cost columns in the expanded breakdown table only, add `derivedAttributes` to persist computed values back onto matching LLM spans before trace consumers run, and add entries to `metrics` to surface arbitrary user metrics (`format: 'string' | 'number' | 'duration' | 'json' | 'boolean'`, `placements: ['header' | 'body']`). `derivedAttributes` can be a keyed map for one-off fields or one callback that returns multiple path/value pairs. Derived keys are dot-paths under `span.attributes`; return `undefined` to skip one span or one returned key.
-- Default usage config derives missing eval outputs from matching LLM/API spans before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`, `inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`, `cacheCreationInputTokens`, `reasoningTokens`, and `llmDurationMs`. Authored outputs and column overrides win. Default usage columns, stats, and charts use `hideIfNoValue: true`. Default LLM usage charts configure cost, input tokens, and output tokens separately and use `dedupeConsecutiveValues: true` to skip repeated adjacent chart values. `totalTokens` is input + output only; cache read/write tokens stay separate and affect `costUsd` at their own rates. `llmTurns` is the maximum per-call turn count in the case run, using configured steps when available and otherwise one turn per matched LLM call span. Derived base input cost uses `inputTokens - cachedInputTokens - cacheCreationInputTokens` so cache details are not double-counted. `cacheCreationInputTokens` is the total cache-write count; optional `cacheCreationInput1hTokens` only splits that total for 1-hour write pricing via `cacheCreationInput1hUsdPerMillion`. `llmDurationMs` sums elapsed matched LLM span durations; it is not time-to-first-token latency. Remove defaults globally or per eval with `removeDefaultConfig: true` or a key list such as `removeDefaultConfig: ['apiCalls', 'reasoningTokens']`.
+- Default usage config derives missing eval outputs from matching LLM/API spans before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `costUsdWithoutCache`, `costUsdWarmedCache`, `llmTurns`, `inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`, `cacheCreationInputTokens`, `reasoningTokens`, and `llmDurationMs`. Authored outputs and column overrides win. The web UI fills in baseline run-health stats (`cases`, `passRate`, `duration`) and a pass-rate/duration history chart when an eval has not already authored equivalent run-health UI. If discovery metadata is missing but saved runs contain runtime columns such as `costUsd`, `inputTokens`, or `apiCalls`, the single-eval page can infer the standard usage stats and charts from those saved run values. Default usage columns, stats, and charts use `hideIfNoValue: true`. Default LLM usage charts configure cost, input tokens, and output tokens separately and use `dedupeConsecutiveValues: true` to skip repeated adjacent chart values. The default LLM cost chart compares actual billed cost, no-cache normalized cost, and warmed-cache normalized cost. `totalTokens` is input + output only; cache read/write tokens stay separate and affect `costUsd` at their own rates. `costUsdWithoutCache` bills all input tokens at the base input rate; `costUsdWarmedCache` treats cacheable tokens as warmed cache reads. `llmTurns` is the maximum per-call turn count in the case run, using configured steps when available and otherwise one turn per matched LLM call span. Derived base input cost uses `inputTokens - cachedInputTokens - cacheCreationInputTokens` so cache details are not double-counted. `cacheCreationInputTokens` is the total cache-write count; optional `cacheCreationInput1hTokens` only splits that total for 1-hour write pricing via `cacheCreationInput1hUsdPerMillion`. `llmDurationMs` sums elapsed matched LLM span durations; it is not time-to-first-token latency. Remove defaults globally or per eval with `removeDefaultConfig: true` or a key list such as `removeDefaultConfig: ['apiCalls', 'reasoningTokens']`; removing `costUsd` removes the default cost family.
 - `apiCalls` (in `agent-evals.config.ts`) configures how API-call spans are summarized for review. Defaults to `kind: 'api'`, `'http'`, `'http.client'`, and `'fetch'` spans with `method`, `url`, `statusCode`, `request`, `routeAlias`, `response`, `requestBody`, `responseBody`, `headers`, `durationMs`, and `error` read from conventional attribute paths. Override `kinds` or `attributes.<field>` for external tracers. Set a per-span `routeAlias` attribute such as `/v3/tabs/:id` to group dynamic URL paths in API-call route labels and endpoint charts while preserving original URLs in row details. Add `derivedAttributes` as a keyed map or object-returning callback for computed persisted API span attributes, and add `metrics` with the same formats and placements as LLM-call metrics.
 - `runLogs` (in `agent-evals.config.ts`) controls case log capture. Use `runLogs: { captureConsole: false }` to keep console output in the terminal without persisting console calls to case details. Manual `evalLog(...)` calls are still captured. Captured log locations store the selected user-facing source frame and the full JavaScript stack so agents can inspect additional frames in persisted artifacts when diagnosing where a log came from.
-Stats rows and history charts can be authored via `stats` / `charts` on the eval definition. Global `stats` in `agent-evals.config.ts` combine with eval-level stats. Native stat kinds include `cases`, `passRate`, `duration`, and `cacheHits`; `cacheHits` shows Agent Eval operation-level cache hits over total cache operations (`hits/total`) from spans and `evalTracer.cache(...)` refs, not LLM provider prompt-cache read tokens such as `cachedInputTokens`. Cache-hit stats use a separate aggregate control and default to `sum`; `avg` is average per-case hit rate, and min/max/best/worst select cases by hit rate. `duration` aggregates per-case durations using the same modes as column stats. Usage stats and LLM usage charts are added by default unless removed with `removeDefaultConfig`. Column stats can override `format` and `numberFormat`, otherwise they inherit from the matching column. Duration and column stat aggregates support `avg`, `min`, `max`, `sum`, `best` (highest finite value), and `worst` (lowest finite value). Use `defaultStatAggregate` in `agent-evals.config.ts` to set the workspace-wide initial duration/column stat mode, or on an eval definition to override it for that eval. Number formats use `maxDecimalPlaces` to cap decimals and `minDecimalPlaces` to pad trailing zeroes. Without `maxDecimalPlaces`, the default cap is 3 decimal places. Stats and charts support `hideIfNoValue: true`. Charts support `dedupeConsecutiveValues: true` to omit consecutive points whose plotted metrics and tooltip extras match the previous kept point. Their shapes live in the types; no need to memorize the option set.
+Stats rows and history charts can be authored via `stats` / `charts` on the eval definition. Global `stats` in `agent-evals.config.ts` combine with eval-level stats. The web UI automatically supplies missing `cases`, `passRate`, and `duration` stats plus a pass-rate/duration history chart, including for a single completed run. Native stat kinds include `cases`, `passRate`, `duration`, and `cacheHits`; `cacheHits` shows Agent Eval operation-level cache hits over total cache operations (`hits/total`) from spans and `evalTracer.cache(...)` refs, not LLM provider prompt-cache read tokens such as `cachedInputTokens`. Cache-hit stats use a separate aggregate control and default to `sum`; `avg` is average per-case hit rate, and min/max/best/worst select cases by hit rate. `duration` aggregates per-case durations using the same modes as column stats. Usage stats and LLM usage charts are added by default unless removed with `removeDefaultConfig`. Column stats can override `format` and `numberFormat`, otherwise they inherit from the matching column. Duration and column stat aggregates support `avg`, `min`, `max`, `sum`, `best` (highest finite value), and `worst` (lowest finite value). Use `defaultStatAggregate` in `agent-evals.config.ts` to set the workspace-wide initial duration/column stat mode, or on an eval definition to override it for that eval. Number formats use `maxDecimalPlaces` to cap decimals and `minDecimalPlaces` to pad trailing zeroes. Without `maxDecimalPlaces`, the default cap is 3 decimal places. Stats and charts support `hideIfNoValue: true`. Charts support `dedupeConsecutiveValues: true` to omit consecutive points whose plotted metrics and tooltip extras match the previous kept point. Rendered charts with no plottable values show an unavailable state instead of a blank frame. Their shapes live in the types; no need to memorize the option set.
 ## Cached operations