npm - @ls-stack/agent-eval - Versions diffs - 0.21.0 → 0.22.0 - Mend

@ls-stack/agent-eval 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/dist/{app-CmwmcUgG.mjs → app-moDHbg1O.mjs} +3 -3
package/dist/apps/web/dist/assets/index-AUDD3rNB.js +118 -0
package/dist/apps/web/dist/index.html +1 -1
package/dist/bin.mjs +1 -1
package/dist/{cli-DumvanQI.mjs → cli-C0EtHhEO.mjs} +3 -3
package/dist/index.d.mts +46 -53
package/dist/index.mjs +3 -3
package/dist/runChild.mjs +1 -1
package/dist/{runOrchestration-zYAcAPtS.mjs → runOrchestration-D1edUDhp.mjs} +109 -78
package/dist/{runner-Dy_PECaf.mjs → runner-C9nP2VKL.mjs} +2 -2
package/dist/{runner-BcwyX9CO.mjs → runner-CyRhIzci.mjs} +1 -1
package/dist/src-D-HuV8I-.mjs +3 -0
package/package.json +1 -1
package/skills/agent-eval/SKILL.md +23 -17
package/dist/apps/web/dist/assets/index-EXO08yya.js +0 -118
package/dist/src-BoAJb4wC.mjs +0 -3

package/dist/{runOrchestration-zYAcAPtS.mjs → runOrchestration-D1edUDhp.mjs} RENAMED Viewed

@@ -2468,7 +2468,8 @@ const caseRowSchema = z.object({
 		"error",
 		"cancelled"
 	]),
-	latencyMs: z.number().nullable(),
+	/** Elapsed case execution duration in milliseconds, or null before completion. */
+	durationMs: z.number().nullable(),
 	costUsd: z.number().nullable().optional(),
 	columns: z.record(z.string(), cellValueSchema),
 	/** Winning trial index for the persisted case result. */
@@ -2586,7 +2587,7 @@ const defaultConfigKeySchema = z.enum([
 	"cachedInputTokens",
 	"cacheCreationInputTokens",
 	"reasoningTokens",
-	"llmLatencyMs"
+	"llmDurationMs"
 ]);
 /** Removal config for built-in eval-level outputs and UI metadata. */
 const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfigKeySchema)]);
@@ -2663,7 +2664,7 @@ const apiCallMetricSchema = z.object({
 });
 /**
 * Schema for one model/provider pricing entry used to derive LLM-call costs
-* from token counts when a span does not already record explicit USD costs.
+* from token counts.
 */
 const llmCallPricingSchema = z.object({
 	/** Exact model name read from the configured `attributes.model` path. */
@@ -2682,6 +2683,8 @@ const llmCallPricingSchema = z.object({
 	cachedInputUsdPerMillion: z.number().nonnegative().optional(),
 	/** USD per one million prompt-cache write tokens. */
 	cacheCreationInputUsdPerMillion: z.number().nonnegative().optional(),
+	/** USD per one million one-hour prompt-cache write tokens. */
+	cacheCreationInput1hUsdPerMillion: z.number().nonnegative().optional(),
 	/** USD per one million reasoning tokens when reported separately. */
 	reasoningUsdPerMillion: z.number().nonnegative().optional()
 });
@@ -2692,12 +2695,9 @@ const llmCallsConfigSchema = z.object({
 	/**
 	* Attribute paths used to extract structured per-call fields. Each entry is
 	* a dot-path inside `span.attributes`. Missing paths fall back to the
-	* built-in defaults (e.g. `usage.inputTokens`, `costUsd`).
-	*
-	* Per-token-type cost paths (`inputCost`, `outputCost`, `cachedInputCost`,
-	* `reasoningCost`) feed the cost breakdown table in the expanded row when
-	* spans provide explicit USD cost overrides. Prefer `pricing` for deriving
-	* costs from token counts globally.
+	* built-in defaults (e.g. `usage.inputTokens`). Derived fields such as
+	* total tokens, tokens/sec, duration, and USD costs are intentionally not
+	* configurable as attribute paths.
 	*/
 	attributes: z.object({
 		model: z.string().optional(),
@@ -2706,15 +2706,9 @@ const llmCallsConfigSchema = z.object({
 		outputTokens: z.string().optional(),
 		cachedInputTokens: z.string().optional(),
 		cacheCreationInputTokens: z.string().optional(),
+		cacheCreationInput1hTokens: z.string().optional(),
 		reasoningTokens: z.string().optional(),
-		totalTokens: z.string().optional(),
-		tokensPerSecond: z.string().optional(),
-		cost: z.string().optional(),
-		inputCost: z.string().optional(),
-		outputCost: z.string().optional(),
-		cachedInputCost: z.string().optional(),
-		cacheCreationInputCost: z.string().optional(),
-		reasoningCost: z.string().optional(),
+		latencyMs: z.string().optional(),
 		steps: z.string().optional(),
 		finishReason: z.string().optional(),
 		input: z.string().optional(),
@@ -2723,9 +2717,8 @@ const llmCallsConfigSchema = z.object({
 		toolCalls: z.string().optional()
 	}).optional(),
 	/**
-	* Model/provider pricing registry used to calculate missing LLM-call costs
-	* from token counts. Explicit span attributes (`costUsd`, `cost.inputUsd`,
-	* etc.) take precedence over derived prices.
+	* Model/provider pricing registry used to calculate LLM-call costs from
+	* token counts. Built-in LLM cost fields are only derived from this registry.
 	*/
 	pricing: z.array(llmCallPricingSchema).optional(),
 	/** Custom user-defined metrics surfaced on each LLM call. */
@@ -2773,15 +2766,9 @@ const DEFAULT_LLM_CALLS_CONFIG = {
 		outputTokens: "usage.outputTokens",
 		cachedInputTokens: "usage.cachedInputTokens",
 		cacheCreationInputTokens: "usage.cacheCreationInputTokens",
+		cacheCreationInput1hTokens: "usage.cacheCreationInput1hTokens",
 		reasoningTokens: "usage.reasoningTokens",
-		totalTokens: "usage.totalTokens",
-		tokensPerSecond: "tokensPerSecond",
-		cost: "costUsd",
-		inputCost: "cost.inputUsd",
-		outputCost: "cost.outputUsd",
-		cachedInputCost: "cost.cachedInputUsd",
-		cacheCreationInputCost: "cost.cacheCreationInputUsd",
-		reasoningCost: "cost.reasoningUsd",
+		latencyMs: "latencyMs",
 		steps: "steps",
 		finishReason: "finishReason",
 		input: "input",
@@ -2823,8 +2810,8 @@ const DEFAULT_API_CALLS_CONFIG = {
 *   attribute path.
 * - Missing `metrics[].format` defaults to `'string'`.
 * - Missing `metrics[].placements` defaults to `['body']`.
-* - Missing `pricing` defaults to an empty registry; explicit span costs still
-*   take precedence over derived costs.
+* - Missing `pricing` defaults to an empty registry; built-in costs are only
+*   derived from configured pricing and token counts.
 */
 function resolveLlmCallsConfig(input) {
 	return {
@@ -2848,6 +2835,7 @@ function resolveLlmCallsConfig(input) {
 			outputUsdPerMillion: p.outputUsdPerMillion,
 			cachedInputUsdPerMillion: p.cachedInputUsdPerMillion,
 			cacheCreationInputUsdPerMillion: p.cacheCreationInputUsdPerMillion,
+			cacheCreationInput1hUsdPerMillion: p.cacheCreationInput1hUsdPerMillion,
 			reasoningUsdPerMillion: p.reasoningUsdPerMillion
 		}))
 	};
@@ -3037,8 +3025,8 @@ function deriveScopedSummaryFromCases(params) {
 		else if (caseRow.status === "cancelled") cancelledCases += 1;
 		else if (caseRow.status === "running") runningCases += 1;
 		else pendingCases += 1;
-		if (caseRow.latencyMs !== null) {
-			totalDurationMs += caseRow.latencyMs;
+		if (caseRow.durationMs !== null) {
+			totalDurationMs += caseRow.durationMs;
 			hasDuration = true;
 		}
 	}
@@ -3132,6 +3120,21 @@ function computeTokenCost(tokens, usdPerMillion) {
 	if (usdPerMillion === void 0) return null;
 	return tokens / 1e6 * usdPerMillion;
 }
+function computeCacheCreationInputCost({ cacheCreationInputTokens, cacheCreationInput1hTokens, usdPerMillion, oneHourUsdPerMillion }) {
+	if (cacheCreationInputTokens === null) return null;
+	if (cacheCreationInputTokens === 0) return 0;
+	if (cacheCreationInput1hTokens === null) return computeTokenCost(cacheCreationInputTokens, usdPerMillion);
+	const oneHourTokens = Math.min(cacheCreationInput1hTokens, cacheCreationInputTokens);
+	const shortLivedCost = computeTokenCost(cacheCreationInputTokens - oneHourTokens, usdPerMillion);
+	const oneHourCost = computeTokenCost(oneHourTokens, oneHourUsdPerMillion);
+	if (shortLivedCost === null || oneHourCost === null) return null;
+	return shortLivedCost + oneHourCost;
+}
+function computeBaseInputTokens({ inputTokens, cachedInputTokens, cacheCreationInputTokens }) {
+	if (inputTokens === null) return null;
+	const cachedTokens = (cachedInputTokens ?? 0) + (cacheCreationInputTokens ?? 0);
+	return Math.max(inputTokens - cachedTokens, 0);
+}
 function pickPricingEntry({ pricing, model, provider }) {
 	if (model === null) return null;
 	let fallback = null;
@@ -3145,7 +3148,7 @@ function pickPricingEntry({ pricing, model, provider }) {
 	}
 	return fallback;
 }
-function computeFallbackTotalCost({ inputTokens, inputCostUsd, outputTokens, outputCostUsd, cachedInputTokens, cachedInputCostUsd, cacheCreationInputTokens, cacheCreationInputCostUsd, reasoningTokens, reasoningCostUsd }) {
+function computeTotalCost({ inputTokens, inputCostUsd, outputTokens, outputCostUsd, cachedInputTokens, cachedInputCostUsd, cacheCreationInputTokens, cacheCreationInputCostUsd, reasoningTokens, reasoningCostUsd }) {
 	const parts = [
 		{
 			tokens: inputTokens,
@@ -3182,7 +3185,7 @@ function computeFallbackTotalCost({ inputTokens, inputCostUsd, outputTokens, out
 	if (hasCost) return total;
 	return hasReportedTokens ? 0 : null;
 }
-function computeLatencyMs$1(span) {
+function computeDurationMs$1(span) {
 	if (span.endedAt === null) return null;
 	const started = Date.parse(span.startedAt);
 	const ended = Date.parse(span.endedAt);
@@ -3190,10 +3193,16 @@ function computeLatencyMs$1(span) {
 	const delta = ended - started;
 	return delta >= 0 ? delta : null;
 }
-function computeTotalTokens({ declared, input, output, cached, cacheCreation }) {
-	if (declared !== null) return declared;
-	if (input === null && output === null && cached === null && cacheCreation === null) return null;
-	return (input ?? 0) + (output ?? 0) + (cached ?? 0) + (cacheCreation ?? 0);
+function computeTotalTokens({ input, output }) {
+	if (input === null && output === null) return null;
+	return (input ?? 0) + (output ?? 0);
+}
+function computeTokensPerSecond({ outputTokens, durationMs, latencyMs }) {
+	if (outputTokens === null || durationMs === null) return null;
+	if (outputTokens === 0) return 0;
+	const generationMs = latencyMs === null ? durationMs : durationMs - latencyMs;
+	if (generationMs <= 0) return null;
+	return outputTokens / (generationMs / 1e3);
 }
 function readSteps(attributes, path) {
 	const raw = getNestedAttribute(attributes, path);
@@ -3201,10 +3210,6 @@ function readSteps(attributes, path) {
 		stepCount: raw.length,
 		stepDetails: raw
 	};
-	if (typeof raw === "number" && Number.isFinite(raw)) return {
-		stepCount: raw,
-		stepDetails: null
-	};
 	return {
 		stepCount: null,
 		stepDetails: null
@@ -3226,16 +3231,22 @@ function pickError$1(span) {
 * shape consumed by the LLM calls tab.
 *
 * Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
-* (`model`, token counts, explicit cost, etc.) are read via
+* (`model`, token counts, latency, etc.) are read via
 * `getNestedAttribute` from the configured paths, with safe coercion to
-* `string | null` / `number | null`. When explicit USD costs are absent,
-* configured model pricing derives per-token-type costs from token counts.
-* `totalTokens` falls back to a sum of input + output + cached when no
-* explicit total attribute is present. The `steps` attribute path may resolve
-* to either a number (rendered as the inference-round count) or an array of
-* per-step detail objects (rendered as a Steps section in the body, with
-* `stepCount` derived from the array length). `latencyMs` is `null` while the
-* span is still running. User-defined `metrics` whose path resolves to
+* `string | null` / `number | null`. `latencyMs` is an explicit
+* time-to-first-token attribute; full span elapsed time is reported separately
+* as `durationMs`. Built-in USD costs are derived only from configured model
+* pricing and token counts. `totalTokens` is always derived from input +
+* output tokens. Cached input and cache creation tokens are reported
+* separately because they are subsets of input/output usage. The main cache
+* creation token field is treated as the total write count; optional one-hour
+* cache creation tokens only split that total for cost calculation. Base input
+* cost uses input minus cache read/write tokens so cached tokens are not
+* charged twice. Cache read/write costs still contribute to the total USD cost
+* at their configured rates. The `steps` attribute path may resolve to an array
+* of per-step detail objects, with `stepCount` derived from the array length.
+* `durationMs` and `tokensPerSecond` are `null` while the span is still
+* running. User-defined `metrics` whose path resolves to
 * `undefined` are dropped, but `null`, `0`, and `false` are preserved as
 * legitimate values worth displaying. Original span order is preserved so the
 * LLM calls tab matches the ordering in the Trace tab.
@@ -3252,19 +3263,30 @@ function extractLlmCalls(spans, config) {
 		const outputTokens = readNumber$2(attrs, config.attributes.outputTokens);
 		const cachedInputTokens = readNumber$2(attrs, config.attributes.cachedInputTokens);
 		const cacheCreationInputTokens = readNumber$2(attrs, config.attributes.cacheCreationInputTokens);
+		const cacheCreationInput1hTokens = readNumber$2(attrs, config.attributes.cacheCreationInput1hTokens);
 		const reasoningTokens = readNumber$2(attrs, config.attributes.reasoningTokens);
-		const declaredTotalTokens = readNumber$2(attrs, config.attributes.totalTokens);
+		const latencyMs = readNumber$2(attrs, config.attributes.latencyMs);
+		const durationMs = computeDurationMs$1(span);
 		const pricing = pickPricingEntry({
 			pricing: config.pricing,
 			model,
 			provider
 		});
-		const inputCostUsd = readNumber$2(attrs, config.attributes.inputCost) ?? computeTokenCost(inputTokens, pricing?.inputUsdPerMillion);
-		const outputCostUsd = readNumber$2(attrs, config.attributes.outputCost) ?? computeTokenCost(outputTokens, pricing?.outputUsdPerMillion);
-		const cachedInputCostUsd = readNumber$2(attrs, config.attributes.cachedInputCost) ?? computeTokenCost(cachedInputTokens, pricing?.cachedInputUsdPerMillion);
-		const cacheCreationInputCostUsd = readNumber$2(attrs, config.attributes.cacheCreationInputCost) ?? computeTokenCost(cacheCreationInputTokens, pricing?.cacheCreationInputUsdPerMillion);
-		const reasoningCostUsd = readNumber$2(attrs, config.attributes.reasoningCost) ?? computeTokenCost(reasoningTokens, pricing?.reasoningUsdPerMillion);
-		const costUsd = readNumber$2(attrs, config.attributes.cost) ?? computeFallbackTotalCost({
+		const inputCostUsd = computeTokenCost(computeBaseInputTokens({
+			inputTokens,
+			cachedInputTokens,
+			cacheCreationInputTokens
+		}), pricing?.inputUsdPerMillion);
+		const outputCostUsd = computeTokenCost(outputTokens, pricing?.outputUsdPerMillion);
+		const cachedInputCostUsd = computeTokenCost(cachedInputTokens, pricing?.cachedInputUsdPerMillion);
+		const cacheCreationInputCostUsd = computeCacheCreationInputCost({
+			cacheCreationInputTokens,
+			cacheCreationInput1hTokens,
+			usdPerMillion: pricing?.cacheCreationInputUsdPerMillion,
+			oneHourUsdPerMillion: pricing?.cacheCreationInput1hUsdPerMillion
+		});
+		const reasoningCostUsd = computeTokenCost(reasoningTokens, pricing?.reasoningUsdPerMillion);
+		const costUsd = computeTotalCost({
 			inputTokens,
 			inputCostUsd,
 			outputTokens,
@@ -3302,13 +3324,15 @@ function extractLlmCalls(spans, config) {
 			cacheCreationInputTokens,
 			reasoningTokens,
 			totalTokens: computeTotalTokens({
-				declared: declaredTotalTokens,
 				input: inputTokens,
-				output: outputTokens,
-				cached: cachedInputTokens,
-				cacheCreation: cacheCreationInputTokens
+				output: outputTokens
+			}),
+			latencyMs,
+			tokensPerSecond: computeTokensPerSecond({
+				outputTokens,
+				durationMs,
+				latencyMs
 			}),
-			tokensPerSecond: readNumber$2(attrs, config.attributes.tokensPerSecond),
 			costUsd,
 			inputCostUsd,
 			outputCostUsd,
@@ -3317,7 +3341,7 @@ function extractLlmCalls(spans, config) {
 			reasoningCostUsd,
 			...readSteps(attrs, config.attributes.steps),
 			finishReason: readString$2(attrs, config.attributes.finishReason),
-			latencyMs: computeLatencyMs$1(span),
+			durationMs,
 			input: getNestedAttribute(attrs, config.attributes.input),
 			output: getNestedAttribute(attrs, config.attributes.output),
 			reasoning: getNestedAttribute(attrs, config.attributes.reasoning),
@@ -3342,7 +3366,7 @@ function readString$1(attributes, path) {
 	const raw = getNestedAttribute(attributes, path);
 	return typeof raw === "string" && raw.length > 0 ? raw : null;
 }
-function computeLatencyMs(span) {
+function computeDurationMs(span) {
 	if (span.endedAt === null) return null;
 	const started = Date.parse(span.startedAt);
 	const ended = Date.parse(span.endedAt);
@@ -3367,10 +3391,10 @@ function pickError(span) {
 *
 * Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
 * (`method`, `url`, `statusCode`, etc.) are read via `getNestedAttribute` from
-* the configured paths. `durationMs` takes precedence for latency, with a
-* fallback to the span start/end timestamps. User-defined `metrics` whose path
-* resolves to `undefined` are dropped, but `null`, `0`, and `false` are
-* preserved as legitimate values worth displaying. Original span order is
+* the configured paths. An explicit `durationMs` attribute takes precedence,
+* with a fallback to the span start/end timestamps. User-defined `metrics`
+* whose path resolves to `undefined` are dropped, but `null`, `0`, and `false`
+* are preserved as legitimate values worth displaying. Original span order is
 * preserved so the API calls tab matches the ordering in the Trace tab.
 */
 function extractApiCalls(spans, config) {
@@ -3400,7 +3424,7 @@ function extractApiCalls(spans, config) {
 			method: readString$1(attrs, config.attributes.method),
 			url: readString$1(attrs, config.attributes.url),
 			statusCode: readNumber$1(attrs, config.attributes.statusCode),
-			latencyMs: readNumber$1(attrs, config.attributes.durationMs) ?? computeLatencyMs(span),
+			durationMs: readNumber$1(attrs, config.attributes.durationMs) ?? computeDurationMs(span),
 			request: getNestedAttribute(attrs, config.attributes.request),
 			response: getNestedAttribute(attrs, config.attributes.response),
 			requestBody: getNestedAttribute(attrs, config.attributes.requestBody),
@@ -3798,7 +3822,7 @@ async function writeCacheFile(cacheDir, cacheFile) {
 	await mkdir(cacheDir, { recursive: true });
 	const filePath = ownerPath(cacheDir, cacheFile.owner);
 	const tmpPath = `${filePath}.${process.pid.toString()}.tmp`;
-	await writeFile(tmpPath, JSON.stringify(cacheFile));
+	await writeFile(tmpPath, JSON.stringify(cacheFile, null, 2));
 	await rename(tmpPath, filePath);
 }
 async function readDebugKeyFile(debugDir, owner) {
@@ -4237,7 +4261,8 @@ const DEFAULT_CONFIG_KEYS = [
 	"totalTokens",
 	"cachedInputTokens",
 	"cacheCreationInputTokens",
-	"llmLatencyMs"
+	"reasoningTokens",
+	"llmDurationMs"
 ];
 const tokenNumberFormat = { notation: "compact" };
 const countNumberFormat = {
@@ -4303,8 +4328,8 @@ const DEFAULT_COLUMNS = {
 		numberFormat: tokenNumberFormat,
 		align: "right"
 	},
-	llmLatencyMs: {
-		label: "LLM Latency",
+	llmDurationMs: {
+		label: "LLM Duration",
 		format: "duration",
 		align: "right"
 	}
@@ -4509,8 +4534,14 @@ function addDefaultOutputs(params) {
 	});
 	assignIfMissing({
 		outputs: params.outputs,
-		key: "llmLatencyMs",
-		value: sumNullable(calls.map((call) => call.latencyMs)),
+		key: "reasoningTokens",
+		value: sumNullable(calls.map((call) => call.reasoningTokens)),
+		activeKeys
+	});
+	assignIfMissing({
+		outputs: params.outputs,
+		key: "llmDurationMs",
+		value: sumNullable(calls.map((call) => call.durationMs)),
 		activeKeys
 	});
 }
@@ -5356,7 +5387,7 @@ async function runCase(params) {
 		caseDetail,
 		caseRowUpdate: {
 			status,
-			latencyMs: Date.now() - startTime,
+			durationMs: Date.now() - startTime,
 			columns
 		}
 	};
@@ -5647,7 +5678,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
 													caseId: evalCase.id,
 													evalId: evalMeta.id,
 													status: caseRowUpdate.status ?? "pending",
-													latencyMs: caseRowUpdate.latencyMs ?? null,
+													durationMs: caseRowUpdate.durationMs ?? null,
 													columns: caseRowUpdate.columns ?? {},
 													trial
 												}

package/dist/{runner-Dy_PECaf.mjs → runner-C9nP2VKL.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { n as createRunner } from "./cli-DumvanQI.mjs";
-import "./src-BoAJb4wC.mjs";
+import { n as createRunner } from "./cli-C0EtHhEO.mjs";
+import "./src-D-HuV8I-.mjs";
 //#region ../../apps/server/src/runner.ts
 let runnerInstance = null;
 function getRunnerInstance() {

package/dist/{runner-BcwyX9CO.mjs → runner-CyRhIzci.mjs} RENAMED Viewed

@@ -1,2 +1,2 @@
-import { n as initRunner, t as getRunnerInstance } from "./runner-Dy_PECaf.mjs";
+import { n as initRunner, t as getRunnerInstance } from "./runner-C9nP2VKL.mjs";
 export { getRunnerInstance, initRunner };

package/dist/src-D-HuV8I-.mjs ADDED Viewed

@@ -0,0 +1,3 @@
+import "./runOrchestration-D1edUDhp.mjs";
+import "./cli-C0EtHhEO.mjs";
+export {};

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ls-stack/agent-eval",
-  "version": "0.21.0",
+  "version": "0.22.0",
   "type": "module",
   "bin": {
     "agent-evals": "./dist/bin.mjs"

package/skills/agent-eval/SKILL.md CHANGED Viewed

@@ -92,20 +92,16 @@ export async function runRefundWorkflow(input: RefundInput) {
         async () => {
           let text: string;
           let usage: { inputTokens: number; outputTokens: number };
-          let costUsd: number;
           try {
-            ({ text, usage, costUsd } = await llm.complete(input.message));
+            ({ text, usage } = await llm.complete(input.message));
           } catch (error) {
             captureEvalSpanError(error);
-            ({ text, usage, costUsd } = await llm.completeWithFallback(
-              input.message,
-            ));
+            ({ text, usage } = await llm.completeWithFallback(input.message));
           }
           evalSpan.setAttributes({
             model: 'gpt-4o-mini',
             provider: 'openai',
             usage,
-            costUsd,
           });
           const expectedLocale = getEvalCaseInput('locale');
           if (typeof expectedLocale === 'string') {
@@ -137,8 +133,8 @@ are more specific. Only the `input` and `output` span attributes are promoted
 automatically in the trace tree; use `traceDisplay` for other span attributes
 such as `model` or `usage`. Eval-level LLM usage outputs, columns, stats, and
 charts are derived from matching LLM spans by default. Prefer
-`llmCalls.pricing` for LLM-call cost display instead of writing `costUsd` on
-each span.
+`llmCalls.pricing` for LLM-call cost display; built-in costs ignore span
+`costUsd` attributes.
 Use `captureEvalSpanError(error)` for recoverable errors on the active
 `evalTracer.span(...)`, such as optional model/tool failures that fall back and
@@ -261,18 +257,28 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
   See the `TraceDisplayInputConfig` type.
 - `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are
   summarized for review. Defaults to `kind: 'llm'` spans with `model`,
-  `usage.*`, `tokensPerSecond`, `input`, `output`, etc. read from conventional
-  attribute paths. Override `kinds` to broaden the filter, override
-  `attributes.<field>` for non-default span shapes, configure `pricing` to
-  derive USD costs from token counts by model/provider, and add entries to
-  `metrics` to surface arbitrary user metrics (`format: 'string' | 'number' |
-'duration' | 'json' | 'boolean'`, `placements: ['header' | 'body']`).
+  `usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional
+  attribute paths. `latencyMs` is time to first token; duration, total tokens,
+  tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
+  override `attributes.<field>` for non-default primitive span shapes, configure
+  `pricing` to derive USD costs from token counts by model/provider, and add
+  entries to `metrics` to surface arbitrary user metrics (`format: 'string' |
+'number' | 'duration' | 'json' | 'boolean'`, `placements: ['header' |
+'body']`).
 - Default usage config derives missing eval outputs from matching LLM/API spans
   before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`,
   `inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`,
-  `cacheCreationInputTokens`, `reasoningTokens`, and `llmLatencyMs`. Authored
-  outputs and column overrides win. Remove defaults globally or per eval with
-  `removeDefaultConfig: true` or a key list such as
+  `cacheCreationInputTokens`, `reasoningTokens`, and `llmDurationMs`. Authored
+  outputs and column overrides win. `totalTokens` is input + output only; cache
+  read/write tokens stay separate and affect `costUsd` at their own rates.
+  Derived base input cost uses `inputTokens - cachedInputTokens -
+cacheCreationInputTokens` so cache details are not double-counted.
+  `cacheCreationInputTokens` is the total cache-write count; optional
+  `cacheCreationInput1hTokens` only splits that total for 1-hour write pricing
+  via `cacheCreationInput1hUsdPerMillion`. `llmDurationMs` sums elapsed matched
+  LLM span durations; it is not time-to-first-token latency.
+  Remove defaults globally or per eval with `removeDefaultConfig: true` or a
+  key list such as
   `removeDefaultConfig: ['apiCalls', 'reasoningTokens']`.
 - `apiCalls` (in `agent-evals.config.ts`) configures how API-call spans are
   summarized for review. Defaults to `kind: 'api'`, `'http'`, `'http.client'`,