npm - @ls-stack/agent-eval - Versions diffs - 0.21.0 → 0.23.0 - Mend

@ls-stack/agent-eval 0.21.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/dist/{app-CmwmcUgG.mjs → app-Cw79dJDr.mjs} +3 -3
package/dist/apps/web/dist/assets/index-AUDD3rNB.js +118 -0
package/dist/apps/web/dist/index.html +1 -1
package/dist/bin.mjs +1 -1
package/dist/{cli-DumvanQI.mjs → cli-D3QNOcPN.mjs} +3 -3
package/dist/index.d.mts +102 -56
package/dist/index.mjs +4 -4
package/dist/runChild.mjs +1 -1
package/dist/{runOrchestration-zYAcAPtS.mjs → runOrchestration-CimthgI7.mjs} +248 -90
package/dist/{runner-Dy_PECaf.mjs → runner-4yNYRvmF.mjs} +2 -2
package/dist/{runner-BcwyX9CO.mjs → runner-B-SYzW8w.mjs} +1 -1
package/dist/src-CcXfWT4M.mjs +3 -0
package/package.json +1 -1
package/skills/agent-eval/SKILL.md +36 -17
package/dist/apps/web/dist/assets/index-EXO08yya.js +0 -118
package/dist/src-BoAJb4wC.mjs +0 -3

package/dist/{runOrchestration-zYAcAPtS.mjs → runOrchestration-CimthgI7.mjs} RENAMED Viewed

@@ -49,9 +49,46 @@ function repoFile(path, mimeType) {
 //#region ../sdk/src/runtime.ts
 const scopeStorage = new AsyncLocalStorage();
 const runtimeScopeStorage = new AsyncLocalStorage();
+const evalClockStorage = new AsyncLocalStorage();
 let activeEvalScopeCount = 0;
 let activeEvalRuntimeScopeCount = 0;
 let consoleCaptureEnabled = true;
+const defaultEvalStartTimeMs = Date.parse("2026-04-10T00:00:00.000Z");
+const realDate = globalThis.__agentEvalsRealDate ?? Date;
+globalThis.__agentEvalsRealDate = realDate;
+function toDateConstructorArg(value) {
+	if (typeof value === "string" || typeof value === "number" || value instanceof realDate) return value;
+	return Number(value);
+}
+function toDateNumberArg(value) {
+	return typeof value === "number" ? value : Number(value);
+}
+function constructDateFromArgs(args) {
+	if (args.length === 0) return new realDate();
+	if (args.length === 1) return new realDate(toDateConstructorArg(args[0]));
+	return new realDate(toDateNumberArg(args[0]), toDateNumberArg(args[1]), args[2] === void 0 ? 1 : toDateNumberArg(args[2]), args[3] === void 0 ? 0 : toDateNumberArg(args[3]), args[4] === void 0 ? 0 : toDateNumberArg(args[4]), args[5] === void 0 ? 0 : toDateNumberArg(args[5]), args[6] === void 0 ? 0 : toDateNumberArg(args[6]));
+}
+const evalDate = new Proxy(realDate, {
+	apply(target, thisArg, argArray_) {
+		const nowMs = getEvalClockNowMs();
+		if (nowMs !== null) return new target(nowMs).toString();
+		return target.call(thisArg);
+	},
+	construct(target, argArray, newTarget_) {
+		const nowMs = getEvalClockNowMs();
+		if (argArray.length === 0 && nowMs !== null) return new target(nowMs);
+		return constructDateFromArgs(Array.from(argArray));
+	},
+	get(target, property) {
+		if (property === "now") return getEvalDateNow;
+		if (property === "parse") return target.parse;
+		if (property === "UTC") return target.UTC;
+		if (property === "prototype") return target.prototype;
+		if (property === "name") return target.name;
+		if (property === "length") return target.length;
+	}
+});
+globalThis.Date = evalDate;
 const maxLogMessageLength = 2e4;
 const maxLogStringLength = 1e4;
 const maxLogArrayLength = 100;
@@ -79,6 +116,82 @@ var EvalAssertionError = class extends Error {
 		this.name = "EvalAssertionError";
 	}
 };
+function getEvalClockStateNowMs(state) {
+	const elapsedMs = state.frozen ? 0 : realDate.now() - state.realStartMs;
+	return state.startMs + elapsedMs + state.offsetMs;
+}
+function getEvalClockNowMs() {
+	const state = evalClockStorage.getStore();
+	if (state?.shifted !== true) return null;
+	return getEvalClockStateNowMs(state);
+}
+function getEvalDateNow() {
+	return getEvalClockNowMs() ?? realDate.now();
+}
+/** Return the host process clock, bypassing the eval Date shim. */
+function getRealDateNowMs() {
+	return realDate.now();
+}
+/** Return the shifted wall-clock time for a stored eval clock state. */
+function getEvalClockStateTimeMs(state) {
+	if (!state.shifted) return null;
+	return getEvalClockStateNowMs(state);
+}
+/**
+* Return the wall-clock start time captured for the active eval.
+*
+* For `startTime: 'now'`, this is the real time captured when the eval clock
+* context was created.
+*/
+function getEvalStartTime() {
+	const state = evalClockStorage.getStore();
+	if (state === void 0) throw new Error("getEvalStartTime() must be called inside an active eval");
+	return new realDate(state.startMs);
+}
+function resolveEvalStartTimeMs(startTime) {
+	if (startTime === void 0) return defaultEvalStartTimeMs;
+	if (startTime === "now") return realDate.now();
+	const ms = startTime instanceof realDate ? startTime.getTime() : typeof startTime === "number" ? startTime : realDate.parse(startTime);
+	if (Number.isFinite(ms)) return ms;
+	throw new Error(`Invalid eval startTime "${String(startTime)}". Use a Date, timestamp, ISO date string, or "now".`);
+}
+function createEvalClockState(startTime, freezeTime) {
+	const nowMs = realDate.now();
+	return {
+		startMs: startTime === "now" ? nowMs : resolveEvalStartTimeMs(startTime),
+		realStartMs: nowMs,
+		offsetMs: 0,
+		frozen: freezeTime,
+		shifted: startTime !== "now" || freezeTime
+	};
+}
+/** Execute a callback with the eval Date clock shifted from `startTime`. */
+async function runWithEvalClock(startTime, fn, options = {}) {
+	return await evalClockStorage.run(createEvalClockState(startTime, options.freezeTime === true), fn);
+}
+function getEvalTimeUnitMs(unit) {
+	if (unit === "millisecond" || unit === "milliseconds") return 1;
+	if (unit === "second" || unit === "seconds") return 1e3;
+	if (unit === "minute" || unit === "minutes") return 6e4;
+	if (unit === "hour" || unit === "hours") return 36e5;
+	if (unit === "day" || unit === "days") return 864e5;
+	throw new Error(`Unsupported eval time unit "${unit}"`);
+}
+/**
+* Advance the active eval's shifted Date clock and return the new time.
+*
+* Throws outside an active shifted eval clock. Evals that set
+* `startTime: 'now'` use the real current clock unless `freezeTime: true` is
+* also set.
+*/
+function advanceEvalTime(unit, amount) {
+	const state = evalClockStorage.getStore();
+	if (state === void 0) throw new Error("advanceEvalTime() must be called inside an active eval");
+	if (!state.shifted) throw new Error("advanceEvalTime() requires a shifted eval clock. Remove startTime: \"now\" or set freezeTime: true to use it.");
+	if (!Number.isFinite(amount)) throw new Error("advanceEvalTime() amount must be a finite number");
+	state.offsetMs += getEvalTimeUnitMs(unit) * amount;
+	return new realDate(getEvalClockStateNowMs(state));
+}
 /** Return the current eval scope for the active async context, if any. */
 function getCurrentScope() {
 	if (activeEvalScopeCount === 0) return void 0;
@@ -349,7 +462,9 @@ async function runInExistingEvalScope(scope, runtimeScope, fn) {
 	activeEvalScopeCount++;
 	try {
 		return await scopeStorage.run(scope, async () => {
-			return await runInEvalRuntimeScope(runtimeScope, fn);
+			return await evalClockStorage.run(scope.evalClockState, async () => {
+				return await runInEvalRuntimeScope(runtimeScope, fn);
+			});
 		});
 	} finally {
 		activeEvalScopeCount--;
@@ -362,6 +477,8 @@ async function runInExistingEvalScope(scope, runtimeScope, fn) {
 async function runInEvalScope(caseId, fn, options = {}) {
 	const scope = {
 		caseId,
+		startTime: options.startTime,
+		evalClockState: createEvalClockState(options.startTime, options.freezeTime === true),
 		idPrefix: options.idPrefix,
 		nextEvalIdCounter: 0,
 		input: options.input,
@@ -1493,9 +1610,12 @@ function mergeSpanAttribute(span, key, patch) {
 		...patch
 	} });
 }
-function finishSpanWithoutThrownError(span) {
+function addElapsedMsToTimestamp(isoTimestamp, elapsedMs) {
+	return new Date(new Date(isoTimestamp).getTime() + elapsedMs).toISOString();
+}
+function finishSpanWithoutThrownError(span, realStartedAt) {
 	span.status = hasSpanError(span) ? "error" : "ok";
-	span.endedAt = (/* @__PURE__ */ new Date()).toISOString();
+	span.endedAt = addElapsedMsToTimestamp(span.startedAt, getRealDateNowMs() - realStartedAt);
 }
 function createSpanHandle(span) {
 	return {
@@ -1737,9 +1857,11 @@ async function traceSpanInternal(info, fn) {
 	const scope = getCurrentScope();
 	if (!scope) return await fn(noopActiveSpan());
 	const id = generateSpanId();
+	const parentId = scope.activeSpanStack.at(-1)?.id ?? null;
+	const realStartedAt = getRealDateNowMs();
 	const spanRecord = {
 		id,
-		parentId: scope.activeSpanStack.at(-1)?.id ?? null,
+		parentId,
 		caseId: scope.caseId,
 		kind: info.kind,
 		name: info.name,
@@ -1779,7 +1901,7 @@ async function traceSpanInternal(info, fn) {
 					const recording = deserializeCacheRecording(hit.recording);
 					replayRecording(scope, spanRecord, recording, { generateSpanId });
 					spanRecord.status = recording.finalStatus ?? (hasSpanError(spanRecord) ? "error" : "ok");
-					spanRecord.endedAt = (/* @__PURE__ */ new Date()).toISOString();
+					spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
 					return recording.returnValue;
 				}
 				mergeSpanAttributes(spanRecord, { "cache.status": "miss" });
@@ -1798,7 +1920,7 @@ async function traceSpanInternal(info, fn) {
 				scope.recordingStack.pop();
 			}
 			appendSubSpanOps(scope, frame);
-			finishSpanWithoutThrownError(spanRecord);
+			finishSpanWithoutThrownError(spanRecord, realStartedAt);
 			if (ctx.mode !== "bypass") {
 				const recording = {
 					returnValue: bodyResult,
@@ -1832,11 +1954,11 @@ async function traceSpanInternal(info, fn) {
 			return bodyResult;
 		}
 		const result = await fn(activeSpan);
-		finishSpanWithoutThrownError(spanRecord);
+		finishSpanWithoutThrownError(spanRecord, realStartedAt);
 		return result;
 	} catch (error) {
 		spanRecord.status = "error";
-		spanRecord.endedAt = (/* @__PURE__ */ new Date()).toISOString();
+		spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
 		spanRecord.error = normalizeTraceError(error);
 		throw error;
 	} finally {
@@ -2468,7 +2590,8 @@ const caseRowSchema = z.object({
 		"error",
 		"cancelled"
 	]),
-	latencyMs: z.number().nullable(),
+	/** Elapsed case execution duration in milliseconds, or null before completion. */
+	durationMs: z.number().nullable(),
 	costUsd: z.number().nullable().optional(),
 	columns: z.record(z.string(), cellValueSchema),
 	/** Winning trial index for the persisted case result. */
@@ -2586,7 +2709,7 @@ const defaultConfigKeySchema = z.enum([
 	"cachedInputTokens",
 	"cacheCreationInputTokens",
 	"reasoningTokens",
-	"llmLatencyMs"
+	"llmDurationMs"
 ]);
 /** Removal config for built-in eval-level outputs and UI metadata. */
 const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfigKeySchema)]);
@@ -2663,7 +2786,7 @@ const apiCallMetricSchema = z.object({
 });
 /**
 * Schema for one model/provider pricing entry used to derive LLM-call costs
-* from token counts when a span does not already record explicit USD costs.
+* from token counts.
 */
 const llmCallPricingSchema = z.object({
 	/** Exact model name read from the configured `attributes.model` path. */
@@ -2682,6 +2805,8 @@ const llmCallPricingSchema = z.object({
 	cachedInputUsdPerMillion: z.number().nonnegative().optional(),
 	/** USD per one million prompt-cache write tokens. */
 	cacheCreationInputUsdPerMillion: z.number().nonnegative().optional(),
+	/** USD per one million one-hour prompt-cache write tokens. */
+	cacheCreationInput1hUsdPerMillion: z.number().nonnegative().optional(),
 	/** USD per one million reasoning tokens when reported separately. */
 	reasoningUsdPerMillion: z.number().nonnegative().optional()
 });
@@ -2692,12 +2817,9 @@ const llmCallsConfigSchema = z.object({
 	/**
 	* Attribute paths used to extract structured per-call fields. Each entry is
 	* a dot-path inside `span.attributes`. Missing paths fall back to the
-	* built-in defaults (e.g. `usage.inputTokens`, `costUsd`).
-	*
-	* Per-token-type cost paths (`inputCost`, `outputCost`, `cachedInputCost`,
-	* `reasoningCost`) feed the cost breakdown table in the expanded row when
-	* spans provide explicit USD cost overrides. Prefer `pricing` for deriving
-	* costs from token counts globally.
+	* built-in defaults (e.g. `usage.inputTokens`). Derived fields such as
+	* total tokens, tokens/sec, duration, and USD costs are intentionally not
+	* configurable as attribute paths.
 	*/
 	attributes: z.object({
 		model: z.string().optional(),
@@ -2706,15 +2828,9 @@ const llmCallsConfigSchema = z.object({
 		outputTokens: z.string().optional(),
 		cachedInputTokens: z.string().optional(),
 		cacheCreationInputTokens: z.string().optional(),
+		cacheCreationInput1hTokens: z.string().optional(),
 		reasoningTokens: z.string().optional(),
-		totalTokens: z.string().optional(),
-		tokensPerSecond: z.string().optional(),
-		cost: z.string().optional(),
-		inputCost: z.string().optional(),
-		outputCost: z.string().optional(),
-		cachedInputCost: z.string().optional(),
-		cacheCreationInputCost: z.string().optional(),
-		reasoningCost: z.string().optional(),
+		latencyMs: z.string().optional(),
 		steps: z.string().optional(),
 		finishReason: z.string().optional(),
 		input: z.string().optional(),
@@ -2723,9 +2839,8 @@ const llmCallsConfigSchema = z.object({
 		toolCalls: z.string().optional()
 	}).optional(),
 	/**
-	* Model/provider pricing registry used to calculate missing LLM-call costs
-	* from token counts. Explicit span attributes (`costUsd`, `cost.inputUsd`,
-	* etc.) take precedence over derived prices.
+	* Model/provider pricing registry used to calculate LLM-call costs from
+	* token counts. Built-in LLM cost fields are only derived from this registry.
 	*/
 	pricing: z.array(llmCallPricingSchema).optional(),
 	/** Custom user-defined metrics surfaced on each LLM call. */
@@ -2773,15 +2888,9 @@ const DEFAULT_LLM_CALLS_CONFIG = {
 		outputTokens: "usage.outputTokens",
 		cachedInputTokens: "usage.cachedInputTokens",
 		cacheCreationInputTokens: "usage.cacheCreationInputTokens",
+		cacheCreationInput1hTokens: "usage.cacheCreationInput1hTokens",
 		reasoningTokens: "usage.reasoningTokens",
-		totalTokens: "usage.totalTokens",
-		tokensPerSecond: "tokensPerSecond",
-		cost: "costUsd",
-		inputCost: "cost.inputUsd",
-		outputCost: "cost.outputUsd",
-		cachedInputCost: "cost.cachedInputUsd",
-		cacheCreationInputCost: "cost.cacheCreationInputUsd",
-		reasoningCost: "cost.reasoningUsd",
+		latencyMs: "latencyMs",
 		steps: "steps",
 		finishReason: "finishReason",
 		input: "input",
@@ -2823,8 +2932,8 @@ const DEFAULT_API_CALLS_CONFIG = {
 *   attribute path.
 * - Missing `metrics[].format` defaults to `'string'`.
 * - Missing `metrics[].placements` defaults to `['body']`.
-* - Missing `pricing` defaults to an empty registry; explicit span costs still
-*   take precedence over derived costs.
+* - Missing `pricing` defaults to an empty registry; built-in costs are only
+*   derived from configured pricing and token counts.
 */
 function resolveLlmCallsConfig(input) {
 	return {
@@ -2848,6 +2957,7 @@ function resolveLlmCallsConfig(input) {
 			outputUsdPerMillion: p.outputUsdPerMillion,
 			cachedInputUsdPerMillion: p.cachedInputUsdPerMillion,
 			cacheCreationInputUsdPerMillion: p.cacheCreationInputUsdPerMillion,
+			cacheCreationInput1hUsdPerMillion: p.cacheCreationInput1hUsdPerMillion,
 			reasoningUsdPerMillion: p.reasoningUsdPerMillion
 		}))
 	};
@@ -3037,8 +3147,8 @@ function deriveScopedSummaryFromCases(params) {
 		else if (caseRow.status === "cancelled") cancelledCases += 1;
 		else if (caseRow.status === "running") runningCases += 1;
 		else pendingCases += 1;
-		if (caseRow.latencyMs !== null) {
-			totalDurationMs += caseRow.latencyMs;
+		if (caseRow.durationMs !== null) {
+			totalDurationMs += caseRow.durationMs;
 			hasDuration = true;
 		}
 	}
@@ -3132,6 +3242,21 @@ function computeTokenCost(tokens, usdPerMillion) {
 	if (usdPerMillion === void 0) return null;
 	return tokens / 1e6 * usdPerMillion;
 }
+function computeCacheCreationInputCost({ cacheCreationInputTokens, cacheCreationInput1hTokens, usdPerMillion, oneHourUsdPerMillion }) {
+	if (cacheCreationInputTokens === null) return null;
+	if (cacheCreationInputTokens === 0) return 0;
+	if (cacheCreationInput1hTokens === null) return computeTokenCost(cacheCreationInputTokens, usdPerMillion);
+	const oneHourTokens = Math.min(cacheCreationInput1hTokens, cacheCreationInputTokens);
+	const shortLivedCost = computeTokenCost(cacheCreationInputTokens - oneHourTokens, usdPerMillion);
+	const oneHourCost = computeTokenCost(oneHourTokens, oneHourUsdPerMillion);
+	if (shortLivedCost === null || oneHourCost === null) return null;
+	return shortLivedCost + oneHourCost;
+}
+function computeBaseInputTokens({ inputTokens, cachedInputTokens, cacheCreationInputTokens }) {
+	if (inputTokens === null) return null;
+	const cachedTokens = (cachedInputTokens ?? 0) + (cacheCreationInputTokens ?? 0);
+	return Math.max(inputTokens - cachedTokens, 0);
+}
 function pickPricingEntry({ pricing, model, provider }) {
 	if (model === null) return null;
 	let fallback = null;
@@ -3145,7 +3270,7 @@ function pickPricingEntry({ pricing, model, provider }) {
 	}
 	return fallback;
 }
-function computeFallbackTotalCost({ inputTokens, inputCostUsd, outputTokens, outputCostUsd, cachedInputTokens, cachedInputCostUsd, cacheCreationInputTokens, cacheCreationInputCostUsd, reasoningTokens, reasoningCostUsd }) {
+function computeTotalCost({ inputTokens, inputCostUsd, outputTokens, outputCostUsd, cachedInputTokens, cachedInputCostUsd, cacheCreationInputTokens, cacheCreationInputCostUsd, reasoningTokens, reasoningCostUsd }) {
 	const parts = [
 		{
 			tokens: inputTokens,
@@ -3182,7 +3307,7 @@ function computeFallbackTotalCost({ inputTokens, inputCostUsd, outputTokens, out
 	if (hasCost) return total;
 	return hasReportedTokens ? 0 : null;
 }
-function computeLatencyMs$1(span) {
+function computeDurationMs$1(span) {
 	if (span.endedAt === null) return null;
 	const started = Date.parse(span.startedAt);
 	const ended = Date.parse(span.endedAt);
@@ -3190,10 +3315,16 @@ function computeLatencyMs$1(span) {
 	const delta = ended - started;
 	return delta >= 0 ? delta : null;
 }
-function computeTotalTokens({ declared, input, output, cached, cacheCreation }) {
-	if (declared !== null) return declared;
-	if (input === null && output === null && cached === null && cacheCreation === null) return null;
-	return (input ?? 0) + (output ?? 0) + (cached ?? 0) + (cacheCreation ?? 0);
+function computeTotalTokens({ input, output }) {
+	if (input === null && output === null) return null;
+	return (input ?? 0) + (output ?? 0);
+}
+function computeTokensPerSecond({ outputTokens, durationMs, latencyMs }) {
+	if (outputTokens === null || durationMs === null) return null;
+	if (outputTokens === 0) return 0;
+	const generationMs = latencyMs === null ? durationMs : durationMs - latencyMs;
+	if (generationMs <= 0) return null;
+	return outputTokens / (generationMs / 1e3);
 }
 function readSteps(attributes, path) {
 	const raw = getNestedAttribute(attributes, path);
@@ -3201,10 +3332,6 @@ function readSteps(attributes, path) {
 		stepCount: raw.length,
 		stepDetails: raw
 	};
-	if (typeof raw === "number" && Number.isFinite(raw)) return {
-		stepCount: raw,
-		stepDetails: null
-	};
 	return {
 		stepCount: null,
 		stepDetails: null
@@ -3226,16 +3353,22 @@ function pickError$1(span) {
 * shape consumed by the LLM calls tab.
 *
 * Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
-* (`model`, token counts, explicit cost, etc.) are read via
+* (`model`, token counts, latency, etc.) are read via
 * `getNestedAttribute` from the configured paths, with safe coercion to
-* `string | null` / `number | null`. When explicit USD costs are absent,
-* configured model pricing derives per-token-type costs from token counts.
-* `totalTokens` falls back to a sum of input + output + cached when no
-* explicit total attribute is present. The `steps` attribute path may resolve
-* to either a number (rendered as the inference-round count) or an array of
-* per-step detail objects (rendered as a Steps section in the body, with
-* `stepCount` derived from the array length). `latencyMs` is `null` while the
-* span is still running. User-defined `metrics` whose path resolves to
+* `string | null` / `number | null`. `latencyMs` is an explicit
+* time-to-first-token attribute; full span elapsed time is reported separately
+* as `durationMs`. Built-in USD costs are derived only from configured model
+* pricing and token counts. `totalTokens` is always derived from input +
+* output tokens. Cached input and cache creation tokens are reported
+* separately because they are subsets of input/output usage. The main cache
+* creation token field is treated as the total write count; optional one-hour
+* cache creation tokens only split that total for cost calculation. Base input
+* cost uses input minus cache read/write tokens so cached tokens are not
+* charged twice. Cache read/write costs still contribute to the total USD cost
+* at their configured rates. The `steps` attribute path may resolve to an array
+* of per-step detail objects, with `stepCount` derived from the array length.
+* `durationMs` and `tokensPerSecond` are `null` while the span is still
+* running. User-defined `metrics` whose path resolves to
 * `undefined` are dropped, but `null`, `0`, and `false` are preserved as
 * legitimate values worth displaying. Original span order is preserved so the
 * LLM calls tab matches the ordering in the Trace tab.
@@ -3252,19 +3385,30 @@ function extractLlmCalls(spans, config) {
 		const outputTokens = readNumber$2(attrs, config.attributes.outputTokens);
 		const cachedInputTokens = readNumber$2(attrs, config.attributes.cachedInputTokens);
 		const cacheCreationInputTokens = readNumber$2(attrs, config.attributes.cacheCreationInputTokens);
+		const cacheCreationInput1hTokens = readNumber$2(attrs, config.attributes.cacheCreationInput1hTokens);
 		const reasoningTokens = readNumber$2(attrs, config.attributes.reasoningTokens);
-		const declaredTotalTokens = readNumber$2(attrs, config.attributes.totalTokens);
+		const latencyMs = readNumber$2(attrs, config.attributes.latencyMs);
+		const durationMs = computeDurationMs$1(span);
 		const pricing = pickPricingEntry({
 			pricing: config.pricing,
 			model,
 			provider
 		});
-		const inputCostUsd = readNumber$2(attrs, config.attributes.inputCost) ?? computeTokenCost(inputTokens, pricing?.inputUsdPerMillion);
-		const outputCostUsd = readNumber$2(attrs, config.attributes.outputCost) ?? computeTokenCost(outputTokens, pricing?.outputUsdPerMillion);
-		const cachedInputCostUsd = readNumber$2(attrs, config.attributes.cachedInputCost) ?? computeTokenCost(cachedInputTokens, pricing?.cachedInputUsdPerMillion);
-		const cacheCreationInputCostUsd = readNumber$2(attrs, config.attributes.cacheCreationInputCost) ?? computeTokenCost(cacheCreationInputTokens, pricing?.cacheCreationInputUsdPerMillion);
-		const reasoningCostUsd = readNumber$2(attrs, config.attributes.reasoningCost) ?? computeTokenCost(reasoningTokens, pricing?.reasoningUsdPerMillion);
-		const costUsd = readNumber$2(attrs, config.attributes.cost) ?? computeFallbackTotalCost({
+		const inputCostUsd = computeTokenCost(computeBaseInputTokens({
+			inputTokens,
+			cachedInputTokens,
+			cacheCreationInputTokens
+		}), pricing?.inputUsdPerMillion);
+		const outputCostUsd = computeTokenCost(outputTokens, pricing?.outputUsdPerMillion);
+		const cachedInputCostUsd = computeTokenCost(cachedInputTokens, pricing?.cachedInputUsdPerMillion);
+		const cacheCreationInputCostUsd = computeCacheCreationInputCost({
+			cacheCreationInputTokens,
+			cacheCreationInput1hTokens,
+			usdPerMillion: pricing?.cacheCreationInputUsdPerMillion,
+			oneHourUsdPerMillion: pricing?.cacheCreationInput1hUsdPerMillion
+		});
+		const reasoningCostUsd = computeTokenCost(reasoningTokens, pricing?.reasoningUsdPerMillion);
+		const costUsd = computeTotalCost({
 			inputTokens,
 			inputCostUsd,
 			outputTokens,
@@ -3302,13 +3446,15 @@ function extractLlmCalls(spans, config) {
 			cacheCreationInputTokens,
 			reasoningTokens,
 			totalTokens: computeTotalTokens({
-				declared: declaredTotalTokens,
 				input: inputTokens,
-				output: outputTokens,
-				cached: cachedInputTokens,
-				cacheCreation: cacheCreationInputTokens
+				output: outputTokens
+			}),
+			latencyMs,
+			tokensPerSecond: computeTokensPerSecond({
+				outputTokens,
+				durationMs,
+				latencyMs
 			}),
-			tokensPerSecond: readNumber$2(attrs, config.attributes.tokensPerSecond),
 			costUsd,
 			inputCostUsd,
 			outputCostUsd,
@@ -3317,7 +3463,7 @@ function extractLlmCalls(spans, config) {
 			reasoningCostUsd,
 			...readSteps(attrs, config.attributes.steps),
 			finishReason: readString$2(attrs, config.attributes.finishReason),
-			latencyMs: computeLatencyMs$1(span),
+			durationMs,
 			input: getNestedAttribute(attrs, config.attributes.input),
 			output: getNestedAttribute(attrs, config.attributes.output),
 			reasoning: getNestedAttribute(attrs, config.attributes.reasoning),
@@ -3342,7 +3488,7 @@ function readString$1(attributes, path) {
 	const raw = getNestedAttribute(attributes, path);
 	return typeof raw === "string" && raw.length > 0 ? raw : null;
 }
-function computeLatencyMs(span) {
+function computeDurationMs(span) {
 	if (span.endedAt === null) return null;
 	const started = Date.parse(span.startedAt);
 	const ended = Date.parse(span.endedAt);
@@ -3367,10 +3513,10 @@ function pickError(span) {
 *
 * Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
 * (`method`, `url`, `statusCode`, etc.) are read via `getNestedAttribute` from
-* the configured paths. `durationMs` takes precedence for latency, with a
-* fallback to the span start/end timestamps. User-defined `metrics` whose path
-* resolves to `undefined` are dropped, but `null`, `0`, and `false` are
-* preserved as legitimate values worth displaying. Original span order is
+* the configured paths. An explicit `durationMs` attribute takes precedence,
+* with a fallback to the span start/end timestamps. User-defined `metrics`
+* whose path resolves to `undefined` are dropped, but `null`, `0`, and `false`
+* are preserved as legitimate values worth displaying. Original span order is
 * preserved so the API calls tab matches the ordering in the Trace tab.
 */
 function extractApiCalls(spans, config) {
@@ -3400,7 +3546,7 @@ function extractApiCalls(spans, config) {
 			method: readString$1(attrs, config.attributes.method),
 			url: readString$1(attrs, config.attributes.url),
 			statusCode: readNumber$1(attrs, config.attributes.statusCode),
-			latencyMs: readNumber$1(attrs, config.attributes.durationMs) ?? computeLatencyMs(span),
+			durationMs: readNumber$1(attrs, config.attributes.durationMs) ?? computeDurationMs(span),
 			request: getNestedAttribute(attrs, config.attributes.request),
 			response: getNestedAttribute(attrs, config.attributes.response),
 			requestBody: getNestedAttribute(attrs, config.attributes.requestBody),
@@ -3798,7 +3944,7 @@ async function writeCacheFile(cacheDir, cacheFile) {
 	await mkdir(cacheDir, { recursive: true });
 	const filePath = ownerPath(cacheDir, cacheFile.owner);
 	const tmpPath = `${filePath}.${process.pid.toString()}.tmp`;
-	await writeFile(tmpPath, JSON.stringify(cacheFile));
+	await writeFile(tmpPath, JSON.stringify(cacheFile, null, 2));
 	await rename(tmpPath, filePath);
 }
 async function readDebugKeyFile(debugDir, owner) {
@@ -4237,7 +4383,8 @@ const DEFAULT_CONFIG_KEYS = [
 	"totalTokens",
 	"cachedInputTokens",
 	"cacheCreationInputTokens",
-	"llmLatencyMs"
+	"reasoningTokens",
+	"llmDurationMs"
 ];
 const tokenNumberFormat = { notation: "compact" };
 const countNumberFormat = {
@@ -4303,8 +4450,8 @@ const DEFAULT_COLUMNS = {
 		numberFormat: tokenNumberFormat,
 		align: "right"
 	},
-	llmLatencyMs: {
-		label: "LLM Latency",
+	llmDurationMs: {
+		label: "LLM Duration",
 		format: "duration",
 		align: "right"
 	}
@@ -4509,8 +4656,14 @@ function addDefaultOutputs(params) {
 	});
 	assignIfMissing({
 		outputs: params.outputs,
-		key: "llmLatencyMs",
-		value: sumNullable(calls.map((call) => call.latencyMs)),
+		key: "reasoningTokens",
+		value: sumNullable(calls.map((call) => call.reasoningTokens)),
+		activeKeys
+	});
+	assignIfMissing({
+		outputs: params.outputs,
+		key: "llmDurationMs",
+		value: sumNullable(calls.map((call) => call.durationMs)),
 		activeKeys
 	});
 }
@@ -5204,7 +5357,9 @@ async function runCase(params) {
 			mode: cacheMode,
 			evalId,
 			codeFingerprint
-		} : void 0
+		} : void 0,
+		startTime: evalDef.startTime,
+		freezeTime: evalDef.freezeTime
 	});
 	const traceTree = buildTraceTree(scope.spans, scope.checkpoints);
 	const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
@@ -5245,6 +5400,7 @@ async function runCase(params) {
 	}
 	const scoreResults = /* @__PURE__ */ new Map();
 	const scoringTraces = {};
+	const scoreStartTime = getEvalClockStateTimeMs(scope.evalClockState) ?? evalDef.startTime;
 	if (!nonAssertError && scope.assertionFailures.length === 0 && evalDef.scores) for (const [key, def] of Object.entries(evalDef.scores)) {
 		const { compute, passThreshold, label } = normalizeScoreDef(def);
 		const scoreRun = await runInEvalScope(evalCase.id, async () => {
@@ -5264,7 +5420,9 @@ async function runCase(params) {
 				mode: cacheMode,
 				evalId: `${evalId}__score__${key}`,
 				codeFingerprint
-			} : void 0
+			} : void 0,
+			startTime: scoreStartTime,
+			freezeTime: evalDef.freezeTime
 		});
 		const { trace, traceDisplay } = resolveTracePresentation(scoreRun.scope.spans, globalTraceDisplay, evalDef.traceDisplay);
 		scope.logs.push(...scoreRun.scope.logs.map((entry) => ({
@@ -5356,7 +5514,7 @@ async function runCase(params) {
 		caseDetail,
 		caseRowUpdate: {
 			status,
-			latencyMs: Date.now() - startTime,
+			durationMs: Date.now() - startTime,
 			columns
 		}
 	};
@@ -5577,7 +5735,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
 					await runInEvalRuntimeScope("cases", async () => {
 						await entry.use(async (evalDef) => {
 							const cases = filterEvalCases(resolveRunnableEvalCases({
-								cases: typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [],
+								cases: await runWithEvalClock(evalDef.startTime, async () => typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [], { freezeTime: evalDef.freezeTime }),
 								evalId: evalMeta.id
 							}), request.target.evalIds, request.target.caseIds, evalMeta.id);
 							runState.summary.totalCases += cases.length;
@@ -5647,7 +5805,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
 													caseId: evalCase.id,
 													evalId: evalMeta.id,
 													status: caseRowUpdate.status ?? "pending",
-													latencyMs: caseRowUpdate.latencyMs ?? null,
+													durationMs: caseRowUpdate.durationMs ?? null,
 													columns: caseRowUpdate.columns ?? {},
 													trial
 												}
@@ -5780,4 +5938,4 @@ function toLastRunStatus(status) {
 	return status === "pending" ? null : status;
 }
 //#endregion
-export { removeDefaultConfigSchema as $, columnKindSchema as $t, extractApiCalls as A, setEvalOutput as An, cacheFileSchema as At, DEFAULT_API_CALLS_CONFIG as B, traceAttributeDisplayFormatSchema as Bt, validateCharts as C, incrementEvalOutput as Cn, evalChartTooltipExtraSchema as Ct, sseEnvelopeSchema as D, runInEvalRuntimeScope as Dn, cacheDebugKeyFileSchema as Dt, updateManualScoreRequestSchema as E, nextEvalId as En, cacheDebugKeyEntrySchema as Et, deriveScopedSummaryFromCases as F, getEvalRegistry as Fn, cacheRecordingSchema as Ft, apiCallMetricSchema as G, traceDisplayInputConfigSchema as Gt, agentEvalsConfigSchema as H, traceAttributeDisplayPlacementSchema as Ht, deriveStatusFromCaseRows as I, cacheStatusSchema as It, llmCallMetricFormatSchema as J, traceSpanSchema as Jt, apiCallsConfigSchema as K, traceSpanErrorSchema as Kt, deriveStatusFromChildStatuses as L, serializedCacheSpanSchema as Lt, getNestedAttribute as M, startEvalBackgroundJob as Mn, cacheModeSchema as Mt, getEvalTitle as N, repoFile as Nn, cacheOperationTypeSchema as Nt, extractCacheEntries as O, runInEvalScope as On, cacheEntrySchema as Ot, getEvalDisplayStatus as P, defineEval as Pn, cacheRecordingOpSchema as Pt, llmCallsConfigSchema as Q, columnFormatSchema as Qt, runManifestSchema as R, spanCacheOptionsSchema as Rt, normalizeScoreDef as S, getEvalCaseInput as Sn, evalChartMetricSchema as St, createRunRequestSchema as T, mergeEvalOutput as Tn, evalChartsConfigSchema as Tt, apiCallMetricFormatSchema as U, traceAttributeDisplaySchema as Ut, DEFAULT_LLM_CALLS_CONFIG as V, traceAttributeDisplayInputSchema as Vt, apiCallMetricPlacementSchema as W, traceDisplayConfigSchema as Wt, llmCallMetricSchema as X, cellValueSchema as Xt, llmCallMetricPlacementSchema as Y, traceSpanWarningSchema as Yt, llmCallPricingSchema as Z, columnDefSchema as Zt, loadEvalModule as _, appendToEvalOutput as _n, evalChartAggregateSchema as _t, loadPersistedRunSnapshot as a, z$1 as an, caseDetailSchema as at, loadConfig as b, evalLog as bn, evalChartColorSchema as bt, persistCaseDetail as c, evalSpan as cn, evalStatAggregateSchema as ct, recomputePersistedCaseStatus as d, hashCacheKeySync as dn, evalSummarySchema as dt, fileRefSchema as en, resolveApiCallsConfig as et, runTouchesEval as f, deserializeCacheRecording as fn, runLogEntrySchema as ft, setLatestRunInfoMap as g, EvalAssertionError as gn, scoreTraceSchema as gt, getTargetEvalIds as h, serializeCacheValue as hn, runLogPhaseSchema as ht, getLatestRunInfos as i, runArtifactRefSchema as in, assertionFailureSchema as it, extractLlmCalls as j, setScopeCacheContext as jn, cacheListItemSchema as jt, extractCacheHits as k, runInExistingEvalScope as kn, cacheEntryWithDebugKeySchema as kt, persistRunState as l, evalTracer as ln, evalStatItemSchema as lt, buildEvalSummary as m, serializeCacheRecording as mn, runLogLocationSchema as mt, generateRunId as n, numberDisplayOptionsSchema as nn, runLogsConfigSchema as nt, loadPersistedRunSnapshots as o, buildTraceTree as on, caseRowSchema as ot, resolveArtifactPath as p, deserializeCacheValue as pn, runLogLevelSchema as pt, defaultConfigKeySchema as q, traceSpanKindSchema as qt, getLastRunStatuses as r, repoFileRefSchema as rn, trialSelectionModeSchema as rt, nextShortIdFromSnapshots as s, captureEvalSpanError as sn, evalFreshnessStatusSchema as st, executeRun as t, jsonCellSchema as tn, resolveLlmCallsConfig as tt, recomputeEvalStatusesInRuns as u, hashCacheKey as un, evalStatsConfigSchema as ut, parseEvalMetas as v, configureEvalRunLogs as vn, evalChartAxisSchema as vt, createFsCacheStore as w, isInEvalScope as wn, evalChartTypeSchema as wt, buildDeclaredColumnDefs as x, getCurrentScope as xn, evalChartConfigSchema as xt, resolveEvalDefaultConfig as y, evalAssert as yn, evalChartBuiltinMetricSchema as yt, runSummarySchema as z, traceCacheRefSchema as zt };
+export { removeDefaultConfigSchema as $, columnKindSchema as $t, extractApiCalls as A, runInEvalScope as An, cacheFileSchema as At, DEFAULT_API_CALLS_CONFIG as B, traceAttributeDisplayFormatSchema as Bt, validateCharts as C, getEvalCaseInput as Cn, evalChartTooltipExtraSchema as Ct, sseEnvelopeSchema as D, mergeEvalOutput as Dn, cacheDebugKeyFileSchema as Dt, updateManualScoreRequestSchema as E, isInEvalScope as En, cacheDebugKeyEntrySchema as Et, deriveScopedSummaryFromCases as F, repoFile as Fn, cacheRecordingSchema as Ft, apiCallMetricSchema as G, traceDisplayInputConfigSchema as Gt, agentEvalsConfigSchema as H, traceAttributeDisplayPlacementSchema as Ht, deriveStatusFromCaseRows as I, defineEval as In, cacheStatusSchema as It, llmCallMetricFormatSchema as J, traceSpanSchema as Jt, apiCallsConfigSchema as K, traceSpanErrorSchema as Kt, deriveStatusFromChildStatuses as L, getEvalRegistry as Ln, serializedCacheSpanSchema as Lt, getNestedAttribute as M, setEvalOutput as Mn, cacheModeSchema as Mt, getEvalTitle as N, setScopeCacheContext as Nn, cacheOperationTypeSchema as Nt, extractCacheEntries as O, nextEvalId as On, cacheEntrySchema as Ot, getEvalDisplayStatus as P, startEvalBackgroundJob as Pn, cacheRecordingOpSchema as Pt, llmCallsConfigSchema as Q, columnFormatSchema as Qt, runManifestSchema as R, spanCacheOptionsSchema as Rt, normalizeScoreDef as S, getCurrentScope as Sn, evalChartMetricSchema as St, createRunRequestSchema as T, incrementEvalOutput as Tn, evalChartsConfigSchema as Tt, apiCallMetricFormatSchema as U, traceAttributeDisplaySchema as Ut, DEFAULT_LLM_CALLS_CONFIG as V, traceAttributeDisplayInputSchema as Vt, apiCallMetricPlacementSchema as W, traceDisplayConfigSchema as Wt, llmCallMetricSchema as X, cellValueSchema as Xt, llmCallMetricPlacementSchema as Y, traceSpanWarningSchema as Yt, llmCallPricingSchema as Z, columnDefSchema as Zt, loadEvalModule as _, advanceEvalTime as _n, evalChartAggregateSchema as _t, loadPersistedRunSnapshot as a, z$1 as an, caseDetailSchema as at, loadConfig as b, evalAssert as bn, evalChartColorSchema as bt, persistCaseDetail as c, evalSpan as cn, evalStatAggregateSchema as ct, recomputePersistedCaseStatus as d, hashCacheKeySync as dn, evalSummarySchema as dt, fileRefSchema as en, resolveApiCallsConfig as et, runTouchesEval as f, deserializeCacheRecording as fn, runLogEntrySchema as ft, setLatestRunInfoMap as g, EvalAssertionError as gn, scoreTraceSchema as gt, getTargetEvalIds as h, serializeCacheValue as hn, runLogPhaseSchema as ht, getLatestRunInfos as i, runArtifactRefSchema as in, assertionFailureSchema as it, extractLlmCalls as j, runInExistingEvalScope as jn, cacheListItemSchema as jt, extractCacheHits as k, runInEvalRuntimeScope as kn, cacheEntryWithDebugKeySchema as kt, persistRunState as l, evalTracer as ln, evalStatItemSchema as lt, buildEvalSummary as m, serializeCacheRecording as mn, runLogLocationSchema as mt, generateRunId as n, numberDisplayOptionsSchema as nn, runLogsConfigSchema as nt, loadPersistedRunSnapshots as o, buildTraceTree as on, caseRowSchema as ot, resolveArtifactPath as p, deserializeCacheValue as pn, runLogLevelSchema as pt, defaultConfigKeySchema as q, traceSpanKindSchema as qt, getLastRunStatuses as r, repoFileRefSchema as rn, trialSelectionModeSchema as rt, nextShortIdFromSnapshots as s, captureEvalSpanError as sn, evalFreshnessStatusSchema as st, executeRun as t, jsonCellSchema as tn, resolveLlmCallsConfig as tt, recomputeEvalStatusesInRuns as u, hashCacheKey as un, evalStatsConfigSchema as ut, parseEvalMetas as v, appendToEvalOutput as vn, evalChartAxisSchema as vt, createFsCacheStore as w, getEvalStartTime as wn, evalChartTypeSchema as wt, buildDeclaredColumnDefs as x, evalLog as xn, evalChartConfigSchema as xt, resolveEvalDefaultConfig as y, configureEvalRunLogs as yn, evalChartBuiltinMetricSchema as yt, runSummarySchema as z, traceCacheRefSchema as zt };

package/dist/{runner-Dy_PECaf.mjs → runner-4yNYRvmF.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { n as createRunner } from "./cli-DumvanQI.mjs";
-import "./src-BoAJb4wC.mjs";
+import { n as createRunner } from "./cli-D3QNOcPN.mjs";
+import "./src-CcXfWT4M.mjs";
 //#region ../../apps/server/src/runner.ts
 let runnerInstance = null;
 function getRunnerInstance() {