npm - @ls-stack/agent-eval - Versions diffs - 0.22.0 → 0.24.0 - Mend

@ls-stack/agent-eval 0.22.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/{app-moDHbg1O.mjs → app-DYRmucgj.mjs} +3 -3
package/dist/apps/web/dist/assets/{index-AUDD3rNB.js → index-KbbX3NYr.js} +35 -35
package/dist/apps/web/dist/index.html +1 -1
package/dist/bin.mjs +1 -1
package/dist/{cli-C0EtHhEO.mjs → cli-Be0x8CS3.mjs} +3 -3
package/dist/index.d.mts +106 -9
package/dist/index.mjs +4 -4
package/dist/runChild.mjs +1 -1
package/dist/{runOrchestration-D1edUDhp.mjs → runOrchestration-D697g6Qe.mjs} +281 -42
package/dist/{runner-C9nP2VKL.mjs → runner-B4SosWgD.mjs} +2 -2
package/dist/{runner-CyRhIzci.mjs → runner-jSujaSKt.mjs} +1 -1
package/dist/src-D6cettg0.mjs +3 -0
package/package.json +3 -3
package/skills/agent-eval/SKILL.md +21 -5
package/dist/src-D-HuV8I-.mjs +0 -3

package/dist/{runOrchestration-D1edUDhp.mjs → runOrchestration-D697g6Qe.mjs} RENAMED Viewed

@@ -49,9 +49,46 @@ function repoFile(path, mimeType) {
 //#region ../sdk/src/runtime.ts
 const scopeStorage = new AsyncLocalStorage();
 const runtimeScopeStorage = new AsyncLocalStorage();
+const evalClockStorage = new AsyncLocalStorage();
 let activeEvalScopeCount = 0;
 let activeEvalRuntimeScopeCount = 0;
 let consoleCaptureEnabled = true;
+const defaultEvalStartTimeMs = Date.parse("2026-04-10T00:00:00.000Z");
+const realDate = globalThis.__agentEvalsRealDate ?? Date;
+globalThis.__agentEvalsRealDate = realDate;
+function toDateConstructorArg(value) {
+	if (typeof value === "string" || typeof value === "number" || value instanceof realDate) return value;
+	return Number(value);
+}
+function toDateNumberArg(value) {
+	return typeof value === "number" ? value : Number(value);
+}
+function constructDateFromArgs(args) {
+	if (args.length === 0) return new realDate();
+	if (args.length === 1) return new realDate(toDateConstructorArg(args[0]));
+	return new realDate(toDateNumberArg(args[0]), toDateNumberArg(args[1]), args[2] === void 0 ? 1 : toDateNumberArg(args[2]), args[3] === void 0 ? 0 : toDateNumberArg(args[3]), args[4] === void 0 ? 0 : toDateNumberArg(args[4]), args[5] === void 0 ? 0 : toDateNumberArg(args[5]), args[6] === void 0 ? 0 : toDateNumberArg(args[6]));
+}
+const evalDate = new Proxy(realDate, {
+	apply(target, thisArg, argArray_) {
+		const nowMs = getEvalClockNowMs();
+		if (nowMs !== null) return new target(nowMs).toString();
+		return target.call(thisArg);
+	},
+	construct(target, argArray, newTarget_) {
+		const nowMs = getEvalClockNowMs();
+		if (argArray.length === 0 && nowMs !== null) return new target(nowMs);
+		return constructDateFromArgs(Array.from(argArray));
+	},
+	get(target, property) {
+		if (property === "now") return getEvalDateNow;
+		if (property === "parse") return target.parse;
+		if (property === "UTC") return target.UTC;
+		if (property === "prototype") return target.prototype;
+		if (property === "name") return target.name;
+		if (property === "length") return target.length;
+	}
+});
+globalThis.Date = evalDate;
 const maxLogMessageLength = 2e4;
 const maxLogStringLength = 1e4;
 const maxLogArrayLength = 100;
@@ -79,6 +116,82 @@ var EvalAssertionError = class extends Error {
 		this.name = "EvalAssertionError";
 	}
 };
+function getEvalClockStateNowMs(state) {
+	const elapsedMs = state.frozen ? 0 : realDate.now() - state.realStartMs;
+	return state.startMs + elapsedMs + state.offsetMs;
+}
+function getEvalClockNowMs() {
+	const state = evalClockStorage.getStore();
+	if (state?.shifted !== true) return null;
+	return getEvalClockStateNowMs(state);
+}
+function getEvalDateNow() {
+	return getEvalClockNowMs() ?? realDate.now();
+}
+/** Return the host process clock, bypassing the eval Date shim. */
+function getRealDateNowMs() {
+	return realDate.now();
+}
+/** Return the shifted wall-clock time for a stored eval clock state. */
+function getEvalClockStateTimeMs(state) {
+	if (!state.shifted) return null;
+	return getEvalClockStateNowMs(state);
+}
+/**
+* Return the wall-clock start time captured for the active eval.
+*
+* For `startTime: 'now'`, this is the real time captured when the eval clock
+* context was created.
+*/
+function getEvalStartTime() {
+	const state = evalClockStorage.getStore();
+	if (state === void 0) throw new Error("getEvalStartTime() must be called inside an active eval");
+	return new realDate(state.startMs);
+}
+function resolveEvalStartTimeMs(startTime) {
+	if (startTime === void 0) return defaultEvalStartTimeMs;
+	if (startTime === "now") return realDate.now();
+	const ms = startTime instanceof realDate ? startTime.getTime() : typeof startTime === "number" ? startTime : realDate.parse(startTime);
+	if (Number.isFinite(ms)) return ms;
+	throw new Error(`Invalid eval startTime "${String(startTime)}". Use a Date, timestamp, ISO date string, or "now".`);
+}
+function createEvalClockState(startTime, freezeTime) {
+	const nowMs = realDate.now();
+	return {
+		startMs: startTime === "now" ? nowMs : resolveEvalStartTimeMs(startTime),
+		realStartMs: nowMs,
+		offsetMs: 0,
+		frozen: freezeTime,
+		shifted: startTime !== "now" || freezeTime
+	};
+}
+/** Execute a callback with the eval Date clock shifted from `startTime`. */
+async function runWithEvalClock(startTime, fn, options = {}) {
+	return await evalClockStorage.run(createEvalClockState(startTime, options.freezeTime === true), fn);
+}
+function getEvalTimeUnitMs(unit) {
+	if (unit === "millisecond" || unit === "milliseconds") return 1;
+	if (unit === "second" || unit === "seconds") return 1e3;
+	if (unit === "minute" || unit === "minutes") return 6e4;
+	if (unit === "hour" || unit === "hours") return 36e5;
+	if (unit === "day" || unit === "days") return 864e5;
+	throw new Error(`Unsupported eval time unit "${unit}"`);
+}
+/**
+* Advance the active eval's shifted Date clock and return the new time.
+*
+* Throws outside an active shifted eval clock. Evals that set
+* `startTime: 'now'` use the real current clock unless `freezeTime: true` is
+* also set.
+*/
+function advanceEvalTime(unit, amount) {
+	const state = evalClockStorage.getStore();
+	if (state === void 0) throw new Error("advanceEvalTime() must be called inside an active eval");
+	if (!state.shifted) throw new Error("advanceEvalTime() requires a shifted eval clock. Remove startTime: \"now\" or set freezeTime: true to use it.");
+	if (!Number.isFinite(amount)) throw new Error("advanceEvalTime() amount must be a finite number");
+	state.offsetMs += getEvalTimeUnitMs(unit) * amount;
+	return new realDate(getEvalClockStateNowMs(state));
+}
 /** Return the current eval scope for the active async context, if any. */
 function getCurrentScope() {
 	if (activeEvalScopeCount === 0) return void 0;
@@ -349,7 +462,9 @@ async function runInExistingEvalScope(scope, runtimeScope, fn) {
 	activeEvalScopeCount++;
 	try {
 		return await scopeStorage.run(scope, async () => {
-			return await runInEvalRuntimeScope(runtimeScope, fn);
+			return await evalClockStorage.run(scope.evalClockState, async () => {
+				return await runInEvalRuntimeScope(runtimeScope, fn);
+			});
 		});
 	} finally {
 		activeEvalScopeCount--;
@@ -362,6 +477,8 @@ async function runInExistingEvalScope(scope, runtimeScope, fn) {
 async function runInEvalScope(caseId, fn, options = {}) {
 	const scope = {
 		caseId,
+		startTime: options.startTime,
+		evalClockState: createEvalClockState(options.startTime, options.freezeTime === true),
 		idPrefix: options.idPrefix,
 		nextEvalIdCounter: 0,
 		input: options.input,
@@ -1213,7 +1330,7 @@ const errorCoreFields = new Set([
 	"stack",
 	"capturedAt"
 ]);
-function isRecord$4(value) {
+function isRecord$5(value) {
 	return typeof value === "object" && value !== null && !Array.isArray(value);
 }
 function formatUnknownErrorMessage(error) {
@@ -1241,7 +1358,7 @@ function normalizeTraceError(error, capturedAt = void 0) {
 		stack: error.stack,
 		capturedAt
 	};
-	if (isRecord$4(error)) {
+	if (isRecord$5(error)) {
 		const extraFields = getErrorExtraFields(error);
 		const name = typeof error.name === "string" ? error.name : void 0;
 		const stack = typeof error.stack === "string" ? error.stack : void 0;
@@ -1266,7 +1383,7 @@ function normalizeTraceWarnings(warningOrWarnings, additionalWarnings, capturedA
 	return (additionalWarnings.length > 0 ? [warningOrWarnings, ...additionalWarnings] : Array.isArray(warningOrWarnings) ? warningOrWarnings : [warningOrWarnings]).map((warning) => normalizeTraceError(warning, capturedAt));
 }
 function isCaptureEvalSpanErrorOptions(value) {
-	if (!isRecord$4(value)) return false;
+	if (!isRecord$5(value)) return false;
 	const keys = Object.keys(value);
 	if (keys.length === 0) return false;
 	if (!keys.every((key) => key === "level")) return false;
@@ -1493,9 +1610,12 @@ function mergeSpanAttribute(span, key, patch) {
 		...patch
 	} });
 }
-function finishSpanWithoutThrownError(span) {
+function addElapsedMsToTimestamp(isoTimestamp, elapsedMs) {
+	return new Date(new Date(isoTimestamp).getTime() + elapsedMs).toISOString();
+}
+function finishSpanWithoutThrownError(span, realStartedAt) {
 	span.status = hasSpanError(span) ? "error" : "ok";
-	span.endedAt = (/* @__PURE__ */ new Date()).toISOString();
+	span.endedAt = addElapsedMsToTimestamp(span.startedAt, getRealDateNowMs() - realStartedAt);
 }
 function createSpanHandle(span) {
 	return {
@@ -1737,9 +1857,11 @@ async function traceSpanInternal(info, fn) {
 	const scope = getCurrentScope();
 	if (!scope) return await fn(noopActiveSpan());
 	const id = generateSpanId();
+	const parentId = scope.activeSpanStack.at(-1)?.id ?? null;
+	const realStartedAt = getRealDateNowMs();
 	const spanRecord = {
 		id,
-		parentId: scope.activeSpanStack.at(-1)?.id ?? null,
+		parentId,
 		caseId: scope.caseId,
 		kind: info.kind,
 		name: info.name,
@@ -1779,7 +1901,7 @@ async function traceSpanInternal(info, fn) {
 					const recording = deserializeCacheRecording(hit.recording);
 					replayRecording(scope, spanRecord, recording, { generateSpanId });
 					spanRecord.status = recording.finalStatus ?? (hasSpanError(spanRecord) ? "error" : "ok");
-					spanRecord.endedAt = (/* @__PURE__ */ new Date()).toISOString();
+					spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
 					return recording.returnValue;
 				}
 				mergeSpanAttributes(spanRecord, { "cache.status": "miss" });
@@ -1798,7 +1920,7 @@ async function traceSpanInternal(info, fn) {
 				scope.recordingStack.pop();
 			}
 			appendSubSpanOps(scope, frame);
-			finishSpanWithoutThrownError(spanRecord);
+			finishSpanWithoutThrownError(spanRecord, realStartedAt);
 			if (ctx.mode !== "bypass") {
 				const recording = {
 					returnValue: bodyResult,
@@ -1832,11 +1954,11 @@ async function traceSpanInternal(info, fn) {
 			return bodyResult;
 		}
 		const result = await fn(activeSpan);
-		finishSpanWithoutThrownError(spanRecord);
+		finishSpanWithoutThrownError(spanRecord, realStartedAt);
 		return result;
 	} catch (error) {
 		spanRecord.status = "error";
-		spanRecord.endedAt = (/* @__PURE__ */ new Date()).toISOString();
+		spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
 		spanRecord.error = normalizeTraceError(error);
 		throw error;
 	} finally {
@@ -2605,13 +2727,16 @@ const apiCallMetricFormatSchema = llmCallMetricFormatSchema;
 const llmCallMetricPlacementSchema = z.enum(["header", "body"]);
 /** Where an API-call metric is rendered inside the API calls tab. */
 const apiCallMetricPlacementSchema = llmCallMetricPlacementSchema;
+const callDerivedAttributeSchema = z.custom((value) => typeof value === "function", { message: "Expected a derived attribute function" });
 /**
 * Schema for a single user-defined metric attached to LLM call rows.
 *
 * Each metric reads `path` from the span's `attributes` and renders the value
-* with the configured `format` and `numberFormat`. `placements` controls
-* whether the metric appears as a chip on the collapsed row header, as a row
-* inside the expanded body, or both. Defaults to `['body']` when omitted.
+* with the configured `format` and `numberFormat`. Use
+* `llmCalls.derivedAttributes` when a metric should read a value computed from
+* other attributes. `placements` controls whether the metric appears as a chip
+* on the collapsed row header, as a row inside the expanded body, or both.
+* Defaults to `['body']` when omitted.
 */
 const llmCallMetricSchema = z.object({
 	/** Display label for the metric row or header chip. */
@@ -2638,9 +2763,11 @@ const llmCallMetricSchema = z.object({
 * Schema for a single user-defined metric attached to API call rows.
 *
 * Each metric reads `path` from the span's `attributes` and renders the value
-* with the configured `format` and `numberFormat`. `placements` controls
-* whether the metric appears as a chip on the collapsed row header, as a row
-* inside the expanded body, or both. Defaults to `['body']` when omitted.
+* with the configured `format` and `numberFormat`. Use
+* `apiCalls.derivedAttributes` when a metric should read a value computed from
+* other attributes. `placements` controls whether the metric appears as a chip
+* on the collapsed row header, as a row inside the expanded body, or both.
+* Defaults to `['body']` when omitted.
 */
 const apiCallMetricSchema = z.object({
 	/** Display label for the metric row or header chip. */
@@ -2717,6 +2844,13 @@ const llmCallsConfigSchema = z.object({
 		toolCalls: z.string().optional()
 	}).optional(),
 	/**
+	* Derived attributes persisted onto every matching LLM span before
+	* `deriveFromTracing`, default outputs, trace display, and call metrics read
+	* the trace. Keys are dot-paths under `span.attributes`; return `undefined`
+	* to skip writing the attribute for one span.
+	*/
+	derivedAttributes: z.record(z.string().min(1), callDerivedAttributeSchema).optional(),
+	/**
 	* Model/provider pricing registry used to calculate LLM-call costs from
 	* token counts. Built-in LLM cost fields are only derived from this registry.
 	*/
@@ -2745,6 +2879,13 @@ const apiCallsConfigSchema = z.object({
 		durationMs: z.string().optional(),
 		error: z.string().optional()
 	}).optional(),
+	/**
+	* Derived attributes persisted onto every matching API span before trace
+	* display and call metrics read the trace. Keys are dot-paths under
+	* `span.attributes`; return `undefined` to skip writing the attribute for
+	* one span.
+	*/
+	derivedAttributes: z.record(z.string().min(1), callDerivedAttributeSchema).optional(),
 	/** Custom user-defined metrics surfaced on each API call. */
 	metrics: z.array(apiCallMetricSchema).optional()
 });
@@ -2776,6 +2917,7 @@ const DEFAULT_LLM_CALLS_CONFIG = {
 		reasoning: "reasoning",
 		toolCalls: "toolCalls"
 	},
+	derivedAttributes: [],
 	metrics: [],
 	pricing: []
 };
@@ -2799,8 +2941,35 @@ const DEFAULT_API_CALLS_CONFIG = {
 		durationMs: "durationMs",
 		error: "error"
 	},
+	derivedAttributes: [],
 	metrics: []
 };
+function resolveDerivedAttributes(input) {
+	return Object.entries(input ?? {}).map(([path, compute]) => ({
+		path,
+		compute
+	}));
+}
+function resolveLlmCallMetric(metric) {
+	return {
+		label: metric.label,
+		tooltip: metric.tooltip,
+		path: metric.path,
+		format: metric.format ?? "string",
+		numberFormat: metric.numberFormat,
+		placements: metric.placements ? [...metric.placements] : ["body"]
+	};
+}
+function resolveApiCallMetric(metric) {
+	return {
+		label: metric.label,
+		tooltip: metric.tooltip,
+		path: metric.path,
+		format: metric.format ?? "string",
+		numberFormat: metric.numberFormat,
+		placements: metric.placements ? [...metric.placements] : ["body"]
+	};
+}
 /**
 * Resolve the user-authored LLM-calls config to a fully-defaulted shape used
 * by the UI to derive the LLM calls tab.
@@ -2820,14 +2989,8 @@ function resolveLlmCallsConfig(input) {
 			...DEFAULT_LLM_CALLS_CONFIG.attributes,
 			...input?.attributes
 		},
-		metrics: (input?.metrics ?? []).map((m) => ({
-			label: m.label,
-			tooltip: m.tooltip,
-			path: m.path,
-			format: m.format ?? "string",
-			numberFormat: m.numberFormat,
-			placements: m.placements ? [...m.placements] : ["body"]
-		})),
+		derivedAttributes: resolveDerivedAttributes(input?.derivedAttributes),
+		metrics: (input?.metrics ?? []).map(resolveLlmCallMetric),
 		pricing: (input?.pricing ?? []).map((p) => ({
 			model: p.model,
 			provider: p.provider,
@@ -2857,14 +3020,8 @@ function resolveApiCallsConfig(input) {
 			...DEFAULT_API_CALLS_CONFIG.attributes,
 			...input?.attributes
 		},
-		metrics: (input?.metrics ?? []).map((m) => ({
-			label: m.label,
-			tooltip: m.tooltip,
-			path: m.path,
-			format: m.format ?? "string",
-			numberFormat: m.numberFormat,
-			placements: m.placements ? [...m.placements] : ["body"]
-		}))
+		derivedAttributes: resolveDerivedAttributes(input?.derivedAttributes),
+		metrics: (input?.metrics ?? []).map(resolveApiCallMetric)
 	};
 }
 /** Zod schema for validating `agent-evals.config.ts` input. */
@@ -3084,7 +3241,7 @@ function getEvalTitle(evalLike) {
 }
 //#endregion
 //#region ../shared/src/utils/getNestedAttribute.ts
-function isRecord$3(value) {
+function isRecord$4(value) {
 	return typeof value === "object" && value !== null;
 }
 /**
@@ -3099,12 +3256,84 @@ function getNestedAttribute(value, path) {
 	const parts = path.split(".");
 	let current = value;
 	for (const part of parts) {
-		if (!isRecord$3(current) || !(part in current)) return;
+		if (!isRecord$4(current) || !(part in current)) return;
 		current = current[part];
 	}
 	return current;
 }
 //#endregion
+//#region ../shared/src/utils/deriveCallAttributes.ts
+function isRecord$3(value) {
+	return typeof value === "object" && value !== null;
+}
+function mergeNestedAttribute$1(value, path, attributeValue) {
+	const root = value === void 0 ? {} : { ...value };
+	const parts = path.split(".");
+	let current = root;
+	for (const [index, part] of parts.entries()) {
+		if (index === parts.length - 1) {
+			current[part] = attributeValue;
+			continue;
+		}
+		const nextValue = current[part];
+		const nextRecord = isRecord$3(nextValue) ? { ...nextValue } : {};
+		current[part] = nextRecord;
+		current = nextRecord;
+	}
+	return root;
+}
+function applyDerivedAttributesForKind(params) {
+	let attributes = params.span.attributes;
+	for (const derivedAttribute of params.derivedAttributes) {
+		if (derivedAttribute.compute === void 0) continue;
+		const span = {
+			...params.span,
+			attributes
+		};
+		const value = (() => {
+			try {
+				return derivedAttribute.compute({
+					attributes,
+					span,
+					get: (path) => getNestedAttribute(attributes, path)
+				});
+			} catch {
+				return;
+			}
+		})();
+		if (value === void 0) continue;
+		attributes = mergeNestedAttribute$1(attributes, derivedAttribute.path, value);
+	}
+	if (attributes === params.span.attributes) return params.span;
+	return {
+		...params.span,
+		attributes
+	};
+}
+/**
+* Persist configured derived attributes onto matching LLM/API spans.
+*
+* These derived attributes are applied before trace consumers run, so
+* `deriveFromTracing`, default usage extraction, trace display, and call
+* metrics can all read them by normal dot-path lookup.
+*/
+function applyDerivedCallAttributes(params) {
+	const llmKinds = new Set(params.llmCallsConfig.kinds);
+	const apiKinds = new Set(params.apiCallsConfig.kinds);
+	return params.spans.map((span) => {
+		let nextSpan = span;
+		if (llmKinds.has(span.kind)) nextSpan = applyDerivedAttributesForKind({
+			span: nextSpan,
+			derivedAttributes: params.llmCallsConfig.derivedAttributes
+		});
+		if (apiKinds.has(span.kind)) nextSpan = applyDerivedAttributesForKind({
+			span: nextSpan,
+			derivedAttributes: params.apiCallsConfig.derivedAttributes
+		});
+		return nextSpan;
+	});
+}
+//#endregion
 //#region ../shared/src/utils/extractLlmCalls.ts
 function readNumber$2(attributes, path) {
 	const raw = getNestedAttribute(attributes, path);
@@ -5235,9 +5464,16 @@ async function runCase(params) {
 			mode: cacheMode,
 			evalId,
 			codeFingerprint
-		} : void 0
+		} : void 0,
+		startTime: evalDef.startTime,
+		freezeTime: evalDef.freezeTime
 	});
-	const traceTree = buildTraceTree(scope.spans, scope.checkpoints);
+	const spansWithDerivedAttributes = applyDerivedCallAttributes({
+		spans: scope.spans,
+		llmCallsConfig,
+		apiCallsConfig
+	});
+	const traceTree = buildTraceTree(spansWithDerivedAttributes, scope.checkpoints);
 	const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
 	if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) scope.assertionFailures.push(toAssertionFailure(executeError.message, executeError));
 	if (!nonAssertError && evalDef.deriveFromTracing) {
@@ -5259,7 +5495,7 @@ async function runCase(params) {
 	}
 	if (!nonAssertError) addDefaultOutputs({
 		outputs: scope.outputs,
-		spans: scope.spans,
+		spans: spansWithDerivedAttributes,
 		llmCallsConfig,
 		apiCallsConfig,
 		globalRemove: globalRemoveDefaultConfig,
@@ -5276,6 +5512,7 @@ async function runCase(params) {
 	}
 	const scoreResults = /* @__PURE__ */ new Map();
 	const scoringTraces = {};
+	const scoreStartTime = getEvalClockStateTimeMs(scope.evalClockState) ?? evalDef.startTime;
 	if (!nonAssertError && scope.assertionFailures.length === 0 && evalDef.scores) for (const [key, def] of Object.entries(evalDef.scores)) {
 		const { compute, passThreshold, label } = normalizeScoreDef(def);
 		const scoreRun = await runInEvalScope(evalCase.id, async () => {
@@ -5295,7 +5532,9 @@ async function runCase(params) {
 				mode: cacheMode,
 				evalId: `${evalId}__score__${key}`,
 				codeFingerprint
-			} : void 0
+			} : void 0,
+			startTime: scoreStartTime,
+			freezeTime: evalDef.freezeTime
 		});
 		const { trace, traceDisplay } = resolveTracePresentation(scoreRun.scope.spans, globalTraceDisplay, evalDef.traceDisplay);
 		scope.logs.push(...scoreRun.scope.logs.map((entry) => ({
@@ -5344,7 +5583,7 @@ async function runCase(params) {
 		}
 	}
 	const status = nonAssertError ? "error" : passed ? "pass" : "fail";
-	const { trace: displayTrace, traceDisplay } = resolveTracePresentation(scope.spans, globalTraceDisplay, evalDef.traceDisplay);
+	const { trace: displayTrace, traceDisplay } = resolveTracePresentation(spansWithDerivedAttributes, globalTraceDisplay, evalDef.traceDisplay);
 	const columns = {};
 	const columnOverrides = mergeDefaultColumns({
 		columns: evalDef.columns,
@@ -5608,7 +5847,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
 					await runInEvalRuntimeScope("cases", async () => {
 						await entry.use(async (evalDef) => {
 							const cases = filterEvalCases(resolveRunnableEvalCases({
-								cases: typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [],
+								cases: await runWithEvalClock(evalDef.startTime, async () => typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [], { freezeTime: evalDef.freezeTime }),
 								evalId: evalMeta.id
 							}), request.target.evalIds, request.target.caseIds, evalMeta.id);
 							runState.summary.totalCases += cases.length;
@@ -5811,4 +6050,4 @@ function toLastRunStatus(status) {
 	return status === "pending" ? null : status;
 }
 //#endregion
-export { removeDefaultConfigSchema as $, columnKindSchema as $t, extractApiCalls as A, setEvalOutput as An, cacheFileSchema as At, DEFAULT_API_CALLS_CONFIG as B, traceAttributeDisplayFormatSchema as Bt, validateCharts as C, incrementEvalOutput as Cn, evalChartTooltipExtraSchema as Ct, sseEnvelopeSchema as D, runInEvalRuntimeScope as Dn, cacheDebugKeyFileSchema as Dt, updateManualScoreRequestSchema as E, nextEvalId as En, cacheDebugKeyEntrySchema as Et, deriveScopedSummaryFromCases as F, getEvalRegistry as Fn, cacheRecordingSchema as Ft, apiCallMetricSchema as G, traceDisplayInputConfigSchema as Gt, agentEvalsConfigSchema as H, traceAttributeDisplayPlacementSchema as Ht, deriveStatusFromCaseRows as I, cacheStatusSchema as It, llmCallMetricFormatSchema as J, traceSpanSchema as Jt, apiCallsConfigSchema as K, traceSpanErrorSchema as Kt, deriveStatusFromChildStatuses as L, serializedCacheSpanSchema as Lt, getNestedAttribute as M, startEvalBackgroundJob as Mn, cacheModeSchema as Mt, getEvalTitle as N, repoFile as Nn, cacheOperationTypeSchema as Nt, extractCacheEntries as O, runInEvalScope as On, cacheEntrySchema as Ot, getEvalDisplayStatus as P, defineEval as Pn, cacheRecordingOpSchema as Pt, llmCallsConfigSchema as Q, columnFormatSchema as Qt, runManifestSchema as R, spanCacheOptionsSchema as Rt, normalizeScoreDef as S, getEvalCaseInput as Sn, evalChartMetricSchema as St, createRunRequestSchema as T, mergeEvalOutput as Tn, evalChartsConfigSchema as Tt, apiCallMetricFormatSchema as U, traceAttributeDisplaySchema as Ut, DEFAULT_LLM_CALLS_CONFIG as V, traceAttributeDisplayInputSchema as Vt, apiCallMetricPlacementSchema as W, traceDisplayConfigSchema as Wt, llmCallMetricSchema as X, cellValueSchema as Xt, llmCallMetricPlacementSchema as Y, traceSpanWarningSchema as Yt, llmCallPricingSchema as Z, columnDefSchema as Zt, loadEvalModule as _, appendToEvalOutput as _n, evalChartAggregateSchema as _t, loadPersistedRunSnapshot as a, z$1 as an, caseDetailSchema as at, loadConfig as b, evalLog as bn, evalChartColorSchema as bt, persistCaseDetail as c, evalSpan as cn, evalStatAggregateSchema as ct, recomputePersistedCaseStatus as d, hashCacheKeySync as dn, evalSummarySchema as dt, fileRefSchema as en, resolveApiCallsConfig as et, runTouchesEval as f, deserializeCacheRecording as fn, runLogEntrySchema as ft, setLatestRunInfoMap as g, EvalAssertionError as gn, scoreTraceSchema as gt, getTargetEvalIds as h, serializeCacheValue as hn, runLogPhaseSchema as ht, getLatestRunInfos as i, runArtifactRefSchema as in, assertionFailureSchema as it, extractLlmCalls as j, setScopeCacheContext as jn, cacheListItemSchema as jt, extractCacheHits as k, runInExistingEvalScope as kn, cacheEntryWithDebugKeySchema as kt, persistRunState as l, evalTracer as ln, evalStatItemSchema as lt, buildEvalSummary as m, serializeCacheRecording as mn, runLogLocationSchema as mt, generateRunId as n, numberDisplayOptionsSchema as nn, runLogsConfigSchema as nt, loadPersistedRunSnapshots as o, buildTraceTree as on, caseRowSchema as ot, resolveArtifactPath as p, deserializeCacheValue as pn, runLogLevelSchema as pt, defaultConfigKeySchema as q, traceSpanKindSchema as qt, getLastRunStatuses as r, repoFileRefSchema as rn, trialSelectionModeSchema as rt, nextShortIdFromSnapshots as s, captureEvalSpanError as sn, evalFreshnessStatusSchema as st, executeRun as t, jsonCellSchema as tn, resolveLlmCallsConfig as tt, recomputeEvalStatusesInRuns as u, hashCacheKey as un, evalStatsConfigSchema as ut, parseEvalMetas as v, configureEvalRunLogs as vn, evalChartAxisSchema as vt, createFsCacheStore as w, isInEvalScope as wn, evalChartTypeSchema as wt, buildDeclaredColumnDefs as x, getCurrentScope as xn, evalChartConfigSchema as xt, resolveEvalDefaultConfig as y, evalAssert as yn, evalChartBuiltinMetricSchema as yt, runSummarySchema as z, traceCacheRefSchema as zt };
+export { llmCallsConfigSchema as $, columnFormatSchema as $t, extractApiCalls as A, runInEvalRuntimeScope as An, cacheEntryWithDebugKeySchema as At, runSummarySchema as B, traceCacheRefSchema as Bt, validateCharts as C, getCurrentScope as Cn, evalChartMetricSchema as Ct, sseEnvelopeSchema as D, isInEvalScope as Dn, cacheDebugKeyEntrySchema as Dt, updateManualScoreRequestSchema as E, incrementEvalOutput as En, evalChartsConfigSchema as Et, getEvalDisplayStatus as F, startEvalBackgroundJob as Fn, cacheRecordingOpSchema as Ft, apiCallMetricPlacementSchema as G, traceDisplayConfigSchema as Gt, DEFAULT_LLM_CALLS_CONFIG as H, traceAttributeDisplayInputSchema as Ht, deriveScopedSummaryFromCases as I, repoFile as In, cacheRecordingSchema as It, defaultConfigKeySchema as J, traceSpanKindSchema as Jt, apiCallMetricSchema as K, traceDisplayInputConfigSchema as Kt, deriveStatusFromCaseRows as L, defineEval as Ln, cacheStatusSchema as Lt, applyDerivedCallAttributes as M, runInExistingEvalScope as Mn, cacheListItemSchema as Mt, getNestedAttribute as N, setEvalOutput as Nn, cacheModeSchema as Nt, extractCacheEntries as O, mergeEvalOutput as On, cacheDebugKeyFileSchema as Ot, getEvalTitle as P, setScopeCacheContext as Pn, cacheOperationTypeSchema as Pt, llmCallPricingSchema as Q, columnDefSchema as Qt, deriveStatusFromChildStatuses as R, getEvalRegistry as Rn, serializedCacheSpanSchema as Rt, normalizeScoreDef as S, evalLog as Sn, evalChartConfigSchema as St, createRunRequestSchema as T, getEvalStartTime as Tn, evalChartTypeSchema as Tt, agentEvalsConfigSchema as U, traceAttributeDisplayPlacementSchema as Ut, DEFAULT_API_CALLS_CONFIG as V, traceAttributeDisplayFormatSchema as Vt, apiCallMetricFormatSchema as W, traceAttributeDisplaySchema as Wt, llmCallMetricPlacementSchema as X, traceSpanWarningSchema as Xt, llmCallMetricFormatSchema as Y, traceSpanSchema as Yt, llmCallMetricSchema as Z, cellValueSchema as Zt, loadEvalModule as _, EvalAssertionError as _n, scoreTraceSchema as _t, loadPersistedRunSnapshot as a, runArtifactRefSchema as an, assertionFailureSchema as at, loadConfig as b, configureEvalRunLogs as bn, evalChartBuiltinMetricSchema as bt, persistCaseDetail as c, captureEvalSpanError as cn, evalFreshnessStatusSchema as ct, recomputePersistedCaseStatus as d, hashCacheKey as dn, evalStatsConfigSchema as dt, columnKindSchema as en, removeDefaultConfigSchema as et, runTouchesEval as f, hashCacheKeySync as fn, evalSummarySchema as ft, setLatestRunInfoMap as g, serializeCacheValue as gn, runLogPhaseSchema as gt, getTargetEvalIds as h, serializeCacheRecording as hn, runLogLocationSchema as ht, getLatestRunInfos as i, repoFileRefSchema as in, trialSelectionModeSchema as it, extractLlmCalls as j, runInEvalScope as jn, cacheFileSchema as jt, extractCacheHits as k, nextEvalId as kn, cacheEntrySchema as kt, persistRunState as l, evalSpan as ln, evalStatAggregateSchema as lt, buildEvalSummary as m, deserializeCacheValue as mn, runLogLevelSchema as mt, generateRunId as n, jsonCellSchema as nn, resolveLlmCallsConfig as nt, loadPersistedRunSnapshots as o, z$1 as on, caseDetailSchema as ot, resolveArtifactPath as p, deserializeCacheRecording as pn, runLogEntrySchema as pt, apiCallsConfigSchema as q, traceSpanErrorSchema as qt, getLastRunStatuses as r, numberDisplayOptionsSchema as rn, runLogsConfigSchema as rt, nextShortIdFromSnapshots as s, buildTraceTree as sn, caseRowSchema as st, executeRun as t, fileRefSchema as tn, resolveApiCallsConfig as tt, recomputeEvalStatusesInRuns as u, evalTracer as un, evalStatItemSchema as ut, parseEvalMetas as v, advanceEvalTime as vn, evalChartAggregateSchema as vt, createFsCacheStore as w, getEvalCaseInput as wn, evalChartTooltipExtraSchema as wt, buildDeclaredColumnDefs as x, evalAssert as xn, evalChartColorSchema as xt, resolveEvalDefaultConfig as y, appendToEvalOutput as yn, evalChartAxisSchema as yt, runManifestSchema as z, spanCacheOptionsSchema as zt };

package/dist/{runner-C9nP2VKL.mjs → runner-B4SosWgD.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { n as createRunner } from "./cli-C0EtHhEO.mjs";
-import "./src-D-HuV8I-.mjs";
+import { n as createRunner } from "./cli-Be0x8CS3.mjs";
+import "./src-D6cettg0.mjs";
 //#region ../../apps/server/src/runner.ts
 let runnerInstance = null;
 function getRunnerInstance() {

package/dist/{runner-CyRhIzci.mjs → runner-jSujaSKt.mjs} RENAMED Viewed

@@ -1,2 +1,2 @@
-import { n as initRunner, t as getRunnerInstance } from "./runner-C9nP2VKL.mjs";
+import { n as initRunner, t as getRunnerInstance } from "./runner-B4SosWgD.mjs";
 export { getRunnerInstance, initRunner };

package/dist/src-D6cettg0.mjs ADDED Viewed

@@ -0,0 +1,3 @@
+import "./runOrchestration-D697g6Qe.mjs";
+import "./cli-Be0x8CS3.mjs";
+export {};

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ls-stack/agent-eval",
-  "version": "0.22.0",
+  "version": "0.24.0",
   "type": "module",
   "bin": {
     "agent-evals": "./dist/bin.mjs"
@@ -59,8 +59,8 @@
     "@types/node": "^24.7.2",
     "typescript": "^5.9.2",
     "@agent-evals/runner": "0.0.1",
-    "@agent-evals/sdk": "0.0.1",
-    "@agent-evals/shared": "0.0.1"
+    "@agent-evals/shared": "0.0.1",
+    "@agent-evals/sdk": "0.0.1"
   },
   "scripts": {
     "build": "pnpm --filter @agent-evals/web build && tsdown",

package/skills/agent-eval/SKILL.md CHANGED Viewed

@@ -156,6 +156,19 @@ for settlement; promise and span errors keep their normal behavior. Use
 `waitForBackgroundJob: false` on a span, or `waitForBackgroundJobs: false` on an
 eval definition, when background work should not delay finalization.
+Eval Date APIs use a shifted wall clock by default: `new Date()` and
+`Date.now()` start at `2026-04-10T00:00:00.000Z` during case generation,
+execution, tracing, derived outputs, and scorers, then continue advancing with
+real elapsed time. Set `startTime` on a specific `defineEval(...)` to use
+another initial clock value, or set `startTime: 'now'` for that eval to use the
+real current clock. Timers are not faked, so async waits still run normally.
+Set `freezeTime: true` to keep Date APIs frozen until they are moved manually.
+Use `getEvalStartTime()` to read the captured wall-clock start as a `Date`.
+Use `advanceEvalTime(unit, amount)` inside an eval to move the shifted clock
+forward; supported units are `millisecond(s)`, `second(s)`, `minute(s)`,
+`hour(s)`, and `day(s)`. It throws for evals with `startTime: 'now'`, unless
+`freezeTime: true` is also set.
 For libraries or observability exporters that already emit span lifecycle
 events, use `evalTracer.startSpan(...)`, `evalTracer.updateSpan(...)`,
 `evalTracer.endSpan(...)`, or `evalTracer.recordSpan(...)` to translate those
@@ -261,10 +274,12 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
   attribute paths. `latencyMs` is time to first token; duration, total tokens,
   tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
   override `attributes.<field>` for non-default primitive span shapes, configure
-  `pricing` to derive USD costs from token counts by model/provider, and add
-  entries to `metrics` to surface arbitrary user metrics (`format: 'string' |
-'number' | 'duration' | 'json' | 'boolean'`, `placements: ['header' |
-'body']`).
+  `pricing` to derive USD costs from token counts by model/provider, add
+  `derivedAttributes` to persist computed values back onto matching LLM spans
+  before trace consumers run, and add entries to `metrics` to surface arbitrary
+  user metrics (`format: 'string' | 'number' | 'duration' | 'json' |
+'boolean'`, `placements: ['header' | 'body']`). `derivedAttributes` keys are
+  dot-paths under `span.attributes`; return `undefined` to skip one span.
 - Default usage config derives missing eval outputs from matching LLM/API spans
   before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`,
   `inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`,
@@ -285,7 +300,8 @@ cacheCreationInputTokens` so cache details are not double-counted.
   and `'fetch'` spans with `method`, `url`, `statusCode`, `request`,
   `response`, `requestBody`, `responseBody`, `headers`, `durationMs`, and
   `error` read from conventional attribute paths. Override `kinds` or
-  `attributes.<field>` for external tracers, and add `metrics` with the same
+  `attributes.<field>` for external tracers, add `derivedAttributes` for
+  computed persisted API span attributes, and add `metrics` with the same
   formats and placements as LLM-call metrics.
 - `runLogs` (in `agent-evals.config.ts`) controls case log capture. Use
   `runLogs: { captureConsole: false }` to keep console output in the terminal

package/dist/src-D-HuV8I-.mjs DELETED Viewed

@@ -1,3 +0,0 @@
-import "./runOrchestration-D1edUDhp.mjs";
-import "./cli-C0EtHhEO.mjs";
-export {};