@ls-stack/agent-eval 0.22.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -49,9 +49,46 @@ function repoFile(path, mimeType) {
49
49
  //#region ../sdk/src/runtime.ts
50
50
  const scopeStorage = new AsyncLocalStorage();
51
51
  const runtimeScopeStorage = new AsyncLocalStorage();
52
+ const evalClockStorage = new AsyncLocalStorage();
52
53
  let activeEvalScopeCount = 0;
53
54
  let activeEvalRuntimeScopeCount = 0;
54
55
  let consoleCaptureEnabled = true;
56
+ const defaultEvalStartTimeMs = Date.parse("2026-04-10T00:00:00.000Z");
57
+ const realDate = globalThis.__agentEvalsRealDate ?? Date;
58
+ globalThis.__agentEvalsRealDate = realDate;
59
+ function toDateConstructorArg(value) {
60
+ if (typeof value === "string" || typeof value === "number" || value instanceof realDate) return value;
61
+ return Number(value);
62
+ }
63
+ function toDateNumberArg(value) {
64
+ return typeof value === "number" ? value : Number(value);
65
+ }
66
+ function constructDateFromArgs(args) {
67
+ if (args.length === 0) return new realDate();
68
+ if (args.length === 1) return new realDate(toDateConstructorArg(args[0]));
69
+ return new realDate(toDateNumberArg(args[0]), toDateNumberArg(args[1]), args[2] === void 0 ? 1 : toDateNumberArg(args[2]), args[3] === void 0 ? 0 : toDateNumberArg(args[3]), args[4] === void 0 ? 0 : toDateNumberArg(args[4]), args[5] === void 0 ? 0 : toDateNumberArg(args[5]), args[6] === void 0 ? 0 : toDateNumberArg(args[6]));
70
+ }
71
+ const evalDate = new Proxy(realDate, {
72
+ apply(target, thisArg, argArray_) {
73
+ const nowMs = getEvalClockNowMs();
74
+ if (nowMs !== null) return new target(nowMs).toString();
75
+ return target.call(thisArg);
76
+ },
77
+ construct(target, argArray, newTarget_) {
78
+ const nowMs = getEvalClockNowMs();
79
+ if (argArray.length === 0 && nowMs !== null) return new target(nowMs);
80
+ return constructDateFromArgs(Array.from(argArray));
81
+ },
82
+ get(target, property) {
83
+ if (property === "now") return getEvalDateNow;
84
+ if (property === "parse") return target.parse;
85
+ if (property === "UTC") return target.UTC;
86
+ if (property === "prototype") return target.prototype;
87
+ if (property === "name") return target.name;
88
+ if (property === "length") return target.length;
89
+ }
90
+ });
91
+ globalThis.Date = evalDate;
55
92
  const maxLogMessageLength = 2e4;
56
93
  const maxLogStringLength = 1e4;
57
94
  const maxLogArrayLength = 100;
@@ -79,6 +116,82 @@ var EvalAssertionError = class extends Error {
79
116
  this.name = "EvalAssertionError";
80
117
  }
81
118
  };
119
+ function getEvalClockStateNowMs(state) {
120
+ const elapsedMs = state.frozen ? 0 : realDate.now() - state.realStartMs;
121
+ return state.startMs + elapsedMs + state.offsetMs;
122
+ }
123
+ function getEvalClockNowMs() {
124
+ const state = evalClockStorage.getStore();
125
+ if (state?.shifted !== true) return null;
126
+ return getEvalClockStateNowMs(state);
127
+ }
128
+ function getEvalDateNow() {
129
+ return getEvalClockNowMs() ?? realDate.now();
130
+ }
131
+ /** Return the host process clock, bypassing the eval Date shim. */
132
+ function getRealDateNowMs() {
133
+ return realDate.now();
134
+ }
135
+ /** Return the shifted wall-clock time for a stored eval clock state. */
136
+ function getEvalClockStateTimeMs(state) {
137
+ if (!state.shifted) return null;
138
+ return getEvalClockStateNowMs(state);
139
+ }
140
+ /**
141
+ * Return the wall-clock start time captured for the active eval.
142
+ *
143
+ * For `startTime: 'now'`, this is the real time captured when the eval clock
144
+ * context was created.
145
+ */
146
+ function getEvalStartTime() {
147
+ const state = evalClockStorage.getStore();
148
+ if (state === void 0) throw new Error("getEvalStartTime() must be called inside an active eval");
149
+ return new realDate(state.startMs);
150
+ }
151
+ function resolveEvalStartTimeMs(startTime) {
152
+ if (startTime === void 0) return defaultEvalStartTimeMs;
153
+ if (startTime === "now") return realDate.now();
154
+ const ms = startTime instanceof realDate ? startTime.getTime() : typeof startTime === "number" ? startTime : realDate.parse(startTime);
155
+ if (Number.isFinite(ms)) return ms;
156
+ throw new Error(`Invalid eval startTime "${String(startTime)}". Use a Date, timestamp, ISO date string, or "now".`);
157
+ }
158
+ function createEvalClockState(startTime, freezeTime) {
159
+ const nowMs = realDate.now();
160
+ return {
161
+ startMs: startTime === "now" ? nowMs : resolveEvalStartTimeMs(startTime),
162
+ realStartMs: nowMs,
163
+ offsetMs: 0,
164
+ frozen: freezeTime,
165
+ shifted: startTime !== "now" || freezeTime
166
+ };
167
+ }
168
+ /** Execute a callback with the eval Date clock shifted from `startTime`. */
169
+ async function runWithEvalClock(startTime, fn, options = {}) {
170
+ return await evalClockStorage.run(createEvalClockState(startTime, options.freezeTime === true), fn);
171
+ }
172
+ function getEvalTimeUnitMs(unit) {
173
+ if (unit === "millisecond" || unit === "milliseconds") return 1;
174
+ if (unit === "second" || unit === "seconds") return 1e3;
175
+ if (unit === "minute" || unit === "minutes") return 6e4;
176
+ if (unit === "hour" || unit === "hours") return 36e5;
177
+ if (unit === "day" || unit === "days") return 864e5;
178
+ throw new Error(`Unsupported eval time unit "${unit}"`);
179
+ }
180
+ /**
181
+ * Advance the active eval's shifted Date clock and return the new time.
182
+ *
183
+ * Throws outside an active shifted eval clock. Evals that set
184
+ * `startTime: 'now'` use the real current clock unless `freezeTime: true` is
185
+ * also set.
186
+ */
187
+ function advanceEvalTime(unit, amount) {
188
+ const state = evalClockStorage.getStore();
189
+ if (state === void 0) throw new Error("advanceEvalTime() must be called inside an active eval");
190
+ if (!state.shifted) throw new Error("advanceEvalTime() requires a shifted eval clock. Remove startTime: \"now\" or set freezeTime: true to use it.");
191
+ if (!Number.isFinite(amount)) throw new Error("advanceEvalTime() amount must be a finite number");
192
+ state.offsetMs += getEvalTimeUnitMs(unit) * amount;
193
+ return new realDate(getEvalClockStateNowMs(state));
194
+ }
82
195
  /** Return the current eval scope for the active async context, if any. */
83
196
  function getCurrentScope() {
84
197
  if (activeEvalScopeCount === 0) return void 0;
@@ -349,7 +462,9 @@ async function runInExistingEvalScope(scope, runtimeScope, fn) {
349
462
  activeEvalScopeCount++;
350
463
  try {
351
464
  return await scopeStorage.run(scope, async () => {
352
- return await runInEvalRuntimeScope(runtimeScope, fn);
465
+ return await evalClockStorage.run(scope.evalClockState, async () => {
466
+ return await runInEvalRuntimeScope(runtimeScope, fn);
467
+ });
353
468
  });
354
469
  } finally {
355
470
  activeEvalScopeCount--;
@@ -362,6 +477,8 @@ async function runInExistingEvalScope(scope, runtimeScope, fn) {
362
477
  async function runInEvalScope(caseId, fn, options = {}) {
363
478
  const scope = {
364
479
  caseId,
480
+ startTime: options.startTime,
481
+ evalClockState: createEvalClockState(options.startTime, options.freezeTime === true),
365
482
  idPrefix: options.idPrefix,
366
483
  nextEvalIdCounter: 0,
367
484
  input: options.input,
@@ -1213,7 +1330,7 @@ const errorCoreFields = new Set([
1213
1330
  "stack",
1214
1331
  "capturedAt"
1215
1332
  ]);
1216
- function isRecord$4(value) {
1333
+ function isRecord$5(value) {
1217
1334
  return typeof value === "object" && value !== null && !Array.isArray(value);
1218
1335
  }
1219
1336
  function formatUnknownErrorMessage(error) {
@@ -1241,7 +1358,7 @@ function normalizeTraceError(error, capturedAt = void 0) {
1241
1358
  stack: error.stack,
1242
1359
  capturedAt
1243
1360
  };
1244
- if (isRecord$4(error)) {
1361
+ if (isRecord$5(error)) {
1245
1362
  const extraFields = getErrorExtraFields(error);
1246
1363
  const name = typeof error.name === "string" ? error.name : void 0;
1247
1364
  const stack = typeof error.stack === "string" ? error.stack : void 0;
@@ -1266,7 +1383,7 @@ function normalizeTraceWarnings(warningOrWarnings, additionalWarnings, capturedA
1266
1383
  return (additionalWarnings.length > 0 ? [warningOrWarnings, ...additionalWarnings] : Array.isArray(warningOrWarnings) ? warningOrWarnings : [warningOrWarnings]).map((warning) => normalizeTraceError(warning, capturedAt));
1267
1384
  }
1268
1385
  function isCaptureEvalSpanErrorOptions(value) {
1269
- if (!isRecord$4(value)) return false;
1386
+ if (!isRecord$5(value)) return false;
1270
1387
  const keys = Object.keys(value);
1271
1388
  if (keys.length === 0) return false;
1272
1389
  if (!keys.every((key) => key === "level")) return false;
@@ -1493,9 +1610,12 @@ function mergeSpanAttribute(span, key, patch) {
1493
1610
  ...patch
1494
1611
  } });
1495
1612
  }
1496
- function finishSpanWithoutThrownError(span) {
1613
+ function addElapsedMsToTimestamp(isoTimestamp, elapsedMs) {
1614
+ return new Date(new Date(isoTimestamp).getTime() + elapsedMs).toISOString();
1615
+ }
1616
+ function finishSpanWithoutThrownError(span, realStartedAt) {
1497
1617
  span.status = hasSpanError(span) ? "error" : "ok";
1498
- span.endedAt = (/* @__PURE__ */ new Date()).toISOString();
1618
+ span.endedAt = addElapsedMsToTimestamp(span.startedAt, getRealDateNowMs() - realStartedAt);
1499
1619
  }
1500
1620
  function createSpanHandle(span) {
1501
1621
  return {
@@ -1737,9 +1857,11 @@ async function traceSpanInternal(info, fn) {
1737
1857
  const scope = getCurrentScope();
1738
1858
  if (!scope) return await fn(noopActiveSpan());
1739
1859
  const id = generateSpanId();
1860
+ const parentId = scope.activeSpanStack.at(-1)?.id ?? null;
1861
+ const realStartedAt = getRealDateNowMs();
1740
1862
  const spanRecord = {
1741
1863
  id,
1742
- parentId: scope.activeSpanStack.at(-1)?.id ?? null,
1864
+ parentId,
1743
1865
  caseId: scope.caseId,
1744
1866
  kind: info.kind,
1745
1867
  name: info.name,
@@ -1779,7 +1901,7 @@ async function traceSpanInternal(info, fn) {
1779
1901
  const recording = deserializeCacheRecording(hit.recording);
1780
1902
  replayRecording(scope, spanRecord, recording, { generateSpanId });
1781
1903
  spanRecord.status = recording.finalStatus ?? (hasSpanError(spanRecord) ? "error" : "ok");
1782
- spanRecord.endedAt = (/* @__PURE__ */ new Date()).toISOString();
1904
+ spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
1783
1905
  return recording.returnValue;
1784
1906
  }
1785
1907
  mergeSpanAttributes(spanRecord, { "cache.status": "miss" });
@@ -1798,7 +1920,7 @@ async function traceSpanInternal(info, fn) {
1798
1920
  scope.recordingStack.pop();
1799
1921
  }
1800
1922
  appendSubSpanOps(scope, frame);
1801
- finishSpanWithoutThrownError(spanRecord);
1923
+ finishSpanWithoutThrownError(spanRecord, realStartedAt);
1802
1924
  if (ctx.mode !== "bypass") {
1803
1925
  const recording = {
1804
1926
  returnValue: bodyResult,
@@ -1832,11 +1954,11 @@ async function traceSpanInternal(info, fn) {
1832
1954
  return bodyResult;
1833
1955
  }
1834
1956
  const result = await fn(activeSpan);
1835
- finishSpanWithoutThrownError(spanRecord);
1957
+ finishSpanWithoutThrownError(spanRecord, realStartedAt);
1836
1958
  return result;
1837
1959
  } catch (error) {
1838
1960
  spanRecord.status = "error";
1839
- spanRecord.endedAt = (/* @__PURE__ */ new Date()).toISOString();
1961
+ spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
1840
1962
  spanRecord.error = normalizeTraceError(error);
1841
1963
  throw error;
1842
1964
  } finally {
@@ -2605,13 +2727,16 @@ const apiCallMetricFormatSchema = llmCallMetricFormatSchema;
2605
2727
  const llmCallMetricPlacementSchema = z.enum(["header", "body"]);
2606
2728
  /** Where an API-call metric is rendered inside the API calls tab. */
2607
2729
  const apiCallMetricPlacementSchema = llmCallMetricPlacementSchema;
2730
+ const callDerivedAttributeSchema = z.custom((value) => typeof value === "function", { message: "Expected a derived attribute function" });
2608
2731
  /**
2609
2732
  * Schema for a single user-defined metric attached to LLM call rows.
2610
2733
  *
2611
2734
  * Each metric reads `path` from the span's `attributes` and renders the value
2612
- * with the configured `format` and `numberFormat`. `placements` controls
2613
- * whether the metric appears as a chip on the collapsed row header, as a row
2614
- * inside the expanded body, or both. Defaults to `['body']` when omitted.
2735
+ * with the configured `format` and `numberFormat`. Use
2736
+ * `llmCalls.derivedAttributes` when a metric should read a value computed from
2737
+ * other attributes. `placements` controls whether the metric appears as a chip
2738
+ * on the collapsed row header, as a row inside the expanded body, or both.
2739
+ * Defaults to `['body']` when omitted.
2615
2740
  */
2616
2741
  const llmCallMetricSchema = z.object({
2617
2742
  /** Display label for the metric row or header chip. */
@@ -2638,9 +2763,11 @@ const llmCallMetricSchema = z.object({
2638
2763
  * Schema for a single user-defined metric attached to API call rows.
2639
2764
  *
2640
2765
  * Each metric reads `path` from the span's `attributes` and renders the value
2641
- * with the configured `format` and `numberFormat`. `placements` controls
2642
- * whether the metric appears as a chip on the collapsed row header, as a row
2643
- * inside the expanded body, or both. Defaults to `['body']` when omitted.
2766
+ * with the configured `format` and `numberFormat`. Use
2767
+ * `apiCalls.derivedAttributes` when a metric should read a value computed from
2768
+ * other attributes. `placements` controls whether the metric appears as a chip
2769
+ * on the collapsed row header, as a row inside the expanded body, or both.
2770
+ * Defaults to `['body']` when omitted.
2644
2771
  */
2645
2772
  const apiCallMetricSchema = z.object({
2646
2773
  /** Display label for the metric row or header chip. */
@@ -2717,6 +2844,13 @@ const llmCallsConfigSchema = z.object({
2717
2844
  toolCalls: z.string().optional()
2718
2845
  }).optional(),
2719
2846
  /**
2847
+ * Derived attributes persisted onto every matching LLM span before
2848
+ * `deriveFromTracing`, default outputs, trace display, and call metrics read
2849
+ * the trace. Keys are dot-paths under `span.attributes`; return `undefined`
2850
+ * to skip writing the attribute for one span.
2851
+ */
2852
+ derivedAttributes: z.record(z.string().min(1), callDerivedAttributeSchema).optional(),
2853
+ /**
2720
2854
  * Model/provider pricing registry used to calculate LLM-call costs from
2721
2855
  * token counts. Built-in LLM cost fields are only derived from this registry.
2722
2856
  */
@@ -2745,6 +2879,13 @@ const apiCallsConfigSchema = z.object({
2745
2879
  durationMs: z.string().optional(),
2746
2880
  error: z.string().optional()
2747
2881
  }).optional(),
2882
+ /**
2883
+ * Derived attributes persisted onto every matching API span before trace
2884
+ * display and call metrics read the trace. Keys are dot-paths under
2885
+ * `span.attributes`; return `undefined` to skip writing the attribute for
2886
+ * one span.
2887
+ */
2888
+ derivedAttributes: z.record(z.string().min(1), callDerivedAttributeSchema).optional(),
2748
2889
  /** Custom user-defined metrics surfaced on each API call. */
2749
2890
  metrics: z.array(apiCallMetricSchema).optional()
2750
2891
  });
@@ -2776,6 +2917,7 @@ const DEFAULT_LLM_CALLS_CONFIG = {
2776
2917
  reasoning: "reasoning",
2777
2918
  toolCalls: "toolCalls"
2778
2919
  },
2920
+ derivedAttributes: [],
2779
2921
  metrics: [],
2780
2922
  pricing: []
2781
2923
  };
@@ -2799,8 +2941,35 @@ const DEFAULT_API_CALLS_CONFIG = {
2799
2941
  durationMs: "durationMs",
2800
2942
  error: "error"
2801
2943
  },
2944
+ derivedAttributes: [],
2802
2945
  metrics: []
2803
2946
  };
2947
+ function resolveDerivedAttributes(input) {
2948
+ return Object.entries(input ?? {}).map(([path, compute]) => ({
2949
+ path,
2950
+ compute
2951
+ }));
2952
+ }
2953
+ function resolveLlmCallMetric(metric) {
2954
+ return {
2955
+ label: metric.label,
2956
+ tooltip: metric.tooltip,
2957
+ path: metric.path,
2958
+ format: metric.format ?? "string",
2959
+ numberFormat: metric.numberFormat,
2960
+ placements: metric.placements ? [...metric.placements] : ["body"]
2961
+ };
2962
+ }
2963
+ function resolveApiCallMetric(metric) {
2964
+ return {
2965
+ label: metric.label,
2966
+ tooltip: metric.tooltip,
2967
+ path: metric.path,
2968
+ format: metric.format ?? "string",
2969
+ numberFormat: metric.numberFormat,
2970
+ placements: metric.placements ? [...metric.placements] : ["body"]
2971
+ };
2972
+ }
2804
2973
  /**
2805
2974
  * Resolve the user-authored LLM-calls config to a fully-defaulted shape used
2806
2975
  * by the UI to derive the LLM calls tab.
@@ -2820,14 +2989,8 @@ function resolveLlmCallsConfig(input) {
2820
2989
  ...DEFAULT_LLM_CALLS_CONFIG.attributes,
2821
2990
  ...input?.attributes
2822
2991
  },
2823
- metrics: (input?.metrics ?? []).map((m) => ({
2824
- label: m.label,
2825
- tooltip: m.tooltip,
2826
- path: m.path,
2827
- format: m.format ?? "string",
2828
- numberFormat: m.numberFormat,
2829
- placements: m.placements ? [...m.placements] : ["body"]
2830
- })),
2992
+ derivedAttributes: resolveDerivedAttributes(input?.derivedAttributes),
2993
+ metrics: (input?.metrics ?? []).map(resolveLlmCallMetric),
2831
2994
  pricing: (input?.pricing ?? []).map((p) => ({
2832
2995
  model: p.model,
2833
2996
  provider: p.provider,
@@ -2857,14 +3020,8 @@ function resolveApiCallsConfig(input) {
2857
3020
  ...DEFAULT_API_CALLS_CONFIG.attributes,
2858
3021
  ...input?.attributes
2859
3022
  },
2860
- metrics: (input?.metrics ?? []).map((m) => ({
2861
- label: m.label,
2862
- tooltip: m.tooltip,
2863
- path: m.path,
2864
- format: m.format ?? "string",
2865
- numberFormat: m.numberFormat,
2866
- placements: m.placements ? [...m.placements] : ["body"]
2867
- }))
3023
+ derivedAttributes: resolveDerivedAttributes(input?.derivedAttributes),
3024
+ metrics: (input?.metrics ?? []).map(resolveApiCallMetric)
2868
3025
  };
2869
3026
  }
2870
3027
  /** Zod schema for validating `agent-evals.config.ts` input. */
@@ -3084,7 +3241,7 @@ function getEvalTitle(evalLike) {
3084
3241
  }
3085
3242
  //#endregion
3086
3243
  //#region ../shared/src/utils/getNestedAttribute.ts
3087
- function isRecord$3(value) {
3244
+ function isRecord$4(value) {
3088
3245
  return typeof value === "object" && value !== null;
3089
3246
  }
3090
3247
  /**
@@ -3099,12 +3256,84 @@ function getNestedAttribute(value, path) {
3099
3256
  const parts = path.split(".");
3100
3257
  let current = value;
3101
3258
  for (const part of parts) {
3102
- if (!isRecord$3(current) || !(part in current)) return;
3259
+ if (!isRecord$4(current) || !(part in current)) return;
3103
3260
  current = current[part];
3104
3261
  }
3105
3262
  return current;
3106
3263
  }
3107
3264
  //#endregion
3265
+ //#region ../shared/src/utils/deriveCallAttributes.ts
3266
+ function isRecord$3(value) {
3267
+ return typeof value === "object" && value !== null;
3268
+ }
3269
+ function mergeNestedAttribute$1(value, path, attributeValue) {
3270
+ const root = value === void 0 ? {} : { ...value };
3271
+ const parts = path.split(".");
3272
+ let current = root;
3273
+ for (const [index, part] of parts.entries()) {
3274
+ if (index === parts.length - 1) {
3275
+ current[part] = attributeValue;
3276
+ continue;
3277
+ }
3278
+ const nextValue = current[part];
3279
+ const nextRecord = isRecord$3(nextValue) ? { ...nextValue } : {};
3280
+ current[part] = nextRecord;
3281
+ current = nextRecord;
3282
+ }
3283
+ return root;
3284
+ }
3285
+ function applyDerivedAttributesForKind(params) {
3286
+ let attributes = params.span.attributes;
3287
+ for (const derivedAttribute of params.derivedAttributes) {
3288
+ if (derivedAttribute.compute === void 0) continue;
3289
+ const span = {
3290
+ ...params.span,
3291
+ attributes
3292
+ };
3293
+ const value = (() => {
3294
+ try {
3295
+ return derivedAttribute.compute({
3296
+ attributes,
3297
+ span,
3298
+ get: (path) => getNestedAttribute(attributes, path)
3299
+ });
3300
+ } catch {
3301
+ return;
3302
+ }
3303
+ })();
3304
+ if (value === void 0) continue;
3305
+ attributes = mergeNestedAttribute$1(attributes, derivedAttribute.path, value);
3306
+ }
3307
+ if (attributes === params.span.attributes) return params.span;
3308
+ return {
3309
+ ...params.span,
3310
+ attributes
3311
+ };
3312
+ }
3313
+ /**
3314
+ * Persist configured derived attributes onto matching LLM/API spans.
3315
+ *
3316
+ * These derived attributes are applied before trace consumers run, so
3317
+ * `deriveFromTracing`, default usage extraction, trace display, and call
3318
+ * metrics can all read them by normal dot-path lookup.
3319
+ */
3320
+ function applyDerivedCallAttributes(params) {
3321
+ const llmKinds = new Set(params.llmCallsConfig.kinds);
3322
+ const apiKinds = new Set(params.apiCallsConfig.kinds);
3323
+ return params.spans.map((span) => {
3324
+ let nextSpan = span;
3325
+ if (llmKinds.has(span.kind)) nextSpan = applyDerivedAttributesForKind({
3326
+ span: nextSpan,
3327
+ derivedAttributes: params.llmCallsConfig.derivedAttributes
3328
+ });
3329
+ if (apiKinds.has(span.kind)) nextSpan = applyDerivedAttributesForKind({
3330
+ span: nextSpan,
3331
+ derivedAttributes: params.apiCallsConfig.derivedAttributes
3332
+ });
3333
+ return nextSpan;
3334
+ });
3335
+ }
3336
+ //#endregion
3108
3337
  //#region ../shared/src/utils/extractLlmCalls.ts
3109
3338
  function readNumber$2(attributes, path) {
3110
3339
  const raw = getNestedAttribute(attributes, path);
@@ -5235,9 +5464,16 @@ async function runCase(params) {
5235
5464
  mode: cacheMode,
5236
5465
  evalId,
5237
5466
  codeFingerprint
5238
- } : void 0
5467
+ } : void 0,
5468
+ startTime: evalDef.startTime,
5469
+ freezeTime: evalDef.freezeTime
5239
5470
  });
5240
- const traceTree = buildTraceTree(scope.spans, scope.checkpoints);
5471
+ const spansWithDerivedAttributes = applyDerivedCallAttributes({
5472
+ spans: scope.spans,
5473
+ llmCallsConfig,
5474
+ apiCallsConfig
5475
+ });
5476
+ const traceTree = buildTraceTree(spansWithDerivedAttributes, scope.checkpoints);
5241
5477
  const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
5242
5478
  if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) scope.assertionFailures.push(toAssertionFailure(executeError.message, executeError));
5243
5479
  if (!nonAssertError && evalDef.deriveFromTracing) {
@@ -5259,7 +5495,7 @@ async function runCase(params) {
5259
5495
  }
5260
5496
  if (!nonAssertError) addDefaultOutputs({
5261
5497
  outputs: scope.outputs,
5262
- spans: scope.spans,
5498
+ spans: spansWithDerivedAttributes,
5263
5499
  llmCallsConfig,
5264
5500
  apiCallsConfig,
5265
5501
  globalRemove: globalRemoveDefaultConfig,
@@ -5276,6 +5512,7 @@ async function runCase(params) {
5276
5512
  }
5277
5513
  const scoreResults = /* @__PURE__ */ new Map();
5278
5514
  const scoringTraces = {};
5515
+ const scoreStartTime = getEvalClockStateTimeMs(scope.evalClockState) ?? evalDef.startTime;
5279
5516
  if (!nonAssertError && scope.assertionFailures.length === 0 && evalDef.scores) for (const [key, def] of Object.entries(evalDef.scores)) {
5280
5517
  const { compute, passThreshold, label } = normalizeScoreDef(def);
5281
5518
  const scoreRun = await runInEvalScope(evalCase.id, async () => {
@@ -5295,7 +5532,9 @@ async function runCase(params) {
5295
5532
  mode: cacheMode,
5296
5533
  evalId: `${evalId}__score__${key}`,
5297
5534
  codeFingerprint
5298
- } : void 0
5535
+ } : void 0,
5536
+ startTime: scoreStartTime,
5537
+ freezeTime: evalDef.freezeTime
5299
5538
  });
5300
5539
  const { trace, traceDisplay } = resolveTracePresentation(scoreRun.scope.spans, globalTraceDisplay, evalDef.traceDisplay);
5301
5540
  scope.logs.push(...scoreRun.scope.logs.map((entry) => ({
@@ -5344,7 +5583,7 @@ async function runCase(params) {
5344
5583
  }
5345
5584
  }
5346
5585
  const status = nonAssertError ? "error" : passed ? "pass" : "fail";
5347
- const { trace: displayTrace, traceDisplay } = resolveTracePresentation(scope.spans, globalTraceDisplay, evalDef.traceDisplay);
5586
+ const { trace: displayTrace, traceDisplay } = resolveTracePresentation(spansWithDerivedAttributes, globalTraceDisplay, evalDef.traceDisplay);
5348
5587
  const columns = {};
5349
5588
  const columnOverrides = mergeDefaultColumns({
5350
5589
  columns: evalDef.columns,
@@ -5608,7 +5847,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5608
5847
  await runInEvalRuntimeScope("cases", async () => {
5609
5848
  await entry.use(async (evalDef) => {
5610
5849
  const cases = filterEvalCases(resolveRunnableEvalCases({
5611
- cases: typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [],
5850
+ cases: await runWithEvalClock(evalDef.startTime, async () => typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [], { freezeTime: evalDef.freezeTime }),
5612
5851
  evalId: evalMeta.id
5613
5852
  }), request.target.evalIds, request.target.caseIds, evalMeta.id);
5614
5853
  runState.summary.totalCases += cases.length;
@@ -5811,4 +6050,4 @@ function toLastRunStatus(status) {
5811
6050
  return status === "pending" ? null : status;
5812
6051
  }
5813
6052
  //#endregion
5814
- export { removeDefaultConfigSchema as $, columnKindSchema as $t, extractApiCalls as A, setEvalOutput as An, cacheFileSchema as At, DEFAULT_API_CALLS_CONFIG as B, traceAttributeDisplayFormatSchema as Bt, validateCharts as C, incrementEvalOutput as Cn, evalChartTooltipExtraSchema as Ct, sseEnvelopeSchema as D, runInEvalRuntimeScope as Dn, cacheDebugKeyFileSchema as Dt, updateManualScoreRequestSchema as E, nextEvalId as En, cacheDebugKeyEntrySchema as Et, deriveScopedSummaryFromCases as F, getEvalRegistry as Fn, cacheRecordingSchema as Ft, apiCallMetricSchema as G, traceDisplayInputConfigSchema as Gt, agentEvalsConfigSchema as H, traceAttributeDisplayPlacementSchema as Ht, deriveStatusFromCaseRows as I, cacheStatusSchema as It, llmCallMetricFormatSchema as J, traceSpanSchema as Jt, apiCallsConfigSchema as K, traceSpanErrorSchema as Kt, deriveStatusFromChildStatuses as L, serializedCacheSpanSchema as Lt, getNestedAttribute as M, startEvalBackgroundJob as Mn, cacheModeSchema as Mt, getEvalTitle as N, repoFile as Nn, cacheOperationTypeSchema as Nt, extractCacheEntries as O, runInEvalScope as On, cacheEntrySchema as Ot, getEvalDisplayStatus as P, defineEval as Pn, cacheRecordingOpSchema as Pt, llmCallsConfigSchema as Q, columnFormatSchema as Qt, runManifestSchema as R, spanCacheOptionsSchema as Rt, normalizeScoreDef as S, getEvalCaseInput as Sn, evalChartMetricSchema as St, createRunRequestSchema as T, mergeEvalOutput as Tn, evalChartsConfigSchema as Tt, apiCallMetricFormatSchema as U, traceAttributeDisplaySchema as Ut, DEFAULT_LLM_CALLS_CONFIG as V, traceAttributeDisplayInputSchema as Vt, apiCallMetricPlacementSchema as W, traceDisplayConfigSchema as Wt, llmCallMetricSchema as X, cellValueSchema as Xt, llmCallMetricPlacementSchema as Y, traceSpanWarningSchema as Yt, llmCallPricingSchema as Z, columnDefSchema as Zt, loadEvalModule as _, appendToEvalOutput as _n, evalChartAggregateSchema as _t, loadPersistedRunSnapshot as a, z$1 as an, caseDetailSchema as at, loadConfig as b, evalLog as bn, evalChartColorSchema as bt, persistCaseDetail as c, evalSpan as cn, evalStatAggregateSchema as ct, recomputePersistedCaseStatus as d, hashCacheKeySync as dn, evalSummarySchema as dt, fileRefSchema as en, resolveApiCallsConfig as et, runTouchesEval as f, deserializeCacheRecording as fn, runLogEntrySchema as ft, setLatestRunInfoMap as g, EvalAssertionError as gn, scoreTraceSchema as gt, getTargetEvalIds as h, serializeCacheValue as hn, runLogPhaseSchema as ht, getLatestRunInfos as i, runArtifactRefSchema as in, assertionFailureSchema as it, extractLlmCalls as j, setScopeCacheContext as jn, cacheListItemSchema as jt, extractCacheHits as k, runInExistingEvalScope as kn, cacheEntryWithDebugKeySchema as kt, persistRunState as l, evalTracer as ln, evalStatItemSchema as lt, buildEvalSummary as m, serializeCacheRecording as mn, runLogLocationSchema as mt, generateRunId as n, numberDisplayOptionsSchema as nn, runLogsConfigSchema as nt, loadPersistedRunSnapshots as o, buildTraceTree as on, caseRowSchema as ot, resolveArtifactPath as p, deserializeCacheValue as pn, runLogLevelSchema as pt, defaultConfigKeySchema as q, traceSpanKindSchema as qt, getLastRunStatuses as r, repoFileRefSchema as rn, trialSelectionModeSchema as rt, nextShortIdFromSnapshots as s, captureEvalSpanError as sn, evalFreshnessStatusSchema as st, executeRun as t, jsonCellSchema as tn, resolveLlmCallsConfig as tt, recomputeEvalStatusesInRuns as u, hashCacheKey as un, evalStatsConfigSchema as ut, parseEvalMetas as v, configureEvalRunLogs as vn, evalChartAxisSchema as vt, createFsCacheStore as w, isInEvalScope as wn, evalChartTypeSchema as wt, buildDeclaredColumnDefs as x, getCurrentScope as xn, evalChartConfigSchema as xt, resolveEvalDefaultConfig as y, evalAssert as yn, evalChartBuiltinMetricSchema as yt, runSummarySchema as z, traceCacheRefSchema as zt };
6053
+ export { llmCallsConfigSchema as $, columnFormatSchema as $t, extractApiCalls as A, runInEvalRuntimeScope as An, cacheEntryWithDebugKeySchema as At, runSummarySchema as B, traceCacheRefSchema as Bt, validateCharts as C, getCurrentScope as Cn, evalChartMetricSchema as Ct, sseEnvelopeSchema as D, isInEvalScope as Dn, cacheDebugKeyEntrySchema as Dt, updateManualScoreRequestSchema as E, incrementEvalOutput as En, evalChartsConfigSchema as Et, getEvalDisplayStatus as F, startEvalBackgroundJob as Fn, cacheRecordingOpSchema as Ft, apiCallMetricPlacementSchema as G, traceDisplayConfigSchema as Gt, DEFAULT_LLM_CALLS_CONFIG as H, traceAttributeDisplayInputSchema as Ht, deriveScopedSummaryFromCases as I, repoFile as In, cacheRecordingSchema as It, defaultConfigKeySchema as J, traceSpanKindSchema as Jt, apiCallMetricSchema as K, traceDisplayInputConfigSchema as Kt, deriveStatusFromCaseRows as L, defineEval as Ln, cacheStatusSchema as Lt, applyDerivedCallAttributes as M, runInExistingEvalScope as Mn, cacheListItemSchema as Mt, getNestedAttribute as N, setEvalOutput as Nn, cacheModeSchema as Nt, extractCacheEntries as O, mergeEvalOutput as On, cacheDebugKeyFileSchema as Ot, getEvalTitle as P, setScopeCacheContext as Pn, cacheOperationTypeSchema as Pt, llmCallPricingSchema as Q, columnDefSchema as Qt, deriveStatusFromChildStatuses as R, getEvalRegistry as Rn, serializedCacheSpanSchema as Rt, normalizeScoreDef as S, evalLog as Sn, evalChartConfigSchema as St, createRunRequestSchema as T, getEvalStartTime as Tn, evalChartTypeSchema as Tt, agentEvalsConfigSchema as U, traceAttributeDisplayPlacementSchema as Ut, DEFAULT_API_CALLS_CONFIG as V, traceAttributeDisplayFormatSchema as Vt, apiCallMetricFormatSchema as W, traceAttributeDisplaySchema as Wt, llmCallMetricPlacementSchema as X, traceSpanWarningSchema as Xt, llmCallMetricFormatSchema as Y, traceSpanSchema as Yt, llmCallMetricSchema as Z, cellValueSchema as Zt, loadEvalModule as _, EvalAssertionError as _n, scoreTraceSchema as _t, loadPersistedRunSnapshot as a, runArtifactRefSchema as an, assertionFailureSchema as at, loadConfig as b, configureEvalRunLogs as bn, evalChartBuiltinMetricSchema as bt, persistCaseDetail as c, captureEvalSpanError as cn, evalFreshnessStatusSchema as ct, recomputePersistedCaseStatus as d, hashCacheKey as dn, evalStatsConfigSchema as dt, columnKindSchema as en, removeDefaultConfigSchema as et, runTouchesEval as f, hashCacheKeySync as fn, evalSummarySchema as ft, setLatestRunInfoMap as g, serializeCacheValue as gn, runLogPhaseSchema as gt, getTargetEvalIds as h, serializeCacheRecording as hn, runLogLocationSchema as ht, getLatestRunInfos as i, repoFileRefSchema as in, trialSelectionModeSchema as it, extractLlmCalls as j, runInEvalScope as jn, cacheFileSchema as jt, extractCacheHits as k, nextEvalId as kn, cacheEntrySchema as kt, persistRunState as l, evalSpan as ln, evalStatAggregateSchema as lt, buildEvalSummary as m, deserializeCacheValue as mn, runLogLevelSchema as mt, generateRunId as n, jsonCellSchema as nn, resolveLlmCallsConfig as nt, loadPersistedRunSnapshots as o, z$1 as on, caseDetailSchema as ot, resolveArtifactPath as p, deserializeCacheRecording as pn, runLogEntrySchema as pt, apiCallsConfigSchema as q, traceSpanErrorSchema as qt, getLastRunStatuses as r, numberDisplayOptionsSchema as rn, runLogsConfigSchema as rt, nextShortIdFromSnapshots as s, buildTraceTree as sn, caseRowSchema as st, executeRun as t, fileRefSchema as tn, resolveApiCallsConfig as tt, recomputeEvalStatusesInRuns as u, evalTracer as un, evalStatItemSchema as ut, parseEvalMetas as v, advanceEvalTime as vn, evalChartAggregateSchema as vt, createFsCacheStore as w, getEvalCaseInput as wn, evalChartTooltipExtraSchema as wt, buildDeclaredColumnDefs as x, evalAssert as xn, evalChartColorSchema as xt, resolveEvalDefaultConfig as y, appendToEvalOutput as yn, evalChartAxisSchema as yt, runManifestSchema as z, spanCacheOptionsSchema as zt };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-C0EtHhEO.mjs";
2
- import "./src-D-HuV8I-.mjs";
1
+ import { n as createRunner } from "./cli-Be0x8CS3.mjs";
2
+ import "./src-D6cettg0.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-C9nP2VKL.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-B4SosWgD.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -0,0 +1,3 @@
1
+ import "./runOrchestration-D697g6Qe.mjs";
2
+ import "./cli-Be0x8CS3.mjs";
3
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.22.0",
3
+ "version": "0.24.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -59,8 +59,8 @@
59
59
  "@types/node": "^24.7.2",
60
60
  "typescript": "^5.9.2",
61
61
  "@agent-evals/runner": "0.0.1",
62
- "@agent-evals/sdk": "0.0.1",
63
- "@agent-evals/shared": "0.0.1"
62
+ "@agent-evals/shared": "0.0.1",
63
+ "@agent-evals/sdk": "0.0.1"
64
64
  },
65
65
  "scripts": {
66
66
  "build": "pnpm --filter @agent-evals/web build && tsdown",
@@ -156,6 +156,19 @@ for settlement; promise and span errors keep their normal behavior. Use
156
156
  `waitForBackgroundJob: false` on a span, or `waitForBackgroundJobs: false` on an
157
157
  eval definition, when background work should not delay finalization.
158
158
 
159
+ Eval Date APIs use a shifted wall clock by default: `new Date()` and
160
+ `Date.now()` start at `2026-04-10T00:00:00.000Z` during case generation,
161
+ execution, tracing, derived outputs, and scorers, then continue advancing with
162
+ real elapsed time. Set `startTime` on a specific `defineEval(...)` to use
163
+ another initial clock value, or set `startTime: 'now'` for that eval to use the
164
+ real current clock. Timers are not faked, so async waits still run normally.
165
+ Set `freezeTime: true` to keep Date APIs frozen until they are moved manually.
166
+ Use `getEvalStartTime()` to read the captured wall-clock start as a `Date`.
167
+ Use `advanceEvalTime(unit, amount)` inside an eval to move the shifted clock
168
+ forward; supported units are `millisecond(s)`, `second(s)`, `minute(s)`,
169
+ `hour(s)`, and `day(s)`. It throws for evals with `startTime: 'now'`, unless
170
+ `freezeTime: true` is also set.
171
+
159
172
  For libraries or observability exporters that already emit span lifecycle
160
173
  events, use `evalTracer.startSpan(...)`, `evalTracer.updateSpan(...)`,
161
174
  `evalTracer.endSpan(...)`, or `evalTracer.recordSpan(...)` to translate those
@@ -261,10 +274,12 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
261
274
  attribute paths. `latencyMs` is time to first token; duration, total tokens,
262
275
  tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
263
276
  override `attributes.<field>` for non-default primitive span shapes, configure
264
- `pricing` to derive USD costs from token counts by model/provider, and add
265
- entries to `metrics` to surface arbitrary user metrics (`format: 'string' |
266
- 'number' | 'duration' | 'json' | 'boolean'`, `placements: ['header' |
267
- 'body']`).
277
+ `pricing` to derive USD costs from token counts by model/provider, add
278
+ `derivedAttributes` to persist computed values back onto matching LLM spans
279
+ before trace consumers run, and add entries to `metrics` to surface arbitrary
280
+ user metrics (`format: 'string' | 'number' | 'duration' | 'json' |
281
+ 'boolean'`, `placements: ['header' | 'body']`). `derivedAttributes` keys are
282
+ dot-paths under `span.attributes`; return `undefined` to skip one span.
268
283
  - Default usage config derives missing eval outputs from matching LLM/API spans
269
284
  before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`,
270
285
  `inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`,
@@ -285,7 +300,8 @@ cacheCreationInputTokens` so cache details are not double-counted.
285
300
  and `'fetch'` spans with `method`, `url`, `statusCode`, `request`,
286
301
  `response`, `requestBody`, `responseBody`, `headers`, `durationMs`, and
287
302
  `error` read from conventional attribute paths. Override `kinds` or
288
- `attributes.<field>` for external tracers, and add `metrics` with the same
303
+ `attributes.<field>` for external tracers, add `derivedAttributes` for
304
+ computed persisted API span attributes, and add `metrics` with the same
289
305
  formats and placements as LLM-call metrics.
290
306
  - `runLogs` (in `agent-evals.config.ts`) controls case log capture. Use
291
307
  `runLogs: { captureConsole: false }` to keep console output in the terminal
@@ -1,3 +0,0 @@
1
- import "./runOrchestration-D1edUDhp.mjs";
2
- import "./cli-C0EtHhEO.mjs";
3
- export {};