@ls-stack/agent-eval 0.21.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -49,9 +49,46 @@ function repoFile(path, mimeType) {
49
49
  //#region ../sdk/src/runtime.ts
50
50
  const scopeStorage = new AsyncLocalStorage();
51
51
  const runtimeScopeStorage = new AsyncLocalStorage();
52
+ const evalClockStorage = new AsyncLocalStorage();
52
53
  let activeEvalScopeCount = 0;
53
54
  let activeEvalRuntimeScopeCount = 0;
54
55
  let consoleCaptureEnabled = true;
56
+ const defaultEvalStartTimeMs = Date.parse("2026-04-10T00:00:00.000Z");
57
+ const realDate = globalThis.__agentEvalsRealDate ?? Date;
58
+ globalThis.__agentEvalsRealDate = realDate;
59
+ function toDateConstructorArg(value) {
60
+ if (typeof value === "string" || typeof value === "number" || value instanceof realDate) return value;
61
+ return Number(value);
62
+ }
63
+ function toDateNumberArg(value) {
64
+ return typeof value === "number" ? value : Number(value);
65
+ }
66
+ function constructDateFromArgs(args) {
67
+ if (args.length === 0) return new realDate();
68
+ if (args.length === 1) return new realDate(toDateConstructorArg(args[0]));
69
+ return new realDate(toDateNumberArg(args[0]), toDateNumberArg(args[1]), args[2] === void 0 ? 1 : toDateNumberArg(args[2]), args[3] === void 0 ? 0 : toDateNumberArg(args[3]), args[4] === void 0 ? 0 : toDateNumberArg(args[4]), args[5] === void 0 ? 0 : toDateNumberArg(args[5]), args[6] === void 0 ? 0 : toDateNumberArg(args[6]));
70
+ }
71
+ const evalDate = new Proxy(realDate, {
72
+ apply(target, thisArg, argArray_) {
73
+ const nowMs = getEvalClockNowMs();
74
+ if (nowMs !== null) return new target(nowMs).toString();
75
+ return target.call(thisArg);
76
+ },
77
+ construct(target, argArray, newTarget_) {
78
+ const nowMs = getEvalClockNowMs();
79
+ if (argArray.length === 0 && nowMs !== null) return new target(nowMs);
80
+ return constructDateFromArgs(Array.from(argArray));
81
+ },
82
+ get(target, property) {
83
+ if (property === "now") return getEvalDateNow;
84
+ if (property === "parse") return target.parse;
85
+ if (property === "UTC") return target.UTC;
86
+ if (property === "prototype") return target.prototype;
87
+ if (property === "name") return target.name;
88
+ if (property === "length") return target.length;
89
+ }
90
+ });
91
+ globalThis.Date = evalDate;
55
92
  const maxLogMessageLength = 2e4;
56
93
  const maxLogStringLength = 1e4;
57
94
  const maxLogArrayLength = 100;
@@ -79,6 +116,82 @@ var EvalAssertionError = class extends Error {
79
116
  this.name = "EvalAssertionError";
80
117
  }
81
118
  };
119
+ function getEvalClockStateNowMs(state) {
120
+ const elapsedMs = state.frozen ? 0 : realDate.now() - state.realStartMs;
121
+ return state.startMs + elapsedMs + state.offsetMs;
122
+ }
123
+ function getEvalClockNowMs() {
124
+ const state = evalClockStorage.getStore();
125
+ if (state?.shifted !== true) return null;
126
+ return getEvalClockStateNowMs(state);
127
+ }
128
+ function getEvalDateNow() {
129
+ return getEvalClockNowMs() ?? realDate.now();
130
+ }
131
+ /** Return the host process clock, bypassing the eval Date shim. */
132
+ function getRealDateNowMs() {
133
+ return realDate.now();
134
+ }
135
+ /** Return the shifted wall-clock time for a stored eval clock state. */
136
+ function getEvalClockStateTimeMs(state) {
137
+ if (!state.shifted) return null;
138
+ return getEvalClockStateNowMs(state);
139
+ }
140
+ /**
141
+ * Return the wall-clock start time captured for the active eval.
142
+ *
143
+ * For `startTime: 'now'`, this is the real time captured when the eval clock
144
+ * context was created.
145
+ */
146
+ function getEvalStartTime() {
147
+ const state = evalClockStorage.getStore();
148
+ if (state === void 0) throw new Error("getEvalStartTime() must be called inside an active eval");
149
+ return new realDate(state.startMs);
150
+ }
151
+ function resolveEvalStartTimeMs(startTime) {
152
+ if (startTime === void 0) return defaultEvalStartTimeMs;
153
+ if (startTime === "now") return realDate.now();
154
+ const ms = startTime instanceof realDate ? startTime.getTime() : typeof startTime === "number" ? startTime : realDate.parse(startTime);
155
+ if (Number.isFinite(ms)) return ms;
156
+ throw new Error(`Invalid eval startTime "${String(startTime)}". Use a Date, timestamp, ISO date string, or "now".`);
157
+ }
158
+ function createEvalClockState(startTime, freezeTime) {
159
+ const nowMs = realDate.now();
160
+ return {
161
+ startMs: startTime === "now" ? nowMs : resolveEvalStartTimeMs(startTime),
162
+ realStartMs: nowMs,
163
+ offsetMs: 0,
164
+ frozen: freezeTime,
165
+ shifted: startTime !== "now" || freezeTime
166
+ };
167
+ }
168
+ /** Execute a callback with the eval Date clock shifted from `startTime`. */
169
+ async function runWithEvalClock(startTime, fn, options = {}) {
170
+ return await evalClockStorage.run(createEvalClockState(startTime, options.freezeTime === true), fn);
171
+ }
172
+ function getEvalTimeUnitMs(unit) {
173
+ if (unit === "millisecond" || unit === "milliseconds") return 1;
174
+ if (unit === "second" || unit === "seconds") return 1e3;
175
+ if (unit === "minute" || unit === "minutes") return 6e4;
176
+ if (unit === "hour" || unit === "hours") return 36e5;
177
+ if (unit === "day" || unit === "days") return 864e5;
178
+ throw new Error(`Unsupported eval time unit "${unit}"`);
179
+ }
180
+ /**
181
+ * Advance the active eval's shifted Date clock and return the new time.
182
+ *
183
+ * Throws outside an active shifted eval clock. Evals that set
184
+ * `startTime: 'now'` use the real current clock unless `freezeTime: true` is
185
+ * also set.
186
+ */
187
+ function advanceEvalTime(unit, amount) {
188
+ const state = evalClockStorage.getStore();
189
+ if (state === void 0) throw new Error("advanceEvalTime() must be called inside an active eval");
190
+ if (!state.shifted) throw new Error("advanceEvalTime() requires a shifted eval clock. Remove startTime: \"now\" or set freezeTime: true to use it.");
191
+ if (!Number.isFinite(amount)) throw new Error("advanceEvalTime() amount must be a finite number");
192
+ state.offsetMs += getEvalTimeUnitMs(unit) * amount;
193
+ return new realDate(getEvalClockStateNowMs(state));
194
+ }
82
195
  /** Return the current eval scope for the active async context, if any. */
83
196
  function getCurrentScope() {
84
197
  if (activeEvalScopeCount === 0) return void 0;
@@ -349,7 +462,9 @@ async function runInExistingEvalScope(scope, runtimeScope, fn) {
349
462
  activeEvalScopeCount++;
350
463
  try {
351
464
  return await scopeStorage.run(scope, async () => {
352
- return await runInEvalRuntimeScope(runtimeScope, fn);
465
+ return await evalClockStorage.run(scope.evalClockState, async () => {
466
+ return await runInEvalRuntimeScope(runtimeScope, fn);
467
+ });
353
468
  });
354
469
  } finally {
355
470
  activeEvalScopeCount--;
@@ -362,6 +477,8 @@ async function runInExistingEvalScope(scope, runtimeScope, fn) {
362
477
  async function runInEvalScope(caseId, fn, options = {}) {
363
478
  const scope = {
364
479
  caseId,
480
+ startTime: options.startTime,
481
+ evalClockState: createEvalClockState(options.startTime, options.freezeTime === true),
365
482
  idPrefix: options.idPrefix,
366
483
  nextEvalIdCounter: 0,
367
484
  input: options.input,
@@ -1493,9 +1610,12 @@ function mergeSpanAttribute(span, key, patch) {
1493
1610
  ...patch
1494
1611
  } });
1495
1612
  }
1496
- function finishSpanWithoutThrownError(span) {
1613
+ function addElapsedMsToTimestamp(isoTimestamp, elapsedMs) {
1614
+ return new Date(new Date(isoTimestamp).getTime() + elapsedMs).toISOString();
1615
+ }
1616
+ function finishSpanWithoutThrownError(span, realStartedAt) {
1497
1617
  span.status = hasSpanError(span) ? "error" : "ok";
1498
- span.endedAt = (/* @__PURE__ */ new Date()).toISOString();
1618
+ span.endedAt = addElapsedMsToTimestamp(span.startedAt, getRealDateNowMs() - realStartedAt);
1499
1619
  }
1500
1620
  function createSpanHandle(span) {
1501
1621
  return {
@@ -1737,9 +1857,11 @@ async function traceSpanInternal(info, fn) {
1737
1857
  const scope = getCurrentScope();
1738
1858
  if (!scope) return await fn(noopActiveSpan());
1739
1859
  const id = generateSpanId();
1860
+ const parentId = scope.activeSpanStack.at(-1)?.id ?? null;
1861
+ const realStartedAt = getRealDateNowMs();
1740
1862
  const spanRecord = {
1741
1863
  id,
1742
- parentId: scope.activeSpanStack.at(-1)?.id ?? null,
1864
+ parentId,
1743
1865
  caseId: scope.caseId,
1744
1866
  kind: info.kind,
1745
1867
  name: info.name,
@@ -1779,7 +1901,7 @@ async function traceSpanInternal(info, fn) {
1779
1901
  const recording = deserializeCacheRecording(hit.recording);
1780
1902
  replayRecording(scope, spanRecord, recording, { generateSpanId });
1781
1903
  spanRecord.status = recording.finalStatus ?? (hasSpanError(spanRecord) ? "error" : "ok");
1782
- spanRecord.endedAt = (/* @__PURE__ */ new Date()).toISOString();
1904
+ spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
1783
1905
  return recording.returnValue;
1784
1906
  }
1785
1907
  mergeSpanAttributes(spanRecord, { "cache.status": "miss" });
@@ -1798,7 +1920,7 @@ async function traceSpanInternal(info, fn) {
1798
1920
  scope.recordingStack.pop();
1799
1921
  }
1800
1922
  appendSubSpanOps(scope, frame);
1801
- finishSpanWithoutThrownError(spanRecord);
1923
+ finishSpanWithoutThrownError(spanRecord, realStartedAt);
1802
1924
  if (ctx.mode !== "bypass") {
1803
1925
  const recording = {
1804
1926
  returnValue: bodyResult,
@@ -1832,11 +1954,11 @@ async function traceSpanInternal(info, fn) {
1832
1954
  return bodyResult;
1833
1955
  }
1834
1956
  const result = await fn(activeSpan);
1835
- finishSpanWithoutThrownError(spanRecord);
1957
+ finishSpanWithoutThrownError(spanRecord, realStartedAt);
1836
1958
  return result;
1837
1959
  } catch (error) {
1838
1960
  spanRecord.status = "error";
1839
- spanRecord.endedAt = (/* @__PURE__ */ new Date()).toISOString();
1961
+ spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
1840
1962
  spanRecord.error = normalizeTraceError(error);
1841
1963
  throw error;
1842
1964
  } finally {
@@ -2468,7 +2590,8 @@ const caseRowSchema = z.object({
2468
2590
  "error",
2469
2591
  "cancelled"
2470
2592
  ]),
2471
- latencyMs: z.number().nullable(),
2593
+ /** Elapsed case execution duration in milliseconds, or null before completion. */
2594
+ durationMs: z.number().nullable(),
2472
2595
  costUsd: z.number().nullable().optional(),
2473
2596
  columns: z.record(z.string(), cellValueSchema),
2474
2597
  /** Winning trial index for the persisted case result. */
@@ -2586,7 +2709,7 @@ const defaultConfigKeySchema = z.enum([
2586
2709
  "cachedInputTokens",
2587
2710
  "cacheCreationInputTokens",
2588
2711
  "reasoningTokens",
2589
- "llmLatencyMs"
2712
+ "llmDurationMs"
2590
2713
  ]);
2591
2714
  /** Removal config for built-in eval-level outputs and UI metadata. */
2592
2715
  const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfigKeySchema)]);
@@ -2663,7 +2786,7 @@ const apiCallMetricSchema = z.object({
2663
2786
  });
2664
2787
  /**
2665
2788
  * Schema for one model/provider pricing entry used to derive LLM-call costs
2666
- * from token counts when a span does not already record explicit USD costs.
2789
+ * from token counts.
2667
2790
  */
2668
2791
  const llmCallPricingSchema = z.object({
2669
2792
  /** Exact model name read from the configured `attributes.model` path. */
@@ -2682,6 +2805,8 @@ const llmCallPricingSchema = z.object({
2682
2805
  cachedInputUsdPerMillion: z.number().nonnegative().optional(),
2683
2806
  /** USD per one million prompt-cache write tokens. */
2684
2807
  cacheCreationInputUsdPerMillion: z.number().nonnegative().optional(),
2808
+ /** USD per one million one-hour prompt-cache write tokens. */
2809
+ cacheCreationInput1hUsdPerMillion: z.number().nonnegative().optional(),
2685
2810
  /** USD per one million reasoning tokens when reported separately. */
2686
2811
  reasoningUsdPerMillion: z.number().nonnegative().optional()
2687
2812
  });
@@ -2692,12 +2817,9 @@ const llmCallsConfigSchema = z.object({
2692
2817
  /**
2693
2818
  * Attribute paths used to extract structured per-call fields. Each entry is
2694
2819
  * a dot-path inside `span.attributes`. Missing paths fall back to the
2695
- * built-in defaults (e.g. `usage.inputTokens`, `costUsd`).
2696
- *
2697
- * Per-token-type cost paths (`inputCost`, `outputCost`, `cachedInputCost`,
2698
- * `reasoningCost`) feed the cost breakdown table in the expanded row when
2699
- * spans provide explicit USD cost overrides. Prefer `pricing` for deriving
2700
- * costs from token counts globally.
2820
+ * built-in defaults (e.g. `usage.inputTokens`). Derived fields such as
2821
+ * total tokens, tokens/sec, duration, and USD costs are intentionally not
2822
+ * configurable as attribute paths.
2701
2823
  */
2702
2824
  attributes: z.object({
2703
2825
  model: z.string().optional(),
@@ -2706,15 +2828,9 @@ const llmCallsConfigSchema = z.object({
2706
2828
  outputTokens: z.string().optional(),
2707
2829
  cachedInputTokens: z.string().optional(),
2708
2830
  cacheCreationInputTokens: z.string().optional(),
2831
+ cacheCreationInput1hTokens: z.string().optional(),
2709
2832
  reasoningTokens: z.string().optional(),
2710
- totalTokens: z.string().optional(),
2711
- tokensPerSecond: z.string().optional(),
2712
- cost: z.string().optional(),
2713
- inputCost: z.string().optional(),
2714
- outputCost: z.string().optional(),
2715
- cachedInputCost: z.string().optional(),
2716
- cacheCreationInputCost: z.string().optional(),
2717
- reasoningCost: z.string().optional(),
2833
+ latencyMs: z.string().optional(),
2718
2834
  steps: z.string().optional(),
2719
2835
  finishReason: z.string().optional(),
2720
2836
  input: z.string().optional(),
@@ -2723,9 +2839,8 @@ const llmCallsConfigSchema = z.object({
2723
2839
  toolCalls: z.string().optional()
2724
2840
  }).optional(),
2725
2841
  /**
2726
- * Model/provider pricing registry used to calculate missing LLM-call costs
2727
- * from token counts. Explicit span attributes (`costUsd`, `cost.inputUsd`,
2728
- * etc.) take precedence over derived prices.
2842
+ * Model/provider pricing registry used to calculate LLM-call costs from
2843
+ * token counts. Built-in LLM cost fields are only derived from this registry.
2729
2844
  */
2730
2845
  pricing: z.array(llmCallPricingSchema).optional(),
2731
2846
  /** Custom user-defined metrics surfaced on each LLM call. */
@@ -2773,15 +2888,9 @@ const DEFAULT_LLM_CALLS_CONFIG = {
2773
2888
  outputTokens: "usage.outputTokens",
2774
2889
  cachedInputTokens: "usage.cachedInputTokens",
2775
2890
  cacheCreationInputTokens: "usage.cacheCreationInputTokens",
2891
+ cacheCreationInput1hTokens: "usage.cacheCreationInput1hTokens",
2776
2892
  reasoningTokens: "usage.reasoningTokens",
2777
- totalTokens: "usage.totalTokens",
2778
- tokensPerSecond: "tokensPerSecond",
2779
- cost: "costUsd",
2780
- inputCost: "cost.inputUsd",
2781
- outputCost: "cost.outputUsd",
2782
- cachedInputCost: "cost.cachedInputUsd",
2783
- cacheCreationInputCost: "cost.cacheCreationInputUsd",
2784
- reasoningCost: "cost.reasoningUsd",
2893
+ latencyMs: "latencyMs",
2785
2894
  steps: "steps",
2786
2895
  finishReason: "finishReason",
2787
2896
  input: "input",
@@ -2823,8 +2932,8 @@ const DEFAULT_API_CALLS_CONFIG = {
2823
2932
  * attribute path.
2824
2933
  * - Missing `metrics[].format` defaults to `'string'`.
2825
2934
  * - Missing `metrics[].placements` defaults to `['body']`.
2826
- * - Missing `pricing` defaults to an empty registry; explicit span costs still
2827
- * take precedence over derived costs.
2935
+ * - Missing `pricing` defaults to an empty registry; built-in costs are only
2936
+ * derived from configured pricing and token counts.
2828
2937
  */
2829
2938
  function resolveLlmCallsConfig(input) {
2830
2939
  return {
@@ -2848,6 +2957,7 @@ function resolveLlmCallsConfig(input) {
2848
2957
  outputUsdPerMillion: p.outputUsdPerMillion,
2849
2958
  cachedInputUsdPerMillion: p.cachedInputUsdPerMillion,
2850
2959
  cacheCreationInputUsdPerMillion: p.cacheCreationInputUsdPerMillion,
2960
+ cacheCreationInput1hUsdPerMillion: p.cacheCreationInput1hUsdPerMillion,
2851
2961
  reasoningUsdPerMillion: p.reasoningUsdPerMillion
2852
2962
  }))
2853
2963
  };
@@ -3037,8 +3147,8 @@ function deriveScopedSummaryFromCases(params) {
3037
3147
  else if (caseRow.status === "cancelled") cancelledCases += 1;
3038
3148
  else if (caseRow.status === "running") runningCases += 1;
3039
3149
  else pendingCases += 1;
3040
- if (caseRow.latencyMs !== null) {
3041
- totalDurationMs += caseRow.latencyMs;
3150
+ if (caseRow.durationMs !== null) {
3151
+ totalDurationMs += caseRow.durationMs;
3042
3152
  hasDuration = true;
3043
3153
  }
3044
3154
  }
@@ -3132,6 +3242,21 @@ function computeTokenCost(tokens, usdPerMillion) {
3132
3242
  if (usdPerMillion === void 0) return null;
3133
3243
  return tokens / 1e6 * usdPerMillion;
3134
3244
  }
3245
+ function computeCacheCreationInputCost({ cacheCreationInputTokens, cacheCreationInput1hTokens, usdPerMillion, oneHourUsdPerMillion }) {
3246
+ if (cacheCreationInputTokens === null) return null;
3247
+ if (cacheCreationInputTokens === 0) return 0;
3248
+ if (cacheCreationInput1hTokens === null) return computeTokenCost(cacheCreationInputTokens, usdPerMillion);
3249
+ const oneHourTokens = Math.min(cacheCreationInput1hTokens, cacheCreationInputTokens);
3250
+ const shortLivedCost = computeTokenCost(cacheCreationInputTokens - oneHourTokens, usdPerMillion);
3251
+ const oneHourCost = computeTokenCost(oneHourTokens, oneHourUsdPerMillion);
3252
+ if (shortLivedCost === null || oneHourCost === null) return null;
3253
+ return shortLivedCost + oneHourCost;
3254
+ }
3255
+ function computeBaseInputTokens({ inputTokens, cachedInputTokens, cacheCreationInputTokens }) {
3256
+ if (inputTokens === null) return null;
3257
+ const cachedTokens = (cachedInputTokens ?? 0) + (cacheCreationInputTokens ?? 0);
3258
+ return Math.max(inputTokens - cachedTokens, 0);
3259
+ }
3135
3260
  function pickPricingEntry({ pricing, model, provider }) {
3136
3261
  if (model === null) return null;
3137
3262
  let fallback = null;
@@ -3145,7 +3270,7 @@ function pickPricingEntry({ pricing, model, provider }) {
3145
3270
  }
3146
3271
  return fallback;
3147
3272
  }
3148
- function computeFallbackTotalCost({ inputTokens, inputCostUsd, outputTokens, outputCostUsd, cachedInputTokens, cachedInputCostUsd, cacheCreationInputTokens, cacheCreationInputCostUsd, reasoningTokens, reasoningCostUsd }) {
3273
+ function computeTotalCost({ inputTokens, inputCostUsd, outputTokens, outputCostUsd, cachedInputTokens, cachedInputCostUsd, cacheCreationInputTokens, cacheCreationInputCostUsd, reasoningTokens, reasoningCostUsd }) {
3149
3274
  const parts = [
3150
3275
  {
3151
3276
  tokens: inputTokens,
@@ -3182,7 +3307,7 @@ function computeFallbackTotalCost({ inputTokens, inputCostUsd, outputTokens, out
3182
3307
  if (hasCost) return total;
3183
3308
  return hasReportedTokens ? 0 : null;
3184
3309
  }
3185
- function computeLatencyMs$1(span) {
3310
+ function computeDurationMs$1(span) {
3186
3311
  if (span.endedAt === null) return null;
3187
3312
  const started = Date.parse(span.startedAt);
3188
3313
  const ended = Date.parse(span.endedAt);
@@ -3190,10 +3315,16 @@ function computeLatencyMs$1(span) {
3190
3315
  const delta = ended - started;
3191
3316
  return delta >= 0 ? delta : null;
3192
3317
  }
3193
- function computeTotalTokens({ declared, input, output, cached, cacheCreation }) {
3194
- if (declared !== null) return declared;
3195
- if (input === null && output === null && cached === null && cacheCreation === null) return null;
3196
- return (input ?? 0) + (output ?? 0) + (cached ?? 0) + (cacheCreation ?? 0);
3318
+ function computeTotalTokens({ input, output }) {
3319
+ if (input === null && output === null) return null;
3320
+ return (input ?? 0) + (output ?? 0);
3321
+ }
3322
+ function computeTokensPerSecond({ outputTokens, durationMs, latencyMs }) {
3323
+ if (outputTokens === null || durationMs === null) return null;
3324
+ if (outputTokens === 0) return 0;
3325
+ const generationMs = latencyMs === null ? durationMs : durationMs - latencyMs;
3326
+ if (generationMs <= 0) return null;
3327
+ return outputTokens / (generationMs / 1e3);
3197
3328
  }
3198
3329
  function readSteps(attributes, path) {
3199
3330
  const raw = getNestedAttribute(attributes, path);
@@ -3201,10 +3332,6 @@ function readSteps(attributes, path) {
3201
3332
  stepCount: raw.length,
3202
3333
  stepDetails: raw
3203
3334
  };
3204
- if (typeof raw === "number" && Number.isFinite(raw)) return {
3205
- stepCount: raw,
3206
- stepDetails: null
3207
- };
3208
3335
  return {
3209
3336
  stepCount: null,
3210
3337
  stepDetails: null
@@ -3226,16 +3353,22 @@ function pickError$1(span) {
3226
3353
  * shape consumed by the LLM calls tab.
3227
3354
  *
3228
3355
  * Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
3229
- * (`model`, token counts, explicit cost, etc.) are read via
3356
+ * (`model`, token counts, latency, etc.) are read via
3230
3357
  * `getNestedAttribute` from the configured paths, with safe coercion to
3231
- * `string | null` / `number | null`. When explicit USD costs are absent,
3232
- * configured model pricing derives per-token-type costs from token counts.
3233
- * `totalTokens` falls back to a sum of input + output + cached when no
3234
- * explicit total attribute is present. The `steps` attribute path may resolve
3235
- * to either a number (rendered as the inference-round count) or an array of
3236
- * per-step detail objects (rendered as a Steps section in the body, with
3237
- * `stepCount` derived from the array length). `latencyMs` is `null` while the
3238
- * span is still running. User-defined `metrics` whose path resolves to
3358
+ * `string | null` / `number | null`. `latencyMs` is an explicit
3359
+ * time-to-first-token attribute; full span elapsed time is reported separately
3360
+ * as `durationMs`. Built-in USD costs are derived only from configured model
3361
+ * pricing and token counts. `totalTokens` is always derived from input +
3362
+ * output tokens. Cached input and cache creation tokens are reported
3363
+ * separately because they are subsets of input/output usage. The main cache
3364
+ * creation token field is treated as the total write count; optional one-hour
3365
+ * cache creation tokens only split that total for cost calculation. Base input
3366
+ * cost uses input minus cache read/write tokens so cached tokens are not
3367
+ * charged twice. Cache read/write costs still contribute to the total USD cost
3368
+ * at their configured rates. The `steps` attribute path may resolve to an array
3369
+ * of per-step detail objects, with `stepCount` derived from the array length.
3370
+ * `durationMs` and `tokensPerSecond` are `null` while the span is still
3371
+ * running. User-defined `metrics` whose path resolves to
3239
3372
  * `undefined` are dropped, but `null`, `0`, and `false` are preserved as
3240
3373
  * legitimate values worth displaying. Original span order is preserved so the
3241
3374
  * LLM calls tab matches the ordering in the Trace tab.
@@ -3252,19 +3385,30 @@ function extractLlmCalls(spans, config) {
3252
3385
  const outputTokens = readNumber$2(attrs, config.attributes.outputTokens);
3253
3386
  const cachedInputTokens = readNumber$2(attrs, config.attributes.cachedInputTokens);
3254
3387
  const cacheCreationInputTokens = readNumber$2(attrs, config.attributes.cacheCreationInputTokens);
3388
+ const cacheCreationInput1hTokens = readNumber$2(attrs, config.attributes.cacheCreationInput1hTokens);
3255
3389
  const reasoningTokens = readNumber$2(attrs, config.attributes.reasoningTokens);
3256
- const declaredTotalTokens = readNumber$2(attrs, config.attributes.totalTokens);
3390
+ const latencyMs = readNumber$2(attrs, config.attributes.latencyMs);
3391
+ const durationMs = computeDurationMs$1(span);
3257
3392
  const pricing = pickPricingEntry({
3258
3393
  pricing: config.pricing,
3259
3394
  model,
3260
3395
  provider
3261
3396
  });
3262
- const inputCostUsd = readNumber$2(attrs, config.attributes.inputCost) ?? computeTokenCost(inputTokens, pricing?.inputUsdPerMillion);
3263
- const outputCostUsd = readNumber$2(attrs, config.attributes.outputCost) ?? computeTokenCost(outputTokens, pricing?.outputUsdPerMillion);
3264
- const cachedInputCostUsd = readNumber$2(attrs, config.attributes.cachedInputCost) ?? computeTokenCost(cachedInputTokens, pricing?.cachedInputUsdPerMillion);
3265
- const cacheCreationInputCostUsd = readNumber$2(attrs, config.attributes.cacheCreationInputCost) ?? computeTokenCost(cacheCreationInputTokens, pricing?.cacheCreationInputUsdPerMillion);
3266
- const reasoningCostUsd = readNumber$2(attrs, config.attributes.reasoningCost) ?? computeTokenCost(reasoningTokens, pricing?.reasoningUsdPerMillion);
3267
- const costUsd = readNumber$2(attrs, config.attributes.cost) ?? computeFallbackTotalCost({
3397
+ const inputCostUsd = computeTokenCost(computeBaseInputTokens({
3398
+ inputTokens,
3399
+ cachedInputTokens,
3400
+ cacheCreationInputTokens
3401
+ }), pricing?.inputUsdPerMillion);
3402
+ const outputCostUsd = computeTokenCost(outputTokens, pricing?.outputUsdPerMillion);
3403
+ const cachedInputCostUsd = computeTokenCost(cachedInputTokens, pricing?.cachedInputUsdPerMillion);
3404
+ const cacheCreationInputCostUsd = computeCacheCreationInputCost({
3405
+ cacheCreationInputTokens,
3406
+ cacheCreationInput1hTokens,
3407
+ usdPerMillion: pricing?.cacheCreationInputUsdPerMillion,
3408
+ oneHourUsdPerMillion: pricing?.cacheCreationInput1hUsdPerMillion
3409
+ });
3410
+ const reasoningCostUsd = computeTokenCost(reasoningTokens, pricing?.reasoningUsdPerMillion);
3411
+ const costUsd = computeTotalCost({
3268
3412
  inputTokens,
3269
3413
  inputCostUsd,
3270
3414
  outputTokens,
@@ -3302,13 +3446,15 @@ function extractLlmCalls(spans, config) {
3302
3446
  cacheCreationInputTokens,
3303
3447
  reasoningTokens,
3304
3448
  totalTokens: computeTotalTokens({
3305
- declared: declaredTotalTokens,
3306
3449
  input: inputTokens,
3307
- output: outputTokens,
3308
- cached: cachedInputTokens,
3309
- cacheCreation: cacheCreationInputTokens
3450
+ output: outputTokens
3451
+ }),
3452
+ latencyMs,
3453
+ tokensPerSecond: computeTokensPerSecond({
3454
+ outputTokens,
3455
+ durationMs,
3456
+ latencyMs
3310
3457
  }),
3311
- tokensPerSecond: readNumber$2(attrs, config.attributes.tokensPerSecond),
3312
3458
  costUsd,
3313
3459
  inputCostUsd,
3314
3460
  outputCostUsd,
@@ -3317,7 +3463,7 @@ function extractLlmCalls(spans, config) {
3317
3463
  reasoningCostUsd,
3318
3464
  ...readSteps(attrs, config.attributes.steps),
3319
3465
  finishReason: readString$2(attrs, config.attributes.finishReason),
3320
- latencyMs: computeLatencyMs$1(span),
3466
+ durationMs,
3321
3467
  input: getNestedAttribute(attrs, config.attributes.input),
3322
3468
  output: getNestedAttribute(attrs, config.attributes.output),
3323
3469
  reasoning: getNestedAttribute(attrs, config.attributes.reasoning),
@@ -3342,7 +3488,7 @@ function readString$1(attributes, path) {
3342
3488
  const raw = getNestedAttribute(attributes, path);
3343
3489
  return typeof raw === "string" && raw.length > 0 ? raw : null;
3344
3490
  }
3345
- function computeLatencyMs(span) {
3491
+ function computeDurationMs(span) {
3346
3492
  if (span.endedAt === null) return null;
3347
3493
  const started = Date.parse(span.startedAt);
3348
3494
  const ended = Date.parse(span.endedAt);
@@ -3367,10 +3513,10 @@ function pickError(span) {
3367
3513
  *
3368
3514
  * Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
3369
3515
  * (`method`, `url`, `statusCode`, etc.) are read via `getNestedAttribute` from
3370
- * the configured paths. `durationMs` takes precedence for latency, with a
3371
- * fallback to the span start/end timestamps. User-defined `metrics` whose path
3372
- * resolves to `undefined` are dropped, but `null`, `0`, and `false` are
3373
- * preserved as legitimate values worth displaying. Original span order is
3516
+ * the configured paths. An explicit `durationMs` attribute takes precedence,
3517
+ * with a fallback to the span start/end timestamps. User-defined `metrics`
3518
+ * whose path resolves to `undefined` are dropped, but `null`, `0`, and `false`
3519
+ * are preserved as legitimate values worth displaying. Original span order is
3374
3520
  * preserved so the API calls tab matches the ordering in the Trace tab.
3375
3521
  */
3376
3522
  function extractApiCalls(spans, config) {
@@ -3400,7 +3546,7 @@ function extractApiCalls(spans, config) {
3400
3546
  method: readString$1(attrs, config.attributes.method),
3401
3547
  url: readString$1(attrs, config.attributes.url),
3402
3548
  statusCode: readNumber$1(attrs, config.attributes.statusCode),
3403
- latencyMs: readNumber$1(attrs, config.attributes.durationMs) ?? computeLatencyMs(span),
3549
+ durationMs: readNumber$1(attrs, config.attributes.durationMs) ?? computeDurationMs(span),
3404
3550
  request: getNestedAttribute(attrs, config.attributes.request),
3405
3551
  response: getNestedAttribute(attrs, config.attributes.response),
3406
3552
  requestBody: getNestedAttribute(attrs, config.attributes.requestBody),
@@ -3798,7 +3944,7 @@ async function writeCacheFile(cacheDir, cacheFile) {
3798
3944
  await mkdir(cacheDir, { recursive: true });
3799
3945
  const filePath = ownerPath(cacheDir, cacheFile.owner);
3800
3946
  const tmpPath = `${filePath}.${process.pid.toString()}.tmp`;
3801
- await writeFile(tmpPath, JSON.stringify(cacheFile));
3947
+ await writeFile(tmpPath, JSON.stringify(cacheFile, null, 2));
3802
3948
  await rename(tmpPath, filePath);
3803
3949
  }
3804
3950
  async function readDebugKeyFile(debugDir, owner) {
@@ -4237,7 +4383,8 @@ const DEFAULT_CONFIG_KEYS = [
4237
4383
  "totalTokens",
4238
4384
  "cachedInputTokens",
4239
4385
  "cacheCreationInputTokens",
4240
- "llmLatencyMs"
4386
+ "reasoningTokens",
4387
+ "llmDurationMs"
4241
4388
  ];
4242
4389
  const tokenNumberFormat = { notation: "compact" };
4243
4390
  const countNumberFormat = {
@@ -4303,8 +4450,8 @@ const DEFAULT_COLUMNS = {
4303
4450
  numberFormat: tokenNumberFormat,
4304
4451
  align: "right"
4305
4452
  },
4306
- llmLatencyMs: {
4307
- label: "LLM Latency",
4453
+ llmDurationMs: {
4454
+ label: "LLM Duration",
4308
4455
  format: "duration",
4309
4456
  align: "right"
4310
4457
  }
@@ -4509,8 +4656,14 @@ function addDefaultOutputs(params) {
4509
4656
  });
4510
4657
  assignIfMissing({
4511
4658
  outputs: params.outputs,
4512
- key: "llmLatencyMs",
4513
- value: sumNullable(calls.map((call) => call.latencyMs)),
4659
+ key: "reasoningTokens",
4660
+ value: sumNullable(calls.map((call) => call.reasoningTokens)),
4661
+ activeKeys
4662
+ });
4663
+ assignIfMissing({
4664
+ outputs: params.outputs,
4665
+ key: "llmDurationMs",
4666
+ value: sumNullable(calls.map((call) => call.durationMs)),
4514
4667
  activeKeys
4515
4668
  });
4516
4669
  }
@@ -5204,7 +5357,9 @@ async function runCase(params) {
5204
5357
  mode: cacheMode,
5205
5358
  evalId,
5206
5359
  codeFingerprint
5207
- } : void 0
5360
+ } : void 0,
5361
+ startTime: evalDef.startTime,
5362
+ freezeTime: evalDef.freezeTime
5208
5363
  });
5209
5364
  const traceTree = buildTraceTree(scope.spans, scope.checkpoints);
5210
5365
  const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
@@ -5245,6 +5400,7 @@ async function runCase(params) {
5245
5400
  }
5246
5401
  const scoreResults = /* @__PURE__ */ new Map();
5247
5402
  const scoringTraces = {};
5403
+ const scoreStartTime = getEvalClockStateTimeMs(scope.evalClockState) ?? evalDef.startTime;
5248
5404
  if (!nonAssertError && scope.assertionFailures.length === 0 && evalDef.scores) for (const [key, def] of Object.entries(evalDef.scores)) {
5249
5405
  const { compute, passThreshold, label } = normalizeScoreDef(def);
5250
5406
  const scoreRun = await runInEvalScope(evalCase.id, async () => {
@@ -5264,7 +5420,9 @@ async function runCase(params) {
5264
5420
  mode: cacheMode,
5265
5421
  evalId: `${evalId}__score__${key}`,
5266
5422
  codeFingerprint
5267
- } : void 0
5423
+ } : void 0,
5424
+ startTime: scoreStartTime,
5425
+ freezeTime: evalDef.freezeTime
5268
5426
  });
5269
5427
  const { trace, traceDisplay } = resolveTracePresentation(scoreRun.scope.spans, globalTraceDisplay, evalDef.traceDisplay);
5270
5428
  scope.logs.push(...scoreRun.scope.logs.map((entry) => ({
@@ -5356,7 +5514,7 @@ async function runCase(params) {
5356
5514
  caseDetail,
5357
5515
  caseRowUpdate: {
5358
5516
  status,
5359
- latencyMs: Date.now() - startTime,
5517
+ durationMs: Date.now() - startTime,
5360
5518
  columns
5361
5519
  }
5362
5520
  };
@@ -5577,7 +5735,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5577
5735
  await runInEvalRuntimeScope("cases", async () => {
5578
5736
  await entry.use(async (evalDef) => {
5579
5737
  const cases = filterEvalCases(resolveRunnableEvalCases({
5580
- cases: typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [],
5738
+ cases: await runWithEvalClock(evalDef.startTime, async () => typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [], { freezeTime: evalDef.freezeTime }),
5581
5739
  evalId: evalMeta.id
5582
5740
  }), request.target.evalIds, request.target.caseIds, evalMeta.id);
5583
5741
  runState.summary.totalCases += cases.length;
@@ -5647,7 +5805,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5647
5805
  caseId: evalCase.id,
5648
5806
  evalId: evalMeta.id,
5649
5807
  status: caseRowUpdate.status ?? "pending",
5650
- latencyMs: caseRowUpdate.latencyMs ?? null,
5808
+ durationMs: caseRowUpdate.durationMs ?? null,
5651
5809
  columns: caseRowUpdate.columns ?? {},
5652
5810
  trial
5653
5811
  }
@@ -5780,4 +5938,4 @@ function toLastRunStatus(status) {
5780
5938
  return status === "pending" ? null : status;
5781
5939
  }
5782
5940
  //#endregion
5783
- export { removeDefaultConfigSchema as $, columnKindSchema as $t, extractApiCalls as A, setEvalOutput as An, cacheFileSchema as At, DEFAULT_API_CALLS_CONFIG as B, traceAttributeDisplayFormatSchema as Bt, validateCharts as C, incrementEvalOutput as Cn, evalChartTooltipExtraSchema as Ct, sseEnvelopeSchema as D, runInEvalRuntimeScope as Dn, cacheDebugKeyFileSchema as Dt, updateManualScoreRequestSchema as E, nextEvalId as En, cacheDebugKeyEntrySchema as Et, deriveScopedSummaryFromCases as F, getEvalRegistry as Fn, cacheRecordingSchema as Ft, apiCallMetricSchema as G, traceDisplayInputConfigSchema as Gt, agentEvalsConfigSchema as H, traceAttributeDisplayPlacementSchema as Ht, deriveStatusFromCaseRows as I, cacheStatusSchema as It, llmCallMetricFormatSchema as J, traceSpanSchema as Jt, apiCallsConfigSchema as K, traceSpanErrorSchema as Kt, deriveStatusFromChildStatuses as L, serializedCacheSpanSchema as Lt, getNestedAttribute as M, startEvalBackgroundJob as Mn, cacheModeSchema as Mt, getEvalTitle as N, repoFile as Nn, cacheOperationTypeSchema as Nt, extractCacheEntries as O, runInEvalScope as On, cacheEntrySchema as Ot, getEvalDisplayStatus as P, defineEval as Pn, cacheRecordingOpSchema as Pt, llmCallsConfigSchema as Q, columnFormatSchema as Qt, runManifestSchema as R, spanCacheOptionsSchema as Rt, normalizeScoreDef as S, getEvalCaseInput as Sn, evalChartMetricSchema as St, createRunRequestSchema as T, mergeEvalOutput as Tn, evalChartsConfigSchema as Tt, apiCallMetricFormatSchema as U, traceAttributeDisplaySchema as Ut, DEFAULT_LLM_CALLS_CONFIG as V, traceAttributeDisplayInputSchema as Vt, apiCallMetricPlacementSchema as W, traceDisplayConfigSchema as Wt, llmCallMetricSchema as X, cellValueSchema as Xt, llmCallMetricPlacementSchema as Y, traceSpanWarningSchema as Yt, llmCallPricingSchema as Z, columnDefSchema as Zt, loadEvalModule as _, appendToEvalOutput as _n, evalChartAggregateSchema as _t, loadPersistedRunSnapshot as a, z$1 as an, caseDetailSchema as at, loadConfig as b, evalLog as bn, evalChartColorSchema as bt, persistCaseDetail as c, evalSpan as cn, evalStatAggregateSchema as ct, recomputePersistedCaseStatus as d, hashCacheKeySync as dn, evalSummarySchema as dt, fileRefSchema as en, resolveApiCallsConfig as et, runTouchesEval as f, deserializeCacheRecording as fn, runLogEntrySchema as ft, setLatestRunInfoMap as g, EvalAssertionError as gn, scoreTraceSchema as gt, getTargetEvalIds as h, serializeCacheValue as hn, runLogPhaseSchema as ht, getLatestRunInfos as i, runArtifactRefSchema as in, assertionFailureSchema as it, extractLlmCalls as j, setScopeCacheContext as jn, cacheListItemSchema as jt, extractCacheHits as k, runInExistingEvalScope as kn, cacheEntryWithDebugKeySchema as kt, persistRunState as l, evalTracer as ln, evalStatItemSchema as lt, buildEvalSummary as m, serializeCacheRecording as mn, runLogLocationSchema as mt, generateRunId as n, numberDisplayOptionsSchema as nn, runLogsConfigSchema as nt, loadPersistedRunSnapshots as o, buildTraceTree as on, caseRowSchema as ot, resolveArtifactPath as p, deserializeCacheValue as pn, runLogLevelSchema as pt, defaultConfigKeySchema as q, traceSpanKindSchema as qt, getLastRunStatuses as r, repoFileRefSchema as rn, trialSelectionModeSchema as rt, nextShortIdFromSnapshots as s, captureEvalSpanError as sn, evalFreshnessStatusSchema as st, executeRun as t, jsonCellSchema as tn, resolveLlmCallsConfig as tt, recomputeEvalStatusesInRuns as u, hashCacheKey as un, evalStatsConfigSchema as ut, parseEvalMetas as v, configureEvalRunLogs as vn, evalChartAxisSchema as vt, createFsCacheStore as w, isInEvalScope as wn, evalChartTypeSchema as wt, buildDeclaredColumnDefs as x, getCurrentScope as xn, evalChartConfigSchema as xt, resolveEvalDefaultConfig as y, evalAssert as yn, evalChartBuiltinMetricSchema as yt, runSummarySchema as z, traceCacheRefSchema as zt };
5941
+ export { removeDefaultConfigSchema as $, columnKindSchema as $t, extractApiCalls as A, runInEvalScope as An, cacheFileSchema as At, DEFAULT_API_CALLS_CONFIG as B, traceAttributeDisplayFormatSchema as Bt, validateCharts as C, getEvalCaseInput as Cn, evalChartTooltipExtraSchema as Ct, sseEnvelopeSchema as D, mergeEvalOutput as Dn, cacheDebugKeyFileSchema as Dt, updateManualScoreRequestSchema as E, isInEvalScope as En, cacheDebugKeyEntrySchema as Et, deriveScopedSummaryFromCases as F, repoFile as Fn, cacheRecordingSchema as Ft, apiCallMetricSchema as G, traceDisplayInputConfigSchema as Gt, agentEvalsConfigSchema as H, traceAttributeDisplayPlacementSchema as Ht, deriveStatusFromCaseRows as I, defineEval as In, cacheStatusSchema as It, llmCallMetricFormatSchema as J, traceSpanSchema as Jt, apiCallsConfigSchema as K, traceSpanErrorSchema as Kt, deriveStatusFromChildStatuses as L, getEvalRegistry as Ln, serializedCacheSpanSchema as Lt, getNestedAttribute as M, setEvalOutput as Mn, cacheModeSchema as Mt, getEvalTitle as N, setScopeCacheContext as Nn, cacheOperationTypeSchema as Nt, extractCacheEntries as O, nextEvalId as On, cacheEntrySchema as Ot, getEvalDisplayStatus as P, startEvalBackgroundJob as Pn, cacheRecordingOpSchema as Pt, llmCallsConfigSchema as Q, columnFormatSchema as Qt, runManifestSchema as R, spanCacheOptionsSchema as Rt, normalizeScoreDef as S, getCurrentScope as Sn, evalChartMetricSchema as St, createRunRequestSchema as T, incrementEvalOutput as Tn, evalChartsConfigSchema as Tt, apiCallMetricFormatSchema as U, traceAttributeDisplaySchema as Ut, DEFAULT_LLM_CALLS_CONFIG as V, traceAttributeDisplayInputSchema as Vt, apiCallMetricPlacementSchema as W, traceDisplayConfigSchema as Wt, llmCallMetricSchema as X, cellValueSchema as Xt, llmCallMetricPlacementSchema as Y, traceSpanWarningSchema as Yt, llmCallPricingSchema as Z, columnDefSchema as Zt, loadEvalModule as _, advanceEvalTime as _n, evalChartAggregateSchema as _t, loadPersistedRunSnapshot as a, z$1 as an, caseDetailSchema as at, loadConfig as b, evalAssert as bn, evalChartColorSchema as bt, persistCaseDetail as c, evalSpan as cn, evalStatAggregateSchema as ct, recomputePersistedCaseStatus as d, hashCacheKeySync as dn, evalSummarySchema as dt, fileRefSchema as en, resolveApiCallsConfig as et, runTouchesEval as f, deserializeCacheRecording as fn, runLogEntrySchema as ft, setLatestRunInfoMap as g, EvalAssertionError as gn, scoreTraceSchema as gt, getTargetEvalIds as h, serializeCacheValue as hn, runLogPhaseSchema as ht, getLatestRunInfos as i, runArtifactRefSchema as in, assertionFailureSchema as it, extractLlmCalls as j, runInExistingEvalScope as jn, cacheListItemSchema as jt, extractCacheHits as k, runInEvalRuntimeScope as kn, cacheEntryWithDebugKeySchema as kt, persistRunState as l, evalTracer as ln, evalStatItemSchema as lt, buildEvalSummary as m, serializeCacheRecording as mn, runLogLocationSchema as mt, generateRunId as n, numberDisplayOptionsSchema as nn, runLogsConfigSchema as nt, loadPersistedRunSnapshots as o, buildTraceTree as on, caseRowSchema as ot, resolveArtifactPath as p, deserializeCacheValue as pn, runLogLevelSchema as pt, defaultConfigKeySchema as q, traceSpanKindSchema as qt, getLastRunStatuses as r, repoFileRefSchema as rn, trialSelectionModeSchema as rt, nextShortIdFromSnapshots as s, captureEvalSpanError as sn, evalFreshnessStatusSchema as st, executeRun as t, jsonCellSchema as tn, resolveLlmCallsConfig as tt, recomputeEvalStatusesInRuns as u, hashCacheKey as un, evalStatsConfigSchema as ut, parseEvalMetas as v, appendToEvalOutput as vn, evalChartAxisSchema as vt, createFsCacheStore as w, getEvalStartTime as wn, evalChartTypeSchema as wt, buildDeclaredColumnDefs as x, evalLog as xn, evalChartConfigSchema as xt, resolveEvalDefaultConfig as y, configureEvalRunLogs as yn, evalChartBuiltinMetricSchema as yt, runSummarySchema as z, traceCacheRefSchema as zt };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-DumvanQI.mjs";
2
- import "./src-BoAJb4wC.mjs";
1
+ import { n as createRunner } from "./cli-D3QNOcPN.mjs";
2
+ import "./src-CcXfWT4M.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {