@ls-stack/agent-eval 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2468,7 +2468,8 @@ const caseRowSchema = z.object({
2468
2468
  "error",
2469
2469
  "cancelled"
2470
2470
  ]),
2471
- latencyMs: z.number().nullable(),
2471
+ /** Elapsed case execution duration in milliseconds, or null before completion. */
2472
+ durationMs: z.number().nullable(),
2472
2473
  costUsd: z.number().nullable().optional(),
2473
2474
  columns: z.record(z.string(), cellValueSchema),
2474
2475
  /** Winning trial index for the persisted case result. */
@@ -2586,7 +2587,7 @@ const defaultConfigKeySchema = z.enum([
2586
2587
  "cachedInputTokens",
2587
2588
  "cacheCreationInputTokens",
2588
2589
  "reasoningTokens",
2589
- "llmLatencyMs"
2590
+ "llmDurationMs"
2590
2591
  ]);
2591
2592
  /** Removal config for built-in eval-level outputs and UI metadata. */
2592
2593
  const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfigKeySchema)]);
@@ -2663,7 +2664,7 @@ const apiCallMetricSchema = z.object({
2663
2664
  });
2664
2665
  /**
2665
2666
  * Schema for one model/provider pricing entry used to derive LLM-call costs
2666
- * from token counts when a span does not already record explicit USD costs.
2667
+ * from token counts.
2667
2668
  */
2668
2669
  const llmCallPricingSchema = z.object({
2669
2670
  /** Exact model name read from the configured `attributes.model` path. */
@@ -2682,6 +2683,8 @@ const llmCallPricingSchema = z.object({
2682
2683
  cachedInputUsdPerMillion: z.number().nonnegative().optional(),
2683
2684
  /** USD per one million prompt-cache write tokens. */
2684
2685
  cacheCreationInputUsdPerMillion: z.number().nonnegative().optional(),
2686
+ /** USD per one million one-hour prompt-cache write tokens. */
2687
+ cacheCreationInput1hUsdPerMillion: z.number().nonnegative().optional(),
2685
2688
  /** USD per one million reasoning tokens when reported separately. */
2686
2689
  reasoningUsdPerMillion: z.number().nonnegative().optional()
2687
2690
  });
@@ -2692,12 +2695,9 @@ const llmCallsConfigSchema = z.object({
2692
2695
  /**
2693
2696
  * Attribute paths used to extract structured per-call fields. Each entry is
2694
2697
  * a dot-path inside `span.attributes`. Missing paths fall back to the
2695
- * built-in defaults (e.g. `usage.inputTokens`, `costUsd`).
2696
- *
2697
- * Per-token-type cost paths (`inputCost`, `outputCost`, `cachedInputCost`,
2698
- * `reasoningCost`) feed the cost breakdown table in the expanded row when
2699
- * spans provide explicit USD cost overrides. Prefer `pricing` for deriving
2700
- * costs from token counts globally.
2698
+ * built-in defaults (e.g. `usage.inputTokens`). Derived fields such as
2699
+ * total tokens, tokens/sec, duration, and USD costs are intentionally not
2700
+ * configurable as attribute paths.
2701
2701
  */
2702
2702
  attributes: z.object({
2703
2703
  model: z.string().optional(),
@@ -2706,15 +2706,9 @@ const llmCallsConfigSchema = z.object({
2706
2706
  outputTokens: z.string().optional(),
2707
2707
  cachedInputTokens: z.string().optional(),
2708
2708
  cacheCreationInputTokens: z.string().optional(),
2709
+ cacheCreationInput1hTokens: z.string().optional(),
2709
2710
  reasoningTokens: z.string().optional(),
2710
- totalTokens: z.string().optional(),
2711
- tokensPerSecond: z.string().optional(),
2712
- cost: z.string().optional(),
2713
- inputCost: z.string().optional(),
2714
- outputCost: z.string().optional(),
2715
- cachedInputCost: z.string().optional(),
2716
- cacheCreationInputCost: z.string().optional(),
2717
- reasoningCost: z.string().optional(),
2711
+ latencyMs: z.string().optional(),
2718
2712
  steps: z.string().optional(),
2719
2713
  finishReason: z.string().optional(),
2720
2714
  input: z.string().optional(),
@@ -2723,9 +2717,8 @@ const llmCallsConfigSchema = z.object({
2723
2717
  toolCalls: z.string().optional()
2724
2718
  }).optional(),
2725
2719
  /**
2726
- * Model/provider pricing registry used to calculate missing LLM-call costs
2727
- * from token counts. Explicit span attributes (`costUsd`, `cost.inputUsd`,
2728
- * etc.) take precedence over derived prices.
2720
+ * Model/provider pricing registry used to calculate LLM-call costs from
2721
+ * token counts. Built-in LLM cost fields are only derived from this registry.
2729
2722
  */
2730
2723
  pricing: z.array(llmCallPricingSchema).optional(),
2731
2724
  /** Custom user-defined metrics surfaced on each LLM call. */
@@ -2773,15 +2766,9 @@ const DEFAULT_LLM_CALLS_CONFIG = {
2773
2766
  outputTokens: "usage.outputTokens",
2774
2767
  cachedInputTokens: "usage.cachedInputTokens",
2775
2768
  cacheCreationInputTokens: "usage.cacheCreationInputTokens",
2769
+ cacheCreationInput1hTokens: "usage.cacheCreationInput1hTokens",
2776
2770
  reasoningTokens: "usage.reasoningTokens",
2777
- totalTokens: "usage.totalTokens",
2778
- tokensPerSecond: "tokensPerSecond",
2779
- cost: "costUsd",
2780
- inputCost: "cost.inputUsd",
2781
- outputCost: "cost.outputUsd",
2782
- cachedInputCost: "cost.cachedInputUsd",
2783
- cacheCreationInputCost: "cost.cacheCreationInputUsd",
2784
- reasoningCost: "cost.reasoningUsd",
2771
+ latencyMs: "latencyMs",
2785
2772
  steps: "steps",
2786
2773
  finishReason: "finishReason",
2787
2774
  input: "input",
@@ -2823,8 +2810,8 @@ const DEFAULT_API_CALLS_CONFIG = {
2823
2810
  * attribute path.
2824
2811
  * - Missing `metrics[].format` defaults to `'string'`.
2825
2812
  * - Missing `metrics[].placements` defaults to `['body']`.
2826
- * - Missing `pricing` defaults to an empty registry; explicit span costs still
2827
- * take precedence over derived costs.
2813
+ * - Missing `pricing` defaults to an empty registry; built-in costs are only
2814
+ * derived from configured pricing and token counts.
2828
2815
  */
2829
2816
  function resolveLlmCallsConfig(input) {
2830
2817
  return {
@@ -2848,6 +2835,7 @@ function resolveLlmCallsConfig(input) {
2848
2835
  outputUsdPerMillion: p.outputUsdPerMillion,
2849
2836
  cachedInputUsdPerMillion: p.cachedInputUsdPerMillion,
2850
2837
  cacheCreationInputUsdPerMillion: p.cacheCreationInputUsdPerMillion,
2838
+ cacheCreationInput1hUsdPerMillion: p.cacheCreationInput1hUsdPerMillion,
2851
2839
  reasoningUsdPerMillion: p.reasoningUsdPerMillion
2852
2840
  }))
2853
2841
  };
@@ -3037,8 +3025,8 @@ function deriveScopedSummaryFromCases(params) {
3037
3025
  else if (caseRow.status === "cancelled") cancelledCases += 1;
3038
3026
  else if (caseRow.status === "running") runningCases += 1;
3039
3027
  else pendingCases += 1;
3040
- if (caseRow.latencyMs !== null) {
3041
- totalDurationMs += caseRow.latencyMs;
3028
+ if (caseRow.durationMs !== null) {
3029
+ totalDurationMs += caseRow.durationMs;
3042
3030
  hasDuration = true;
3043
3031
  }
3044
3032
  }
@@ -3132,6 +3120,21 @@ function computeTokenCost(tokens, usdPerMillion) {
3132
3120
  if (usdPerMillion === void 0) return null;
3133
3121
  return tokens / 1e6 * usdPerMillion;
3134
3122
  }
3123
+ function computeCacheCreationInputCost({ cacheCreationInputTokens, cacheCreationInput1hTokens, usdPerMillion, oneHourUsdPerMillion }) {
3124
+ if (cacheCreationInputTokens === null) return null;
3125
+ if (cacheCreationInputTokens === 0) return 0;
3126
+ if (cacheCreationInput1hTokens === null) return computeTokenCost(cacheCreationInputTokens, usdPerMillion);
3127
+ const oneHourTokens = Math.min(cacheCreationInput1hTokens, cacheCreationInputTokens);
3128
+ const shortLivedCost = computeTokenCost(cacheCreationInputTokens - oneHourTokens, usdPerMillion);
3129
+ const oneHourCost = computeTokenCost(oneHourTokens, oneHourUsdPerMillion);
3130
+ if (shortLivedCost === null || oneHourCost === null) return null;
3131
+ return shortLivedCost + oneHourCost;
3132
+ }
3133
+ function computeBaseInputTokens({ inputTokens, cachedInputTokens, cacheCreationInputTokens }) {
3134
+ if (inputTokens === null) return null;
3135
+ const cachedTokens = (cachedInputTokens ?? 0) + (cacheCreationInputTokens ?? 0);
3136
+ return Math.max(inputTokens - cachedTokens, 0);
3137
+ }
3135
3138
  function pickPricingEntry({ pricing, model, provider }) {
3136
3139
  if (model === null) return null;
3137
3140
  let fallback = null;
@@ -3145,7 +3148,7 @@ function pickPricingEntry({ pricing, model, provider }) {
3145
3148
  }
3146
3149
  return fallback;
3147
3150
  }
3148
- function computeFallbackTotalCost({ inputTokens, inputCostUsd, outputTokens, outputCostUsd, cachedInputTokens, cachedInputCostUsd, cacheCreationInputTokens, cacheCreationInputCostUsd, reasoningTokens, reasoningCostUsd }) {
3151
+ function computeTotalCost({ inputTokens, inputCostUsd, outputTokens, outputCostUsd, cachedInputTokens, cachedInputCostUsd, cacheCreationInputTokens, cacheCreationInputCostUsd, reasoningTokens, reasoningCostUsd }) {
3149
3152
  const parts = [
3150
3153
  {
3151
3154
  tokens: inputTokens,
@@ -3182,7 +3185,7 @@ function computeFallbackTotalCost({ inputTokens, inputCostUsd, outputTokens, out
3182
3185
  if (hasCost) return total;
3183
3186
  return hasReportedTokens ? 0 : null;
3184
3187
  }
3185
- function computeLatencyMs$1(span) {
3188
+ function computeDurationMs$1(span) {
3186
3189
  if (span.endedAt === null) return null;
3187
3190
  const started = Date.parse(span.startedAt);
3188
3191
  const ended = Date.parse(span.endedAt);
@@ -3190,10 +3193,16 @@ function computeLatencyMs$1(span) {
3190
3193
  const delta = ended - started;
3191
3194
  return delta >= 0 ? delta : null;
3192
3195
  }
3193
- function computeTotalTokens({ declared, input, output, cached, cacheCreation }) {
3194
- if (declared !== null) return declared;
3195
- if (input === null && output === null && cached === null && cacheCreation === null) return null;
3196
- return (input ?? 0) + (output ?? 0) + (cached ?? 0) + (cacheCreation ?? 0);
3196
+ function computeTotalTokens({ input, output }) {
3197
+ if (input === null && output === null) return null;
3198
+ return (input ?? 0) + (output ?? 0);
3199
+ }
3200
+ function computeTokensPerSecond({ outputTokens, durationMs, latencyMs }) {
3201
+ if (outputTokens === null || durationMs === null) return null;
3202
+ if (outputTokens === 0) return 0;
3203
+ const generationMs = latencyMs === null ? durationMs : durationMs - latencyMs;
3204
+ if (generationMs <= 0) return null;
3205
+ return outputTokens / (generationMs / 1e3);
3197
3206
  }
3198
3207
  function readSteps(attributes, path) {
3199
3208
  const raw = getNestedAttribute(attributes, path);
@@ -3201,10 +3210,6 @@ function readSteps(attributes, path) {
3201
3210
  stepCount: raw.length,
3202
3211
  stepDetails: raw
3203
3212
  };
3204
- if (typeof raw === "number" && Number.isFinite(raw)) return {
3205
- stepCount: raw,
3206
- stepDetails: null
3207
- };
3208
3213
  return {
3209
3214
  stepCount: null,
3210
3215
  stepDetails: null
@@ -3226,16 +3231,22 @@ function pickError$1(span) {
3226
3231
  * shape consumed by the LLM calls tab.
3227
3232
  *
3228
3233
  * Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
3229
- * (`model`, token counts, explicit cost, etc.) are read via
3234
+ * (`model`, token counts, latency, etc.) are read via
3230
3235
  * `getNestedAttribute` from the configured paths, with safe coercion to
3231
- * `string | null` / `number | null`. When explicit USD costs are absent,
3232
- * configured model pricing derives per-token-type costs from token counts.
3233
- * `totalTokens` falls back to a sum of input + output + cached when no
3234
- * explicit total attribute is present. The `steps` attribute path may resolve
3235
- * to either a number (rendered as the inference-round count) or an array of
3236
- * per-step detail objects (rendered as a Steps section in the body, with
3237
- * `stepCount` derived from the array length). `latencyMs` is `null` while the
3238
- * span is still running. User-defined `metrics` whose path resolves to
3236
+ * `string | null` / `number | null`. `latencyMs` is an explicit
3237
+ * time-to-first-token attribute; full span elapsed time is reported separately
3238
+ * as `durationMs`. Built-in USD costs are derived only from configured model
3239
+ * pricing and token counts. `totalTokens` is always derived from input +
3240
+ * output tokens. Cached input and cache creation tokens are reported
3241
+ * separately because they are subsets of input/output usage. The main cache
3242
+ * creation token field is treated as the total write count; optional one-hour
3243
+ * cache creation tokens only split that total for cost calculation. Base input
3244
+ * cost uses input minus cache read/write tokens so cached tokens are not
3245
+ * charged twice. Cache read/write costs still contribute to the total USD cost
3246
+ * at their configured rates. The `steps` attribute path may resolve to an array
3247
+ * of per-step detail objects, with `stepCount` derived from the array length.
3248
+ * `durationMs` and `tokensPerSecond` are `null` while the span is still
3249
+ * running. User-defined `metrics` whose path resolves to
3239
3250
  * `undefined` are dropped, but `null`, `0`, and `false` are preserved as
3240
3251
  * legitimate values worth displaying. Original span order is preserved so the
3241
3252
  * LLM calls tab matches the ordering in the Trace tab.
@@ -3252,19 +3263,30 @@ function extractLlmCalls(spans, config) {
3252
3263
  const outputTokens = readNumber$2(attrs, config.attributes.outputTokens);
3253
3264
  const cachedInputTokens = readNumber$2(attrs, config.attributes.cachedInputTokens);
3254
3265
  const cacheCreationInputTokens = readNumber$2(attrs, config.attributes.cacheCreationInputTokens);
3266
+ const cacheCreationInput1hTokens = readNumber$2(attrs, config.attributes.cacheCreationInput1hTokens);
3255
3267
  const reasoningTokens = readNumber$2(attrs, config.attributes.reasoningTokens);
3256
- const declaredTotalTokens = readNumber$2(attrs, config.attributes.totalTokens);
3268
+ const latencyMs = readNumber$2(attrs, config.attributes.latencyMs);
3269
+ const durationMs = computeDurationMs$1(span);
3257
3270
  const pricing = pickPricingEntry({
3258
3271
  pricing: config.pricing,
3259
3272
  model,
3260
3273
  provider
3261
3274
  });
3262
- const inputCostUsd = readNumber$2(attrs, config.attributes.inputCost) ?? computeTokenCost(inputTokens, pricing?.inputUsdPerMillion);
3263
- const outputCostUsd = readNumber$2(attrs, config.attributes.outputCost) ?? computeTokenCost(outputTokens, pricing?.outputUsdPerMillion);
3264
- const cachedInputCostUsd = readNumber$2(attrs, config.attributes.cachedInputCost) ?? computeTokenCost(cachedInputTokens, pricing?.cachedInputUsdPerMillion);
3265
- const cacheCreationInputCostUsd = readNumber$2(attrs, config.attributes.cacheCreationInputCost) ?? computeTokenCost(cacheCreationInputTokens, pricing?.cacheCreationInputUsdPerMillion);
3266
- const reasoningCostUsd = readNumber$2(attrs, config.attributes.reasoningCost) ?? computeTokenCost(reasoningTokens, pricing?.reasoningUsdPerMillion);
3267
- const costUsd = readNumber$2(attrs, config.attributes.cost) ?? computeFallbackTotalCost({
3275
+ const inputCostUsd = computeTokenCost(computeBaseInputTokens({
3276
+ inputTokens,
3277
+ cachedInputTokens,
3278
+ cacheCreationInputTokens
3279
+ }), pricing?.inputUsdPerMillion);
3280
+ const outputCostUsd = computeTokenCost(outputTokens, pricing?.outputUsdPerMillion);
3281
+ const cachedInputCostUsd = computeTokenCost(cachedInputTokens, pricing?.cachedInputUsdPerMillion);
3282
+ const cacheCreationInputCostUsd = computeCacheCreationInputCost({
3283
+ cacheCreationInputTokens,
3284
+ cacheCreationInput1hTokens,
3285
+ usdPerMillion: pricing?.cacheCreationInputUsdPerMillion,
3286
+ oneHourUsdPerMillion: pricing?.cacheCreationInput1hUsdPerMillion
3287
+ });
3288
+ const reasoningCostUsd = computeTokenCost(reasoningTokens, pricing?.reasoningUsdPerMillion);
3289
+ const costUsd = computeTotalCost({
3268
3290
  inputTokens,
3269
3291
  inputCostUsd,
3270
3292
  outputTokens,
@@ -3302,13 +3324,15 @@ function extractLlmCalls(spans, config) {
3302
3324
  cacheCreationInputTokens,
3303
3325
  reasoningTokens,
3304
3326
  totalTokens: computeTotalTokens({
3305
- declared: declaredTotalTokens,
3306
3327
  input: inputTokens,
3307
- output: outputTokens,
3308
- cached: cachedInputTokens,
3309
- cacheCreation: cacheCreationInputTokens
3328
+ output: outputTokens
3329
+ }),
3330
+ latencyMs,
3331
+ tokensPerSecond: computeTokensPerSecond({
3332
+ outputTokens,
3333
+ durationMs,
3334
+ latencyMs
3310
3335
  }),
3311
- tokensPerSecond: readNumber$2(attrs, config.attributes.tokensPerSecond),
3312
3336
  costUsd,
3313
3337
  inputCostUsd,
3314
3338
  outputCostUsd,
@@ -3317,7 +3341,7 @@ function extractLlmCalls(spans, config) {
3317
3341
  reasoningCostUsd,
3318
3342
  ...readSteps(attrs, config.attributes.steps),
3319
3343
  finishReason: readString$2(attrs, config.attributes.finishReason),
3320
- latencyMs: computeLatencyMs$1(span),
3344
+ durationMs,
3321
3345
  input: getNestedAttribute(attrs, config.attributes.input),
3322
3346
  output: getNestedAttribute(attrs, config.attributes.output),
3323
3347
  reasoning: getNestedAttribute(attrs, config.attributes.reasoning),
@@ -3342,7 +3366,7 @@ function readString$1(attributes, path) {
3342
3366
  const raw = getNestedAttribute(attributes, path);
3343
3367
  return typeof raw === "string" && raw.length > 0 ? raw : null;
3344
3368
  }
3345
- function computeLatencyMs(span) {
3369
+ function computeDurationMs(span) {
3346
3370
  if (span.endedAt === null) return null;
3347
3371
  const started = Date.parse(span.startedAt);
3348
3372
  const ended = Date.parse(span.endedAt);
@@ -3367,10 +3391,10 @@ function pickError(span) {
3367
3391
  *
3368
3392
  * Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
3369
3393
  * (`method`, `url`, `statusCode`, etc.) are read via `getNestedAttribute` from
3370
- * the configured paths. `durationMs` takes precedence for latency, with a
3371
- * fallback to the span start/end timestamps. User-defined `metrics` whose path
3372
- * resolves to `undefined` are dropped, but `null`, `0`, and `false` are
3373
- * preserved as legitimate values worth displaying. Original span order is
3394
+ * the configured paths. An explicit `durationMs` attribute takes precedence,
3395
+ * with a fallback to the span start/end timestamps. User-defined `metrics`
3396
+ * whose path resolves to `undefined` are dropped, but `null`, `0`, and `false`
3397
+ * are preserved as legitimate values worth displaying. Original span order is
3374
3398
  * preserved so the API calls tab matches the ordering in the Trace tab.
3375
3399
  */
3376
3400
  function extractApiCalls(spans, config) {
@@ -3400,7 +3424,7 @@ function extractApiCalls(spans, config) {
3400
3424
  method: readString$1(attrs, config.attributes.method),
3401
3425
  url: readString$1(attrs, config.attributes.url),
3402
3426
  statusCode: readNumber$1(attrs, config.attributes.statusCode),
3403
- latencyMs: readNumber$1(attrs, config.attributes.durationMs) ?? computeLatencyMs(span),
3427
+ durationMs: readNumber$1(attrs, config.attributes.durationMs) ?? computeDurationMs(span),
3404
3428
  request: getNestedAttribute(attrs, config.attributes.request),
3405
3429
  response: getNestedAttribute(attrs, config.attributes.response),
3406
3430
  requestBody: getNestedAttribute(attrs, config.attributes.requestBody),
@@ -3798,7 +3822,7 @@ async function writeCacheFile(cacheDir, cacheFile) {
3798
3822
  await mkdir(cacheDir, { recursive: true });
3799
3823
  const filePath = ownerPath(cacheDir, cacheFile.owner);
3800
3824
  const tmpPath = `${filePath}.${process.pid.toString()}.tmp`;
3801
- await writeFile(tmpPath, JSON.stringify(cacheFile));
3825
+ await writeFile(tmpPath, JSON.stringify(cacheFile, null, 2));
3802
3826
  await rename(tmpPath, filePath);
3803
3827
  }
3804
3828
  async function readDebugKeyFile(debugDir, owner) {
@@ -4237,7 +4261,8 @@ const DEFAULT_CONFIG_KEYS = [
4237
4261
  "totalTokens",
4238
4262
  "cachedInputTokens",
4239
4263
  "cacheCreationInputTokens",
4240
- "llmLatencyMs"
4264
+ "reasoningTokens",
4265
+ "llmDurationMs"
4241
4266
  ];
4242
4267
  const tokenNumberFormat = { notation: "compact" };
4243
4268
  const countNumberFormat = {
@@ -4303,8 +4328,8 @@ const DEFAULT_COLUMNS = {
4303
4328
  numberFormat: tokenNumberFormat,
4304
4329
  align: "right"
4305
4330
  },
4306
- llmLatencyMs: {
4307
- label: "LLM Latency",
4331
+ llmDurationMs: {
4332
+ label: "LLM Duration",
4308
4333
  format: "duration",
4309
4334
  align: "right"
4310
4335
  }
@@ -4509,8 +4534,14 @@ function addDefaultOutputs(params) {
4509
4534
  });
4510
4535
  assignIfMissing({
4511
4536
  outputs: params.outputs,
4512
- key: "llmLatencyMs",
4513
- value: sumNullable(calls.map((call) => call.latencyMs)),
4537
+ key: "reasoningTokens",
4538
+ value: sumNullable(calls.map((call) => call.reasoningTokens)),
4539
+ activeKeys
4540
+ });
4541
+ assignIfMissing({
4542
+ outputs: params.outputs,
4543
+ key: "llmDurationMs",
4544
+ value: sumNullable(calls.map((call) => call.durationMs)),
4514
4545
  activeKeys
4515
4546
  });
4516
4547
  }
@@ -5356,7 +5387,7 @@ async function runCase(params) {
5356
5387
  caseDetail,
5357
5388
  caseRowUpdate: {
5358
5389
  status,
5359
- latencyMs: Date.now() - startTime,
5390
+ durationMs: Date.now() - startTime,
5360
5391
  columns
5361
5392
  }
5362
5393
  };
@@ -5647,7 +5678,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5647
5678
  caseId: evalCase.id,
5648
5679
  evalId: evalMeta.id,
5649
5680
  status: caseRowUpdate.status ?? "pending",
5650
- latencyMs: caseRowUpdate.latencyMs ?? null,
5681
+ durationMs: caseRowUpdate.durationMs ?? null,
5651
5682
  columns: caseRowUpdate.columns ?? {},
5652
5683
  trial
5653
5684
  }
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-DumvanQI.mjs";
2
- import "./src-BoAJb4wC.mjs";
1
+ import { n as createRunner } from "./cli-C0EtHhEO.mjs";
2
+ import "./src-D-HuV8I-.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-Dy_PECaf.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-C9nP2VKL.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -0,0 +1,3 @@
1
+ import "./runOrchestration-D1edUDhp.mjs";
2
+ import "./cli-C0EtHhEO.mjs";
3
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.21.0",
3
+ "version": "0.22.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -92,20 +92,16 @@ export async function runRefundWorkflow(input: RefundInput) {
92
92
  async () => {
93
93
  let text: string;
94
94
  let usage: { inputTokens: number; outputTokens: number };
95
- let costUsd: number;
96
95
  try {
97
- ({ text, usage, costUsd } = await llm.complete(input.message));
96
+ ({ text, usage } = await llm.complete(input.message));
98
97
  } catch (error) {
99
98
  captureEvalSpanError(error);
100
- ({ text, usage, costUsd } = await llm.completeWithFallback(
101
- input.message,
102
- ));
99
+ ({ text, usage } = await llm.completeWithFallback(input.message));
103
100
  }
104
101
  evalSpan.setAttributes({
105
102
  model: 'gpt-4o-mini',
106
103
  provider: 'openai',
107
104
  usage,
108
- costUsd,
109
105
  });
110
106
  const expectedLocale = getEvalCaseInput('locale');
111
107
  if (typeof expectedLocale === 'string') {
@@ -137,8 +133,8 @@ are more specific. Only the `input` and `output` span attributes are promoted
137
133
  automatically in the trace tree; use `traceDisplay` for other span attributes
138
134
  such as `model` or `usage`. Eval-level LLM usage outputs, columns, stats, and
139
135
  charts are derived from matching LLM spans by default. Prefer
140
- `llmCalls.pricing` for LLM-call cost display instead of writing `costUsd` on
141
- each span.
136
+ `llmCalls.pricing` for LLM-call cost display; built-in costs ignore span
137
+ `costUsd` attributes.
142
138
 
143
139
  Use `captureEvalSpanError(error)` for recoverable errors on the active
144
140
  `evalTracer.span(...)`, such as optional model/tool failures that fall back and
@@ -261,18 +257,28 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
261
257
  See the `TraceDisplayInputConfig` type.
262
258
  - `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are
263
259
  summarized for review. Defaults to `kind: 'llm'` spans with `model`,
264
- `usage.*`, `tokensPerSecond`, `input`, `output`, etc. read from conventional
265
- attribute paths. Override `kinds` to broaden the filter, override
266
- `attributes.<field>` for non-default span shapes, configure `pricing` to
267
- derive USD costs from token counts by model/provider, and add entries to
268
- `metrics` to surface arbitrary user metrics (`format: 'string' | 'number' |
269
- 'duration' | 'json' | 'boolean'`, `placements: ['header' | 'body']`).
260
+ `usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional
261
+ attribute paths. `latencyMs` is time to first token; duration, total tokens,
262
+ tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
263
+ override `attributes.<field>` for non-default primitive span shapes, configure
264
+ `pricing` to derive USD costs from token counts by model/provider, and add
265
+ entries to `metrics` to surface arbitrary user metrics (`format: 'string' |
266
+ 'number' | 'duration' | 'json' | 'boolean'`, `placements: ['header' |
267
+ 'body']`).
270
268
  - Default usage config derives missing eval outputs from matching LLM/API spans
271
269
  before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`,
272
270
  `inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`,
273
- `cacheCreationInputTokens`, `reasoningTokens`, and `llmLatencyMs`. Authored
274
- outputs and column overrides win. Remove defaults globally or per eval with
275
- `removeDefaultConfig: true` or a key list such as
271
+ `cacheCreationInputTokens`, `reasoningTokens`, and `llmDurationMs`. Authored
272
+ outputs and column overrides win. `totalTokens` is input + output only; cache
273
+ read/write tokens stay separate and affect `costUsd` at their own rates.
274
+ Derived base input cost uses `inputTokens - cachedInputTokens -
275
+ cacheCreationInputTokens` so cache details are not double-counted.
276
+ `cacheCreationInputTokens` is the total cache-write count; optional
277
+ `cacheCreationInput1hTokens` only splits that total for 1-hour write pricing
278
+ via `cacheCreationInput1hUsdPerMillion`. `llmDurationMs` sums elapsed matched
279
+ LLM span durations; it is not time-to-first-token latency.
280
+ Remove defaults globally or per eval with `removeDefaultConfig: true` or a
281
+ key list such as
276
282
  `removeDefaultConfig: ['apiCalls', 'reasoningTokens']`.
277
283
  - `apiCalls` (in `agent-evals.config.ts`) configures how API-call spans are
278
284
  summarized for review. Defaults to `kind: 'api'`, `'http'`, `'http.client'`,