@ls-stack/agent-eval 0.20.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1970,7 +1970,15 @@ const numberDisplayOptionsSchema = z.object({
1970
1970
  compactDisplay: z.enum(["short", "long"]).optional(),
1971
1971
  prefix: z.string().optional(),
1972
1972
  suffix: z.string().optional(),
1973
- decimalPlaces: z.number().int().min(0).optional()
1973
+ minDecimalPlaces: z.number().int().min(0).optional(),
1974
+ maxDecimalPlaces: z.number().int().min(0).optional()
1975
+ }).refine((options) => {
1976
+ if (options.minDecimalPlaces === void 0) return true;
1977
+ if (options.maxDecimalPlaces === void 0) return true;
1978
+ return options.minDecimalPlaces <= options.maxDecimalPlaces;
1979
+ }, {
1980
+ message: "minDecimalPlaces must be less than or equal to maxDecimalPlaces",
1981
+ path: ["minDecimalPlaces"]
1974
1982
  });
1975
1983
  /** Schema for the supported column rendering kinds in list views. */
1976
1984
  const columnKindSchema = z.enum([
@@ -2005,7 +2013,6 @@ const columnDefSchema = z.object({
2005
2013
  passThreshold: z.number().optional(),
2006
2014
  maxStars: z.number().int().min(2).optional(),
2007
2015
  hideInTable: z.boolean().optional(),
2008
- sortable: z.boolean().optional(),
2009
2016
  align: z.enum([
2010
2017
  "left",
2011
2018
  "center",
@@ -2403,6 +2410,8 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
2403
2410
  label: z.string().optional(),
2404
2411
  aggregate: evalStatAggregateSchema,
2405
2412
  format: columnFormatSchema.optional(),
2413
+ /** Number presentation options applied when `format: 'number'`. */
2414
+ numberFormat: numberDisplayOptionsSchema.optional(),
2406
2415
  accent: z.boolean().optional()
2407
2416
  })
2408
2417
  ]);
@@ -2459,7 +2468,8 @@ const caseRowSchema = z.object({
2459
2468
  "error",
2460
2469
  "cancelled"
2461
2470
  ]),
2462
- latencyMs: z.number().nullable(),
2471
+ /** Elapsed case execution duration in milliseconds, or null before completion. */
2472
+ durationMs: z.number().nullable(),
2463
2473
  costUsd: z.number().nullable().optional(),
2464
2474
  columns: z.record(z.string(), cellValueSchema),
2465
2475
  /** Winning trial index for the persisted case result. */
@@ -2577,7 +2587,7 @@ const defaultConfigKeySchema = z.enum([
2577
2587
  "cachedInputTokens",
2578
2588
  "cacheCreationInputTokens",
2579
2589
  "reasoningTokens",
2580
- "llmLatencyMs"
2590
+ "llmDurationMs"
2581
2591
  ]);
2582
2592
  /** Removal config for built-in eval-level outputs and UI metadata. */
2583
2593
  const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfigKeySchema)]);
@@ -2654,7 +2664,7 @@ const apiCallMetricSchema = z.object({
2654
2664
  });
2655
2665
  /**
2656
2666
  * Schema for one model/provider pricing entry used to derive LLM-call costs
2657
- * from token counts when a span does not already record explicit USD costs.
2667
+ * from token counts.
2658
2668
  */
2659
2669
  const llmCallPricingSchema = z.object({
2660
2670
  /** Exact model name read from the configured `attributes.model` path. */
@@ -2673,6 +2683,8 @@ const llmCallPricingSchema = z.object({
2673
2683
  cachedInputUsdPerMillion: z.number().nonnegative().optional(),
2674
2684
  /** USD per one million prompt-cache write tokens. */
2675
2685
  cacheCreationInputUsdPerMillion: z.number().nonnegative().optional(),
2686
+ /** USD per one million one-hour prompt-cache write tokens. */
2687
+ cacheCreationInput1hUsdPerMillion: z.number().nonnegative().optional(),
2676
2688
  /** USD per one million reasoning tokens when reported separately. */
2677
2689
  reasoningUsdPerMillion: z.number().nonnegative().optional()
2678
2690
  });
@@ -2683,12 +2695,9 @@ const llmCallsConfigSchema = z.object({
2683
2695
  /**
2684
2696
  * Attribute paths used to extract structured per-call fields. Each entry is
2685
2697
  * a dot-path inside `span.attributes`. Missing paths fall back to the
2686
- * built-in defaults (e.g. `usage.inputTokens`, `costUsd`).
2687
- *
2688
- * Per-token-type cost paths (`inputCost`, `outputCost`, `cachedInputCost`,
2689
- * `reasoningCost`) feed the cost breakdown table in the expanded row when
2690
- * spans provide explicit USD cost overrides. Prefer `pricing` for deriving
2691
- * costs from token counts globally.
2698
+ * built-in defaults (e.g. `usage.inputTokens`). Derived fields such as
2699
+ * total tokens, tokens/sec, duration, and USD costs are intentionally not
2700
+ * configurable as attribute paths.
2692
2701
  */
2693
2702
  attributes: z.object({
2694
2703
  model: z.string().optional(),
@@ -2697,15 +2706,9 @@ const llmCallsConfigSchema = z.object({
2697
2706
  outputTokens: z.string().optional(),
2698
2707
  cachedInputTokens: z.string().optional(),
2699
2708
  cacheCreationInputTokens: z.string().optional(),
2709
+ cacheCreationInput1hTokens: z.string().optional(),
2700
2710
  reasoningTokens: z.string().optional(),
2701
- totalTokens: z.string().optional(),
2702
- tokensPerSecond: z.string().optional(),
2703
- cost: z.string().optional(),
2704
- inputCost: z.string().optional(),
2705
- outputCost: z.string().optional(),
2706
- cachedInputCost: z.string().optional(),
2707
- cacheCreationInputCost: z.string().optional(),
2708
- reasoningCost: z.string().optional(),
2711
+ latencyMs: z.string().optional(),
2709
2712
  steps: z.string().optional(),
2710
2713
  finishReason: z.string().optional(),
2711
2714
  input: z.string().optional(),
@@ -2714,9 +2717,8 @@ const llmCallsConfigSchema = z.object({
2714
2717
  toolCalls: z.string().optional()
2715
2718
  }).optional(),
2716
2719
  /**
2717
- * Model/provider pricing registry used to calculate missing LLM-call costs
2718
- * from token counts. Explicit span attributes (`costUsd`, `cost.inputUsd`,
2719
- * etc.) take precedence over derived prices.
2720
+ * Model/provider pricing registry used to calculate LLM-call costs from
2721
+ * token counts. Built-in LLM cost fields are only derived from this registry.
2720
2722
  */
2721
2723
  pricing: z.array(llmCallPricingSchema).optional(),
2722
2724
  /** Custom user-defined metrics surfaced on each LLM call. */
@@ -2764,15 +2766,9 @@ const DEFAULT_LLM_CALLS_CONFIG = {
2764
2766
  outputTokens: "usage.outputTokens",
2765
2767
  cachedInputTokens: "usage.cachedInputTokens",
2766
2768
  cacheCreationInputTokens: "usage.cacheCreationInputTokens",
2769
+ cacheCreationInput1hTokens: "usage.cacheCreationInput1hTokens",
2767
2770
  reasoningTokens: "usage.reasoningTokens",
2768
- totalTokens: "usage.totalTokens",
2769
- tokensPerSecond: "tokensPerSecond",
2770
- cost: "costUsd",
2771
- inputCost: "cost.inputUsd",
2772
- outputCost: "cost.outputUsd",
2773
- cachedInputCost: "cost.cachedInputUsd",
2774
- cacheCreationInputCost: "cost.cacheCreationInputUsd",
2775
- reasoningCost: "cost.reasoningUsd",
2771
+ latencyMs: "latencyMs",
2776
2772
  steps: "steps",
2777
2773
  finishReason: "finishReason",
2778
2774
  input: "input",
@@ -2814,8 +2810,8 @@ const DEFAULT_API_CALLS_CONFIG = {
2814
2810
  * attribute path.
2815
2811
  * - Missing `metrics[].format` defaults to `'string'`.
2816
2812
  * - Missing `metrics[].placements` defaults to `['body']`.
2817
- * - Missing `pricing` defaults to an empty registry; explicit span costs still
2818
- * take precedence over derived costs.
2813
+ * - Missing `pricing` defaults to an empty registry; built-in costs are only
2814
+ * derived from configured pricing and token counts.
2819
2815
  */
2820
2816
  function resolveLlmCallsConfig(input) {
2821
2817
  return {
@@ -2839,6 +2835,7 @@ function resolveLlmCallsConfig(input) {
2839
2835
  outputUsdPerMillion: p.outputUsdPerMillion,
2840
2836
  cachedInputUsdPerMillion: p.cachedInputUsdPerMillion,
2841
2837
  cacheCreationInputUsdPerMillion: p.cacheCreationInputUsdPerMillion,
2838
+ cacheCreationInput1hUsdPerMillion: p.cacheCreationInput1hUsdPerMillion,
2842
2839
  reasoningUsdPerMillion: p.reasoningUsdPerMillion
2843
2840
  }))
2844
2841
  };
@@ -3028,8 +3025,8 @@ function deriveScopedSummaryFromCases(params) {
3028
3025
  else if (caseRow.status === "cancelled") cancelledCases += 1;
3029
3026
  else if (caseRow.status === "running") runningCases += 1;
3030
3027
  else pendingCases += 1;
3031
- if (caseRow.latencyMs !== null) {
3032
- totalDurationMs += caseRow.latencyMs;
3028
+ if (caseRow.durationMs !== null) {
3029
+ totalDurationMs += caseRow.durationMs;
3033
3030
  hasDuration = true;
3034
3031
  }
3035
3032
  }
@@ -3123,6 +3120,21 @@ function computeTokenCost(tokens, usdPerMillion) {
3123
3120
  if (usdPerMillion === void 0) return null;
3124
3121
  return tokens / 1e6 * usdPerMillion;
3125
3122
  }
3123
+ function computeCacheCreationInputCost({ cacheCreationInputTokens, cacheCreationInput1hTokens, usdPerMillion, oneHourUsdPerMillion }) {
3124
+ if (cacheCreationInputTokens === null) return null;
3125
+ if (cacheCreationInputTokens === 0) return 0;
3126
+ if (cacheCreationInput1hTokens === null) return computeTokenCost(cacheCreationInputTokens, usdPerMillion);
3127
+ const oneHourTokens = Math.min(cacheCreationInput1hTokens, cacheCreationInputTokens);
3128
+ const shortLivedCost = computeTokenCost(cacheCreationInputTokens - oneHourTokens, usdPerMillion);
3129
+ const oneHourCost = computeTokenCost(oneHourTokens, oneHourUsdPerMillion);
3130
+ if (shortLivedCost === null || oneHourCost === null) return null;
3131
+ return shortLivedCost + oneHourCost;
3132
+ }
3133
+ function computeBaseInputTokens({ inputTokens, cachedInputTokens, cacheCreationInputTokens }) {
3134
+ if (inputTokens === null) return null;
3135
+ const cachedTokens = (cachedInputTokens ?? 0) + (cacheCreationInputTokens ?? 0);
3136
+ return Math.max(inputTokens - cachedTokens, 0);
3137
+ }
3126
3138
  function pickPricingEntry({ pricing, model, provider }) {
3127
3139
  if (model === null) return null;
3128
3140
  let fallback = null;
@@ -3136,7 +3148,7 @@ function pickPricingEntry({ pricing, model, provider }) {
3136
3148
  }
3137
3149
  return fallback;
3138
3150
  }
3139
- function computeFallbackTotalCost({ inputTokens, inputCostUsd, outputTokens, outputCostUsd, cachedInputTokens, cachedInputCostUsd, cacheCreationInputTokens, cacheCreationInputCostUsd, reasoningTokens, reasoningCostUsd }) {
3151
+ function computeTotalCost({ inputTokens, inputCostUsd, outputTokens, outputCostUsd, cachedInputTokens, cachedInputCostUsd, cacheCreationInputTokens, cacheCreationInputCostUsd, reasoningTokens, reasoningCostUsd }) {
3140
3152
  const parts = [
3141
3153
  {
3142
3154
  tokens: inputTokens,
@@ -3173,7 +3185,7 @@ function computeFallbackTotalCost({ inputTokens, inputCostUsd, outputTokens, out
3173
3185
  if (hasCost) return total;
3174
3186
  return hasReportedTokens ? 0 : null;
3175
3187
  }
3176
- function computeLatencyMs$1(span) {
3188
+ function computeDurationMs$1(span) {
3177
3189
  if (span.endedAt === null) return null;
3178
3190
  const started = Date.parse(span.startedAt);
3179
3191
  const ended = Date.parse(span.endedAt);
@@ -3181,10 +3193,16 @@ function computeLatencyMs$1(span) {
3181
3193
  const delta = ended - started;
3182
3194
  return delta >= 0 ? delta : null;
3183
3195
  }
3184
- function computeTotalTokens({ declared, input, output, cached, cacheCreation }) {
3185
- if (declared !== null) return declared;
3186
- if (input === null && output === null && cached === null && cacheCreation === null) return null;
3187
- return (input ?? 0) + (output ?? 0) + (cached ?? 0) + (cacheCreation ?? 0);
3196
+ function computeTotalTokens({ input, output }) {
3197
+ if (input === null && output === null) return null;
3198
+ return (input ?? 0) + (output ?? 0);
3199
+ }
3200
+ function computeTokensPerSecond({ outputTokens, durationMs, latencyMs }) {
3201
+ if (outputTokens === null || durationMs === null) return null;
3202
+ if (outputTokens === 0) return 0;
3203
+ const generationMs = latencyMs === null ? durationMs : durationMs - latencyMs;
3204
+ if (generationMs <= 0) return null;
3205
+ return outputTokens / (generationMs / 1e3);
3188
3206
  }
3189
3207
  function readSteps(attributes, path) {
3190
3208
  const raw = getNestedAttribute(attributes, path);
@@ -3192,10 +3210,6 @@ function readSteps(attributes, path) {
3192
3210
  stepCount: raw.length,
3193
3211
  stepDetails: raw
3194
3212
  };
3195
- if (typeof raw === "number" && Number.isFinite(raw)) return {
3196
- stepCount: raw,
3197
- stepDetails: null
3198
- };
3199
3213
  return {
3200
3214
  stepCount: null,
3201
3215
  stepDetails: null
@@ -3217,16 +3231,22 @@ function pickError$1(span) {
3217
3231
  * shape consumed by the LLM calls tab.
3218
3232
  *
3219
3233
  * Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
3220
- * (`model`, token counts, explicit cost, etc.) are read via
3234
+ * (`model`, token counts, latency, etc.) are read via
3221
3235
  * `getNestedAttribute` from the configured paths, with safe coercion to
3222
- * `string | null` / `number | null`. When explicit USD costs are absent,
3223
- * configured model pricing derives per-token-type costs from token counts.
3224
- * `totalTokens` falls back to a sum of input + output + cached when no
3225
- * explicit total attribute is present. The `steps` attribute path may resolve
3226
- * to either a number (rendered as the inference-round count) or an array of
3227
- * per-step detail objects (rendered as a Steps section in the body, with
3228
- * `stepCount` derived from the array length). `latencyMs` is `null` while the
3229
- * span is still running. User-defined `metrics` whose path resolves to
3236
+ * `string | null` / `number | null`. `latencyMs` is an explicit
3237
+ * time-to-first-token attribute; full span elapsed time is reported separately
3238
+ * as `durationMs`. Built-in USD costs are derived only from configured model
3239
+ * pricing and token counts. `totalTokens` is always derived from input +
3240
+ * output tokens. Cached input and cache creation tokens are reported
3241
+ * separately because they are subsets of input/output usage. The main cache
3242
+ * creation token field is treated as the total write count; optional one-hour
3243
+ * cache creation tokens only split that total for cost calculation. Base input
3244
+ * cost uses input minus cache read/write tokens so cached tokens are not
3245
+ * charged twice. Cache read/write costs still contribute to the total USD cost
3246
+ * at their configured rates. The `steps` attribute path may resolve to an array
3247
+ * of per-step detail objects, with `stepCount` derived from the array length.
3248
+ * `durationMs` and `tokensPerSecond` are `null` while the span is still
3249
+ * running. User-defined `metrics` whose path resolves to
3230
3250
  * `undefined` are dropped, but `null`, `0`, and `false` are preserved as
3231
3251
  * legitimate values worth displaying. Original span order is preserved so the
3232
3252
  * LLM calls tab matches the ordering in the Trace tab.
@@ -3243,19 +3263,30 @@ function extractLlmCalls(spans, config) {
3243
3263
  const outputTokens = readNumber$2(attrs, config.attributes.outputTokens);
3244
3264
  const cachedInputTokens = readNumber$2(attrs, config.attributes.cachedInputTokens);
3245
3265
  const cacheCreationInputTokens = readNumber$2(attrs, config.attributes.cacheCreationInputTokens);
3266
+ const cacheCreationInput1hTokens = readNumber$2(attrs, config.attributes.cacheCreationInput1hTokens);
3246
3267
  const reasoningTokens = readNumber$2(attrs, config.attributes.reasoningTokens);
3247
- const declaredTotalTokens = readNumber$2(attrs, config.attributes.totalTokens);
3268
+ const latencyMs = readNumber$2(attrs, config.attributes.latencyMs);
3269
+ const durationMs = computeDurationMs$1(span);
3248
3270
  const pricing = pickPricingEntry({
3249
3271
  pricing: config.pricing,
3250
3272
  model,
3251
3273
  provider
3252
3274
  });
3253
- const inputCostUsd = readNumber$2(attrs, config.attributes.inputCost) ?? computeTokenCost(inputTokens, pricing?.inputUsdPerMillion);
3254
- const outputCostUsd = readNumber$2(attrs, config.attributes.outputCost) ?? computeTokenCost(outputTokens, pricing?.outputUsdPerMillion);
3255
- const cachedInputCostUsd = readNumber$2(attrs, config.attributes.cachedInputCost) ?? computeTokenCost(cachedInputTokens, pricing?.cachedInputUsdPerMillion);
3256
- const cacheCreationInputCostUsd = readNumber$2(attrs, config.attributes.cacheCreationInputCost) ?? computeTokenCost(cacheCreationInputTokens, pricing?.cacheCreationInputUsdPerMillion);
3257
- const reasoningCostUsd = readNumber$2(attrs, config.attributes.reasoningCost) ?? computeTokenCost(reasoningTokens, pricing?.reasoningUsdPerMillion);
3258
- const costUsd = readNumber$2(attrs, config.attributes.cost) ?? computeFallbackTotalCost({
3275
+ const inputCostUsd = computeTokenCost(computeBaseInputTokens({
3276
+ inputTokens,
3277
+ cachedInputTokens,
3278
+ cacheCreationInputTokens
3279
+ }), pricing?.inputUsdPerMillion);
3280
+ const outputCostUsd = computeTokenCost(outputTokens, pricing?.outputUsdPerMillion);
3281
+ const cachedInputCostUsd = computeTokenCost(cachedInputTokens, pricing?.cachedInputUsdPerMillion);
3282
+ const cacheCreationInputCostUsd = computeCacheCreationInputCost({
3283
+ cacheCreationInputTokens,
3284
+ cacheCreationInput1hTokens,
3285
+ usdPerMillion: pricing?.cacheCreationInputUsdPerMillion,
3286
+ oneHourUsdPerMillion: pricing?.cacheCreationInput1hUsdPerMillion
3287
+ });
3288
+ const reasoningCostUsd = computeTokenCost(reasoningTokens, pricing?.reasoningUsdPerMillion);
3289
+ const costUsd = computeTotalCost({
3259
3290
  inputTokens,
3260
3291
  inputCostUsd,
3261
3292
  outputTokens,
@@ -3293,13 +3324,15 @@ function extractLlmCalls(spans, config) {
3293
3324
  cacheCreationInputTokens,
3294
3325
  reasoningTokens,
3295
3326
  totalTokens: computeTotalTokens({
3296
- declared: declaredTotalTokens,
3297
3327
  input: inputTokens,
3298
- output: outputTokens,
3299
- cached: cachedInputTokens,
3300
- cacheCreation: cacheCreationInputTokens
3328
+ output: outputTokens
3329
+ }),
3330
+ latencyMs,
3331
+ tokensPerSecond: computeTokensPerSecond({
3332
+ outputTokens,
3333
+ durationMs,
3334
+ latencyMs
3301
3335
  }),
3302
- tokensPerSecond: readNumber$2(attrs, config.attributes.tokensPerSecond),
3303
3336
  costUsd,
3304
3337
  inputCostUsd,
3305
3338
  outputCostUsd,
@@ -3308,7 +3341,7 @@ function extractLlmCalls(spans, config) {
3308
3341
  reasoningCostUsd,
3309
3342
  ...readSteps(attrs, config.attributes.steps),
3310
3343
  finishReason: readString$2(attrs, config.attributes.finishReason),
3311
- latencyMs: computeLatencyMs$1(span),
3344
+ durationMs,
3312
3345
  input: getNestedAttribute(attrs, config.attributes.input),
3313
3346
  output: getNestedAttribute(attrs, config.attributes.output),
3314
3347
  reasoning: getNestedAttribute(attrs, config.attributes.reasoning),
@@ -3333,7 +3366,7 @@ function readString$1(attributes, path) {
3333
3366
  const raw = getNestedAttribute(attributes, path);
3334
3367
  return typeof raw === "string" && raw.length > 0 ? raw : null;
3335
3368
  }
3336
- function computeLatencyMs(span) {
3369
+ function computeDurationMs(span) {
3337
3370
  if (span.endedAt === null) return null;
3338
3371
  const started = Date.parse(span.startedAt);
3339
3372
  const ended = Date.parse(span.endedAt);
@@ -3358,10 +3391,10 @@ function pickError(span) {
3358
3391
  *
3359
3392
  * Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
3360
3393
  * (`method`, `url`, `statusCode`, etc.) are read via `getNestedAttribute` from
3361
- * the configured paths. `durationMs` takes precedence for latency, with a
3362
- * fallback to the span start/end timestamps. User-defined `metrics` whose path
3363
- * resolves to `undefined` are dropped, but `null`, `0`, and `false` are
3364
- * preserved as legitimate values worth displaying. Original span order is
3394
+ * the configured paths. An explicit `durationMs` attribute takes precedence,
3395
+ * with a fallback to the span start/end timestamps. User-defined `metrics`
3396
+ * whose path resolves to `undefined` are dropped, but `null`, `0`, and `false`
3397
+ * are preserved as legitimate values worth displaying. Original span order is
3365
3398
  * preserved so the API calls tab matches the ordering in the Trace tab.
3366
3399
  */
3367
3400
  function extractApiCalls(spans, config) {
@@ -3391,7 +3424,7 @@ function extractApiCalls(spans, config) {
3391
3424
  method: readString$1(attrs, config.attributes.method),
3392
3425
  url: readString$1(attrs, config.attributes.url),
3393
3426
  statusCode: readNumber$1(attrs, config.attributes.statusCode),
3394
- latencyMs: readNumber$1(attrs, config.attributes.durationMs) ?? computeLatencyMs(span),
3427
+ durationMs: readNumber$1(attrs, config.attributes.durationMs) ?? computeDurationMs(span),
3395
3428
  request: getNestedAttribute(attrs, config.attributes.request),
3396
3429
  response: getNestedAttribute(attrs, config.attributes.response),
3397
3430
  requestBody: getNestedAttribute(attrs, config.attributes.requestBody),
@@ -3789,7 +3822,7 @@ async function writeCacheFile(cacheDir, cacheFile) {
3789
3822
  await mkdir(cacheDir, { recursive: true });
3790
3823
  const filePath = ownerPath(cacheDir, cacheFile.owner);
3791
3824
  const tmpPath = `${filePath}.${process.pid.toString()}.tmp`;
3792
- await writeFile(tmpPath, JSON.stringify(cacheFile));
3825
+ await writeFile(tmpPath, JSON.stringify(cacheFile, null, 2));
3793
3826
  await rename(tmpPath, filePath);
3794
3827
  }
3795
3828
  async function readDebugKeyFile(debugDir, owner) {
@@ -4035,7 +4068,6 @@ function getScoreOverride(def) {
4035
4068
  format: def.format,
4036
4069
  numberFormat: def.numberFormat,
4037
4070
  hideInTable: def.hideInTable,
4038
- sortable: def.sortable,
4039
4071
  align: def.align,
4040
4072
  maxStars: def.maxStars
4041
4073
  };
@@ -4048,7 +4080,6 @@ function mergeOverrides(base, override) {
4048
4080
  format: override.format ?? base.format,
4049
4081
  numberFormat: override.numberFormat ?? base.numberFormat,
4050
4082
  hideInTable: override.hideInTable ?? base.hideInTable,
4051
- sortable: override.sortable ?? base.sortable,
4052
4083
  align: override.align ?? base.align,
4053
4084
  maxStars: override.maxStars ?? base.maxStars
4054
4085
  };
@@ -4163,7 +4194,6 @@ function createColumnDef(params) {
4163
4194
  if (override?.numberFormat !== void 0) def.numberFormat = override.numberFormat;
4164
4195
  if (override?.maxStars !== void 0) def.maxStars = override.maxStars;
4165
4196
  if (override?.hideInTable !== void 0) def.hideInTable = override.hideInTable;
4166
- if (override?.sortable !== void 0) def.sortable = override.sortable;
4167
4197
  if (override?.align !== void 0) def.align = override.align;
4168
4198
  if (!isScore) return def;
4169
4199
  def.isScore = true;
@@ -4232,85 +4262,76 @@ const DEFAULT_CONFIG_KEYS = [
4232
4262
  "cachedInputTokens",
4233
4263
  "cacheCreationInputTokens",
4234
4264
  "reasoningTokens",
4235
- "llmLatencyMs"
4265
+ "llmDurationMs"
4236
4266
  ];
4237
- const tokenNumberFormat = {
4238
- notation: "compact",
4239
- decimalPlaces: 1
4267
+ const tokenNumberFormat = { notation: "compact" };
4268
+ const countNumberFormat = {
4269
+ minDecimalPlaces: 0,
4270
+ maxDecimalPlaces: 0
4271
+ };
4272
+ const costNumberFormat = {
4273
+ prefix: "$",
4274
+ maxDecimalPlaces: 4
4240
4275
  };
4241
- const countNumberFormat = { decimalPlaces: 0 };
4242
4276
  const DEFAULT_COLUMNS = {
4243
4277
  apiCalls: {
4244
4278
  label: "API Calls",
4245
4279
  format: "number",
4246
4280
  numberFormat: countNumberFormat,
4247
- align: "right",
4248
- sortable: true
4281
+ align: "right"
4249
4282
  },
4250
4283
  costUsd: {
4251
4284
  label: "Cost",
4252
4285
  format: "number",
4253
- numberFormat: {
4254
- prefix: "$",
4255
- decimalPlaces: 4
4256
- },
4257
- align: "right",
4258
- sortable: true
4286
+ numberFormat: costNumberFormat,
4287
+ align: "right"
4259
4288
  },
4260
4289
  llmTurns: {
4261
4290
  label: "LLM Turns",
4262
4291
  format: "number",
4263
4292
  numberFormat: countNumberFormat,
4264
- align: "right",
4265
- sortable: true
4293
+ align: "right"
4266
4294
  },
4267
4295
  inputTokens: {
4268
4296
  label: "Input Tokens",
4269
4297
  format: "number",
4270
4298
  numberFormat: tokenNumberFormat,
4271
- align: "right",
4272
- sortable: true
4299
+ align: "right"
4273
4300
  },
4274
4301
  outputTokens: {
4275
4302
  label: "Output Tokens",
4276
4303
  format: "number",
4277
4304
  numberFormat: tokenNumberFormat,
4278
- align: "right",
4279
- sortable: true
4305
+ align: "right"
4280
4306
  },
4281
4307
  totalTokens: {
4282
4308
  label: "Total Tokens",
4283
4309
  format: "number",
4284
4310
  numberFormat: tokenNumberFormat,
4285
- align: "right",
4286
- sortable: true
4311
+ align: "right"
4287
4312
  },
4288
4313
  cachedInputTokens: {
4289
4314
  label: "Cached Input Tokens",
4290
4315
  format: "number",
4291
4316
  numberFormat: tokenNumberFormat,
4292
- align: "right",
4293
- sortable: true
4317
+ align: "right"
4294
4318
  },
4295
4319
  cacheCreationInputTokens: {
4296
4320
  label: "Cache Write Tokens",
4297
4321
  format: "number",
4298
4322
  numberFormat: tokenNumberFormat,
4299
- align: "right",
4300
- sortable: true
4323
+ align: "right"
4301
4324
  },
4302
4325
  reasoningTokens: {
4303
4326
  label: "Reasoning Tokens",
4304
4327
  format: "number",
4305
4328
  numberFormat: tokenNumberFormat,
4306
- align: "right",
4307
- sortable: true
4329
+ align: "right"
4308
4330
  },
4309
- llmLatencyMs: {
4310
- label: "LLM Latency",
4331
+ llmDurationMs: {
4332
+ label: "LLM Duration",
4311
4333
  format: "duration",
4312
- align: "right",
4313
- sortable: true
4334
+ align: "right"
4314
4335
  }
4315
4336
  };
4316
4337
  function resolveRemovedKeys(globalRemove, evalRemove) {
@@ -4336,31 +4357,29 @@ function appendDefaultStats(params) {
4336
4357
  kind: "column",
4337
4358
  key: "apiCalls",
4338
4359
  label: "API Calls",
4339
- aggregate: "avg"
4360
+ aggregate: "avg",
4361
+ numberFormat: countNumberFormat
4340
4362
  });
4341
4363
  if (activeKeys.has("costUsd")) defaults.push({
4342
4364
  kind: "column",
4343
4365
  key: "costUsd",
4344
4366
  label: "LLM Cost",
4345
- aggregate: "sum"
4367
+ aggregate: "avg",
4368
+ numberFormat: costNumberFormat
4346
4369
  });
4347
4370
  if (activeKeys.has("totalTokens")) defaults.push({
4348
4371
  kind: "column",
4349
4372
  key: "totalTokens",
4350
4373
  label: "Tokens",
4351
- aggregate: "sum"
4374
+ aggregate: "avg",
4375
+ numberFormat: tokenNumberFormat
4352
4376
  });
4353
4377
  if (activeKeys.has("llmTurns")) defaults.push({
4354
4378
  kind: "column",
4355
4379
  key: "llmTurns",
4356
4380
  label: "LLM Turns",
4357
- aggregate: "avg"
4358
- });
4359
- if (activeKeys.has("llmLatencyMs")) defaults.push({
4360
- kind: "column",
4361
- key: "llmLatencyMs",
4362
- label: "LLM Latency",
4363
- aggregate: "avg"
4381
+ aggregate: "avg",
4382
+ numberFormat: countNumberFormat
4364
4383
  });
4365
4384
  const merged = [...params.stats ?? [], ...defaults];
4366
4385
  return merged.length > 0 ? merged : void 0;
@@ -4368,24 +4387,13 @@ function appendDefaultStats(params) {
4368
4387
  function appendDefaultCharts(params) {
4369
4388
  const activeKeys = new Set(getActiveDefaultConfigKeys(params));
4370
4389
  const defaults = [];
4371
- if (activeKeys.has("apiCalls")) defaults.push({
4372
- heading: "API Calls",
4373
- type: "bar",
4374
- metrics: [{
4375
- source: "column",
4376
- key: "apiCalls",
4377
- aggregate: "sum",
4378
- label: "API Calls",
4379
- color: "accentDim"
4380
- }]
4381
- });
4382
4390
  if (activeKeys.has("costUsd")) defaults.push({
4383
4391
  heading: "LLM Cost",
4384
4392
  type: "area",
4385
4393
  metrics: [{
4386
4394
  source: "column",
4387
4395
  key: "costUsd",
4388
- aggregate: "sum",
4396
+ aggregate: "avg",
4389
4397
  label: "Cost",
4390
4398
  color: "warning"
4391
4399
  }]
@@ -4394,23 +4402,30 @@ function appendDefaultCharts(params) {
4394
4402
  activeKeys.has("inputTokens") ? {
4395
4403
  source: "column",
4396
4404
  key: "inputTokens",
4397
- aggregate: "sum",
4405
+ aggregate: "avg",
4398
4406
  label: "Input",
4399
4407
  color: "accent"
4400
4408
  } : null,
4401
4409
  activeKeys.has("outputTokens") ? {
4402
4410
  source: "column",
4403
4411
  key: "outputTokens",
4404
- aggregate: "sum",
4412
+ aggregate: "avg",
4405
4413
  label: "Output",
4406
4414
  color: "success"
4407
4415
  } : null,
4408
- activeKeys.has("reasoningTokens") ? {
4416
+ activeKeys.has("cachedInputTokens") ? {
4409
4417
  source: "column",
4410
- key: "reasoningTokens",
4411
- aggregate: "sum",
4412
- label: "Reasoning",
4418
+ key: "cachedInputTokens",
4419
+ aggregate: "avg",
4420
+ label: "Cached Input",
4413
4421
  color: "error"
4422
+ } : null,
4423
+ activeKeys.has("cacheCreationInputTokens") ? {
4424
+ source: "column",
4425
+ key: "cacheCreationInputTokens",
4426
+ aggregate: "avg",
4427
+ label: "Cache Write",
4428
+ color: "warning"
4414
4429
  } : null
4415
4430
  ].filter((metric) => metric !== null);
4416
4431
  if (tokenMetrics.length > 0) defaults.push({
@@ -4420,7 +4435,7 @@ function appendDefaultCharts(params) {
4420
4435
  tooltipExtras: activeKeys.has("totalTokens") ? [{
4421
4436
  source: "column",
4422
4437
  key: "totalTokens",
4423
- aggregate: "sum",
4438
+ aggregate: "avg",
4424
4439
  label: "Total"
4425
4440
  }] : void 0
4426
4441
  });
@@ -4525,8 +4540,8 @@ function addDefaultOutputs(params) {
4525
4540
  });
4526
4541
  assignIfMissing({
4527
4542
  outputs: params.outputs,
4528
- key: "llmLatencyMs",
4529
- value: sumNullable(calls.map((call) => call.latencyMs)),
4543
+ key: "llmDurationMs",
4544
+ value: sumNullable(calls.map((call) => call.durationMs)),
4530
4545
  activeKeys
4531
4546
  });
4532
4547
  }
@@ -5372,7 +5387,7 @@ async function runCase(params) {
5372
5387
  caseDetail,
5373
5388
  caseRowUpdate: {
5374
5389
  status,
5375
- latencyMs: Date.now() - startTime,
5390
+ durationMs: Date.now() - startTime,
5376
5391
  columns
5377
5392
  }
5378
5393
  };
@@ -5663,7 +5678,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5663
5678
  caseId: evalCase.id,
5664
5679
  evalId: evalMeta.id,
5665
5680
  status: caseRowUpdate.status ?? "pending",
5666
- latencyMs: caseRowUpdate.latencyMs ?? null,
5681
+ durationMs: caseRowUpdate.durationMs ?? null,
5667
5682
  columns: caseRowUpdate.columns ?? {},
5668
5683
  trial
5669
5684
  }
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-weogme5U.mjs";
2
- import "./src-B879LZfo.mjs";
1
+ import { n as createRunner } from "./cli-C0EtHhEO.mjs";
2
+ import "./src-D-HuV8I-.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-DzrMtgBu.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-C9nP2VKL.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -0,0 +1,3 @@
1
+ import "./runOrchestration-D1edUDhp.mjs";
2
+ import "./cli-C0EtHhEO.mjs";
3
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.20.0",
3
+ "version": "0.22.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"