@ls-stack/agent-eval 0.19.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2566,6 +2566,21 @@ const caseDetailSchema = z.object({
2566
2566
  //#region ../shared/src/schemas/config.ts
2567
2567
  /** Strategy used to collapse repeated trials into one stored case result. */
2568
2568
  const trialSelectionModeSchema = z.enum(["lowestScore", "median"]);
2569
+ /** Built-in eval-level output/column keys. */
2570
+ const defaultConfigKeySchema = z.enum([
2571
+ "apiCalls",
2572
+ "costUsd",
2573
+ "llmTurns",
2574
+ "inputTokens",
2575
+ "outputTokens",
2576
+ "totalTokens",
2577
+ "cachedInputTokens",
2578
+ "cacheCreationInputTokens",
2579
+ "reasoningTokens",
2580
+ "llmLatencyMs"
2581
+ ]);
2582
+ /** Removal config for built-in eval-level outputs and UI metadata. */
2583
+ const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfigKeySchema)]);
2569
2584
  /** Render formats supported by an LLM-call metric in the UI. */
2570
2585
  const llmCallMetricFormatSchema = z.enum([
2571
2586
  "string",
@@ -2637,6 +2652,30 @@ const apiCallMetricSchema = z.object({
2637
2652
  */
2638
2653
  placements: z.array(apiCallMetricPlacementSchema).nonempty().optional()
2639
2654
  });
2655
+ /**
2656
+ * Schema for one model/provider pricing entry used to derive LLM-call costs
2657
+ * from token counts when a span does not already record explicit USD costs.
2658
+ */
2659
+ const llmCallPricingSchema = z.object({
2660
+ /** Exact model name read from the configured `attributes.model` path. */
2661
+ model: z.string().min(1),
2662
+ /**
2663
+ * Optional provider discriminator read from `attributes.provider`. When set,
2664
+ * the entry only applies to calls from that provider; provider-specific
2665
+ * entries take precedence over generic entries for the same model.
2666
+ */
2667
+ provider: z.string().min(1).optional(),
2668
+ /** USD per one million non-cached input tokens. */
2669
+ inputUsdPerMillion: z.number().nonnegative().optional(),
2670
+ /** USD per one million output tokens. */
2671
+ outputUsdPerMillion: z.number().nonnegative().optional(),
2672
+ /** USD per one million prompt-cache read tokens. */
2673
+ cachedInputUsdPerMillion: z.number().nonnegative().optional(),
2674
+ /** USD per one million prompt-cache write tokens. */
2675
+ cacheCreationInputUsdPerMillion: z.number().nonnegative().optional(),
2676
+ /** USD per one million reasoning tokens when reported separately. */
2677
+ reasoningUsdPerMillion: z.number().nonnegative().optional()
2678
+ });
2640
2679
  /** Schema for the global LLM calls config block in `agent-evals.config.ts`. */
2641
2680
  const llmCallsConfigSchema = z.object({
2642
2681
  /** Span kinds treated as LLM calls. Defaults to `['llm']`. */
@@ -2647,8 +2686,9 @@ const llmCallsConfigSchema = z.object({
2647
2686
  * built-in defaults (e.g. `usage.inputTokens`, `costUsd`).
2648
2687
  *
2649
2688
  * Per-token-type cost paths (`inputCost`, `outputCost`, `cachedInputCost`,
2650
- * `reasoningCost`) feed the cost breakdown table in the expanded row.
2651
- * Record them as USD numbers alongside `costUsd` in your span attributes.
2689
+ * `reasoningCost`) feed the cost breakdown table in the expanded row when
2690
+ * spans provide explicit USD cost overrides. Prefer `pricing` for deriving
2691
+ * costs from token counts globally.
2652
2692
  */
2653
2693
  attributes: z.object({
2654
2694
  model: z.string().optional(),
@@ -2659,6 +2699,7 @@ const llmCallsConfigSchema = z.object({
2659
2699
  cacheCreationInputTokens: z.string().optional(),
2660
2700
  reasoningTokens: z.string().optional(),
2661
2701
  totalTokens: z.string().optional(),
2702
+ tokensPerSecond: z.string().optional(),
2662
2703
  cost: z.string().optional(),
2663
2704
  inputCost: z.string().optional(),
2664
2705
  outputCost: z.string().optional(),
@@ -2672,6 +2713,12 @@ const llmCallsConfigSchema = z.object({
2672
2713
  reasoning: z.string().optional(),
2673
2714
  toolCalls: z.string().optional()
2674
2715
  }).optional(),
2716
+ /**
2717
+ * Model/provider pricing registry used to calculate missing LLM-call costs
2718
+ * from token counts. Explicit span attributes (`costUsd`, `cost.inputUsd`,
2719
+ * etc.) take precedence over derived prices.
2720
+ */
2721
+ pricing: z.array(llmCallPricingSchema).optional(),
2675
2722
  /** Custom user-defined metrics surfaced on each LLM call. */
2676
2723
  metrics: z.array(llmCallMetricSchema).optional()
2677
2724
  });
@@ -2719,6 +2766,7 @@ const DEFAULT_LLM_CALLS_CONFIG = {
2719
2766
  cacheCreationInputTokens: "usage.cacheCreationInputTokens",
2720
2767
  reasoningTokens: "usage.reasoningTokens",
2721
2768
  totalTokens: "usage.totalTokens",
2769
+ tokensPerSecond: "tokensPerSecond",
2722
2770
  cost: "costUsd",
2723
2771
  inputCost: "cost.inputUsd",
2724
2772
  outputCost: "cost.outputUsd",
@@ -2732,7 +2780,8 @@ const DEFAULT_LLM_CALLS_CONFIG = {
2732
2780
  reasoning: "reasoning",
2733
2781
  toolCalls: "toolCalls"
2734
2782
  },
2735
- metrics: []
2783
+ metrics: [],
2784
+ pricing: []
2736
2785
  };
2737
2786
  /** Default API-calls config the UI uses before the workspace fetch resolves. */
2738
2787
  const DEFAULT_API_CALLS_CONFIG = {
@@ -2765,6 +2814,8 @@ const DEFAULT_API_CALLS_CONFIG = {
2765
2814
  * attribute path.
2766
2815
  * - Missing `metrics[].format` defaults to `'string'`.
2767
2816
  * - Missing `metrics[].placements` defaults to `['body']`.
2817
+ * - Missing `pricing` defaults to an empty registry; explicit span costs still
2818
+ * take precedence over derived costs.
2768
2819
  */
2769
2820
  function resolveLlmCallsConfig(input) {
2770
2821
  return {
@@ -2780,6 +2831,15 @@ function resolveLlmCallsConfig(input) {
2780
2831
  format: m.format ?? "string",
2781
2832
  numberFormat: m.numberFormat,
2782
2833
  placements: m.placements ? [...m.placements] : ["body"]
2834
+ })),
2835
+ pricing: (input?.pricing ?? []).map((p) => ({
2836
+ model: p.model,
2837
+ provider: p.provider,
2838
+ inputUsdPerMillion: p.inputUsdPerMillion,
2839
+ outputUsdPerMillion: p.outputUsdPerMillion,
2840
+ cachedInputUsdPerMillion: p.cachedInputUsdPerMillion,
2841
+ cacheCreationInputUsdPerMillion: p.cacheCreationInputUsdPerMillion,
2842
+ reasoningUsdPerMillion: p.reasoningUsdPerMillion
2783
2843
  }))
2784
2844
  };
2785
2845
  }
@@ -2821,6 +2881,7 @@ const agentEvalsConfigSchema = z.object({
2821
2881
  allowCliRunAll: z.boolean().optional(),
2822
2882
  traceDisplay: traceDisplayInputConfigSchema.optional(),
2823
2883
  llmCalls: llmCallsConfigSchema.optional(),
2884
+ removeDefaultConfig: removeDefaultConfigSchema.optional(),
2824
2885
  apiCalls: apiCallsConfigSchema.optional(),
2825
2886
  runLogs: runLogsConfigSchema.optional(),
2826
2887
  cache: z.object({
@@ -3056,6 +3117,62 @@ function readString$2(attributes, path) {
3056
3117
  const raw = getNestedAttribute(attributes, path);
3057
3118
  return typeof raw === "string" && raw.length > 0 ? raw : null;
3058
3119
  }
3120
+ function computeTokenCost(tokens, usdPerMillion) {
3121
+ if (tokens === null) return null;
3122
+ if (tokens === 0) return 0;
3123
+ if (usdPerMillion === void 0) return null;
3124
+ return tokens / 1e6 * usdPerMillion;
3125
+ }
3126
+ function pickPricingEntry({ pricing, model, provider }) {
3127
+ if (model === null) return null;
3128
+ let fallback = null;
3129
+ for (const entry of pricing) {
3130
+ if (entry.model !== model) continue;
3131
+ if (entry.provider === void 0) {
3132
+ fallback ??= entry;
3133
+ continue;
3134
+ }
3135
+ if (entry.provider === provider) return entry;
3136
+ }
3137
+ return fallback;
3138
+ }
3139
+ function computeFallbackTotalCost({ inputTokens, inputCostUsd, outputTokens, outputCostUsd, cachedInputTokens, cachedInputCostUsd, cacheCreationInputTokens, cacheCreationInputCostUsd, reasoningTokens, reasoningCostUsd }) {
3140
+ const parts = [
3141
+ {
3142
+ tokens: inputTokens,
3143
+ cost: inputCostUsd
3144
+ },
3145
+ {
3146
+ tokens: outputTokens,
3147
+ cost: outputCostUsd
3148
+ },
3149
+ {
3150
+ tokens: cachedInputTokens,
3151
+ cost: cachedInputCostUsd
3152
+ },
3153
+ {
3154
+ tokens: cacheCreationInputTokens,
3155
+ cost: cacheCreationInputCostUsd
3156
+ },
3157
+ {
3158
+ tokens: reasoningTokens,
3159
+ cost: reasoningCostUsd
3160
+ }
3161
+ ];
3162
+ let total = 0;
3163
+ let hasCost = false;
3164
+ let hasReportedTokens = false;
3165
+ for (const part of parts) {
3166
+ if (part.tokens === null) continue;
3167
+ hasReportedTokens = true;
3168
+ if (part.tokens === 0) continue;
3169
+ if (part.cost === null) return null;
3170
+ total += part.cost;
3171
+ hasCost = true;
3172
+ }
3173
+ if (hasCost) return total;
3174
+ return hasReportedTokens ? 0 : null;
3175
+ }
3059
3176
  function computeLatencyMs$1(span) {
3060
3177
  if (span.endedAt === null) return null;
3061
3178
  const started = Date.parse(span.startedAt);
@@ -3100,9 +3217,11 @@ function pickError$1(span) {
3100
3217
  * shape consumed by the LLM calls tab.
3101
3218
  *
3102
3219
  * Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
3103
- * (`model`, token counts, cost, etc.) are read via `getNestedAttribute` from
3104
- * the configured paths, with safe coercion to `string | null` / `number |
3105
- * null`. `totalTokens` falls back to a sum of input + output + cached when no
3220
+ * (`model`, token counts, explicit cost, etc.) are read via
3221
+ * `getNestedAttribute` from the configured paths, with safe coercion to
3222
+ * `string | null` / `number | null`. When explicit USD costs are absent,
3223
+ * configured model pricing derives per-token-type costs from token counts.
3224
+ * `totalTokens` falls back to a sum of input + output + cached when no
3106
3225
  * explicit total attribute is present. The `steps` attribute path may resolve
3107
3226
  * to either a number (rendered as the inference-round count) or an array of
3108
3227
  * per-step detail objects (rendered as a Steps section in the body, with
@@ -3118,12 +3237,36 @@ function extractLlmCalls(spans, config) {
3118
3237
  for (const span of spans) {
3119
3238
  if (!kindSet.has(span.kind)) continue;
3120
3239
  const attrs = span.attributes;
3240
+ const model = readString$2(attrs, config.attributes.model);
3241
+ const provider = readString$2(attrs, config.attributes.provider);
3121
3242
  const inputTokens = readNumber$2(attrs, config.attributes.inputTokens);
3122
3243
  const outputTokens = readNumber$2(attrs, config.attributes.outputTokens);
3123
3244
  const cachedInputTokens = readNumber$2(attrs, config.attributes.cachedInputTokens);
3124
3245
  const cacheCreationInputTokens = readNumber$2(attrs, config.attributes.cacheCreationInputTokens);
3125
3246
  const reasoningTokens = readNumber$2(attrs, config.attributes.reasoningTokens);
3126
3247
  const declaredTotalTokens = readNumber$2(attrs, config.attributes.totalTokens);
3248
+ const pricing = pickPricingEntry({
3249
+ pricing: config.pricing,
3250
+ model,
3251
+ provider
3252
+ });
3253
+ const inputCostUsd = readNumber$2(attrs, config.attributes.inputCost) ?? computeTokenCost(inputTokens, pricing?.inputUsdPerMillion);
3254
+ const outputCostUsd = readNumber$2(attrs, config.attributes.outputCost) ?? computeTokenCost(outputTokens, pricing?.outputUsdPerMillion);
3255
+ const cachedInputCostUsd = readNumber$2(attrs, config.attributes.cachedInputCost) ?? computeTokenCost(cachedInputTokens, pricing?.cachedInputUsdPerMillion);
3256
+ const cacheCreationInputCostUsd = readNumber$2(attrs, config.attributes.cacheCreationInputCost) ?? computeTokenCost(cacheCreationInputTokens, pricing?.cacheCreationInputUsdPerMillion);
3257
+ const reasoningCostUsd = readNumber$2(attrs, config.attributes.reasoningCost) ?? computeTokenCost(reasoningTokens, pricing?.reasoningUsdPerMillion);
3258
+ const costUsd = readNumber$2(attrs, config.attributes.cost) ?? computeFallbackTotalCost({
3259
+ inputTokens,
3260
+ inputCostUsd,
3261
+ outputTokens,
3262
+ outputCostUsd,
3263
+ cachedInputTokens,
3264
+ cachedInputCostUsd,
3265
+ cacheCreationInputTokens,
3266
+ cacheCreationInputCostUsd,
3267
+ reasoningTokens,
3268
+ reasoningCostUsd
3269
+ });
3127
3270
  const metrics = [];
3128
3271
  for (const metric of config.metrics) {
3129
3272
  const rawValue = getNestedAttribute(attrs, metric.path);
@@ -3142,8 +3285,8 @@ function extractLlmCalls(spans, config) {
3142
3285
  name: span.name,
3143
3286
  kind: span.kind,
3144
3287
  status: span.status,
3145
- model: readString$2(attrs, config.attributes.model),
3146
- provider: readString$2(attrs, config.attributes.provider),
3288
+ model,
3289
+ provider,
3147
3290
  inputTokens,
3148
3291
  outputTokens,
3149
3292
  cachedInputTokens,
@@ -3156,12 +3299,13 @@ function extractLlmCalls(spans, config) {
3156
3299
  cached: cachedInputTokens,
3157
3300
  cacheCreation: cacheCreationInputTokens
3158
3301
  }),
3159
- costUsd: readNumber$2(attrs, config.attributes.cost),
3160
- inputCostUsd: readNumber$2(attrs, config.attributes.inputCost),
3161
- outputCostUsd: readNumber$2(attrs, config.attributes.outputCost),
3162
- cachedInputCostUsd: readNumber$2(attrs, config.attributes.cachedInputCost),
3163
- cacheCreationInputCostUsd: readNumber$2(attrs, config.attributes.cacheCreationInputCost),
3164
- reasoningCostUsd: readNumber$2(attrs, config.attributes.reasoningCost),
3302
+ tokensPerSecond: readNumber$2(attrs, config.attributes.tokensPerSecond),
3303
+ costUsd,
3304
+ inputCostUsd,
3305
+ outputCostUsd,
3306
+ cachedInputCostUsd,
3307
+ cacheCreationInputCostUsd,
3308
+ reasoningCostUsd,
3165
3309
  ...readSteps(attrs, config.attributes.steps),
3166
3310
  finishReason: readString$2(attrs, config.attributes.finishReason),
3167
3311
  latencyMs: computeLatencyMs$1(span),
@@ -3792,6 +3936,80 @@ function isRecordLike(value) {
3792
3936
  return typeof value === "object" && value !== null && !Array.isArray(value);
3793
3937
  }
3794
3938
  //#endregion
3939
+ //#region ../runner/src/chartValidation.ts
3940
+ function isValidColumnMetric(metric, columnsByKey, evalId, warnings) {
3941
+ const columnDef = columnsByKey.get(metric.key);
3942
+ if (!columnDef) {
3943
+ warnings.push(`[${evalId}] chart metric references unknown column "${metric.key}" — dropped`);
3944
+ return false;
3945
+ }
3946
+ if (metric.aggregate === "passThresholdRate") {
3947
+ if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
3948
+ warnings.push(`[${evalId}] chart metric "${metric.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
3949
+ return false;
3950
+ }
3951
+ }
3952
+ return true;
3953
+ }
3954
+ function isValidTooltipExtra(extra, columnsByKey, evalId, warnings) {
3955
+ const columnDef = columnsByKey.get(extra.key);
3956
+ if (!columnDef) {
3957
+ warnings.push(`[${evalId}] chart tooltip extra references unknown column "${extra.key}" — dropped`);
3958
+ return false;
3959
+ }
3960
+ if (extra.aggregate === "passThresholdRate") {
3961
+ if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
3962
+ warnings.push(`[${evalId}] chart tooltip extra "${extra.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
3963
+ return false;
3964
+ }
3965
+ }
3966
+ return true;
3967
+ }
3968
+ function sanitizeChart(chart, columnsByKey, evalId, warnings) {
3969
+ const metrics = chart.metrics.filter((metric) => {
3970
+ if (metric.source === "builtin") return true;
3971
+ return isValidColumnMetric(metric, columnsByKey, evalId, warnings);
3972
+ });
3973
+ if (metrics.length === 0) {
3974
+ warnings.push(`[${evalId}] chart had no valid metrics after validation — chart dropped`);
3975
+ return null;
3976
+ }
3977
+ const tooltipExtras = chart.tooltipExtras?.filter((extra) => {
3978
+ if (extra.source === "builtin") return true;
3979
+ return isValidTooltipExtra(extra, columnsByKey, evalId, warnings);
3980
+ });
3981
+ return {
3982
+ ...chart,
3983
+ metrics,
3984
+ tooltipExtras: tooltipExtras?.length ? tooltipExtras : void 0
3985
+ };
3986
+ }
3987
+ /**
3988
+ * Validate and sanitize an authored `charts` config against the eval's
3989
+ * declared columns. Drops metrics/extras that reference unknown columns or
3990
+ * misuse `passThresholdRate`, and drops entire charts whose metrics are all
3991
+ * invalid. Returns `charts: undefined` when nothing valid remains so the UI
3992
+ * falls back to rendering no chart (matching the opt-in default).
3993
+ */
3994
+ function validateCharts(params) {
3995
+ const { charts, columnDefs, evalId } = params;
3996
+ if (!charts || charts.length === 0) return {
3997
+ charts: void 0,
3998
+ warnings: []
3999
+ };
4000
+ const columnsByKey = new Map(columnDefs.map((def) => [def.key, def]));
4001
+ const warnings = [];
4002
+ const sanitized = [];
4003
+ for (const chart of charts) {
4004
+ const result = sanitizeChart(chart, columnsByKey, evalId, warnings);
4005
+ if (result) sanitized.push(result);
4006
+ }
4007
+ return {
4008
+ charts: sanitized.length > 0 ? sanitized : void 0,
4009
+ warnings
4010
+ };
4011
+ }
4012
+ //#endregion
3795
4013
  //#region ../runner/src/columnBuilder.ts
3796
4014
  /**
3797
4015
  * Normalize a user-provided score definition (either a function or an
@@ -4003,6 +4221,316 @@ async function loadConfig() {
4003
4221
  }
4004
4222
  }
4005
4223
  //#endregion
4224
+ //#region ../runner/src/defaultConfig.ts
4225
+ const DEFAULT_CONFIG_KEYS = [
4226
+ "apiCalls",
4227
+ "costUsd",
4228
+ "llmTurns",
4229
+ "inputTokens",
4230
+ "outputTokens",
4231
+ "totalTokens",
4232
+ "cachedInputTokens",
4233
+ "cacheCreationInputTokens",
4234
+ "reasoningTokens",
4235
+ "llmLatencyMs"
4236
+ ];
4237
+ const tokenNumberFormat = {
4238
+ notation: "compact",
4239
+ decimalPlaces: 1
4240
+ };
4241
+ const countNumberFormat = { decimalPlaces: 0 };
4242
+ const DEFAULT_COLUMNS = {
4243
+ apiCalls: {
4244
+ label: "API Calls",
4245
+ format: "number",
4246
+ numberFormat: countNumberFormat,
4247
+ align: "right",
4248
+ sortable: true
4249
+ },
4250
+ costUsd: {
4251
+ label: "Cost",
4252
+ format: "number",
4253
+ numberFormat: {
4254
+ prefix: "$",
4255
+ decimalPlaces: 4
4256
+ },
4257
+ align: "right",
4258
+ sortable: true
4259
+ },
4260
+ llmTurns: {
4261
+ label: "LLM Turns",
4262
+ format: "number",
4263
+ numberFormat: countNumberFormat,
4264
+ align: "right",
4265
+ sortable: true
4266
+ },
4267
+ inputTokens: {
4268
+ label: "Input Tokens",
4269
+ format: "number",
4270
+ numberFormat: tokenNumberFormat,
4271
+ align: "right",
4272
+ sortable: true
4273
+ },
4274
+ outputTokens: {
4275
+ label: "Output Tokens",
4276
+ format: "number",
4277
+ numberFormat: tokenNumberFormat,
4278
+ align: "right",
4279
+ sortable: true
4280
+ },
4281
+ totalTokens: {
4282
+ label: "Total Tokens",
4283
+ format: "number",
4284
+ numberFormat: tokenNumberFormat,
4285
+ align: "right",
4286
+ sortable: true
4287
+ },
4288
+ cachedInputTokens: {
4289
+ label: "Cached Input Tokens",
4290
+ format: "number",
4291
+ numberFormat: tokenNumberFormat,
4292
+ align: "right",
4293
+ sortable: true
4294
+ },
4295
+ cacheCreationInputTokens: {
4296
+ label: "Cache Write Tokens",
4297
+ format: "number",
4298
+ numberFormat: tokenNumberFormat,
4299
+ align: "right",
4300
+ sortable: true
4301
+ },
4302
+ reasoningTokens: {
4303
+ label: "Reasoning Tokens",
4304
+ format: "number",
4305
+ numberFormat: tokenNumberFormat,
4306
+ align: "right",
4307
+ sortable: true
4308
+ },
4309
+ llmLatencyMs: {
4310
+ label: "LLM Latency",
4311
+ format: "duration",
4312
+ align: "right",
4313
+ sortable: true
4314
+ }
4315
+ };
4316
+ function resolveRemovedKeys(globalRemove, evalRemove) {
4317
+ if (globalRemove === true || evalRemove === true) return new Set(DEFAULT_CONFIG_KEYS);
4318
+ return new Set([...globalRemove ?? [], ...evalRemove ?? []]);
4319
+ }
4320
+ function getActiveDefaultConfigKeys(params) {
4321
+ const removed = resolveRemovedKeys(params.globalRemove, params.evalRemove);
4322
+ return DEFAULT_CONFIG_KEYS.filter((key) => !removed.has(key));
4323
+ }
4324
+ function mergeDefaultColumns(params) {
4325
+ const activeKeys = getActiveDefaultConfigKeys(params);
4326
+ if (activeKeys.length === 0) return params.columns;
4327
+ return {
4328
+ ...Object.fromEntries(activeKeys.map((key) => [key, DEFAULT_COLUMNS[key]])),
4329
+ ...params.columns
4330
+ };
4331
+ }
4332
+ function appendDefaultStats(params) {
4333
+ const activeKeys = new Set(getActiveDefaultConfigKeys(params));
4334
+ const defaults = [];
4335
+ if (activeKeys.has("apiCalls")) defaults.push({
4336
+ kind: "column",
4337
+ key: "apiCalls",
4338
+ label: "API Calls",
4339
+ aggregate: "avg"
4340
+ });
4341
+ if (activeKeys.has("costUsd")) defaults.push({
4342
+ kind: "column",
4343
+ key: "costUsd",
4344
+ label: "LLM Cost",
4345
+ aggregate: "sum"
4346
+ });
4347
+ if (activeKeys.has("totalTokens")) defaults.push({
4348
+ kind: "column",
4349
+ key: "totalTokens",
4350
+ label: "Tokens",
4351
+ aggregate: "sum"
4352
+ });
4353
+ if (activeKeys.has("llmTurns")) defaults.push({
4354
+ kind: "column",
4355
+ key: "llmTurns",
4356
+ label: "LLM Turns",
4357
+ aggregate: "avg"
4358
+ });
4359
+ if (activeKeys.has("llmLatencyMs")) defaults.push({
4360
+ kind: "column",
4361
+ key: "llmLatencyMs",
4362
+ label: "LLM Latency",
4363
+ aggregate: "avg"
4364
+ });
4365
+ const merged = [...params.stats ?? [], ...defaults];
4366
+ return merged.length > 0 ? merged : void 0;
4367
+ }
4368
+ function appendDefaultCharts(params) {
4369
+ const activeKeys = new Set(getActiveDefaultConfigKeys(params));
4370
+ const defaults = [];
4371
+ if (activeKeys.has("apiCalls")) defaults.push({
4372
+ heading: "API Calls",
4373
+ type: "bar",
4374
+ metrics: [{
4375
+ source: "column",
4376
+ key: "apiCalls",
4377
+ aggregate: "sum",
4378
+ label: "API Calls",
4379
+ color: "accentDim"
4380
+ }]
4381
+ });
4382
+ if (activeKeys.has("costUsd")) defaults.push({
4383
+ heading: "LLM Cost",
4384
+ type: "area",
4385
+ metrics: [{
4386
+ source: "column",
4387
+ key: "costUsd",
4388
+ aggregate: "sum",
4389
+ label: "Cost",
4390
+ color: "warning"
4391
+ }]
4392
+ });
4393
+ const tokenMetrics = [
4394
+ activeKeys.has("inputTokens") ? {
4395
+ source: "column",
4396
+ key: "inputTokens",
4397
+ aggregate: "sum",
4398
+ label: "Input",
4399
+ color: "accent"
4400
+ } : null,
4401
+ activeKeys.has("outputTokens") ? {
4402
+ source: "column",
4403
+ key: "outputTokens",
4404
+ aggregate: "sum",
4405
+ label: "Output",
4406
+ color: "success"
4407
+ } : null,
4408
+ activeKeys.has("reasoningTokens") ? {
4409
+ source: "column",
4410
+ key: "reasoningTokens",
4411
+ aggregate: "sum",
4412
+ label: "Reasoning",
4413
+ color: "error"
4414
+ } : null
4415
+ ].filter((metric) => metric !== null);
4416
+ if (tokenMetrics.length > 0) defaults.push({
4417
+ heading: "LLM Tokens",
4418
+ type: "bar",
4419
+ metrics: tokenMetrics,
4420
+ tooltipExtras: activeKeys.has("totalTokens") ? [{
4421
+ source: "column",
4422
+ key: "totalTokens",
4423
+ aggregate: "sum",
4424
+ label: "Total"
4425
+ }] : void 0
4426
+ });
4427
+ const merged = [...params.charts ?? [], ...defaults];
4428
+ return merged.length > 0 ? merged : void 0;
4429
+ }
4430
+ function resolveEvalDefaultConfig(params) {
4431
+ const evalRemove = params.evalDef.removeDefaultConfig;
4432
+ return {
4433
+ columns: mergeDefaultColumns({
4434
+ columns: params.evalDef.columns,
4435
+ globalRemove: params.globalRemove,
4436
+ evalRemove
4437
+ }),
4438
+ stats: appendDefaultStats({
4439
+ stats: params.evalDef.stats,
4440
+ globalRemove: params.globalRemove,
4441
+ evalRemove
4442
+ }),
4443
+ charts: appendDefaultCharts({
4444
+ charts: params.evalDef.charts,
4445
+ globalRemove: params.globalRemove,
4446
+ evalRemove
4447
+ })
4448
+ };
4449
+ }
4450
+ function sumNullable(values) {
4451
+ let total = 0;
4452
+ let hasValue = false;
4453
+ for (const value of values) {
4454
+ if (value === null) continue;
4455
+ total += value;
4456
+ hasValue = true;
4457
+ }
4458
+ return hasValue ? total : void 0;
4459
+ }
4460
+ function assignIfMissing(params) {
4461
+ if (!params.activeKeys.has(params.key)) return;
4462
+ if (params.key in params.outputs) return;
4463
+ if (params.value === void 0) return;
4464
+ params.outputs[params.key] = params.value;
4465
+ }
4466
+ function addDefaultOutputs(params) {
4467
+ const activeKeys = new Set(getActiveDefaultConfigKeys(params));
4468
+ if (activeKeys.size === 0) return;
4469
+ const calls = extractLlmCalls(params.spans, params.llmCallsConfig);
4470
+ const apiCalls = extractApiCalls(params.spans, params.apiCallsConfig);
4471
+ assignIfMissing({
4472
+ outputs: params.outputs,
4473
+ key: "apiCalls",
4474
+ value: apiCalls.length > 0 ? apiCalls.length : void 0,
4475
+ activeKeys
4476
+ });
4477
+ if (calls.length === 0) return;
4478
+ assignIfMissing({
4479
+ outputs: params.outputs,
4480
+ key: "llmTurns",
4481
+ value: calls.length,
4482
+ activeKeys
4483
+ });
4484
+ assignIfMissing({
4485
+ outputs: params.outputs,
4486
+ key: "costUsd",
4487
+ value: sumNullable(calls.map((call) => call.costUsd)),
4488
+ activeKeys
4489
+ });
4490
+ assignIfMissing({
4491
+ outputs: params.outputs,
4492
+ key: "inputTokens",
4493
+ value: sumNullable(calls.map((call) => call.inputTokens)),
4494
+ activeKeys
4495
+ });
4496
+ assignIfMissing({
4497
+ outputs: params.outputs,
4498
+ key: "outputTokens",
4499
+ value: sumNullable(calls.map((call) => call.outputTokens)),
4500
+ activeKeys
4501
+ });
4502
+ assignIfMissing({
4503
+ outputs: params.outputs,
4504
+ key: "totalTokens",
4505
+ value: sumNullable(calls.map((call) => call.totalTokens)),
4506
+ activeKeys
4507
+ });
4508
+ assignIfMissing({
4509
+ outputs: params.outputs,
4510
+ key: "cachedInputTokens",
4511
+ value: sumNullable(calls.map((call) => call.cachedInputTokens)),
4512
+ activeKeys
4513
+ });
4514
+ assignIfMissing({
4515
+ outputs: params.outputs,
4516
+ key: "cacheCreationInputTokens",
4517
+ value: sumNullable(calls.map((call) => call.cacheCreationInputTokens)),
4518
+ activeKeys
4519
+ });
4520
+ assignIfMissing({
4521
+ outputs: params.outputs,
4522
+ key: "reasoningTokens",
4523
+ value: sumNullable(calls.map((call) => call.reasoningTokens)),
4524
+ activeKeys
4525
+ });
4526
+ assignIfMissing({
4527
+ outputs: params.outputs,
4528
+ key: "llmLatencyMs",
4529
+ value: sumNullable(calls.map((call) => call.latencyMs)),
4530
+ activeKeys
4531
+ });
4532
+ }
4533
+ //#endregion
4006
4534
  //#region ../runner/src/discovery.ts
4007
4535
  const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
4008
4536
  const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
@@ -4664,7 +5192,7 @@ async function callWithUnknownResult(fn, args) {
4664
5192
  return await Reflect.apply(fn, void 0, args);
4665
5193
  }
4666
5194
  async function runCase(params) {
4667
- const { evalDef, evalId, evalCase, globalTraceDisplay, trial, startTime, cacheAdapter, cacheMode, codeFingerprint, moduleIsolation, evalFilePath, workspaceRoot, artifactDir, runId } = params;
5195
+ const { evalDef, evalId, evalCase, globalTraceDisplay, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, codeFingerprint, moduleIsolation, evalFilePath, workspaceRoot, artifactDir, runId } = params;
4668
5196
  const scopedIdPrefix = buildScopedEvalIdPrefix({
4669
5197
  evalId,
4670
5198
  evalFilePath,
@@ -4714,6 +5242,14 @@ async function runCase(params) {
4714
5242
  scope.assertionFailures.push(toAssertionFailure(message, e instanceof Error ? e : void 0));
4715
5243
  }
4716
5244
  }
5245
+ if (!nonAssertError) addDefaultOutputs({
5246
+ outputs: scope.outputs,
5247
+ spans: scope.spans,
5248
+ llmCallsConfig,
5249
+ apiCallsConfig,
5250
+ globalRemove: globalRemoveDefaultConfig,
5251
+ evalRemove: evalDef.removeDefaultConfig
5252
+ });
4717
5253
  if (!nonAssertError && evalDef.outputsSchema) {
4718
5254
  const { outputsSchema } = evalDef;
4719
5255
  const parsedOutputs = await runInExistingEvalScope(scope, "outputsSchema", () => outputsSchema.safeParse(getOutputsSchemaInput(outputsSchema, scope.outputs)));
@@ -4795,6 +5331,11 @@ async function runCase(params) {
4795
5331
  const status = nonAssertError ? "error" : passed ? "pass" : "fail";
4796
5332
  const { trace: displayTrace, traceDisplay } = resolveTracePresentation(scope.spans, globalTraceDisplay, evalDef.traceDisplay);
4797
5333
  const columns = {};
5334
+ const columnOverrides = mergeDefaultColumns({
5335
+ columns: evalDef.columns,
5336
+ globalRemove: globalRemoveDefaultConfig,
5337
+ evalRemove: evalDef.removeDefaultConfig
5338
+ });
4798
5339
  for (const [key, value] of Object.entries(scope.outputs)) {
4799
5340
  const cell = isBlob(value) ? await persistInlineArtifact({
4800
5341
  artifactDir,
@@ -4803,7 +5344,7 @@ async function runCase(params) {
4803
5344
  outputKey: key,
4804
5345
  trial,
4805
5346
  value
4806
- }) : toCellValue(value, evalDef.columns?.[key]);
5347
+ }) : toCellValue(value, columnOverrides?.[key]);
4807
5348
  if (cell !== void 0) columns[key] = cell;
4808
5349
  }
4809
5350
  for (const key of Object.keys(evalDef.manualScores ?? {})) columns[key] = null;
@@ -5016,6 +5557,8 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5016
5557
  key: runState.manifest.id,
5017
5558
  workspaceRoot
5018
5559
  };
5560
+ const llmCallsConfig = resolveLlmCallsConfig(config.llmCalls);
5561
+ const apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
5019
5562
  for (const evalMeta of targetEvals) {
5020
5563
  const evalFilePath = evalMeta.sourceFilePath;
5021
5564
  let codeFingerprint = "";
@@ -5054,7 +5597,20 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5054
5597
  evalId: evalMeta.id
5055
5598
  }), request.target.evalIds, request.target.caseIds, evalMeta.id);
5056
5599
  runState.summary.totalCases += cases.length;
5057
- const accumulatedColumns = /* @__PURE__ */ new Map();
5600
+ const defaultConfig = resolveEvalDefaultConfig({
5601
+ evalDef,
5602
+ globalRemove: config.removeDefaultConfig
5603
+ });
5604
+ const declaredColumnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
5605
+ const accumulatedColumns = new Map(declaredColumnDefs.map((def) => [def.key, def]));
5606
+ const validatedCharts = validateCharts({
5607
+ charts: defaultConfig.charts,
5608
+ columnDefs: declaredColumnDefs,
5609
+ evalId: evalMeta.id
5610
+ });
5611
+ for (const warning of validatedCharts.warnings) console.warn(warning);
5612
+ evalMeta.stats = defaultConfig.stats;
5613
+ evalMeta.charts = validatedCharts.charts;
5058
5614
  const evalCaseRows = [];
5059
5615
  const preparedCases = [];
5060
5616
  const scoreKeys = Object.freeze(Object.keys(evalDef.scores ?? {}));
@@ -5066,7 +5622,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5066
5622
  preparedCases,
5067
5623
  scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys]),
5068
5624
  mergeColumns: (columns) => {
5069
- mergeColumnDefs(accumulatedColumns, columns, evalDef.columns, evalDef.scores, evalDef.manualScores);
5625
+ mergeColumnDefs(accumulatedColumns, columns, defaultConfig.columns, evalDef.scores, evalDef.manualScores);
5070
5626
  }
5071
5627
  };
5072
5628
  preparedEvals.push(preparedEval);
@@ -5087,6 +5643,9 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5087
5643
  evalId: evalMeta.id,
5088
5644
  evalCase,
5089
5645
  globalTraceDisplay,
5646
+ llmCallsConfig,
5647
+ apiCallsConfig,
5648
+ globalRemoveDefaultConfig: config.removeDefaultConfig,
5090
5649
  trial,
5091
5650
  startTime,
5092
5651
  cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
@@ -5237,4 +5796,4 @@ function toLastRunStatus(status) {
5237
5796
  return status === "pending" ? null : status;
5238
5797
  }
5239
5798
  //#endregion
5240
- export { assertionFailureSchema as $, runArtifactRefSchema as $t, getNestedAttribute as A, getEvalRegistry as An, cacheRecordingSchema as At, agentEvalsConfigSchema as B, traceDisplayInputConfigSchema as Bt, createRunRequestSchema as C, runInEvalScope as Cn, cacheEntrySchema as Ct, extractCacheHits as D, startEvalBackgroundJob as Dn, cacheModeSchema as Dt, extractCacheEntries as E, setScopeCacheContext as En, cacheListItemSchema as Et, deriveStatusFromChildStatuses as F, traceAttributeDisplayFormatSchema as Ft, llmCallMetricFormatSchema as G, cellValueSchema as Gt, apiCallMetricPlacementSchema as H, traceSpanKindSchema as Ht, runManifestSchema as I, traceAttributeDisplayInputSchema as It, llmCallsConfigSchema as J, columnKindSchema as Jt, llmCallMetricPlacementSchema as K, columnDefSchema as Kt, runSummarySchema as L, traceAttributeDisplayPlacementSchema as Lt, getEvalDisplayStatus as M, serializedCacheSpanSchema as Mt, deriveScopedSummaryFromCases as N, spanCacheOptionsSchema as Nt, extractApiCalls as O, repoFile as On, cacheOperationTypeSchema as Ot, deriveStatusFromCaseRows as P, traceCacheRefSchema as Pt, trialSelectionModeSchema as Q, repoFileRefSchema as Qt, DEFAULT_API_CALLS_CONFIG as R, traceAttributeDisplaySchema as Rt, createFsCacheStore as S, runInEvalRuntimeScope as Sn, cacheDebugKeyFileSchema as St, sseEnvelopeSchema as T, setEvalOutput as Tn, cacheFileSchema as Tt, apiCallMetricSchema as U, traceSpanSchema as Ut, apiCallMetricFormatSchema as V, traceSpanErrorSchema as Vt, apiCallsConfigSchema as W, traceSpanWarningSchema as Wt, resolveLlmCallsConfig as X, jsonCellSchema as Xt, resolveApiCallsConfig as Y, fileRefSchema as Yt, runLogsConfigSchema as Z, numberDisplayOptionsSchema as Zt, loadEvalModule as _, getEvalCaseInput as _n, evalChartMetricSchema as _t, loadPersistedRunSnapshot as a, hashCacheKey as an, evalStatsConfigSchema as at, buildDeclaredColumnDefs as b, mergeEvalOutput as bn, evalChartsConfigSchema as bt, persistCaseDetail as c, deserializeCacheValue as cn, runLogLevelSchema as ct, recomputePersistedCaseStatus as d, EvalAssertionError as dn, scoreTraceSchema as dt, z$1 as en, caseDetailSchema as et, runTouchesEval as f, appendToEvalOutput as fn, evalChartAggregateSchema as ft, setLatestRunInfoMap as g, getCurrentScope as gn, evalChartConfigSchema as gt, getTargetEvalIds as h, evalLog as hn, evalChartColorSchema as ht, getLatestRunInfos as i, evalTracer as in, evalStatItemSchema as it, getEvalTitle as j, cacheStatusSchema as jt, extractLlmCalls as k, defineEval as kn, cacheRecordingOpSchema as kt, persistRunState as l, serializeCacheRecording as ln, runLogLocationSchema as lt, buildEvalSummary as m, evalAssert as mn, evalChartBuiltinMetricSchema as mt, generateRunId as n, captureEvalSpanError as nn, evalFreshnessStatusSchema as nt, loadPersistedRunSnapshots as o, hashCacheKeySync as on, evalSummarySchema as ot, resolveArtifactPath as p, configureEvalRunLogs as pn, evalChartAxisSchema as pt, llmCallMetricSchema as q, columnFormatSchema as qt, getLastRunStatuses as r, evalSpan as rn, evalStatAggregateSchema as rt, nextShortIdFromSnapshots as s, deserializeCacheRecording as sn, runLogEntrySchema as st, executeRun as t, buildTraceTree as tn, caseRowSchema as tt, recomputeEvalStatusesInRuns as u, serializeCacheValue as un, runLogPhaseSchema as ut, parseEvalMetas as v, incrementEvalOutput as vn, evalChartTooltipExtraSchema as vt, updateManualScoreRequestSchema as w, runInExistingEvalScope as wn, cacheEntryWithDebugKeySchema as wt, normalizeScoreDef as x, nextEvalId as xn, cacheDebugKeyEntrySchema as xt, loadConfig as y, isInEvalScope as yn, evalChartTypeSchema as yt, DEFAULT_LLM_CALLS_CONFIG as z, traceDisplayConfigSchema as zt };
5799
+ export { removeDefaultConfigSchema as $, columnKindSchema as $t, extractApiCalls as A, setEvalOutput as An, cacheFileSchema as At, DEFAULT_API_CALLS_CONFIG as B, traceAttributeDisplayFormatSchema as Bt, validateCharts as C, incrementEvalOutput as Cn, evalChartTooltipExtraSchema as Ct, sseEnvelopeSchema as D, runInEvalRuntimeScope as Dn, cacheDebugKeyFileSchema as Dt, updateManualScoreRequestSchema as E, nextEvalId as En, cacheDebugKeyEntrySchema as Et, deriveScopedSummaryFromCases as F, getEvalRegistry as Fn, cacheRecordingSchema as Ft, apiCallMetricSchema as G, traceDisplayInputConfigSchema as Gt, agentEvalsConfigSchema as H, traceAttributeDisplayPlacementSchema as Ht, deriveStatusFromCaseRows as I, cacheStatusSchema as It, llmCallMetricFormatSchema as J, traceSpanSchema as Jt, apiCallsConfigSchema as K, traceSpanErrorSchema as Kt, deriveStatusFromChildStatuses as L, serializedCacheSpanSchema as Lt, getNestedAttribute as M, startEvalBackgroundJob as Mn, cacheModeSchema as Mt, getEvalTitle as N, repoFile as Nn, cacheOperationTypeSchema as Nt, extractCacheEntries as O, runInEvalScope as On, cacheEntrySchema as Ot, getEvalDisplayStatus as P, defineEval as Pn, cacheRecordingOpSchema as Pt, llmCallsConfigSchema as Q, columnFormatSchema as Qt, runManifestSchema as R, spanCacheOptionsSchema as Rt, normalizeScoreDef as S, getEvalCaseInput as Sn, evalChartMetricSchema as St, createRunRequestSchema as T, mergeEvalOutput as Tn, evalChartsConfigSchema as Tt, apiCallMetricFormatSchema as U, traceAttributeDisplaySchema as Ut, DEFAULT_LLM_CALLS_CONFIG as V, traceAttributeDisplayInputSchema as Vt, apiCallMetricPlacementSchema as W, traceDisplayConfigSchema as Wt, llmCallMetricSchema as X, cellValueSchema as Xt, llmCallMetricPlacementSchema as Y, traceSpanWarningSchema as Yt, llmCallPricingSchema as Z, columnDefSchema as Zt, loadEvalModule as _, appendToEvalOutput as _n, evalChartAggregateSchema as _t, loadPersistedRunSnapshot as a, z$1 as an, caseDetailSchema as at, loadConfig as b, evalLog as bn, evalChartColorSchema as bt, persistCaseDetail as c, evalSpan as cn, evalStatAggregateSchema as ct, recomputePersistedCaseStatus as d, hashCacheKeySync as dn, evalSummarySchema as dt, fileRefSchema as en, resolveApiCallsConfig as et, runTouchesEval as f, deserializeCacheRecording as fn, runLogEntrySchema as ft, setLatestRunInfoMap as g, EvalAssertionError as gn, scoreTraceSchema as gt, getTargetEvalIds as h, serializeCacheValue as hn, runLogPhaseSchema as ht, getLatestRunInfos as i, runArtifactRefSchema as in, assertionFailureSchema as it, extractLlmCalls as j, setScopeCacheContext as jn, cacheListItemSchema as jt, extractCacheHits as k, runInExistingEvalScope as kn, cacheEntryWithDebugKeySchema as kt, persistRunState as l, evalTracer as ln, evalStatItemSchema as lt, buildEvalSummary as m, serializeCacheRecording as mn, runLogLocationSchema as mt, generateRunId as n, numberDisplayOptionsSchema as nn, runLogsConfigSchema as nt, loadPersistedRunSnapshots as o, buildTraceTree as on, caseRowSchema as ot, resolveArtifactPath as p, deserializeCacheValue as pn, runLogLevelSchema as pt, defaultConfigKeySchema as q, traceSpanKindSchema as qt, getLastRunStatuses as r, repoFileRefSchema as rn, trialSelectionModeSchema as rt, nextShortIdFromSnapshots as s, captureEvalSpanError as sn, evalFreshnessStatusSchema as st, executeRun as t, jsonCellSchema as tn, resolveLlmCallsConfig as tt, recomputeEvalStatusesInRuns as u, hashCacheKey as un, evalStatsConfigSchema as ut, parseEvalMetas as v, configureEvalRunLogs as vn, evalChartAxisSchema as vt, createFsCacheStore as w, isInEvalScope as wn, evalChartTypeSchema as wt, buildDeclaredColumnDefs as x, getCurrentScope as xn, evalChartConfigSchema as xt, resolveEvalDefaultConfig as y, evalAssert as yn, evalChartBuiltinMetricSchema as yt, runSummarySchema as z, traceCacheRefSchema as zt };