@ls-stack/agent-eval 0.19.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1970,7 +1970,15 @@ const numberDisplayOptionsSchema = z.object({
1970
1970
  compactDisplay: z.enum(["short", "long"]).optional(),
1971
1971
  prefix: z.string().optional(),
1972
1972
  suffix: z.string().optional(),
1973
- decimalPlaces: z.number().int().min(0).optional()
1973
+ minDecimalPlaces: z.number().int().min(0).optional(),
1974
+ maxDecimalPlaces: z.number().int().min(0).optional()
1975
+ }).refine((options) => {
1976
+ if (options.minDecimalPlaces === void 0) return true;
1977
+ if (options.maxDecimalPlaces === void 0) return true;
1978
+ return options.minDecimalPlaces <= options.maxDecimalPlaces;
1979
+ }, {
1980
+ message: "minDecimalPlaces must be less than or equal to maxDecimalPlaces",
1981
+ path: ["minDecimalPlaces"]
1974
1982
  });
1975
1983
  /** Schema for the supported column rendering kinds in list views. */
1976
1984
  const columnKindSchema = z.enum([
@@ -2005,7 +2013,6 @@ const columnDefSchema = z.object({
2005
2013
  passThreshold: z.number().optional(),
2006
2014
  maxStars: z.number().int().min(2).optional(),
2007
2015
  hideInTable: z.boolean().optional(),
2008
- sortable: z.boolean().optional(),
2009
2016
  align: z.enum([
2010
2017
  "left",
2011
2018
  "center",
@@ -2403,6 +2410,8 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
2403
2410
  label: z.string().optional(),
2404
2411
  aggregate: evalStatAggregateSchema,
2405
2412
  format: columnFormatSchema.optional(),
2413
+ /** Number presentation options applied when `format: 'number'`. */
2414
+ numberFormat: numberDisplayOptionsSchema.optional(),
2406
2415
  accent: z.boolean().optional()
2407
2416
  })
2408
2417
  ]);
@@ -2566,6 +2575,21 @@ const caseDetailSchema = z.object({
2566
2575
  //#region ../shared/src/schemas/config.ts
2567
2576
  /** Strategy used to collapse repeated trials into one stored case result. */
2568
2577
  const trialSelectionModeSchema = z.enum(["lowestScore", "median"]);
2578
+ /** Built-in eval-level output/column keys. */
2579
+ const defaultConfigKeySchema = z.enum([
2580
+ "apiCalls",
2581
+ "costUsd",
2582
+ "llmTurns",
2583
+ "inputTokens",
2584
+ "outputTokens",
2585
+ "totalTokens",
2586
+ "cachedInputTokens",
2587
+ "cacheCreationInputTokens",
2588
+ "reasoningTokens",
2589
+ "llmLatencyMs"
2590
+ ]);
2591
+ /** Removal config for built-in eval-level outputs and UI metadata. */
2592
+ const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfigKeySchema)]);
2569
2593
  /** Render formats supported by an LLM-call metric in the UI. */
2570
2594
  const llmCallMetricFormatSchema = z.enum([
2571
2595
  "string",
@@ -2637,6 +2661,30 @@ const apiCallMetricSchema = z.object({
2637
2661
  */
2638
2662
  placements: z.array(apiCallMetricPlacementSchema).nonempty().optional()
2639
2663
  });
2664
+ /**
2665
+ * Schema for one model/provider pricing entry used to derive LLM-call costs
2666
+ * from token counts when a span does not already record explicit USD costs.
2667
+ */
2668
+ const llmCallPricingSchema = z.object({
2669
+ /** Exact model name read from the configured `attributes.model` path. */
2670
+ model: z.string().min(1),
2671
+ /**
2672
+ * Optional provider discriminator read from `attributes.provider`. When set,
2673
+ * the entry only applies to calls from that provider; provider-specific
2674
+ * entries take precedence over generic entries for the same model.
2675
+ */
2676
+ provider: z.string().min(1).optional(),
2677
+ /** USD per one million non-cached input tokens. */
2678
+ inputUsdPerMillion: z.number().nonnegative().optional(),
2679
+ /** USD per one million output tokens. */
2680
+ outputUsdPerMillion: z.number().nonnegative().optional(),
2681
+ /** USD per one million prompt-cache read tokens. */
2682
+ cachedInputUsdPerMillion: z.number().nonnegative().optional(),
2683
+ /** USD per one million prompt-cache write tokens. */
2684
+ cacheCreationInputUsdPerMillion: z.number().nonnegative().optional(),
2685
+ /** USD per one million reasoning tokens when reported separately. */
2686
+ reasoningUsdPerMillion: z.number().nonnegative().optional()
2687
+ });
2640
2688
  /** Schema for the global LLM calls config block in `agent-evals.config.ts`. */
2641
2689
  const llmCallsConfigSchema = z.object({
2642
2690
  /** Span kinds treated as LLM calls. Defaults to `['llm']`. */
@@ -2647,8 +2695,9 @@ const llmCallsConfigSchema = z.object({
2647
2695
  * built-in defaults (e.g. `usage.inputTokens`, `costUsd`).
2648
2696
  *
2649
2697
  * Per-token-type cost paths (`inputCost`, `outputCost`, `cachedInputCost`,
2650
- * `reasoningCost`) feed the cost breakdown table in the expanded row.
2651
- * Record them as USD numbers alongside `costUsd` in your span attributes.
2698
+ * `reasoningCost`) feed the cost breakdown table in the expanded row when
2699
+ * spans provide explicit USD cost overrides. Prefer `pricing` for deriving
2700
+ * costs from token counts globally.
2652
2701
  */
2653
2702
  attributes: z.object({
2654
2703
  model: z.string().optional(),
@@ -2659,6 +2708,7 @@ const llmCallsConfigSchema = z.object({
2659
2708
  cacheCreationInputTokens: z.string().optional(),
2660
2709
  reasoningTokens: z.string().optional(),
2661
2710
  totalTokens: z.string().optional(),
2711
+ tokensPerSecond: z.string().optional(),
2662
2712
  cost: z.string().optional(),
2663
2713
  inputCost: z.string().optional(),
2664
2714
  outputCost: z.string().optional(),
@@ -2672,6 +2722,12 @@ const llmCallsConfigSchema = z.object({
2672
2722
  reasoning: z.string().optional(),
2673
2723
  toolCalls: z.string().optional()
2674
2724
  }).optional(),
2725
+ /**
2726
+ * Model/provider pricing registry used to calculate missing LLM-call costs
2727
+ * from token counts. Explicit span attributes (`costUsd`, `cost.inputUsd`,
2728
+ * etc.) take precedence over derived prices.
2729
+ */
2730
+ pricing: z.array(llmCallPricingSchema).optional(),
2675
2731
  /** Custom user-defined metrics surfaced on each LLM call. */
2676
2732
  metrics: z.array(llmCallMetricSchema).optional()
2677
2733
  });
@@ -2719,6 +2775,7 @@ const DEFAULT_LLM_CALLS_CONFIG = {
2719
2775
  cacheCreationInputTokens: "usage.cacheCreationInputTokens",
2720
2776
  reasoningTokens: "usage.reasoningTokens",
2721
2777
  totalTokens: "usage.totalTokens",
2778
+ tokensPerSecond: "tokensPerSecond",
2722
2779
  cost: "costUsd",
2723
2780
  inputCost: "cost.inputUsd",
2724
2781
  outputCost: "cost.outputUsd",
@@ -2732,7 +2789,8 @@ const DEFAULT_LLM_CALLS_CONFIG = {
2732
2789
  reasoning: "reasoning",
2733
2790
  toolCalls: "toolCalls"
2734
2791
  },
2735
- metrics: []
2792
+ metrics: [],
2793
+ pricing: []
2736
2794
  };
2737
2795
  /** Default API-calls config the UI uses before the workspace fetch resolves. */
2738
2796
  const DEFAULT_API_CALLS_CONFIG = {
@@ -2765,6 +2823,8 @@ const DEFAULT_API_CALLS_CONFIG = {
2765
2823
  * attribute path.
2766
2824
  * - Missing `metrics[].format` defaults to `'string'`.
2767
2825
  * - Missing `metrics[].placements` defaults to `['body']`.
2826
+ * - Missing `pricing` defaults to an empty registry; explicit span costs still
2827
+ * take precedence over derived costs.
2768
2828
  */
2769
2829
  function resolveLlmCallsConfig(input) {
2770
2830
  return {
@@ -2780,6 +2840,15 @@ function resolveLlmCallsConfig(input) {
2780
2840
  format: m.format ?? "string",
2781
2841
  numberFormat: m.numberFormat,
2782
2842
  placements: m.placements ? [...m.placements] : ["body"]
2843
+ })),
2844
+ pricing: (input?.pricing ?? []).map((p) => ({
2845
+ model: p.model,
2846
+ provider: p.provider,
2847
+ inputUsdPerMillion: p.inputUsdPerMillion,
2848
+ outputUsdPerMillion: p.outputUsdPerMillion,
2849
+ cachedInputUsdPerMillion: p.cachedInputUsdPerMillion,
2850
+ cacheCreationInputUsdPerMillion: p.cacheCreationInputUsdPerMillion,
2851
+ reasoningUsdPerMillion: p.reasoningUsdPerMillion
2783
2852
  }))
2784
2853
  };
2785
2854
  }
@@ -2821,6 +2890,7 @@ const agentEvalsConfigSchema = z.object({
2821
2890
  allowCliRunAll: z.boolean().optional(),
2822
2891
  traceDisplay: traceDisplayInputConfigSchema.optional(),
2823
2892
  llmCalls: llmCallsConfigSchema.optional(),
2893
+ removeDefaultConfig: removeDefaultConfigSchema.optional(),
2824
2894
  apiCalls: apiCallsConfigSchema.optional(),
2825
2895
  runLogs: runLogsConfigSchema.optional(),
2826
2896
  cache: z.object({
@@ -3056,6 +3126,62 @@ function readString$2(attributes, path) {
3056
3126
  const raw = getNestedAttribute(attributes, path);
3057
3127
  return typeof raw === "string" && raw.length > 0 ? raw : null;
3058
3128
  }
3129
+ function computeTokenCost(tokens, usdPerMillion) {
3130
+ if (tokens === null) return null;
3131
+ if (tokens === 0) return 0;
3132
+ if (usdPerMillion === void 0) return null;
3133
+ return tokens / 1e6 * usdPerMillion;
3134
+ }
3135
+ function pickPricingEntry({ pricing, model, provider }) {
3136
+ if (model === null) return null;
3137
+ let fallback = null;
3138
+ for (const entry of pricing) {
3139
+ if (entry.model !== model) continue;
3140
+ if (entry.provider === void 0) {
3141
+ fallback ??= entry;
3142
+ continue;
3143
+ }
3144
+ if (entry.provider === provider) return entry;
3145
+ }
3146
+ return fallback;
3147
+ }
3148
+ function computeFallbackTotalCost({ inputTokens, inputCostUsd, outputTokens, outputCostUsd, cachedInputTokens, cachedInputCostUsd, cacheCreationInputTokens, cacheCreationInputCostUsd, reasoningTokens, reasoningCostUsd }) {
3149
+ const parts = [
3150
+ {
3151
+ tokens: inputTokens,
3152
+ cost: inputCostUsd
3153
+ },
3154
+ {
3155
+ tokens: outputTokens,
3156
+ cost: outputCostUsd
3157
+ },
3158
+ {
3159
+ tokens: cachedInputTokens,
3160
+ cost: cachedInputCostUsd
3161
+ },
3162
+ {
3163
+ tokens: cacheCreationInputTokens,
3164
+ cost: cacheCreationInputCostUsd
3165
+ },
3166
+ {
3167
+ tokens: reasoningTokens,
3168
+ cost: reasoningCostUsd
3169
+ }
3170
+ ];
3171
+ let total = 0;
3172
+ let hasCost = false;
3173
+ let hasReportedTokens = false;
3174
+ for (const part of parts) {
3175
+ if (part.tokens === null) continue;
3176
+ hasReportedTokens = true;
3177
+ if (part.tokens === 0) continue;
3178
+ if (part.cost === null) return null;
3179
+ total += part.cost;
3180
+ hasCost = true;
3181
+ }
3182
+ if (hasCost) return total;
3183
+ return hasReportedTokens ? 0 : null;
3184
+ }
3059
3185
  function computeLatencyMs$1(span) {
3060
3186
  if (span.endedAt === null) return null;
3061
3187
  const started = Date.parse(span.startedAt);
@@ -3100,9 +3226,11 @@ function pickError$1(span) {
3100
3226
  * shape consumed by the LLM calls tab.
3101
3227
  *
3102
3228
  * Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
3103
- * (`model`, token counts, cost, etc.) are read via `getNestedAttribute` from
3104
- * the configured paths, with safe coercion to `string | null` / `number |
3105
- * null`. `totalTokens` falls back to a sum of input + output + cached when no
3229
+ * (`model`, token counts, explicit cost, etc.) are read via
3230
+ * `getNestedAttribute` from the configured paths, with safe coercion to
3231
+ * `string | null` / `number | null`. When explicit USD costs are absent,
3232
+ * configured model pricing derives per-token-type costs from token counts.
3233
+ * `totalTokens` falls back to a sum of input + output + cached when no
3106
3234
  * explicit total attribute is present. The `steps` attribute path may resolve
3107
3235
  * to either a number (rendered as the inference-round count) or an array of
3108
3236
  * per-step detail objects (rendered as a Steps section in the body, with
@@ -3118,12 +3246,36 @@ function extractLlmCalls(spans, config) {
3118
3246
  for (const span of spans) {
3119
3247
  if (!kindSet.has(span.kind)) continue;
3120
3248
  const attrs = span.attributes;
3249
+ const model = readString$2(attrs, config.attributes.model);
3250
+ const provider = readString$2(attrs, config.attributes.provider);
3121
3251
  const inputTokens = readNumber$2(attrs, config.attributes.inputTokens);
3122
3252
  const outputTokens = readNumber$2(attrs, config.attributes.outputTokens);
3123
3253
  const cachedInputTokens = readNumber$2(attrs, config.attributes.cachedInputTokens);
3124
3254
  const cacheCreationInputTokens = readNumber$2(attrs, config.attributes.cacheCreationInputTokens);
3125
3255
  const reasoningTokens = readNumber$2(attrs, config.attributes.reasoningTokens);
3126
3256
  const declaredTotalTokens = readNumber$2(attrs, config.attributes.totalTokens);
3257
+ const pricing = pickPricingEntry({
3258
+ pricing: config.pricing,
3259
+ model,
3260
+ provider
3261
+ });
3262
+ const inputCostUsd = readNumber$2(attrs, config.attributes.inputCost) ?? computeTokenCost(inputTokens, pricing?.inputUsdPerMillion);
3263
+ const outputCostUsd = readNumber$2(attrs, config.attributes.outputCost) ?? computeTokenCost(outputTokens, pricing?.outputUsdPerMillion);
3264
+ const cachedInputCostUsd = readNumber$2(attrs, config.attributes.cachedInputCost) ?? computeTokenCost(cachedInputTokens, pricing?.cachedInputUsdPerMillion);
3265
+ const cacheCreationInputCostUsd = readNumber$2(attrs, config.attributes.cacheCreationInputCost) ?? computeTokenCost(cacheCreationInputTokens, pricing?.cacheCreationInputUsdPerMillion);
3266
+ const reasoningCostUsd = readNumber$2(attrs, config.attributes.reasoningCost) ?? computeTokenCost(reasoningTokens, pricing?.reasoningUsdPerMillion);
3267
+ const costUsd = readNumber$2(attrs, config.attributes.cost) ?? computeFallbackTotalCost({
3268
+ inputTokens,
3269
+ inputCostUsd,
3270
+ outputTokens,
3271
+ outputCostUsd,
3272
+ cachedInputTokens,
3273
+ cachedInputCostUsd,
3274
+ cacheCreationInputTokens,
3275
+ cacheCreationInputCostUsd,
3276
+ reasoningTokens,
3277
+ reasoningCostUsd
3278
+ });
3127
3279
  const metrics = [];
3128
3280
  for (const metric of config.metrics) {
3129
3281
  const rawValue = getNestedAttribute(attrs, metric.path);
@@ -3142,8 +3294,8 @@ function extractLlmCalls(spans, config) {
3142
3294
  name: span.name,
3143
3295
  kind: span.kind,
3144
3296
  status: span.status,
3145
- model: readString$2(attrs, config.attributes.model),
3146
- provider: readString$2(attrs, config.attributes.provider),
3297
+ model,
3298
+ provider,
3147
3299
  inputTokens,
3148
3300
  outputTokens,
3149
3301
  cachedInputTokens,
@@ -3156,12 +3308,13 @@ function extractLlmCalls(spans, config) {
3156
3308
  cached: cachedInputTokens,
3157
3309
  cacheCreation: cacheCreationInputTokens
3158
3310
  }),
3159
- costUsd: readNumber$2(attrs, config.attributes.cost),
3160
- inputCostUsd: readNumber$2(attrs, config.attributes.inputCost),
3161
- outputCostUsd: readNumber$2(attrs, config.attributes.outputCost),
3162
- cachedInputCostUsd: readNumber$2(attrs, config.attributes.cachedInputCost),
3163
- cacheCreationInputCostUsd: readNumber$2(attrs, config.attributes.cacheCreationInputCost),
3164
- reasoningCostUsd: readNumber$2(attrs, config.attributes.reasoningCost),
3311
+ tokensPerSecond: readNumber$2(attrs, config.attributes.tokensPerSecond),
3312
+ costUsd,
3313
+ inputCostUsd,
3314
+ outputCostUsd,
3315
+ cachedInputCostUsd,
3316
+ cacheCreationInputCostUsd,
3317
+ reasoningCostUsd,
3165
3318
  ...readSteps(attrs, config.attributes.steps),
3166
3319
  finishReason: readString$2(attrs, config.attributes.finishReason),
3167
3320
  latencyMs: computeLatencyMs$1(span),
@@ -3792,6 +3945,80 @@ function isRecordLike(value) {
3792
3945
  return typeof value === "object" && value !== null && !Array.isArray(value);
3793
3946
  }
3794
3947
  //#endregion
3948
+ //#region ../runner/src/chartValidation.ts
3949
+ function isValidColumnMetric(metric, columnsByKey, evalId, warnings) {
3950
+ const columnDef = columnsByKey.get(metric.key);
3951
+ if (!columnDef) {
3952
+ warnings.push(`[${evalId}] chart metric references unknown column "${metric.key}" — dropped`);
3953
+ return false;
3954
+ }
3955
+ if (metric.aggregate === "passThresholdRate") {
3956
+ if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
3957
+ warnings.push(`[${evalId}] chart metric "${metric.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
3958
+ return false;
3959
+ }
3960
+ }
3961
+ return true;
3962
+ }
3963
+ function isValidTooltipExtra(extra, columnsByKey, evalId, warnings) {
3964
+ const columnDef = columnsByKey.get(extra.key);
3965
+ if (!columnDef) {
3966
+ warnings.push(`[${evalId}] chart tooltip extra references unknown column "${extra.key}" — dropped`);
3967
+ return false;
3968
+ }
3969
+ if (extra.aggregate === "passThresholdRate") {
3970
+ if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
3971
+ warnings.push(`[${evalId}] chart tooltip extra "${extra.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
3972
+ return false;
3973
+ }
3974
+ }
3975
+ return true;
3976
+ }
3977
+ function sanitizeChart(chart, columnsByKey, evalId, warnings) {
3978
+ const metrics = chart.metrics.filter((metric) => {
3979
+ if (metric.source === "builtin") return true;
3980
+ return isValidColumnMetric(metric, columnsByKey, evalId, warnings);
3981
+ });
3982
+ if (metrics.length === 0) {
3983
+ warnings.push(`[${evalId}] chart had no valid metrics after validation — chart dropped`);
3984
+ return null;
3985
+ }
3986
+ const tooltipExtras = chart.tooltipExtras?.filter((extra) => {
3987
+ if (extra.source === "builtin") return true;
3988
+ return isValidTooltipExtra(extra, columnsByKey, evalId, warnings);
3989
+ });
3990
+ return {
3991
+ ...chart,
3992
+ metrics,
3993
+ tooltipExtras: tooltipExtras?.length ? tooltipExtras : void 0
3994
+ };
3995
+ }
3996
+ /**
3997
+ * Validate and sanitize an authored `charts` config against the eval's
3998
+ * declared columns. Drops metrics/extras that reference unknown columns or
3999
+ * misuse `passThresholdRate`, and drops entire charts whose metrics are all
4000
+ * invalid. Returns `charts: undefined` when nothing valid remains so the UI
4001
+ * falls back to rendering no chart (matching the opt-in default).
4002
+ */
4003
+ function validateCharts(params) {
4004
+ const { charts, columnDefs, evalId } = params;
4005
+ if (!charts || charts.length === 0) return {
4006
+ charts: void 0,
4007
+ warnings: []
4008
+ };
4009
+ const columnsByKey = new Map(columnDefs.map((def) => [def.key, def]));
4010
+ const warnings = [];
4011
+ const sanitized = [];
4012
+ for (const chart of charts) {
4013
+ const result = sanitizeChart(chart, columnsByKey, evalId, warnings);
4014
+ if (result) sanitized.push(result);
4015
+ }
4016
+ return {
4017
+ charts: sanitized.length > 0 ? sanitized : void 0,
4018
+ warnings
4019
+ };
4020
+ }
4021
+ //#endregion
3795
4022
  //#region ../runner/src/columnBuilder.ts
3796
4023
  /**
3797
4024
  * Normalize a user-provided score definition (either a function or an
@@ -3817,7 +4044,6 @@ function getScoreOverride(def) {
3817
4044
  format: def.format,
3818
4045
  numberFormat: def.numberFormat,
3819
4046
  hideInTable: def.hideInTable,
3820
- sortable: def.sortable,
3821
4047
  align: def.align,
3822
4048
  maxStars: def.maxStars
3823
4049
  };
@@ -3830,7 +4056,6 @@ function mergeOverrides(base, override) {
3830
4056
  format: override.format ?? base.format,
3831
4057
  numberFormat: override.numberFormat ?? base.numberFormat,
3832
4058
  hideInTable: override.hideInTable ?? base.hideInTable,
3833
- sortable: override.sortable ?? base.sortable,
3834
4059
  align: override.align ?? base.align,
3835
4060
  maxStars: override.maxStars ?? base.maxStars
3836
4061
  };
@@ -3945,7 +4170,6 @@ function createColumnDef(params) {
3945
4170
  if (override?.numberFormat !== void 0) def.numberFormat = override.numberFormat;
3946
4171
  if (override?.maxStars !== void 0) def.maxStars = override.maxStars;
3947
4172
  if (override?.hideInTable !== void 0) def.hideInTable = override.hideInTable;
3948
- if (override?.sortable !== void 0) def.sortable = override.sortable;
3949
4173
  if (override?.align !== void 0) def.align = override.align;
3950
4174
  if (!isScore) return def;
3951
4175
  def.isScore = true;
@@ -4003,6 +4227,294 @@ async function loadConfig() {
4003
4227
  }
4004
4228
  }
4005
4229
  //#endregion
4230
+ //#region ../runner/src/defaultConfig.ts
4231
+ const DEFAULT_CONFIG_KEYS = [
4232
+ "apiCalls",
4233
+ "costUsd",
4234
+ "llmTurns",
4235
+ "inputTokens",
4236
+ "outputTokens",
4237
+ "totalTokens",
4238
+ "cachedInputTokens",
4239
+ "cacheCreationInputTokens",
4240
+ "llmLatencyMs"
4241
+ ];
4242
+ const tokenNumberFormat = { notation: "compact" };
4243
+ const countNumberFormat = {
4244
+ minDecimalPlaces: 0,
4245
+ maxDecimalPlaces: 0
4246
+ };
4247
+ const costNumberFormat = {
4248
+ prefix: "$",
4249
+ maxDecimalPlaces: 4
4250
+ };
4251
+ const DEFAULT_COLUMNS = {
4252
+ apiCalls: {
4253
+ label: "API Calls",
4254
+ format: "number",
4255
+ numberFormat: countNumberFormat,
4256
+ align: "right"
4257
+ },
4258
+ costUsd: {
4259
+ label: "Cost",
4260
+ format: "number",
4261
+ numberFormat: costNumberFormat,
4262
+ align: "right"
4263
+ },
4264
+ llmTurns: {
4265
+ label: "LLM Turns",
4266
+ format: "number",
4267
+ numberFormat: countNumberFormat,
4268
+ align: "right"
4269
+ },
4270
+ inputTokens: {
4271
+ label: "Input Tokens",
4272
+ format: "number",
4273
+ numberFormat: tokenNumberFormat,
4274
+ align: "right"
4275
+ },
4276
+ outputTokens: {
4277
+ label: "Output Tokens",
4278
+ format: "number",
4279
+ numberFormat: tokenNumberFormat,
4280
+ align: "right"
4281
+ },
4282
+ totalTokens: {
4283
+ label: "Total Tokens",
4284
+ format: "number",
4285
+ numberFormat: tokenNumberFormat,
4286
+ align: "right"
4287
+ },
4288
+ cachedInputTokens: {
4289
+ label: "Cached Input Tokens",
4290
+ format: "number",
4291
+ numberFormat: tokenNumberFormat,
4292
+ align: "right"
4293
+ },
4294
+ cacheCreationInputTokens: {
4295
+ label: "Cache Write Tokens",
4296
+ format: "number",
4297
+ numberFormat: tokenNumberFormat,
4298
+ align: "right"
4299
+ },
4300
+ reasoningTokens: {
4301
+ label: "Reasoning Tokens",
4302
+ format: "number",
4303
+ numberFormat: tokenNumberFormat,
4304
+ align: "right"
4305
+ },
4306
+ llmLatencyMs: {
4307
+ label: "LLM Latency",
4308
+ format: "duration",
4309
+ align: "right"
4310
+ }
4311
+ };
4312
+ function resolveRemovedKeys(globalRemove, evalRemove) {
4313
+ if (globalRemove === true || evalRemove === true) return new Set(DEFAULT_CONFIG_KEYS);
4314
+ return new Set([...globalRemove ?? [], ...evalRemove ?? []]);
4315
+ }
4316
+ function getActiveDefaultConfigKeys(params) {
4317
+ const removed = resolveRemovedKeys(params.globalRemove, params.evalRemove);
4318
+ return DEFAULT_CONFIG_KEYS.filter((key) => !removed.has(key));
4319
+ }
4320
+ function mergeDefaultColumns(params) {
4321
+ const activeKeys = getActiveDefaultConfigKeys(params);
4322
+ if (activeKeys.length === 0) return params.columns;
4323
+ return {
4324
+ ...Object.fromEntries(activeKeys.map((key) => [key, DEFAULT_COLUMNS[key]])),
4325
+ ...params.columns
4326
+ };
4327
+ }
4328
+ function appendDefaultStats(params) {
4329
+ const activeKeys = new Set(getActiveDefaultConfigKeys(params));
4330
+ const defaults = [];
4331
+ if (activeKeys.has("apiCalls")) defaults.push({
4332
+ kind: "column",
4333
+ key: "apiCalls",
4334
+ label: "API Calls",
4335
+ aggregate: "avg",
4336
+ numberFormat: countNumberFormat
4337
+ });
4338
+ if (activeKeys.has("costUsd")) defaults.push({
4339
+ kind: "column",
4340
+ key: "costUsd",
4341
+ label: "LLM Cost",
4342
+ aggregate: "avg",
4343
+ numberFormat: costNumberFormat
4344
+ });
4345
+ if (activeKeys.has("totalTokens")) defaults.push({
4346
+ kind: "column",
4347
+ key: "totalTokens",
4348
+ label: "Tokens",
4349
+ aggregate: "avg",
4350
+ numberFormat: tokenNumberFormat
4351
+ });
4352
+ if (activeKeys.has("llmTurns")) defaults.push({
4353
+ kind: "column",
4354
+ key: "llmTurns",
4355
+ label: "LLM Turns",
4356
+ aggregate: "avg",
4357
+ numberFormat: countNumberFormat
4358
+ });
4359
+ const merged = [...params.stats ?? [], ...defaults];
4360
+ return merged.length > 0 ? merged : void 0;
4361
+ }
4362
+ function appendDefaultCharts(params) {
4363
+ const activeKeys = new Set(getActiveDefaultConfigKeys(params));
4364
+ const defaults = [];
4365
+ if (activeKeys.has("costUsd")) defaults.push({
4366
+ heading: "LLM Cost",
4367
+ type: "area",
4368
+ metrics: [{
4369
+ source: "column",
4370
+ key: "costUsd",
4371
+ aggregate: "avg",
4372
+ label: "Cost",
4373
+ color: "warning"
4374
+ }]
4375
+ });
4376
+ const tokenMetrics = [
4377
+ activeKeys.has("inputTokens") ? {
4378
+ source: "column",
4379
+ key: "inputTokens",
4380
+ aggregate: "avg",
4381
+ label: "Input",
4382
+ color: "accent"
4383
+ } : null,
4384
+ activeKeys.has("outputTokens") ? {
4385
+ source: "column",
4386
+ key: "outputTokens",
4387
+ aggregate: "avg",
4388
+ label: "Output",
4389
+ color: "success"
4390
+ } : null,
4391
+ activeKeys.has("cachedInputTokens") ? {
4392
+ source: "column",
4393
+ key: "cachedInputTokens",
4394
+ aggregate: "avg",
4395
+ label: "Cached Input",
4396
+ color: "error"
4397
+ } : null,
4398
+ activeKeys.has("cacheCreationInputTokens") ? {
4399
+ source: "column",
4400
+ key: "cacheCreationInputTokens",
4401
+ aggregate: "avg",
4402
+ label: "Cache Write",
4403
+ color: "warning"
4404
+ } : null
4405
+ ].filter((metric) => metric !== null);
4406
+ if (tokenMetrics.length > 0) defaults.push({
4407
+ heading: "LLM Tokens",
4408
+ type: "bar",
4409
+ metrics: tokenMetrics,
4410
+ tooltipExtras: activeKeys.has("totalTokens") ? [{
4411
+ source: "column",
4412
+ key: "totalTokens",
4413
+ aggregate: "avg",
4414
+ label: "Total"
4415
+ }] : void 0
4416
+ });
4417
+ const merged = [...params.charts ?? [], ...defaults];
4418
+ return merged.length > 0 ? merged : void 0;
4419
+ }
4420
+ function resolveEvalDefaultConfig(params) {
4421
+ const evalRemove = params.evalDef.removeDefaultConfig;
4422
+ return {
4423
+ columns: mergeDefaultColumns({
4424
+ columns: params.evalDef.columns,
4425
+ globalRemove: params.globalRemove,
4426
+ evalRemove
4427
+ }),
4428
+ stats: appendDefaultStats({
4429
+ stats: params.evalDef.stats,
4430
+ globalRemove: params.globalRemove,
4431
+ evalRemove
4432
+ }),
4433
+ charts: appendDefaultCharts({
4434
+ charts: params.evalDef.charts,
4435
+ globalRemove: params.globalRemove,
4436
+ evalRemove
4437
+ })
4438
+ };
4439
+ }
4440
+ function sumNullable(values) {
4441
+ let total = 0;
4442
+ let hasValue = false;
4443
+ for (const value of values) {
4444
+ if (value === null) continue;
4445
+ total += value;
4446
+ hasValue = true;
4447
+ }
4448
+ return hasValue ? total : void 0;
4449
+ }
4450
+ function assignIfMissing(params) {
4451
+ if (!params.activeKeys.has(params.key)) return;
4452
+ if (params.key in params.outputs) return;
4453
+ if (params.value === void 0) return;
4454
+ params.outputs[params.key] = params.value;
4455
+ }
4456
+ function addDefaultOutputs(params) {
4457
+ const activeKeys = new Set(getActiveDefaultConfigKeys(params));
4458
+ if (activeKeys.size === 0) return;
4459
+ const calls = extractLlmCalls(params.spans, params.llmCallsConfig);
4460
+ const apiCalls = extractApiCalls(params.spans, params.apiCallsConfig);
4461
+ assignIfMissing({
4462
+ outputs: params.outputs,
4463
+ key: "apiCalls",
4464
+ value: apiCalls.length > 0 ? apiCalls.length : void 0,
4465
+ activeKeys
4466
+ });
4467
+ if (calls.length === 0) return;
4468
+ assignIfMissing({
4469
+ outputs: params.outputs,
4470
+ key: "llmTurns",
4471
+ value: calls.length,
4472
+ activeKeys
4473
+ });
4474
+ assignIfMissing({
4475
+ outputs: params.outputs,
4476
+ key: "costUsd",
4477
+ value: sumNullable(calls.map((call) => call.costUsd)),
4478
+ activeKeys
4479
+ });
4480
+ assignIfMissing({
4481
+ outputs: params.outputs,
4482
+ key: "inputTokens",
4483
+ value: sumNullable(calls.map((call) => call.inputTokens)),
4484
+ activeKeys
4485
+ });
4486
+ assignIfMissing({
4487
+ outputs: params.outputs,
4488
+ key: "outputTokens",
4489
+ value: sumNullable(calls.map((call) => call.outputTokens)),
4490
+ activeKeys
4491
+ });
4492
+ assignIfMissing({
4493
+ outputs: params.outputs,
4494
+ key: "totalTokens",
4495
+ value: sumNullable(calls.map((call) => call.totalTokens)),
4496
+ activeKeys
4497
+ });
4498
+ assignIfMissing({
4499
+ outputs: params.outputs,
4500
+ key: "cachedInputTokens",
4501
+ value: sumNullable(calls.map((call) => call.cachedInputTokens)),
4502
+ activeKeys
4503
+ });
4504
+ assignIfMissing({
4505
+ outputs: params.outputs,
4506
+ key: "cacheCreationInputTokens",
4507
+ value: sumNullable(calls.map((call) => call.cacheCreationInputTokens)),
4508
+ activeKeys
4509
+ });
4510
+ assignIfMissing({
4511
+ outputs: params.outputs,
4512
+ key: "llmLatencyMs",
4513
+ value: sumNullable(calls.map((call) => call.latencyMs)),
4514
+ activeKeys
4515
+ });
4516
+ }
4517
+ //#endregion
4006
4518
  //#region ../runner/src/discovery.ts
4007
4519
  const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
4008
4520
  const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
@@ -4664,7 +5176,7 @@ async function callWithUnknownResult(fn, args) {
4664
5176
  return await Reflect.apply(fn, void 0, args);
4665
5177
  }
4666
5178
  async function runCase(params) {
4667
- const { evalDef, evalId, evalCase, globalTraceDisplay, trial, startTime, cacheAdapter, cacheMode, codeFingerprint, moduleIsolation, evalFilePath, workspaceRoot, artifactDir, runId } = params;
5179
+ const { evalDef, evalId, evalCase, globalTraceDisplay, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, codeFingerprint, moduleIsolation, evalFilePath, workspaceRoot, artifactDir, runId } = params;
4668
5180
  const scopedIdPrefix = buildScopedEvalIdPrefix({
4669
5181
  evalId,
4670
5182
  evalFilePath,
@@ -4714,6 +5226,14 @@ async function runCase(params) {
4714
5226
  scope.assertionFailures.push(toAssertionFailure(message, e instanceof Error ? e : void 0));
4715
5227
  }
4716
5228
  }
5229
+ if (!nonAssertError) addDefaultOutputs({
5230
+ outputs: scope.outputs,
5231
+ spans: scope.spans,
5232
+ llmCallsConfig,
5233
+ apiCallsConfig,
5234
+ globalRemove: globalRemoveDefaultConfig,
5235
+ evalRemove: evalDef.removeDefaultConfig
5236
+ });
4717
5237
  if (!nonAssertError && evalDef.outputsSchema) {
4718
5238
  const { outputsSchema } = evalDef;
4719
5239
  const parsedOutputs = await runInExistingEvalScope(scope, "outputsSchema", () => outputsSchema.safeParse(getOutputsSchemaInput(outputsSchema, scope.outputs)));
@@ -4795,6 +5315,11 @@ async function runCase(params) {
4795
5315
  const status = nonAssertError ? "error" : passed ? "pass" : "fail";
4796
5316
  const { trace: displayTrace, traceDisplay } = resolveTracePresentation(scope.spans, globalTraceDisplay, evalDef.traceDisplay);
4797
5317
  const columns = {};
5318
+ const columnOverrides = mergeDefaultColumns({
5319
+ columns: evalDef.columns,
5320
+ globalRemove: globalRemoveDefaultConfig,
5321
+ evalRemove: evalDef.removeDefaultConfig
5322
+ });
4798
5323
  for (const [key, value] of Object.entries(scope.outputs)) {
4799
5324
  const cell = isBlob(value) ? await persistInlineArtifact({
4800
5325
  artifactDir,
@@ -4803,7 +5328,7 @@ async function runCase(params) {
4803
5328
  outputKey: key,
4804
5329
  trial,
4805
5330
  value
4806
- }) : toCellValue(value, evalDef.columns?.[key]);
5331
+ }) : toCellValue(value, columnOverrides?.[key]);
4807
5332
  if (cell !== void 0) columns[key] = cell;
4808
5333
  }
4809
5334
  for (const key of Object.keys(evalDef.manualScores ?? {})) columns[key] = null;
@@ -5016,6 +5541,8 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5016
5541
  key: runState.manifest.id,
5017
5542
  workspaceRoot
5018
5543
  };
5544
+ const llmCallsConfig = resolveLlmCallsConfig(config.llmCalls);
5545
+ const apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
5019
5546
  for (const evalMeta of targetEvals) {
5020
5547
  const evalFilePath = evalMeta.sourceFilePath;
5021
5548
  let codeFingerprint = "";
@@ -5054,7 +5581,20 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5054
5581
  evalId: evalMeta.id
5055
5582
  }), request.target.evalIds, request.target.caseIds, evalMeta.id);
5056
5583
  runState.summary.totalCases += cases.length;
5057
- const accumulatedColumns = /* @__PURE__ */ new Map();
5584
+ const defaultConfig = resolveEvalDefaultConfig({
5585
+ evalDef,
5586
+ globalRemove: config.removeDefaultConfig
5587
+ });
5588
+ const declaredColumnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
5589
+ const accumulatedColumns = new Map(declaredColumnDefs.map((def) => [def.key, def]));
5590
+ const validatedCharts = validateCharts({
5591
+ charts: defaultConfig.charts,
5592
+ columnDefs: declaredColumnDefs,
5593
+ evalId: evalMeta.id
5594
+ });
5595
+ for (const warning of validatedCharts.warnings) console.warn(warning);
5596
+ evalMeta.stats = defaultConfig.stats;
5597
+ evalMeta.charts = validatedCharts.charts;
5058
5598
  const evalCaseRows = [];
5059
5599
  const preparedCases = [];
5060
5600
  const scoreKeys = Object.freeze(Object.keys(evalDef.scores ?? {}));
@@ -5066,7 +5606,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5066
5606
  preparedCases,
5067
5607
  scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys]),
5068
5608
  mergeColumns: (columns) => {
5069
- mergeColumnDefs(accumulatedColumns, columns, evalDef.columns, evalDef.scores, evalDef.manualScores);
5609
+ mergeColumnDefs(accumulatedColumns, columns, defaultConfig.columns, evalDef.scores, evalDef.manualScores);
5070
5610
  }
5071
5611
  };
5072
5612
  preparedEvals.push(preparedEval);
@@ -5087,6 +5627,9 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5087
5627
  evalId: evalMeta.id,
5088
5628
  evalCase,
5089
5629
  globalTraceDisplay,
5630
+ llmCallsConfig,
5631
+ apiCallsConfig,
5632
+ globalRemoveDefaultConfig: config.removeDefaultConfig,
5090
5633
  trial,
5091
5634
  startTime,
5092
5635
  cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
@@ -5237,4 +5780,4 @@ function toLastRunStatus(status) {
5237
5780
  return status === "pending" ? null : status;
5238
5781
  }
5239
5782
  //#endregion
5240
- export { assertionFailureSchema as $, runArtifactRefSchema as $t, getNestedAttribute as A, getEvalRegistry as An, cacheRecordingSchema as At, agentEvalsConfigSchema as B, traceDisplayInputConfigSchema as Bt, createRunRequestSchema as C, runInEvalScope as Cn, cacheEntrySchema as Ct, extractCacheHits as D, startEvalBackgroundJob as Dn, cacheModeSchema as Dt, extractCacheEntries as E, setScopeCacheContext as En, cacheListItemSchema as Et, deriveStatusFromChildStatuses as F, traceAttributeDisplayFormatSchema as Ft, llmCallMetricFormatSchema as G, cellValueSchema as Gt, apiCallMetricPlacementSchema as H, traceSpanKindSchema as Ht, runManifestSchema as I, traceAttributeDisplayInputSchema as It, llmCallsConfigSchema as J, columnKindSchema as Jt, llmCallMetricPlacementSchema as K, columnDefSchema as Kt, runSummarySchema as L, traceAttributeDisplayPlacementSchema as Lt, getEvalDisplayStatus as M, serializedCacheSpanSchema as Mt, deriveScopedSummaryFromCases as N, spanCacheOptionsSchema as Nt, extractApiCalls as O, repoFile as On, cacheOperationTypeSchema as Ot, deriveStatusFromCaseRows as P, traceCacheRefSchema as Pt, trialSelectionModeSchema as Q, repoFileRefSchema as Qt, DEFAULT_API_CALLS_CONFIG as R, traceAttributeDisplaySchema as Rt, createFsCacheStore as S, runInEvalRuntimeScope as Sn, cacheDebugKeyFileSchema as St, sseEnvelopeSchema as T, setEvalOutput as Tn, cacheFileSchema as Tt, apiCallMetricSchema as U, traceSpanSchema as Ut, apiCallMetricFormatSchema as V, traceSpanErrorSchema as Vt, apiCallsConfigSchema as W, traceSpanWarningSchema as Wt, resolveLlmCallsConfig as X, jsonCellSchema as Xt, resolveApiCallsConfig as Y, fileRefSchema as Yt, runLogsConfigSchema as Z, numberDisplayOptionsSchema as Zt, loadEvalModule as _, getEvalCaseInput as _n, evalChartMetricSchema as _t, loadPersistedRunSnapshot as a, hashCacheKey as an, evalStatsConfigSchema as at, buildDeclaredColumnDefs as b, mergeEvalOutput as bn, evalChartsConfigSchema as bt, persistCaseDetail as c, deserializeCacheValue as cn, runLogLevelSchema as ct, recomputePersistedCaseStatus as d, EvalAssertionError as dn, scoreTraceSchema as dt, z$1 as en, caseDetailSchema as et, runTouchesEval as f, appendToEvalOutput as fn, evalChartAggregateSchema as ft, setLatestRunInfoMap as g, getCurrentScope as gn, evalChartConfigSchema as gt, getTargetEvalIds as h, evalLog as hn, evalChartColorSchema as ht, getLatestRunInfos as i, evalTracer as in, evalStatItemSchema as it, getEvalTitle as j, cacheStatusSchema as jt, extractLlmCalls as k, defineEval as kn, cacheRecordingOpSchema as kt, persistRunState as l, serializeCacheRecording as ln, runLogLocationSchema as lt, buildEvalSummary as m, evalAssert as mn, evalChartBuiltinMetricSchema as mt, generateRunId as n, captureEvalSpanError as nn, evalFreshnessStatusSchema as nt, loadPersistedRunSnapshots as o, hashCacheKeySync as on, evalSummarySchema as ot, resolveArtifactPath as p, configureEvalRunLogs as pn, evalChartAxisSchema as pt, llmCallMetricSchema as q, columnFormatSchema as qt, getLastRunStatuses as r, evalSpan as rn, evalStatAggregateSchema as rt, nextShortIdFromSnapshots as s, deserializeCacheRecording as sn, runLogEntrySchema as st, executeRun as t, buildTraceTree as tn, caseRowSchema as tt, recomputeEvalStatusesInRuns as u, serializeCacheValue as un, runLogPhaseSchema as ut, parseEvalMetas as v, incrementEvalOutput as vn, evalChartTooltipExtraSchema as vt, updateManualScoreRequestSchema as w, runInExistingEvalScope as wn, cacheEntryWithDebugKeySchema as wt, normalizeScoreDef as x, nextEvalId as xn, cacheDebugKeyEntrySchema as xt, loadConfig as y, isInEvalScope as yn, evalChartTypeSchema as yt, DEFAULT_LLM_CALLS_CONFIG as z, traceDisplayConfigSchema as zt };
5783
+ export { removeDefaultConfigSchema as $, columnKindSchema as $t, extractApiCalls as A, setEvalOutput as An, cacheFileSchema as At, DEFAULT_API_CALLS_CONFIG as B, traceAttributeDisplayFormatSchema as Bt, validateCharts as C, incrementEvalOutput as Cn, evalChartTooltipExtraSchema as Ct, sseEnvelopeSchema as D, runInEvalRuntimeScope as Dn, cacheDebugKeyFileSchema as Dt, updateManualScoreRequestSchema as E, nextEvalId as En, cacheDebugKeyEntrySchema as Et, deriveScopedSummaryFromCases as F, getEvalRegistry as Fn, cacheRecordingSchema as Ft, apiCallMetricSchema as G, traceDisplayInputConfigSchema as Gt, agentEvalsConfigSchema as H, traceAttributeDisplayPlacementSchema as Ht, deriveStatusFromCaseRows as I, cacheStatusSchema as It, llmCallMetricFormatSchema as J, traceSpanSchema as Jt, apiCallsConfigSchema as K, traceSpanErrorSchema as Kt, deriveStatusFromChildStatuses as L, serializedCacheSpanSchema as Lt, getNestedAttribute as M, startEvalBackgroundJob as Mn, cacheModeSchema as Mt, getEvalTitle as N, repoFile as Nn, cacheOperationTypeSchema as Nt, extractCacheEntries as O, runInEvalScope as On, cacheEntrySchema as Ot, getEvalDisplayStatus as P, defineEval as Pn, cacheRecordingOpSchema as Pt, llmCallsConfigSchema as Q, columnFormatSchema as Qt, runManifestSchema as R, spanCacheOptionsSchema as Rt, normalizeScoreDef as S, getEvalCaseInput as Sn, evalChartMetricSchema as St, createRunRequestSchema as T, mergeEvalOutput as Tn, evalChartsConfigSchema as Tt, apiCallMetricFormatSchema as U, traceAttributeDisplaySchema as Ut, DEFAULT_LLM_CALLS_CONFIG as V, traceAttributeDisplayInputSchema as Vt, apiCallMetricPlacementSchema as W, traceDisplayConfigSchema as Wt, llmCallMetricSchema as X, cellValueSchema as Xt, llmCallMetricPlacementSchema as Y, traceSpanWarningSchema as Yt, llmCallPricingSchema as Z, columnDefSchema as Zt, loadEvalModule as _, appendToEvalOutput as _n, evalChartAggregateSchema as _t, loadPersistedRunSnapshot as a, z$1 as an, caseDetailSchema as at, loadConfig as b, evalLog as bn, evalChartColorSchema as bt, persistCaseDetail as c, evalSpan as cn, evalStatAggregateSchema as ct, recomputePersistedCaseStatus as d, hashCacheKeySync as dn, evalSummarySchema as dt, fileRefSchema as en, resolveApiCallsConfig as et, runTouchesEval as f, deserializeCacheRecording as fn, runLogEntrySchema as ft, setLatestRunInfoMap as g, EvalAssertionError as gn, scoreTraceSchema as gt, getTargetEvalIds as h, serializeCacheValue as hn, runLogPhaseSchema as ht, getLatestRunInfos as i, runArtifactRefSchema as in, assertionFailureSchema as it, extractLlmCalls as j, setScopeCacheContext as jn, cacheListItemSchema as jt, extractCacheHits as k, runInExistingEvalScope as kn, cacheEntryWithDebugKeySchema as kt, persistRunState as l, evalTracer as ln, evalStatItemSchema as lt, buildEvalSummary as m, serializeCacheRecording as mn, runLogLocationSchema as mt, generateRunId as n, numberDisplayOptionsSchema as nn, runLogsConfigSchema as nt, loadPersistedRunSnapshots as o, buildTraceTree as on, caseRowSchema as ot, resolveArtifactPath as p, deserializeCacheValue as pn, runLogLevelSchema as pt, defaultConfigKeySchema as q, traceSpanKindSchema as qt, getLastRunStatuses as r, repoFileRefSchema as rn, trialSelectionModeSchema as rt, nextShortIdFromSnapshots as s, captureEvalSpanError as sn, evalFreshnessStatusSchema as st, executeRun as t, jsonCellSchema as tn, resolveLlmCallsConfig as tt, recomputeEvalStatusesInRuns as u, hashCacheKey as un, evalStatsConfigSchema as ut, parseEvalMetas as v, configureEvalRunLogs as vn, evalChartAxisSchema as vt, createFsCacheStore as w, isInEvalScope as wn, evalChartTypeSchema as wt, buildDeclaredColumnDefs as x, getCurrentScope as xn, evalChartConfigSchema as xt, resolveEvalDefaultConfig as y, evalAssert as yn, evalChartBuiltinMetricSchema as yt, runSummarySchema as z, traceCacheRefSchema as zt };