@ls-stack/agent-eval 0.19.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-hAlVvT-Q.mjs → app-CmwmcUgG.mjs} +3 -3
- package/dist/apps/web/dist/assets/index-EXO08yya.js +118 -0
- package/dist/apps/web/dist/assets/index-r0dVFK0B.css +1 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-3zANEAhG.mjs → cli-DumvanQI.mjs} +10 -80
- package/dist/index.d.mts +131 -14
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-BBg_VUH5.mjs → runOrchestration-zYAcAPtS.mjs} +567 -24
- package/dist/{runner-DxlahWDo.mjs → runner-BcwyX9CO.mjs} +1 -1
- package/dist/{runner-RmZPRz-h.mjs → runner-Dy_PECaf.mjs} +2 -2
- package/dist/src-BoAJb4wC.mjs +3 -0
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +35 -14
- package/dist/apps/web/dist/assets/index-C761goIh.css +0 -1
- package/dist/apps/web/dist/assets/index-DS552a3u.js +0 -118
- package/dist/src-BC4OrajN.mjs +0 -3
|
@@ -1970,7 +1970,15 @@ const numberDisplayOptionsSchema = z.object({
|
|
|
1970
1970
|
compactDisplay: z.enum(["short", "long"]).optional(),
|
|
1971
1971
|
prefix: z.string().optional(),
|
|
1972
1972
|
suffix: z.string().optional(),
|
|
1973
|
-
|
|
1973
|
+
minDecimalPlaces: z.number().int().min(0).optional(),
|
|
1974
|
+
maxDecimalPlaces: z.number().int().min(0).optional()
|
|
1975
|
+
}).refine((options) => {
|
|
1976
|
+
if (options.minDecimalPlaces === void 0) return true;
|
|
1977
|
+
if (options.maxDecimalPlaces === void 0) return true;
|
|
1978
|
+
return options.minDecimalPlaces <= options.maxDecimalPlaces;
|
|
1979
|
+
}, {
|
|
1980
|
+
message: "minDecimalPlaces must be less than or equal to maxDecimalPlaces",
|
|
1981
|
+
path: ["minDecimalPlaces"]
|
|
1974
1982
|
});
|
|
1975
1983
|
/** Schema for the supported column rendering kinds in list views. */
|
|
1976
1984
|
const columnKindSchema = z.enum([
|
|
@@ -2005,7 +2013,6 @@ const columnDefSchema = z.object({
|
|
|
2005
2013
|
passThreshold: z.number().optional(),
|
|
2006
2014
|
maxStars: z.number().int().min(2).optional(),
|
|
2007
2015
|
hideInTable: z.boolean().optional(),
|
|
2008
|
-
sortable: z.boolean().optional(),
|
|
2009
2016
|
align: z.enum([
|
|
2010
2017
|
"left",
|
|
2011
2018
|
"center",
|
|
@@ -2403,6 +2410,8 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
|
|
|
2403
2410
|
label: z.string().optional(),
|
|
2404
2411
|
aggregate: evalStatAggregateSchema,
|
|
2405
2412
|
format: columnFormatSchema.optional(),
|
|
2413
|
+
/** Number presentation options applied when `format: 'number'`. */
|
|
2414
|
+
numberFormat: numberDisplayOptionsSchema.optional(),
|
|
2406
2415
|
accent: z.boolean().optional()
|
|
2407
2416
|
})
|
|
2408
2417
|
]);
|
|
@@ -2566,6 +2575,21 @@ const caseDetailSchema = z.object({
|
|
|
2566
2575
|
//#region ../shared/src/schemas/config.ts
|
|
2567
2576
|
/** Strategy used to collapse repeated trials into one stored case result. */
|
|
2568
2577
|
const trialSelectionModeSchema = z.enum(["lowestScore", "median"]);
|
|
2578
|
+
/** Built-in eval-level output/column keys. */
|
|
2579
|
+
const defaultConfigKeySchema = z.enum([
|
|
2580
|
+
"apiCalls",
|
|
2581
|
+
"costUsd",
|
|
2582
|
+
"llmTurns",
|
|
2583
|
+
"inputTokens",
|
|
2584
|
+
"outputTokens",
|
|
2585
|
+
"totalTokens",
|
|
2586
|
+
"cachedInputTokens",
|
|
2587
|
+
"cacheCreationInputTokens",
|
|
2588
|
+
"reasoningTokens",
|
|
2589
|
+
"llmLatencyMs"
|
|
2590
|
+
]);
|
|
2591
|
+
/** Removal config for built-in eval-level outputs and UI metadata. */
|
|
2592
|
+
const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfigKeySchema)]);
|
|
2569
2593
|
/** Render formats supported by an LLM-call metric in the UI. */
|
|
2570
2594
|
const llmCallMetricFormatSchema = z.enum([
|
|
2571
2595
|
"string",
|
|
@@ -2637,6 +2661,30 @@ const apiCallMetricSchema = z.object({
|
|
|
2637
2661
|
*/
|
|
2638
2662
|
placements: z.array(apiCallMetricPlacementSchema).nonempty().optional()
|
|
2639
2663
|
});
|
|
2664
|
+
/**
|
|
2665
|
+
* Schema for one model/provider pricing entry used to derive LLM-call costs
|
|
2666
|
+
* from token counts when a span does not already record explicit USD costs.
|
|
2667
|
+
*/
|
|
2668
|
+
const llmCallPricingSchema = z.object({
|
|
2669
|
+
/** Exact model name read from the configured `attributes.model` path. */
|
|
2670
|
+
model: z.string().min(1),
|
|
2671
|
+
/**
|
|
2672
|
+
* Optional provider discriminator read from `attributes.provider`. When set,
|
|
2673
|
+
* the entry only applies to calls from that provider; provider-specific
|
|
2674
|
+
* entries take precedence over generic entries for the same model.
|
|
2675
|
+
*/
|
|
2676
|
+
provider: z.string().min(1).optional(),
|
|
2677
|
+
/** USD per one million non-cached input tokens. */
|
|
2678
|
+
inputUsdPerMillion: z.number().nonnegative().optional(),
|
|
2679
|
+
/** USD per one million output tokens. */
|
|
2680
|
+
outputUsdPerMillion: z.number().nonnegative().optional(),
|
|
2681
|
+
/** USD per one million prompt-cache read tokens. */
|
|
2682
|
+
cachedInputUsdPerMillion: z.number().nonnegative().optional(),
|
|
2683
|
+
/** USD per one million prompt-cache write tokens. */
|
|
2684
|
+
cacheCreationInputUsdPerMillion: z.number().nonnegative().optional(),
|
|
2685
|
+
/** USD per one million reasoning tokens when reported separately. */
|
|
2686
|
+
reasoningUsdPerMillion: z.number().nonnegative().optional()
|
|
2687
|
+
});
|
|
2640
2688
|
/** Schema for the global LLM calls config block in `agent-evals.config.ts`. */
|
|
2641
2689
|
const llmCallsConfigSchema = z.object({
|
|
2642
2690
|
/** Span kinds treated as LLM calls. Defaults to `['llm']`. */
|
|
@@ -2647,8 +2695,9 @@ const llmCallsConfigSchema = z.object({
|
|
|
2647
2695
|
* built-in defaults (e.g. `usage.inputTokens`, `costUsd`).
|
|
2648
2696
|
*
|
|
2649
2697
|
* Per-token-type cost paths (`inputCost`, `outputCost`, `cachedInputCost`,
|
|
2650
|
-
* `reasoningCost`) feed the cost breakdown table in the expanded row
|
|
2651
|
-
*
|
|
2698
|
+
* `reasoningCost`) feed the cost breakdown table in the expanded row when
|
|
2699
|
+
* spans provide explicit USD cost overrides. Prefer `pricing` for deriving
|
|
2700
|
+
* costs from token counts globally.
|
|
2652
2701
|
*/
|
|
2653
2702
|
attributes: z.object({
|
|
2654
2703
|
model: z.string().optional(),
|
|
@@ -2659,6 +2708,7 @@ const llmCallsConfigSchema = z.object({
|
|
|
2659
2708
|
cacheCreationInputTokens: z.string().optional(),
|
|
2660
2709
|
reasoningTokens: z.string().optional(),
|
|
2661
2710
|
totalTokens: z.string().optional(),
|
|
2711
|
+
tokensPerSecond: z.string().optional(),
|
|
2662
2712
|
cost: z.string().optional(),
|
|
2663
2713
|
inputCost: z.string().optional(),
|
|
2664
2714
|
outputCost: z.string().optional(),
|
|
@@ -2672,6 +2722,12 @@ const llmCallsConfigSchema = z.object({
|
|
|
2672
2722
|
reasoning: z.string().optional(),
|
|
2673
2723
|
toolCalls: z.string().optional()
|
|
2674
2724
|
}).optional(),
|
|
2725
|
+
/**
|
|
2726
|
+
* Model/provider pricing registry used to calculate missing LLM-call costs
|
|
2727
|
+
* from token counts. Explicit span attributes (`costUsd`, `cost.inputUsd`,
|
|
2728
|
+
* etc.) take precedence over derived prices.
|
|
2729
|
+
*/
|
|
2730
|
+
pricing: z.array(llmCallPricingSchema).optional(),
|
|
2675
2731
|
/** Custom user-defined metrics surfaced on each LLM call. */
|
|
2676
2732
|
metrics: z.array(llmCallMetricSchema).optional()
|
|
2677
2733
|
});
|
|
@@ -2719,6 +2775,7 @@ const DEFAULT_LLM_CALLS_CONFIG = {
|
|
|
2719
2775
|
cacheCreationInputTokens: "usage.cacheCreationInputTokens",
|
|
2720
2776
|
reasoningTokens: "usage.reasoningTokens",
|
|
2721
2777
|
totalTokens: "usage.totalTokens",
|
|
2778
|
+
tokensPerSecond: "tokensPerSecond",
|
|
2722
2779
|
cost: "costUsd",
|
|
2723
2780
|
inputCost: "cost.inputUsd",
|
|
2724
2781
|
outputCost: "cost.outputUsd",
|
|
@@ -2732,7 +2789,8 @@ const DEFAULT_LLM_CALLS_CONFIG = {
|
|
|
2732
2789
|
reasoning: "reasoning",
|
|
2733
2790
|
toolCalls: "toolCalls"
|
|
2734
2791
|
},
|
|
2735
|
-
metrics: []
|
|
2792
|
+
metrics: [],
|
|
2793
|
+
pricing: []
|
|
2736
2794
|
};
|
|
2737
2795
|
/** Default API-calls config the UI uses before the workspace fetch resolves. */
|
|
2738
2796
|
const DEFAULT_API_CALLS_CONFIG = {
|
|
@@ -2765,6 +2823,8 @@ const DEFAULT_API_CALLS_CONFIG = {
|
|
|
2765
2823
|
* attribute path.
|
|
2766
2824
|
* - Missing `metrics[].format` defaults to `'string'`.
|
|
2767
2825
|
* - Missing `metrics[].placements` defaults to `['body']`.
|
|
2826
|
+
* - Missing `pricing` defaults to an empty registry; explicit span costs still
|
|
2827
|
+
* take precedence over derived costs.
|
|
2768
2828
|
*/
|
|
2769
2829
|
function resolveLlmCallsConfig(input) {
|
|
2770
2830
|
return {
|
|
@@ -2780,6 +2840,15 @@ function resolveLlmCallsConfig(input) {
|
|
|
2780
2840
|
format: m.format ?? "string",
|
|
2781
2841
|
numberFormat: m.numberFormat,
|
|
2782
2842
|
placements: m.placements ? [...m.placements] : ["body"]
|
|
2843
|
+
})),
|
|
2844
|
+
pricing: (input?.pricing ?? []).map((p) => ({
|
|
2845
|
+
model: p.model,
|
|
2846
|
+
provider: p.provider,
|
|
2847
|
+
inputUsdPerMillion: p.inputUsdPerMillion,
|
|
2848
|
+
outputUsdPerMillion: p.outputUsdPerMillion,
|
|
2849
|
+
cachedInputUsdPerMillion: p.cachedInputUsdPerMillion,
|
|
2850
|
+
cacheCreationInputUsdPerMillion: p.cacheCreationInputUsdPerMillion,
|
|
2851
|
+
reasoningUsdPerMillion: p.reasoningUsdPerMillion
|
|
2783
2852
|
}))
|
|
2784
2853
|
};
|
|
2785
2854
|
}
|
|
@@ -2821,6 +2890,7 @@ const agentEvalsConfigSchema = z.object({
|
|
|
2821
2890
|
allowCliRunAll: z.boolean().optional(),
|
|
2822
2891
|
traceDisplay: traceDisplayInputConfigSchema.optional(),
|
|
2823
2892
|
llmCalls: llmCallsConfigSchema.optional(),
|
|
2893
|
+
removeDefaultConfig: removeDefaultConfigSchema.optional(),
|
|
2824
2894
|
apiCalls: apiCallsConfigSchema.optional(),
|
|
2825
2895
|
runLogs: runLogsConfigSchema.optional(),
|
|
2826
2896
|
cache: z.object({
|
|
@@ -3056,6 +3126,62 @@ function readString$2(attributes, path) {
|
|
|
3056
3126
|
const raw = getNestedAttribute(attributes, path);
|
|
3057
3127
|
return typeof raw === "string" && raw.length > 0 ? raw : null;
|
|
3058
3128
|
}
|
|
3129
|
+
function computeTokenCost(tokens, usdPerMillion) {
|
|
3130
|
+
if (tokens === null) return null;
|
|
3131
|
+
if (tokens === 0) return 0;
|
|
3132
|
+
if (usdPerMillion === void 0) return null;
|
|
3133
|
+
return tokens / 1e6 * usdPerMillion;
|
|
3134
|
+
}
|
|
3135
|
+
function pickPricingEntry({ pricing, model, provider }) {
|
|
3136
|
+
if (model === null) return null;
|
|
3137
|
+
let fallback = null;
|
|
3138
|
+
for (const entry of pricing) {
|
|
3139
|
+
if (entry.model !== model) continue;
|
|
3140
|
+
if (entry.provider === void 0) {
|
|
3141
|
+
fallback ??= entry;
|
|
3142
|
+
continue;
|
|
3143
|
+
}
|
|
3144
|
+
if (entry.provider === provider) return entry;
|
|
3145
|
+
}
|
|
3146
|
+
return fallback;
|
|
3147
|
+
}
|
|
3148
|
+
function computeFallbackTotalCost({ inputTokens, inputCostUsd, outputTokens, outputCostUsd, cachedInputTokens, cachedInputCostUsd, cacheCreationInputTokens, cacheCreationInputCostUsd, reasoningTokens, reasoningCostUsd }) {
|
|
3149
|
+
const parts = [
|
|
3150
|
+
{
|
|
3151
|
+
tokens: inputTokens,
|
|
3152
|
+
cost: inputCostUsd
|
|
3153
|
+
},
|
|
3154
|
+
{
|
|
3155
|
+
tokens: outputTokens,
|
|
3156
|
+
cost: outputCostUsd
|
|
3157
|
+
},
|
|
3158
|
+
{
|
|
3159
|
+
tokens: cachedInputTokens,
|
|
3160
|
+
cost: cachedInputCostUsd
|
|
3161
|
+
},
|
|
3162
|
+
{
|
|
3163
|
+
tokens: cacheCreationInputTokens,
|
|
3164
|
+
cost: cacheCreationInputCostUsd
|
|
3165
|
+
},
|
|
3166
|
+
{
|
|
3167
|
+
tokens: reasoningTokens,
|
|
3168
|
+
cost: reasoningCostUsd
|
|
3169
|
+
}
|
|
3170
|
+
];
|
|
3171
|
+
let total = 0;
|
|
3172
|
+
let hasCost = false;
|
|
3173
|
+
let hasReportedTokens = false;
|
|
3174
|
+
for (const part of parts) {
|
|
3175
|
+
if (part.tokens === null) continue;
|
|
3176
|
+
hasReportedTokens = true;
|
|
3177
|
+
if (part.tokens === 0) continue;
|
|
3178
|
+
if (part.cost === null) return null;
|
|
3179
|
+
total += part.cost;
|
|
3180
|
+
hasCost = true;
|
|
3181
|
+
}
|
|
3182
|
+
if (hasCost) return total;
|
|
3183
|
+
return hasReportedTokens ? 0 : null;
|
|
3184
|
+
}
|
|
3059
3185
|
function computeLatencyMs$1(span) {
|
|
3060
3186
|
if (span.endedAt === null) return null;
|
|
3061
3187
|
const started = Date.parse(span.startedAt);
|
|
@@ -3100,9 +3226,11 @@ function pickError$1(span) {
|
|
|
3100
3226
|
* shape consumed by the LLM calls tab.
|
|
3101
3227
|
*
|
|
3102
3228
|
* Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
|
|
3103
|
-
* (`model`, token counts, cost, etc.) are read via
|
|
3104
|
-
* the configured paths, with safe coercion to
|
|
3105
|
-
*
|
|
3229
|
+
* (`model`, token counts, explicit cost, etc.) are read via
|
|
3230
|
+
* `getNestedAttribute` from the configured paths, with safe coercion to
|
|
3231
|
+
* `string | null` / `number | null`. When explicit USD costs are absent,
|
|
3232
|
+
* configured model pricing derives per-token-type costs from token counts.
|
|
3233
|
+
* `totalTokens` falls back to a sum of input + output + cached when no
|
|
3106
3234
|
* explicit total attribute is present. The `steps` attribute path may resolve
|
|
3107
3235
|
* to either a number (rendered as the inference-round count) or an array of
|
|
3108
3236
|
* per-step detail objects (rendered as a Steps section in the body, with
|
|
@@ -3118,12 +3246,36 @@ function extractLlmCalls(spans, config) {
|
|
|
3118
3246
|
for (const span of spans) {
|
|
3119
3247
|
if (!kindSet.has(span.kind)) continue;
|
|
3120
3248
|
const attrs = span.attributes;
|
|
3249
|
+
const model = readString$2(attrs, config.attributes.model);
|
|
3250
|
+
const provider = readString$2(attrs, config.attributes.provider);
|
|
3121
3251
|
const inputTokens = readNumber$2(attrs, config.attributes.inputTokens);
|
|
3122
3252
|
const outputTokens = readNumber$2(attrs, config.attributes.outputTokens);
|
|
3123
3253
|
const cachedInputTokens = readNumber$2(attrs, config.attributes.cachedInputTokens);
|
|
3124
3254
|
const cacheCreationInputTokens = readNumber$2(attrs, config.attributes.cacheCreationInputTokens);
|
|
3125
3255
|
const reasoningTokens = readNumber$2(attrs, config.attributes.reasoningTokens);
|
|
3126
3256
|
const declaredTotalTokens = readNumber$2(attrs, config.attributes.totalTokens);
|
|
3257
|
+
const pricing = pickPricingEntry({
|
|
3258
|
+
pricing: config.pricing,
|
|
3259
|
+
model,
|
|
3260
|
+
provider
|
|
3261
|
+
});
|
|
3262
|
+
const inputCostUsd = readNumber$2(attrs, config.attributes.inputCost) ?? computeTokenCost(inputTokens, pricing?.inputUsdPerMillion);
|
|
3263
|
+
const outputCostUsd = readNumber$2(attrs, config.attributes.outputCost) ?? computeTokenCost(outputTokens, pricing?.outputUsdPerMillion);
|
|
3264
|
+
const cachedInputCostUsd = readNumber$2(attrs, config.attributes.cachedInputCost) ?? computeTokenCost(cachedInputTokens, pricing?.cachedInputUsdPerMillion);
|
|
3265
|
+
const cacheCreationInputCostUsd = readNumber$2(attrs, config.attributes.cacheCreationInputCost) ?? computeTokenCost(cacheCreationInputTokens, pricing?.cacheCreationInputUsdPerMillion);
|
|
3266
|
+
const reasoningCostUsd = readNumber$2(attrs, config.attributes.reasoningCost) ?? computeTokenCost(reasoningTokens, pricing?.reasoningUsdPerMillion);
|
|
3267
|
+
const costUsd = readNumber$2(attrs, config.attributes.cost) ?? computeFallbackTotalCost({
|
|
3268
|
+
inputTokens,
|
|
3269
|
+
inputCostUsd,
|
|
3270
|
+
outputTokens,
|
|
3271
|
+
outputCostUsd,
|
|
3272
|
+
cachedInputTokens,
|
|
3273
|
+
cachedInputCostUsd,
|
|
3274
|
+
cacheCreationInputTokens,
|
|
3275
|
+
cacheCreationInputCostUsd,
|
|
3276
|
+
reasoningTokens,
|
|
3277
|
+
reasoningCostUsd
|
|
3278
|
+
});
|
|
3127
3279
|
const metrics = [];
|
|
3128
3280
|
for (const metric of config.metrics) {
|
|
3129
3281
|
const rawValue = getNestedAttribute(attrs, metric.path);
|
|
@@ -3142,8 +3294,8 @@ function extractLlmCalls(spans, config) {
|
|
|
3142
3294
|
name: span.name,
|
|
3143
3295
|
kind: span.kind,
|
|
3144
3296
|
status: span.status,
|
|
3145
|
-
model
|
|
3146
|
-
provider
|
|
3297
|
+
model,
|
|
3298
|
+
provider,
|
|
3147
3299
|
inputTokens,
|
|
3148
3300
|
outputTokens,
|
|
3149
3301
|
cachedInputTokens,
|
|
@@ -3156,12 +3308,13 @@ function extractLlmCalls(spans, config) {
|
|
|
3156
3308
|
cached: cachedInputTokens,
|
|
3157
3309
|
cacheCreation: cacheCreationInputTokens
|
|
3158
3310
|
}),
|
|
3159
|
-
|
|
3160
|
-
|
|
3161
|
-
|
|
3162
|
-
|
|
3163
|
-
|
|
3164
|
-
|
|
3311
|
+
tokensPerSecond: readNumber$2(attrs, config.attributes.tokensPerSecond),
|
|
3312
|
+
costUsd,
|
|
3313
|
+
inputCostUsd,
|
|
3314
|
+
outputCostUsd,
|
|
3315
|
+
cachedInputCostUsd,
|
|
3316
|
+
cacheCreationInputCostUsd,
|
|
3317
|
+
reasoningCostUsd,
|
|
3165
3318
|
...readSteps(attrs, config.attributes.steps),
|
|
3166
3319
|
finishReason: readString$2(attrs, config.attributes.finishReason),
|
|
3167
3320
|
latencyMs: computeLatencyMs$1(span),
|
|
@@ -3792,6 +3945,80 @@ function isRecordLike(value) {
|
|
|
3792
3945
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
3793
3946
|
}
|
|
3794
3947
|
//#endregion
|
|
3948
|
+
//#region ../runner/src/chartValidation.ts
|
|
3949
|
+
function isValidColumnMetric(metric, columnsByKey, evalId, warnings) {
|
|
3950
|
+
const columnDef = columnsByKey.get(metric.key);
|
|
3951
|
+
if (!columnDef) {
|
|
3952
|
+
warnings.push(`[${evalId}] chart metric references unknown column "${metric.key}" — dropped`);
|
|
3953
|
+
return false;
|
|
3954
|
+
}
|
|
3955
|
+
if (metric.aggregate === "passThresholdRate") {
|
|
3956
|
+
if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
|
|
3957
|
+
warnings.push(`[${evalId}] chart metric "${metric.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
|
|
3958
|
+
return false;
|
|
3959
|
+
}
|
|
3960
|
+
}
|
|
3961
|
+
return true;
|
|
3962
|
+
}
|
|
3963
|
+
function isValidTooltipExtra(extra, columnsByKey, evalId, warnings) {
|
|
3964
|
+
const columnDef = columnsByKey.get(extra.key);
|
|
3965
|
+
if (!columnDef) {
|
|
3966
|
+
warnings.push(`[${evalId}] chart tooltip extra references unknown column "${extra.key}" — dropped`);
|
|
3967
|
+
return false;
|
|
3968
|
+
}
|
|
3969
|
+
if (extra.aggregate === "passThresholdRate") {
|
|
3970
|
+
if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
|
|
3971
|
+
warnings.push(`[${evalId}] chart tooltip extra "${extra.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
|
|
3972
|
+
return false;
|
|
3973
|
+
}
|
|
3974
|
+
}
|
|
3975
|
+
return true;
|
|
3976
|
+
}
|
|
3977
|
+
function sanitizeChart(chart, columnsByKey, evalId, warnings) {
|
|
3978
|
+
const metrics = chart.metrics.filter((metric) => {
|
|
3979
|
+
if (metric.source === "builtin") return true;
|
|
3980
|
+
return isValidColumnMetric(metric, columnsByKey, evalId, warnings);
|
|
3981
|
+
});
|
|
3982
|
+
if (metrics.length === 0) {
|
|
3983
|
+
warnings.push(`[${evalId}] chart had no valid metrics after validation — chart dropped`);
|
|
3984
|
+
return null;
|
|
3985
|
+
}
|
|
3986
|
+
const tooltipExtras = chart.tooltipExtras?.filter((extra) => {
|
|
3987
|
+
if (extra.source === "builtin") return true;
|
|
3988
|
+
return isValidTooltipExtra(extra, columnsByKey, evalId, warnings);
|
|
3989
|
+
});
|
|
3990
|
+
return {
|
|
3991
|
+
...chart,
|
|
3992
|
+
metrics,
|
|
3993
|
+
tooltipExtras: tooltipExtras?.length ? tooltipExtras : void 0
|
|
3994
|
+
};
|
|
3995
|
+
}
|
|
3996
|
+
/**
|
|
3997
|
+
* Validate and sanitize an authored `charts` config against the eval's
|
|
3998
|
+
* declared columns. Drops metrics/extras that reference unknown columns or
|
|
3999
|
+
* misuse `passThresholdRate`, and drops entire charts whose metrics are all
|
|
4000
|
+
* invalid. Returns `charts: undefined` when nothing valid remains so the UI
|
|
4001
|
+
* falls back to rendering no chart (matching the opt-in default).
|
|
4002
|
+
*/
|
|
4003
|
+
function validateCharts(params) {
|
|
4004
|
+
const { charts, columnDefs, evalId } = params;
|
|
4005
|
+
if (!charts || charts.length === 0) return {
|
|
4006
|
+
charts: void 0,
|
|
4007
|
+
warnings: []
|
|
4008
|
+
};
|
|
4009
|
+
const columnsByKey = new Map(columnDefs.map((def) => [def.key, def]));
|
|
4010
|
+
const warnings = [];
|
|
4011
|
+
const sanitized = [];
|
|
4012
|
+
for (const chart of charts) {
|
|
4013
|
+
const result = sanitizeChart(chart, columnsByKey, evalId, warnings);
|
|
4014
|
+
if (result) sanitized.push(result);
|
|
4015
|
+
}
|
|
4016
|
+
return {
|
|
4017
|
+
charts: sanitized.length > 0 ? sanitized : void 0,
|
|
4018
|
+
warnings
|
|
4019
|
+
};
|
|
4020
|
+
}
|
|
4021
|
+
//#endregion
|
|
3795
4022
|
//#region ../runner/src/columnBuilder.ts
|
|
3796
4023
|
/**
|
|
3797
4024
|
* Normalize a user-provided score definition (either a function or an
|
|
@@ -3817,7 +4044,6 @@ function getScoreOverride(def) {
|
|
|
3817
4044
|
format: def.format,
|
|
3818
4045
|
numberFormat: def.numberFormat,
|
|
3819
4046
|
hideInTable: def.hideInTable,
|
|
3820
|
-
sortable: def.sortable,
|
|
3821
4047
|
align: def.align,
|
|
3822
4048
|
maxStars: def.maxStars
|
|
3823
4049
|
};
|
|
@@ -3830,7 +4056,6 @@ function mergeOverrides(base, override) {
|
|
|
3830
4056
|
format: override.format ?? base.format,
|
|
3831
4057
|
numberFormat: override.numberFormat ?? base.numberFormat,
|
|
3832
4058
|
hideInTable: override.hideInTable ?? base.hideInTable,
|
|
3833
|
-
sortable: override.sortable ?? base.sortable,
|
|
3834
4059
|
align: override.align ?? base.align,
|
|
3835
4060
|
maxStars: override.maxStars ?? base.maxStars
|
|
3836
4061
|
};
|
|
@@ -3945,7 +4170,6 @@ function createColumnDef(params) {
|
|
|
3945
4170
|
if (override?.numberFormat !== void 0) def.numberFormat = override.numberFormat;
|
|
3946
4171
|
if (override?.maxStars !== void 0) def.maxStars = override.maxStars;
|
|
3947
4172
|
if (override?.hideInTable !== void 0) def.hideInTable = override.hideInTable;
|
|
3948
|
-
if (override?.sortable !== void 0) def.sortable = override.sortable;
|
|
3949
4173
|
if (override?.align !== void 0) def.align = override.align;
|
|
3950
4174
|
if (!isScore) return def;
|
|
3951
4175
|
def.isScore = true;
|
|
@@ -4003,6 +4227,294 @@ async function loadConfig() {
|
|
|
4003
4227
|
}
|
|
4004
4228
|
}
|
|
4005
4229
|
//#endregion
|
|
4230
|
+
//#region ../runner/src/defaultConfig.ts
|
|
4231
|
+
const DEFAULT_CONFIG_KEYS = [
|
|
4232
|
+
"apiCalls",
|
|
4233
|
+
"costUsd",
|
|
4234
|
+
"llmTurns",
|
|
4235
|
+
"inputTokens",
|
|
4236
|
+
"outputTokens",
|
|
4237
|
+
"totalTokens",
|
|
4238
|
+
"cachedInputTokens",
|
|
4239
|
+
"cacheCreationInputTokens",
|
|
4240
|
+
"llmLatencyMs"
|
|
4241
|
+
];
|
|
4242
|
+
const tokenNumberFormat = { notation: "compact" };
|
|
4243
|
+
const countNumberFormat = {
|
|
4244
|
+
minDecimalPlaces: 0,
|
|
4245
|
+
maxDecimalPlaces: 0
|
|
4246
|
+
};
|
|
4247
|
+
const costNumberFormat = {
|
|
4248
|
+
prefix: "$",
|
|
4249
|
+
maxDecimalPlaces: 4
|
|
4250
|
+
};
|
|
4251
|
+
const DEFAULT_COLUMNS = {
|
|
4252
|
+
apiCalls: {
|
|
4253
|
+
label: "API Calls",
|
|
4254
|
+
format: "number",
|
|
4255
|
+
numberFormat: countNumberFormat,
|
|
4256
|
+
align: "right"
|
|
4257
|
+
},
|
|
4258
|
+
costUsd: {
|
|
4259
|
+
label: "Cost",
|
|
4260
|
+
format: "number",
|
|
4261
|
+
numberFormat: costNumberFormat,
|
|
4262
|
+
align: "right"
|
|
4263
|
+
},
|
|
4264
|
+
llmTurns: {
|
|
4265
|
+
label: "LLM Turns",
|
|
4266
|
+
format: "number",
|
|
4267
|
+
numberFormat: countNumberFormat,
|
|
4268
|
+
align: "right"
|
|
4269
|
+
},
|
|
4270
|
+
inputTokens: {
|
|
4271
|
+
label: "Input Tokens",
|
|
4272
|
+
format: "number",
|
|
4273
|
+
numberFormat: tokenNumberFormat,
|
|
4274
|
+
align: "right"
|
|
4275
|
+
},
|
|
4276
|
+
outputTokens: {
|
|
4277
|
+
label: "Output Tokens",
|
|
4278
|
+
format: "number",
|
|
4279
|
+
numberFormat: tokenNumberFormat,
|
|
4280
|
+
align: "right"
|
|
4281
|
+
},
|
|
4282
|
+
totalTokens: {
|
|
4283
|
+
label: "Total Tokens",
|
|
4284
|
+
format: "number",
|
|
4285
|
+
numberFormat: tokenNumberFormat,
|
|
4286
|
+
align: "right"
|
|
4287
|
+
},
|
|
4288
|
+
cachedInputTokens: {
|
|
4289
|
+
label: "Cached Input Tokens",
|
|
4290
|
+
format: "number",
|
|
4291
|
+
numberFormat: tokenNumberFormat,
|
|
4292
|
+
align: "right"
|
|
4293
|
+
},
|
|
4294
|
+
cacheCreationInputTokens: {
|
|
4295
|
+
label: "Cache Write Tokens",
|
|
4296
|
+
format: "number",
|
|
4297
|
+
numberFormat: tokenNumberFormat,
|
|
4298
|
+
align: "right"
|
|
4299
|
+
},
|
|
4300
|
+
reasoningTokens: {
|
|
4301
|
+
label: "Reasoning Tokens",
|
|
4302
|
+
format: "number",
|
|
4303
|
+
numberFormat: tokenNumberFormat,
|
|
4304
|
+
align: "right"
|
|
4305
|
+
},
|
|
4306
|
+
llmLatencyMs: {
|
|
4307
|
+
label: "LLM Latency",
|
|
4308
|
+
format: "duration",
|
|
4309
|
+
align: "right"
|
|
4310
|
+
}
|
|
4311
|
+
};
|
|
4312
|
+
function resolveRemovedKeys(globalRemove, evalRemove) {
|
|
4313
|
+
if (globalRemove === true || evalRemove === true) return new Set(DEFAULT_CONFIG_KEYS);
|
|
4314
|
+
return new Set([...globalRemove ?? [], ...evalRemove ?? []]);
|
|
4315
|
+
}
|
|
4316
|
+
function getActiveDefaultConfigKeys(params) {
|
|
4317
|
+
const removed = resolveRemovedKeys(params.globalRemove, params.evalRemove);
|
|
4318
|
+
return DEFAULT_CONFIG_KEYS.filter((key) => !removed.has(key));
|
|
4319
|
+
}
|
|
4320
|
+
function mergeDefaultColumns(params) {
|
|
4321
|
+
const activeKeys = getActiveDefaultConfigKeys(params);
|
|
4322
|
+
if (activeKeys.length === 0) return params.columns;
|
|
4323
|
+
return {
|
|
4324
|
+
...Object.fromEntries(activeKeys.map((key) => [key, DEFAULT_COLUMNS[key]])),
|
|
4325
|
+
...params.columns
|
|
4326
|
+
};
|
|
4327
|
+
}
|
|
4328
|
+
function appendDefaultStats(params) {
|
|
4329
|
+
const activeKeys = new Set(getActiveDefaultConfigKeys(params));
|
|
4330
|
+
const defaults = [];
|
|
4331
|
+
if (activeKeys.has("apiCalls")) defaults.push({
|
|
4332
|
+
kind: "column",
|
|
4333
|
+
key: "apiCalls",
|
|
4334
|
+
label: "API Calls",
|
|
4335
|
+
aggregate: "avg",
|
|
4336
|
+
numberFormat: countNumberFormat
|
|
4337
|
+
});
|
|
4338
|
+
if (activeKeys.has("costUsd")) defaults.push({
|
|
4339
|
+
kind: "column",
|
|
4340
|
+
key: "costUsd",
|
|
4341
|
+
label: "LLM Cost",
|
|
4342
|
+
aggregate: "avg",
|
|
4343
|
+
numberFormat: costNumberFormat
|
|
4344
|
+
});
|
|
4345
|
+
if (activeKeys.has("totalTokens")) defaults.push({
|
|
4346
|
+
kind: "column",
|
|
4347
|
+
key: "totalTokens",
|
|
4348
|
+
label: "Tokens",
|
|
4349
|
+
aggregate: "avg",
|
|
4350
|
+
numberFormat: tokenNumberFormat
|
|
4351
|
+
});
|
|
4352
|
+
if (activeKeys.has("llmTurns")) defaults.push({
|
|
4353
|
+
kind: "column",
|
|
4354
|
+
key: "llmTurns",
|
|
4355
|
+
label: "LLM Turns",
|
|
4356
|
+
aggregate: "avg",
|
|
4357
|
+
numberFormat: countNumberFormat
|
|
4358
|
+
});
|
|
4359
|
+
const merged = [...params.stats ?? [], ...defaults];
|
|
4360
|
+
return merged.length > 0 ? merged : void 0;
|
|
4361
|
+
}
|
|
4362
|
+
function appendDefaultCharts(params) {
|
|
4363
|
+
const activeKeys = new Set(getActiveDefaultConfigKeys(params));
|
|
4364
|
+
const defaults = [];
|
|
4365
|
+
if (activeKeys.has("costUsd")) defaults.push({
|
|
4366
|
+
heading: "LLM Cost",
|
|
4367
|
+
type: "area",
|
|
4368
|
+
metrics: [{
|
|
4369
|
+
source: "column",
|
|
4370
|
+
key: "costUsd",
|
|
4371
|
+
aggregate: "avg",
|
|
4372
|
+
label: "Cost",
|
|
4373
|
+
color: "warning"
|
|
4374
|
+
}]
|
|
4375
|
+
});
|
|
4376
|
+
const tokenMetrics = [
|
|
4377
|
+
activeKeys.has("inputTokens") ? {
|
|
4378
|
+
source: "column",
|
|
4379
|
+
key: "inputTokens",
|
|
4380
|
+
aggregate: "avg",
|
|
4381
|
+
label: "Input",
|
|
4382
|
+
color: "accent"
|
|
4383
|
+
} : null,
|
|
4384
|
+
activeKeys.has("outputTokens") ? {
|
|
4385
|
+
source: "column",
|
|
4386
|
+
key: "outputTokens",
|
|
4387
|
+
aggregate: "avg",
|
|
4388
|
+
label: "Output",
|
|
4389
|
+
color: "success"
|
|
4390
|
+
} : null,
|
|
4391
|
+
activeKeys.has("cachedInputTokens") ? {
|
|
4392
|
+
source: "column",
|
|
4393
|
+
key: "cachedInputTokens",
|
|
4394
|
+
aggregate: "avg",
|
|
4395
|
+
label: "Cached Input",
|
|
4396
|
+
color: "error"
|
|
4397
|
+
} : null,
|
|
4398
|
+
activeKeys.has("cacheCreationInputTokens") ? {
|
|
4399
|
+
source: "column",
|
|
4400
|
+
key: "cacheCreationInputTokens",
|
|
4401
|
+
aggregate: "avg",
|
|
4402
|
+
label: "Cache Write",
|
|
4403
|
+
color: "warning"
|
|
4404
|
+
} : null
|
|
4405
|
+
].filter((metric) => metric !== null);
|
|
4406
|
+
if (tokenMetrics.length > 0) defaults.push({
|
|
4407
|
+
heading: "LLM Tokens",
|
|
4408
|
+
type: "bar",
|
|
4409
|
+
metrics: tokenMetrics,
|
|
4410
|
+
tooltipExtras: activeKeys.has("totalTokens") ? [{
|
|
4411
|
+
source: "column",
|
|
4412
|
+
key: "totalTokens",
|
|
4413
|
+
aggregate: "avg",
|
|
4414
|
+
label: "Total"
|
|
4415
|
+
}] : void 0
|
|
4416
|
+
});
|
|
4417
|
+
const merged = [...params.charts ?? [], ...defaults];
|
|
4418
|
+
return merged.length > 0 ? merged : void 0;
|
|
4419
|
+
}
|
|
4420
|
+
function resolveEvalDefaultConfig(params) {
|
|
4421
|
+
const evalRemove = params.evalDef.removeDefaultConfig;
|
|
4422
|
+
return {
|
|
4423
|
+
columns: mergeDefaultColumns({
|
|
4424
|
+
columns: params.evalDef.columns,
|
|
4425
|
+
globalRemove: params.globalRemove,
|
|
4426
|
+
evalRemove
|
|
4427
|
+
}),
|
|
4428
|
+
stats: appendDefaultStats({
|
|
4429
|
+
stats: params.evalDef.stats,
|
|
4430
|
+
globalRemove: params.globalRemove,
|
|
4431
|
+
evalRemove
|
|
4432
|
+
}),
|
|
4433
|
+
charts: appendDefaultCharts({
|
|
4434
|
+
charts: params.evalDef.charts,
|
|
4435
|
+
globalRemove: params.globalRemove,
|
|
4436
|
+
evalRemove
|
|
4437
|
+
})
|
|
4438
|
+
};
|
|
4439
|
+
}
|
|
4440
|
+
function sumNullable(values) {
|
|
4441
|
+
let total = 0;
|
|
4442
|
+
let hasValue = false;
|
|
4443
|
+
for (const value of values) {
|
|
4444
|
+
if (value === null) continue;
|
|
4445
|
+
total += value;
|
|
4446
|
+
hasValue = true;
|
|
4447
|
+
}
|
|
4448
|
+
return hasValue ? total : void 0;
|
|
4449
|
+
}
|
|
4450
|
+
function assignIfMissing(params) {
|
|
4451
|
+
if (!params.activeKeys.has(params.key)) return;
|
|
4452
|
+
if (params.key in params.outputs) return;
|
|
4453
|
+
if (params.value === void 0) return;
|
|
4454
|
+
params.outputs[params.key] = params.value;
|
|
4455
|
+
}
|
|
4456
|
+
function addDefaultOutputs(params) {
|
|
4457
|
+
const activeKeys = new Set(getActiveDefaultConfigKeys(params));
|
|
4458
|
+
if (activeKeys.size === 0) return;
|
|
4459
|
+
const calls = extractLlmCalls(params.spans, params.llmCallsConfig);
|
|
4460
|
+
const apiCalls = extractApiCalls(params.spans, params.apiCallsConfig);
|
|
4461
|
+
assignIfMissing({
|
|
4462
|
+
outputs: params.outputs,
|
|
4463
|
+
key: "apiCalls",
|
|
4464
|
+
value: apiCalls.length > 0 ? apiCalls.length : void 0,
|
|
4465
|
+
activeKeys
|
|
4466
|
+
});
|
|
4467
|
+
if (calls.length === 0) return;
|
|
4468
|
+
assignIfMissing({
|
|
4469
|
+
outputs: params.outputs,
|
|
4470
|
+
key: "llmTurns",
|
|
4471
|
+
value: calls.length,
|
|
4472
|
+
activeKeys
|
|
4473
|
+
});
|
|
4474
|
+
assignIfMissing({
|
|
4475
|
+
outputs: params.outputs,
|
|
4476
|
+
key: "costUsd",
|
|
4477
|
+
value: sumNullable(calls.map((call) => call.costUsd)),
|
|
4478
|
+
activeKeys
|
|
4479
|
+
});
|
|
4480
|
+
assignIfMissing({
|
|
4481
|
+
outputs: params.outputs,
|
|
4482
|
+
key: "inputTokens",
|
|
4483
|
+
value: sumNullable(calls.map((call) => call.inputTokens)),
|
|
4484
|
+
activeKeys
|
|
4485
|
+
});
|
|
4486
|
+
assignIfMissing({
|
|
4487
|
+
outputs: params.outputs,
|
|
4488
|
+
key: "outputTokens",
|
|
4489
|
+
value: sumNullable(calls.map((call) => call.outputTokens)),
|
|
4490
|
+
activeKeys
|
|
4491
|
+
});
|
|
4492
|
+
assignIfMissing({
|
|
4493
|
+
outputs: params.outputs,
|
|
4494
|
+
key: "totalTokens",
|
|
4495
|
+
value: sumNullable(calls.map((call) => call.totalTokens)),
|
|
4496
|
+
activeKeys
|
|
4497
|
+
});
|
|
4498
|
+
assignIfMissing({
|
|
4499
|
+
outputs: params.outputs,
|
|
4500
|
+
key: "cachedInputTokens",
|
|
4501
|
+
value: sumNullable(calls.map((call) => call.cachedInputTokens)),
|
|
4502
|
+
activeKeys
|
|
4503
|
+
});
|
|
4504
|
+
assignIfMissing({
|
|
4505
|
+
outputs: params.outputs,
|
|
4506
|
+
key: "cacheCreationInputTokens",
|
|
4507
|
+
value: sumNullable(calls.map((call) => call.cacheCreationInputTokens)),
|
|
4508
|
+
activeKeys
|
|
4509
|
+
});
|
|
4510
|
+
assignIfMissing({
|
|
4511
|
+
outputs: params.outputs,
|
|
4512
|
+
key: "llmLatencyMs",
|
|
4513
|
+
value: sumNullable(calls.map((call) => call.latencyMs)),
|
|
4514
|
+
activeKeys
|
|
4515
|
+
});
|
|
4516
|
+
}
|
|
4517
|
+
//#endregion
|
|
4006
4518
|
//#region ../runner/src/discovery.ts
|
|
4007
4519
|
const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
|
|
4008
4520
|
const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
|
|
@@ -4664,7 +5176,7 @@ async function callWithUnknownResult(fn, args) {
|
|
|
4664
5176
|
return await Reflect.apply(fn, void 0, args);
|
|
4665
5177
|
}
|
|
4666
5178
|
async function runCase(params) {
|
|
4667
|
-
const { evalDef, evalId, evalCase, globalTraceDisplay, trial, startTime, cacheAdapter, cacheMode, codeFingerprint, moduleIsolation, evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
5179
|
+
const { evalDef, evalId, evalCase, globalTraceDisplay, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, codeFingerprint, moduleIsolation, evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
4668
5180
|
const scopedIdPrefix = buildScopedEvalIdPrefix({
|
|
4669
5181
|
evalId,
|
|
4670
5182
|
evalFilePath,
|
|
@@ -4714,6 +5226,14 @@ async function runCase(params) {
|
|
|
4714
5226
|
scope.assertionFailures.push(toAssertionFailure(message, e instanceof Error ? e : void 0));
|
|
4715
5227
|
}
|
|
4716
5228
|
}
|
|
5229
|
+
if (!nonAssertError) addDefaultOutputs({
|
|
5230
|
+
outputs: scope.outputs,
|
|
5231
|
+
spans: scope.spans,
|
|
5232
|
+
llmCallsConfig,
|
|
5233
|
+
apiCallsConfig,
|
|
5234
|
+
globalRemove: globalRemoveDefaultConfig,
|
|
5235
|
+
evalRemove: evalDef.removeDefaultConfig
|
|
5236
|
+
});
|
|
4717
5237
|
if (!nonAssertError && evalDef.outputsSchema) {
|
|
4718
5238
|
const { outputsSchema } = evalDef;
|
|
4719
5239
|
const parsedOutputs = await runInExistingEvalScope(scope, "outputsSchema", () => outputsSchema.safeParse(getOutputsSchemaInput(outputsSchema, scope.outputs)));
|
|
@@ -4795,6 +5315,11 @@ async function runCase(params) {
|
|
|
4795
5315
|
const status = nonAssertError ? "error" : passed ? "pass" : "fail";
|
|
4796
5316
|
const { trace: displayTrace, traceDisplay } = resolveTracePresentation(scope.spans, globalTraceDisplay, evalDef.traceDisplay);
|
|
4797
5317
|
const columns = {};
|
|
5318
|
+
const columnOverrides = mergeDefaultColumns({
|
|
5319
|
+
columns: evalDef.columns,
|
|
5320
|
+
globalRemove: globalRemoveDefaultConfig,
|
|
5321
|
+
evalRemove: evalDef.removeDefaultConfig
|
|
5322
|
+
});
|
|
4798
5323
|
for (const [key, value] of Object.entries(scope.outputs)) {
|
|
4799
5324
|
const cell = isBlob(value) ? await persistInlineArtifact({
|
|
4800
5325
|
artifactDir,
|
|
@@ -4803,7 +5328,7 @@ async function runCase(params) {
|
|
|
4803
5328
|
outputKey: key,
|
|
4804
5329
|
trial,
|
|
4805
5330
|
value
|
|
4806
|
-
}) : toCellValue(value,
|
|
5331
|
+
}) : toCellValue(value, columnOverrides?.[key]);
|
|
4807
5332
|
if (cell !== void 0) columns[key] = cell;
|
|
4808
5333
|
}
|
|
4809
5334
|
for (const key of Object.keys(evalDef.manualScores ?? {})) columns[key] = null;
|
|
@@ -5016,6 +5541,8 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5016
5541
|
key: runState.manifest.id,
|
|
5017
5542
|
workspaceRoot
|
|
5018
5543
|
};
|
|
5544
|
+
const llmCallsConfig = resolveLlmCallsConfig(config.llmCalls);
|
|
5545
|
+
const apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
|
|
5019
5546
|
for (const evalMeta of targetEvals) {
|
|
5020
5547
|
const evalFilePath = evalMeta.sourceFilePath;
|
|
5021
5548
|
let codeFingerprint = "";
|
|
@@ -5054,7 +5581,20 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5054
5581
|
evalId: evalMeta.id
|
|
5055
5582
|
}), request.target.evalIds, request.target.caseIds, evalMeta.id);
|
|
5056
5583
|
runState.summary.totalCases += cases.length;
|
|
5057
|
-
const
|
|
5584
|
+
const defaultConfig = resolveEvalDefaultConfig({
|
|
5585
|
+
evalDef,
|
|
5586
|
+
globalRemove: config.removeDefaultConfig
|
|
5587
|
+
});
|
|
5588
|
+
const declaredColumnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
|
|
5589
|
+
const accumulatedColumns = new Map(declaredColumnDefs.map((def) => [def.key, def]));
|
|
5590
|
+
const validatedCharts = validateCharts({
|
|
5591
|
+
charts: defaultConfig.charts,
|
|
5592
|
+
columnDefs: declaredColumnDefs,
|
|
5593
|
+
evalId: evalMeta.id
|
|
5594
|
+
});
|
|
5595
|
+
for (const warning of validatedCharts.warnings) console.warn(warning);
|
|
5596
|
+
evalMeta.stats = defaultConfig.stats;
|
|
5597
|
+
evalMeta.charts = validatedCharts.charts;
|
|
5058
5598
|
const evalCaseRows = [];
|
|
5059
5599
|
const preparedCases = [];
|
|
5060
5600
|
const scoreKeys = Object.freeze(Object.keys(evalDef.scores ?? {}));
|
|
@@ -5066,7 +5606,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5066
5606
|
preparedCases,
|
|
5067
5607
|
scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys]),
|
|
5068
5608
|
mergeColumns: (columns) => {
|
|
5069
|
-
mergeColumnDefs(accumulatedColumns, columns,
|
|
5609
|
+
mergeColumnDefs(accumulatedColumns, columns, defaultConfig.columns, evalDef.scores, evalDef.manualScores);
|
|
5070
5610
|
}
|
|
5071
5611
|
};
|
|
5072
5612
|
preparedEvals.push(preparedEval);
|
|
@@ -5087,6 +5627,9 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5087
5627
|
evalId: evalMeta.id,
|
|
5088
5628
|
evalCase,
|
|
5089
5629
|
globalTraceDisplay,
|
|
5630
|
+
llmCallsConfig,
|
|
5631
|
+
apiCallsConfig,
|
|
5632
|
+
globalRemoveDefaultConfig: config.removeDefaultConfig,
|
|
5090
5633
|
trial,
|
|
5091
5634
|
startTime,
|
|
5092
5635
|
cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
|
|
@@ -5237,4 +5780,4 @@ function toLastRunStatus(status) {
|
|
|
5237
5780
|
return status === "pending" ? null : status;
|
|
5238
5781
|
}
|
|
5239
5782
|
//#endregion
|
|
5240
|
-
export {
|
|
5783
|
+
export { removeDefaultConfigSchema as $, columnKindSchema as $t, extractApiCalls as A, setEvalOutput as An, cacheFileSchema as At, DEFAULT_API_CALLS_CONFIG as B, traceAttributeDisplayFormatSchema as Bt, validateCharts as C, incrementEvalOutput as Cn, evalChartTooltipExtraSchema as Ct, sseEnvelopeSchema as D, runInEvalRuntimeScope as Dn, cacheDebugKeyFileSchema as Dt, updateManualScoreRequestSchema as E, nextEvalId as En, cacheDebugKeyEntrySchema as Et, deriveScopedSummaryFromCases as F, getEvalRegistry as Fn, cacheRecordingSchema as Ft, apiCallMetricSchema as G, traceDisplayInputConfigSchema as Gt, agentEvalsConfigSchema as H, traceAttributeDisplayPlacementSchema as Ht, deriveStatusFromCaseRows as I, cacheStatusSchema as It, llmCallMetricFormatSchema as J, traceSpanSchema as Jt, apiCallsConfigSchema as K, traceSpanErrorSchema as Kt, deriveStatusFromChildStatuses as L, serializedCacheSpanSchema as Lt, getNestedAttribute as M, startEvalBackgroundJob as Mn, cacheModeSchema as Mt, getEvalTitle as N, repoFile as Nn, cacheOperationTypeSchema as Nt, extractCacheEntries as O, runInEvalScope as On, cacheEntrySchema as Ot, getEvalDisplayStatus as P, defineEval as Pn, cacheRecordingOpSchema as Pt, llmCallsConfigSchema as Q, columnFormatSchema as Qt, runManifestSchema as R, spanCacheOptionsSchema as Rt, normalizeScoreDef as S, getEvalCaseInput as Sn, evalChartMetricSchema as St, createRunRequestSchema as T, mergeEvalOutput as Tn, evalChartsConfigSchema as Tt, apiCallMetricFormatSchema as U, traceAttributeDisplaySchema as Ut, DEFAULT_LLM_CALLS_CONFIG as V, traceAttributeDisplayInputSchema as Vt, apiCallMetricPlacementSchema as W, traceDisplayConfigSchema as Wt, llmCallMetricSchema as X, cellValueSchema as Xt, llmCallMetricPlacementSchema as Y, traceSpanWarningSchema as Yt, llmCallPricingSchema as Z, columnDefSchema as Zt, loadEvalModule as _, appendToEvalOutput as _n, evalChartAggregateSchema as _t, loadPersistedRunSnapshot as a, z$1 as an, caseDetailSchema as at, loadConfig as b, evalLog as bn, evalChartColorSchema as bt, persistCaseDetail as c, evalSpan as cn, evalStatAggregateSchema as ct, recomputePersistedCaseStatus as d, hashCacheKeySync as dn, evalSummarySchema as dt, fileRefSchema as en, resolveApiCallsConfig as et, runTouchesEval as f, deserializeCacheRecording as fn, runLogEntrySchema as ft, setLatestRunInfoMap as g, EvalAssertionError as gn, scoreTraceSchema as gt, getTargetEvalIds as h, serializeCacheValue as hn, runLogPhaseSchema as ht, getLatestRunInfos as i, runArtifactRefSchema as in, assertionFailureSchema as it, extractLlmCalls as j, setScopeCacheContext as jn, cacheListItemSchema as jt, extractCacheHits as k, runInExistingEvalScope as kn, cacheEntryWithDebugKeySchema as kt, persistRunState as l, evalTracer as ln, evalStatItemSchema as lt, buildEvalSummary as m, serializeCacheRecording as mn, runLogLocationSchema as mt, generateRunId as n, numberDisplayOptionsSchema as nn, runLogsConfigSchema as nt, loadPersistedRunSnapshots as o, buildTraceTree as on, caseRowSchema as ot, resolveArtifactPath as p, deserializeCacheValue as pn, runLogLevelSchema as pt, defaultConfigKeySchema as q, traceSpanKindSchema as qt, getLastRunStatuses as r, repoFileRefSchema as rn, trialSelectionModeSchema as rt, nextShortIdFromSnapshots as s, captureEvalSpanError as sn, evalFreshnessStatusSchema as st, executeRun as t, jsonCellSchema as tn, resolveLlmCallsConfig as tt, recomputeEvalStatusesInRuns as u, hashCacheKey as un, evalStatsConfigSchema as ut, parseEvalMetas as v, configureEvalRunLogs as vn, evalChartAxisSchema as vt, createFsCacheStore as w, isInEvalScope as wn, evalChartTypeSchema as wt, buildDeclaredColumnDefs as x, getCurrentScope as xn, evalChartConfigSchema as xt, resolveEvalDefaultConfig as y, evalAssert as yn, evalChartBuiltinMetricSchema as yt, runSummarySchema as z, traceCacheRefSchema as zt };
|