@ls-stack/agent-eval 0.19.0 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-hAlVvT-Q.mjs → app-DsiLU65H.mjs} +3 -3
- package/dist/apps/web/dist/assets/index-Cba4MFa0.js +118 -0
- package/dist/apps/web/dist/assets/index-CvR6QCLa.css +1 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-3zANEAhG.mjs → cli-weogme5U.mjs} +10 -80
- package/dist/index.d.mts +121 -6
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-BBg_VUH5.mjs → runOrchestration-Cv1kiOAG.mjs} +578 -19
- package/dist/{runner-DxlahWDo.mjs → runner-B25oRQxX.mjs} +1 -1
- package/dist/{runner-RmZPRz-h.mjs → runner-DzrMtgBu.mjs} +2 -2
- package/dist/src-B879LZfo.mjs +3 -0
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +31 -14
- package/dist/apps/web/dist/assets/index-C761goIh.css +0 -1
- package/dist/apps/web/dist/assets/index-DS552a3u.js +0 -118
- package/dist/src-BC4OrajN.mjs +0 -3
|
@@ -2566,6 +2566,21 @@ const caseDetailSchema = z.object({
|
|
|
2566
2566
|
//#region ../shared/src/schemas/config.ts
|
|
2567
2567
|
/** Strategy used to collapse repeated trials into one stored case result. */
|
|
2568
2568
|
const trialSelectionModeSchema = z.enum(["lowestScore", "median"]);
|
|
2569
|
+
/** Built-in eval-level output/column keys. */
|
|
2570
|
+
const defaultConfigKeySchema = z.enum([
|
|
2571
|
+
"apiCalls",
|
|
2572
|
+
"costUsd",
|
|
2573
|
+
"llmTurns",
|
|
2574
|
+
"inputTokens",
|
|
2575
|
+
"outputTokens",
|
|
2576
|
+
"totalTokens",
|
|
2577
|
+
"cachedInputTokens",
|
|
2578
|
+
"cacheCreationInputTokens",
|
|
2579
|
+
"reasoningTokens",
|
|
2580
|
+
"llmLatencyMs"
|
|
2581
|
+
]);
|
|
2582
|
+
/** Removal config for built-in eval-level outputs and UI metadata. */
|
|
2583
|
+
const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfigKeySchema)]);
|
|
2569
2584
|
/** Render formats supported by an LLM-call metric in the UI. */
|
|
2570
2585
|
const llmCallMetricFormatSchema = z.enum([
|
|
2571
2586
|
"string",
|
|
@@ -2637,6 +2652,30 @@ const apiCallMetricSchema = z.object({
|
|
|
2637
2652
|
*/
|
|
2638
2653
|
placements: z.array(apiCallMetricPlacementSchema).nonempty().optional()
|
|
2639
2654
|
});
|
|
2655
|
+
/**
|
|
2656
|
+
* Schema for one model/provider pricing entry used to derive LLM-call costs
|
|
2657
|
+
* from token counts when a span does not already record explicit USD costs.
|
|
2658
|
+
*/
|
|
2659
|
+
const llmCallPricingSchema = z.object({
|
|
2660
|
+
/** Exact model name read from the configured `attributes.model` path. */
|
|
2661
|
+
model: z.string().min(1),
|
|
2662
|
+
/**
|
|
2663
|
+
* Optional provider discriminator read from `attributes.provider`. When set,
|
|
2664
|
+
* the entry only applies to calls from that provider; provider-specific
|
|
2665
|
+
* entries take precedence over generic entries for the same model.
|
|
2666
|
+
*/
|
|
2667
|
+
provider: z.string().min(1).optional(),
|
|
2668
|
+
/** USD per one million non-cached input tokens. */
|
|
2669
|
+
inputUsdPerMillion: z.number().nonnegative().optional(),
|
|
2670
|
+
/** USD per one million output tokens. */
|
|
2671
|
+
outputUsdPerMillion: z.number().nonnegative().optional(),
|
|
2672
|
+
/** USD per one million prompt-cache read tokens. */
|
|
2673
|
+
cachedInputUsdPerMillion: z.number().nonnegative().optional(),
|
|
2674
|
+
/** USD per one million prompt-cache write tokens. */
|
|
2675
|
+
cacheCreationInputUsdPerMillion: z.number().nonnegative().optional(),
|
|
2676
|
+
/** USD per one million reasoning tokens when reported separately. */
|
|
2677
|
+
reasoningUsdPerMillion: z.number().nonnegative().optional()
|
|
2678
|
+
});
|
|
2640
2679
|
/** Schema for the global LLM calls config block in `agent-evals.config.ts`. */
|
|
2641
2680
|
const llmCallsConfigSchema = z.object({
|
|
2642
2681
|
/** Span kinds treated as LLM calls. Defaults to `['llm']`. */
|
|
@@ -2647,8 +2686,9 @@ const llmCallsConfigSchema = z.object({
|
|
|
2647
2686
|
* built-in defaults (e.g. `usage.inputTokens`, `costUsd`).
|
|
2648
2687
|
*
|
|
2649
2688
|
* Per-token-type cost paths (`inputCost`, `outputCost`, `cachedInputCost`,
|
|
2650
|
-
* `reasoningCost`) feed the cost breakdown table in the expanded row
|
|
2651
|
-
*
|
|
2689
|
+
* `reasoningCost`) feed the cost breakdown table in the expanded row when
|
|
2690
|
+
* spans provide explicit USD cost overrides. Prefer `pricing` for deriving
|
|
2691
|
+
* costs from token counts globally.
|
|
2652
2692
|
*/
|
|
2653
2693
|
attributes: z.object({
|
|
2654
2694
|
model: z.string().optional(),
|
|
@@ -2659,6 +2699,7 @@ const llmCallsConfigSchema = z.object({
|
|
|
2659
2699
|
cacheCreationInputTokens: z.string().optional(),
|
|
2660
2700
|
reasoningTokens: z.string().optional(),
|
|
2661
2701
|
totalTokens: z.string().optional(),
|
|
2702
|
+
tokensPerSecond: z.string().optional(),
|
|
2662
2703
|
cost: z.string().optional(),
|
|
2663
2704
|
inputCost: z.string().optional(),
|
|
2664
2705
|
outputCost: z.string().optional(),
|
|
@@ -2672,6 +2713,12 @@ const llmCallsConfigSchema = z.object({
|
|
|
2672
2713
|
reasoning: z.string().optional(),
|
|
2673
2714
|
toolCalls: z.string().optional()
|
|
2674
2715
|
}).optional(),
|
|
2716
|
+
/**
|
|
2717
|
+
* Model/provider pricing registry used to calculate missing LLM-call costs
|
|
2718
|
+
* from token counts. Explicit span attributes (`costUsd`, `cost.inputUsd`,
|
|
2719
|
+
* etc.) take precedence over derived prices.
|
|
2720
|
+
*/
|
|
2721
|
+
pricing: z.array(llmCallPricingSchema).optional(),
|
|
2675
2722
|
/** Custom user-defined metrics surfaced on each LLM call. */
|
|
2676
2723
|
metrics: z.array(llmCallMetricSchema).optional()
|
|
2677
2724
|
});
|
|
@@ -2719,6 +2766,7 @@ const DEFAULT_LLM_CALLS_CONFIG = {
|
|
|
2719
2766
|
cacheCreationInputTokens: "usage.cacheCreationInputTokens",
|
|
2720
2767
|
reasoningTokens: "usage.reasoningTokens",
|
|
2721
2768
|
totalTokens: "usage.totalTokens",
|
|
2769
|
+
tokensPerSecond: "tokensPerSecond",
|
|
2722
2770
|
cost: "costUsd",
|
|
2723
2771
|
inputCost: "cost.inputUsd",
|
|
2724
2772
|
outputCost: "cost.outputUsd",
|
|
@@ -2732,7 +2780,8 @@ const DEFAULT_LLM_CALLS_CONFIG = {
|
|
|
2732
2780
|
reasoning: "reasoning",
|
|
2733
2781
|
toolCalls: "toolCalls"
|
|
2734
2782
|
},
|
|
2735
|
-
metrics: []
|
|
2783
|
+
metrics: [],
|
|
2784
|
+
pricing: []
|
|
2736
2785
|
};
|
|
2737
2786
|
/** Default API-calls config the UI uses before the workspace fetch resolves. */
|
|
2738
2787
|
const DEFAULT_API_CALLS_CONFIG = {
|
|
@@ -2765,6 +2814,8 @@ const DEFAULT_API_CALLS_CONFIG = {
|
|
|
2765
2814
|
* attribute path.
|
|
2766
2815
|
* - Missing `metrics[].format` defaults to `'string'`.
|
|
2767
2816
|
* - Missing `metrics[].placements` defaults to `['body']`.
|
|
2817
|
+
* - Missing `pricing` defaults to an empty registry; explicit span costs still
|
|
2818
|
+
* take precedence over derived costs.
|
|
2768
2819
|
*/
|
|
2769
2820
|
function resolveLlmCallsConfig(input) {
|
|
2770
2821
|
return {
|
|
@@ -2780,6 +2831,15 @@ function resolveLlmCallsConfig(input) {
|
|
|
2780
2831
|
format: m.format ?? "string",
|
|
2781
2832
|
numberFormat: m.numberFormat,
|
|
2782
2833
|
placements: m.placements ? [...m.placements] : ["body"]
|
|
2834
|
+
})),
|
|
2835
|
+
pricing: (input?.pricing ?? []).map((p) => ({
|
|
2836
|
+
model: p.model,
|
|
2837
|
+
provider: p.provider,
|
|
2838
|
+
inputUsdPerMillion: p.inputUsdPerMillion,
|
|
2839
|
+
outputUsdPerMillion: p.outputUsdPerMillion,
|
|
2840
|
+
cachedInputUsdPerMillion: p.cachedInputUsdPerMillion,
|
|
2841
|
+
cacheCreationInputUsdPerMillion: p.cacheCreationInputUsdPerMillion,
|
|
2842
|
+
reasoningUsdPerMillion: p.reasoningUsdPerMillion
|
|
2783
2843
|
}))
|
|
2784
2844
|
};
|
|
2785
2845
|
}
|
|
@@ -2821,6 +2881,7 @@ const agentEvalsConfigSchema = z.object({
|
|
|
2821
2881
|
allowCliRunAll: z.boolean().optional(),
|
|
2822
2882
|
traceDisplay: traceDisplayInputConfigSchema.optional(),
|
|
2823
2883
|
llmCalls: llmCallsConfigSchema.optional(),
|
|
2884
|
+
removeDefaultConfig: removeDefaultConfigSchema.optional(),
|
|
2824
2885
|
apiCalls: apiCallsConfigSchema.optional(),
|
|
2825
2886
|
runLogs: runLogsConfigSchema.optional(),
|
|
2826
2887
|
cache: z.object({
|
|
@@ -3056,6 +3117,62 @@ function readString$2(attributes, path) {
|
|
|
3056
3117
|
const raw = getNestedAttribute(attributes, path);
|
|
3057
3118
|
return typeof raw === "string" && raw.length > 0 ? raw : null;
|
|
3058
3119
|
}
|
|
3120
|
+
function computeTokenCost(tokens, usdPerMillion) {
|
|
3121
|
+
if (tokens === null) return null;
|
|
3122
|
+
if (tokens === 0) return 0;
|
|
3123
|
+
if (usdPerMillion === void 0) return null;
|
|
3124
|
+
return tokens / 1e6 * usdPerMillion;
|
|
3125
|
+
}
|
|
3126
|
+
function pickPricingEntry({ pricing, model, provider }) {
|
|
3127
|
+
if (model === null) return null;
|
|
3128
|
+
let fallback = null;
|
|
3129
|
+
for (const entry of pricing) {
|
|
3130
|
+
if (entry.model !== model) continue;
|
|
3131
|
+
if (entry.provider === void 0) {
|
|
3132
|
+
fallback ??= entry;
|
|
3133
|
+
continue;
|
|
3134
|
+
}
|
|
3135
|
+
if (entry.provider === provider) return entry;
|
|
3136
|
+
}
|
|
3137
|
+
return fallback;
|
|
3138
|
+
}
|
|
3139
|
+
function computeFallbackTotalCost({ inputTokens, inputCostUsd, outputTokens, outputCostUsd, cachedInputTokens, cachedInputCostUsd, cacheCreationInputTokens, cacheCreationInputCostUsd, reasoningTokens, reasoningCostUsd }) {
|
|
3140
|
+
const parts = [
|
|
3141
|
+
{
|
|
3142
|
+
tokens: inputTokens,
|
|
3143
|
+
cost: inputCostUsd
|
|
3144
|
+
},
|
|
3145
|
+
{
|
|
3146
|
+
tokens: outputTokens,
|
|
3147
|
+
cost: outputCostUsd
|
|
3148
|
+
},
|
|
3149
|
+
{
|
|
3150
|
+
tokens: cachedInputTokens,
|
|
3151
|
+
cost: cachedInputCostUsd
|
|
3152
|
+
},
|
|
3153
|
+
{
|
|
3154
|
+
tokens: cacheCreationInputTokens,
|
|
3155
|
+
cost: cacheCreationInputCostUsd
|
|
3156
|
+
},
|
|
3157
|
+
{
|
|
3158
|
+
tokens: reasoningTokens,
|
|
3159
|
+
cost: reasoningCostUsd
|
|
3160
|
+
}
|
|
3161
|
+
];
|
|
3162
|
+
let total = 0;
|
|
3163
|
+
let hasCost = false;
|
|
3164
|
+
let hasReportedTokens = false;
|
|
3165
|
+
for (const part of parts) {
|
|
3166
|
+
if (part.tokens === null) continue;
|
|
3167
|
+
hasReportedTokens = true;
|
|
3168
|
+
if (part.tokens === 0) continue;
|
|
3169
|
+
if (part.cost === null) return null;
|
|
3170
|
+
total += part.cost;
|
|
3171
|
+
hasCost = true;
|
|
3172
|
+
}
|
|
3173
|
+
if (hasCost) return total;
|
|
3174
|
+
return hasReportedTokens ? 0 : null;
|
|
3175
|
+
}
|
|
3059
3176
|
function computeLatencyMs$1(span) {
|
|
3060
3177
|
if (span.endedAt === null) return null;
|
|
3061
3178
|
const started = Date.parse(span.startedAt);
|
|
@@ -3100,9 +3217,11 @@ function pickError$1(span) {
|
|
|
3100
3217
|
* shape consumed by the LLM calls tab.
|
|
3101
3218
|
*
|
|
3102
3219
|
* Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
|
|
3103
|
-
* (`model`, token counts, cost, etc.) are read via
|
|
3104
|
-
* the configured paths, with safe coercion to
|
|
3105
|
-
*
|
|
3220
|
+
* (`model`, token counts, explicit cost, etc.) are read via
|
|
3221
|
+
* `getNestedAttribute` from the configured paths, with safe coercion to
|
|
3222
|
+
* `string | null` / `number | null`. When explicit USD costs are absent,
|
|
3223
|
+
* configured model pricing derives per-token-type costs from token counts.
|
|
3224
|
+
* `totalTokens` falls back to a sum of input + output + cached when no
|
|
3106
3225
|
* explicit total attribute is present. The `steps` attribute path may resolve
|
|
3107
3226
|
* to either a number (rendered as the inference-round count) or an array of
|
|
3108
3227
|
* per-step detail objects (rendered as a Steps section in the body, with
|
|
@@ -3118,12 +3237,36 @@ function extractLlmCalls(spans, config) {
|
|
|
3118
3237
|
for (const span of spans) {
|
|
3119
3238
|
if (!kindSet.has(span.kind)) continue;
|
|
3120
3239
|
const attrs = span.attributes;
|
|
3240
|
+
const model = readString$2(attrs, config.attributes.model);
|
|
3241
|
+
const provider = readString$2(attrs, config.attributes.provider);
|
|
3121
3242
|
const inputTokens = readNumber$2(attrs, config.attributes.inputTokens);
|
|
3122
3243
|
const outputTokens = readNumber$2(attrs, config.attributes.outputTokens);
|
|
3123
3244
|
const cachedInputTokens = readNumber$2(attrs, config.attributes.cachedInputTokens);
|
|
3124
3245
|
const cacheCreationInputTokens = readNumber$2(attrs, config.attributes.cacheCreationInputTokens);
|
|
3125
3246
|
const reasoningTokens = readNumber$2(attrs, config.attributes.reasoningTokens);
|
|
3126
3247
|
const declaredTotalTokens = readNumber$2(attrs, config.attributes.totalTokens);
|
|
3248
|
+
const pricing = pickPricingEntry({
|
|
3249
|
+
pricing: config.pricing,
|
|
3250
|
+
model,
|
|
3251
|
+
provider
|
|
3252
|
+
});
|
|
3253
|
+
const inputCostUsd = readNumber$2(attrs, config.attributes.inputCost) ?? computeTokenCost(inputTokens, pricing?.inputUsdPerMillion);
|
|
3254
|
+
const outputCostUsd = readNumber$2(attrs, config.attributes.outputCost) ?? computeTokenCost(outputTokens, pricing?.outputUsdPerMillion);
|
|
3255
|
+
const cachedInputCostUsd = readNumber$2(attrs, config.attributes.cachedInputCost) ?? computeTokenCost(cachedInputTokens, pricing?.cachedInputUsdPerMillion);
|
|
3256
|
+
const cacheCreationInputCostUsd = readNumber$2(attrs, config.attributes.cacheCreationInputCost) ?? computeTokenCost(cacheCreationInputTokens, pricing?.cacheCreationInputUsdPerMillion);
|
|
3257
|
+
const reasoningCostUsd = readNumber$2(attrs, config.attributes.reasoningCost) ?? computeTokenCost(reasoningTokens, pricing?.reasoningUsdPerMillion);
|
|
3258
|
+
const costUsd = readNumber$2(attrs, config.attributes.cost) ?? computeFallbackTotalCost({
|
|
3259
|
+
inputTokens,
|
|
3260
|
+
inputCostUsd,
|
|
3261
|
+
outputTokens,
|
|
3262
|
+
outputCostUsd,
|
|
3263
|
+
cachedInputTokens,
|
|
3264
|
+
cachedInputCostUsd,
|
|
3265
|
+
cacheCreationInputTokens,
|
|
3266
|
+
cacheCreationInputCostUsd,
|
|
3267
|
+
reasoningTokens,
|
|
3268
|
+
reasoningCostUsd
|
|
3269
|
+
});
|
|
3127
3270
|
const metrics = [];
|
|
3128
3271
|
for (const metric of config.metrics) {
|
|
3129
3272
|
const rawValue = getNestedAttribute(attrs, metric.path);
|
|
@@ -3142,8 +3285,8 @@ function extractLlmCalls(spans, config) {
|
|
|
3142
3285
|
name: span.name,
|
|
3143
3286
|
kind: span.kind,
|
|
3144
3287
|
status: span.status,
|
|
3145
|
-
model
|
|
3146
|
-
provider
|
|
3288
|
+
model,
|
|
3289
|
+
provider,
|
|
3147
3290
|
inputTokens,
|
|
3148
3291
|
outputTokens,
|
|
3149
3292
|
cachedInputTokens,
|
|
@@ -3156,12 +3299,13 @@ function extractLlmCalls(spans, config) {
|
|
|
3156
3299
|
cached: cachedInputTokens,
|
|
3157
3300
|
cacheCreation: cacheCreationInputTokens
|
|
3158
3301
|
}),
|
|
3159
|
-
|
|
3160
|
-
|
|
3161
|
-
|
|
3162
|
-
|
|
3163
|
-
|
|
3164
|
-
|
|
3302
|
+
tokensPerSecond: readNumber$2(attrs, config.attributes.tokensPerSecond),
|
|
3303
|
+
costUsd,
|
|
3304
|
+
inputCostUsd,
|
|
3305
|
+
outputCostUsd,
|
|
3306
|
+
cachedInputCostUsd,
|
|
3307
|
+
cacheCreationInputCostUsd,
|
|
3308
|
+
reasoningCostUsd,
|
|
3165
3309
|
...readSteps(attrs, config.attributes.steps),
|
|
3166
3310
|
finishReason: readString$2(attrs, config.attributes.finishReason),
|
|
3167
3311
|
latencyMs: computeLatencyMs$1(span),
|
|
@@ -3792,6 +3936,80 @@ function isRecordLike(value) {
|
|
|
3792
3936
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
3793
3937
|
}
|
|
3794
3938
|
//#endregion
|
|
3939
|
+
//#region ../runner/src/chartValidation.ts
|
|
3940
|
+
function isValidColumnMetric(metric, columnsByKey, evalId, warnings) {
|
|
3941
|
+
const columnDef = columnsByKey.get(metric.key);
|
|
3942
|
+
if (!columnDef) {
|
|
3943
|
+
warnings.push(`[${evalId}] chart metric references unknown column "${metric.key}" — dropped`);
|
|
3944
|
+
return false;
|
|
3945
|
+
}
|
|
3946
|
+
if (metric.aggregate === "passThresholdRate") {
|
|
3947
|
+
if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
|
|
3948
|
+
warnings.push(`[${evalId}] chart metric "${metric.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
|
|
3949
|
+
return false;
|
|
3950
|
+
}
|
|
3951
|
+
}
|
|
3952
|
+
return true;
|
|
3953
|
+
}
|
|
3954
|
+
function isValidTooltipExtra(extra, columnsByKey, evalId, warnings) {
|
|
3955
|
+
const columnDef = columnsByKey.get(extra.key);
|
|
3956
|
+
if (!columnDef) {
|
|
3957
|
+
warnings.push(`[${evalId}] chart tooltip extra references unknown column "${extra.key}" — dropped`);
|
|
3958
|
+
return false;
|
|
3959
|
+
}
|
|
3960
|
+
if (extra.aggregate === "passThresholdRate") {
|
|
3961
|
+
if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
|
|
3962
|
+
warnings.push(`[${evalId}] chart tooltip extra "${extra.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
|
|
3963
|
+
return false;
|
|
3964
|
+
}
|
|
3965
|
+
}
|
|
3966
|
+
return true;
|
|
3967
|
+
}
|
|
3968
|
+
function sanitizeChart(chart, columnsByKey, evalId, warnings) {
|
|
3969
|
+
const metrics = chart.metrics.filter((metric) => {
|
|
3970
|
+
if (metric.source === "builtin") return true;
|
|
3971
|
+
return isValidColumnMetric(metric, columnsByKey, evalId, warnings);
|
|
3972
|
+
});
|
|
3973
|
+
if (metrics.length === 0) {
|
|
3974
|
+
warnings.push(`[${evalId}] chart had no valid metrics after validation — chart dropped`);
|
|
3975
|
+
return null;
|
|
3976
|
+
}
|
|
3977
|
+
const tooltipExtras = chart.tooltipExtras?.filter((extra) => {
|
|
3978
|
+
if (extra.source === "builtin") return true;
|
|
3979
|
+
return isValidTooltipExtra(extra, columnsByKey, evalId, warnings);
|
|
3980
|
+
});
|
|
3981
|
+
return {
|
|
3982
|
+
...chart,
|
|
3983
|
+
metrics,
|
|
3984
|
+
tooltipExtras: tooltipExtras?.length ? tooltipExtras : void 0
|
|
3985
|
+
};
|
|
3986
|
+
}
|
|
3987
|
+
/**
|
|
3988
|
+
* Validate and sanitize an authored `charts` config against the eval's
|
|
3989
|
+
* declared columns. Drops metrics/extras that reference unknown columns or
|
|
3990
|
+
* misuse `passThresholdRate`, and drops entire charts whose metrics are all
|
|
3991
|
+
* invalid. Returns `charts: undefined` when nothing valid remains so the UI
|
|
3992
|
+
* falls back to rendering no chart (matching the opt-in default).
|
|
3993
|
+
*/
|
|
3994
|
+
function validateCharts(params) {
|
|
3995
|
+
const { charts, columnDefs, evalId } = params;
|
|
3996
|
+
if (!charts || charts.length === 0) return {
|
|
3997
|
+
charts: void 0,
|
|
3998
|
+
warnings: []
|
|
3999
|
+
};
|
|
4000
|
+
const columnsByKey = new Map(columnDefs.map((def) => [def.key, def]));
|
|
4001
|
+
const warnings = [];
|
|
4002
|
+
const sanitized = [];
|
|
4003
|
+
for (const chart of charts) {
|
|
4004
|
+
const result = sanitizeChart(chart, columnsByKey, evalId, warnings);
|
|
4005
|
+
if (result) sanitized.push(result);
|
|
4006
|
+
}
|
|
4007
|
+
return {
|
|
4008
|
+
charts: sanitized.length > 0 ? sanitized : void 0,
|
|
4009
|
+
warnings
|
|
4010
|
+
};
|
|
4011
|
+
}
|
|
4012
|
+
//#endregion
|
|
3795
4013
|
//#region ../runner/src/columnBuilder.ts
|
|
3796
4014
|
/**
|
|
3797
4015
|
* Normalize a user-provided score definition (either a function or an
|
|
@@ -4003,6 +4221,316 @@ async function loadConfig() {
|
|
|
4003
4221
|
}
|
|
4004
4222
|
}
|
|
4005
4223
|
//#endregion
|
|
4224
|
+
//#region ../runner/src/defaultConfig.ts
|
|
4225
|
+
const DEFAULT_CONFIG_KEYS = [
|
|
4226
|
+
"apiCalls",
|
|
4227
|
+
"costUsd",
|
|
4228
|
+
"llmTurns",
|
|
4229
|
+
"inputTokens",
|
|
4230
|
+
"outputTokens",
|
|
4231
|
+
"totalTokens",
|
|
4232
|
+
"cachedInputTokens",
|
|
4233
|
+
"cacheCreationInputTokens",
|
|
4234
|
+
"reasoningTokens",
|
|
4235
|
+
"llmLatencyMs"
|
|
4236
|
+
];
|
|
4237
|
+
const tokenNumberFormat = {
|
|
4238
|
+
notation: "compact",
|
|
4239
|
+
decimalPlaces: 1
|
|
4240
|
+
};
|
|
4241
|
+
const countNumberFormat = { decimalPlaces: 0 };
|
|
4242
|
+
const DEFAULT_COLUMNS = {
|
|
4243
|
+
apiCalls: {
|
|
4244
|
+
label: "API Calls",
|
|
4245
|
+
format: "number",
|
|
4246
|
+
numberFormat: countNumberFormat,
|
|
4247
|
+
align: "right",
|
|
4248
|
+
sortable: true
|
|
4249
|
+
},
|
|
4250
|
+
costUsd: {
|
|
4251
|
+
label: "Cost",
|
|
4252
|
+
format: "number",
|
|
4253
|
+
numberFormat: {
|
|
4254
|
+
prefix: "$",
|
|
4255
|
+
decimalPlaces: 4
|
|
4256
|
+
},
|
|
4257
|
+
align: "right",
|
|
4258
|
+
sortable: true
|
|
4259
|
+
},
|
|
4260
|
+
llmTurns: {
|
|
4261
|
+
label: "LLM Turns",
|
|
4262
|
+
format: "number",
|
|
4263
|
+
numberFormat: countNumberFormat,
|
|
4264
|
+
align: "right",
|
|
4265
|
+
sortable: true
|
|
4266
|
+
},
|
|
4267
|
+
inputTokens: {
|
|
4268
|
+
label: "Input Tokens",
|
|
4269
|
+
format: "number",
|
|
4270
|
+
numberFormat: tokenNumberFormat,
|
|
4271
|
+
align: "right",
|
|
4272
|
+
sortable: true
|
|
4273
|
+
},
|
|
4274
|
+
outputTokens: {
|
|
4275
|
+
label: "Output Tokens",
|
|
4276
|
+
format: "number",
|
|
4277
|
+
numberFormat: tokenNumberFormat,
|
|
4278
|
+
align: "right",
|
|
4279
|
+
sortable: true
|
|
4280
|
+
},
|
|
4281
|
+
totalTokens: {
|
|
4282
|
+
label: "Total Tokens",
|
|
4283
|
+
format: "number",
|
|
4284
|
+
numberFormat: tokenNumberFormat,
|
|
4285
|
+
align: "right",
|
|
4286
|
+
sortable: true
|
|
4287
|
+
},
|
|
4288
|
+
cachedInputTokens: {
|
|
4289
|
+
label: "Cached Input Tokens",
|
|
4290
|
+
format: "number",
|
|
4291
|
+
numberFormat: tokenNumberFormat,
|
|
4292
|
+
align: "right",
|
|
4293
|
+
sortable: true
|
|
4294
|
+
},
|
|
4295
|
+
cacheCreationInputTokens: {
|
|
4296
|
+
label: "Cache Write Tokens",
|
|
4297
|
+
format: "number",
|
|
4298
|
+
numberFormat: tokenNumberFormat,
|
|
4299
|
+
align: "right",
|
|
4300
|
+
sortable: true
|
|
4301
|
+
},
|
|
4302
|
+
reasoningTokens: {
|
|
4303
|
+
label: "Reasoning Tokens",
|
|
4304
|
+
format: "number",
|
|
4305
|
+
numberFormat: tokenNumberFormat,
|
|
4306
|
+
align: "right",
|
|
4307
|
+
sortable: true
|
|
4308
|
+
},
|
|
4309
|
+
llmLatencyMs: {
|
|
4310
|
+
label: "LLM Latency",
|
|
4311
|
+
format: "duration",
|
|
4312
|
+
align: "right",
|
|
4313
|
+
sortable: true
|
|
4314
|
+
}
|
|
4315
|
+
};
|
|
4316
|
+
function resolveRemovedKeys(globalRemove, evalRemove) {
|
|
4317
|
+
if (globalRemove === true || evalRemove === true) return new Set(DEFAULT_CONFIG_KEYS);
|
|
4318
|
+
return new Set([...globalRemove ?? [], ...evalRemove ?? []]);
|
|
4319
|
+
}
|
|
4320
|
+
function getActiveDefaultConfigKeys(params) {
|
|
4321
|
+
const removed = resolveRemovedKeys(params.globalRemove, params.evalRemove);
|
|
4322
|
+
return DEFAULT_CONFIG_KEYS.filter((key) => !removed.has(key));
|
|
4323
|
+
}
|
|
4324
|
+
function mergeDefaultColumns(params) {
|
|
4325
|
+
const activeKeys = getActiveDefaultConfigKeys(params);
|
|
4326
|
+
if (activeKeys.length === 0) return params.columns;
|
|
4327
|
+
return {
|
|
4328
|
+
...Object.fromEntries(activeKeys.map((key) => [key, DEFAULT_COLUMNS[key]])),
|
|
4329
|
+
...params.columns
|
|
4330
|
+
};
|
|
4331
|
+
}
|
|
4332
|
+
function appendDefaultStats(params) {
|
|
4333
|
+
const activeKeys = new Set(getActiveDefaultConfigKeys(params));
|
|
4334
|
+
const defaults = [];
|
|
4335
|
+
if (activeKeys.has("apiCalls")) defaults.push({
|
|
4336
|
+
kind: "column",
|
|
4337
|
+
key: "apiCalls",
|
|
4338
|
+
label: "API Calls",
|
|
4339
|
+
aggregate: "avg"
|
|
4340
|
+
});
|
|
4341
|
+
if (activeKeys.has("costUsd")) defaults.push({
|
|
4342
|
+
kind: "column",
|
|
4343
|
+
key: "costUsd",
|
|
4344
|
+
label: "LLM Cost",
|
|
4345
|
+
aggregate: "sum"
|
|
4346
|
+
});
|
|
4347
|
+
if (activeKeys.has("totalTokens")) defaults.push({
|
|
4348
|
+
kind: "column",
|
|
4349
|
+
key: "totalTokens",
|
|
4350
|
+
label: "Tokens",
|
|
4351
|
+
aggregate: "sum"
|
|
4352
|
+
});
|
|
4353
|
+
if (activeKeys.has("llmTurns")) defaults.push({
|
|
4354
|
+
kind: "column",
|
|
4355
|
+
key: "llmTurns",
|
|
4356
|
+
label: "LLM Turns",
|
|
4357
|
+
aggregate: "avg"
|
|
4358
|
+
});
|
|
4359
|
+
if (activeKeys.has("llmLatencyMs")) defaults.push({
|
|
4360
|
+
kind: "column",
|
|
4361
|
+
key: "llmLatencyMs",
|
|
4362
|
+
label: "LLM Latency",
|
|
4363
|
+
aggregate: "avg"
|
|
4364
|
+
});
|
|
4365
|
+
const merged = [...params.stats ?? [], ...defaults];
|
|
4366
|
+
return merged.length > 0 ? merged : void 0;
|
|
4367
|
+
}
|
|
4368
|
+
function appendDefaultCharts(params) {
|
|
4369
|
+
const activeKeys = new Set(getActiveDefaultConfigKeys(params));
|
|
4370
|
+
const defaults = [];
|
|
4371
|
+
if (activeKeys.has("apiCalls")) defaults.push({
|
|
4372
|
+
heading: "API Calls",
|
|
4373
|
+
type: "bar",
|
|
4374
|
+
metrics: [{
|
|
4375
|
+
source: "column",
|
|
4376
|
+
key: "apiCalls",
|
|
4377
|
+
aggregate: "sum",
|
|
4378
|
+
label: "API Calls",
|
|
4379
|
+
color: "accentDim"
|
|
4380
|
+
}]
|
|
4381
|
+
});
|
|
4382
|
+
if (activeKeys.has("costUsd")) defaults.push({
|
|
4383
|
+
heading: "LLM Cost",
|
|
4384
|
+
type: "area",
|
|
4385
|
+
metrics: [{
|
|
4386
|
+
source: "column",
|
|
4387
|
+
key: "costUsd",
|
|
4388
|
+
aggregate: "sum",
|
|
4389
|
+
label: "Cost",
|
|
4390
|
+
color: "warning"
|
|
4391
|
+
}]
|
|
4392
|
+
});
|
|
4393
|
+
const tokenMetrics = [
|
|
4394
|
+
activeKeys.has("inputTokens") ? {
|
|
4395
|
+
source: "column",
|
|
4396
|
+
key: "inputTokens",
|
|
4397
|
+
aggregate: "sum",
|
|
4398
|
+
label: "Input",
|
|
4399
|
+
color: "accent"
|
|
4400
|
+
} : null,
|
|
4401
|
+
activeKeys.has("outputTokens") ? {
|
|
4402
|
+
source: "column",
|
|
4403
|
+
key: "outputTokens",
|
|
4404
|
+
aggregate: "sum",
|
|
4405
|
+
label: "Output",
|
|
4406
|
+
color: "success"
|
|
4407
|
+
} : null,
|
|
4408
|
+
activeKeys.has("reasoningTokens") ? {
|
|
4409
|
+
source: "column",
|
|
4410
|
+
key: "reasoningTokens",
|
|
4411
|
+
aggregate: "sum",
|
|
4412
|
+
label: "Reasoning",
|
|
4413
|
+
color: "error"
|
|
4414
|
+
} : null
|
|
4415
|
+
].filter((metric) => metric !== null);
|
|
4416
|
+
if (tokenMetrics.length > 0) defaults.push({
|
|
4417
|
+
heading: "LLM Tokens",
|
|
4418
|
+
type: "bar",
|
|
4419
|
+
metrics: tokenMetrics,
|
|
4420
|
+
tooltipExtras: activeKeys.has("totalTokens") ? [{
|
|
4421
|
+
source: "column",
|
|
4422
|
+
key: "totalTokens",
|
|
4423
|
+
aggregate: "sum",
|
|
4424
|
+
label: "Total"
|
|
4425
|
+
}] : void 0
|
|
4426
|
+
});
|
|
4427
|
+
const merged = [...params.charts ?? [], ...defaults];
|
|
4428
|
+
return merged.length > 0 ? merged : void 0;
|
|
4429
|
+
}
|
|
4430
|
+
function resolveEvalDefaultConfig(params) {
|
|
4431
|
+
const evalRemove = params.evalDef.removeDefaultConfig;
|
|
4432
|
+
return {
|
|
4433
|
+
columns: mergeDefaultColumns({
|
|
4434
|
+
columns: params.evalDef.columns,
|
|
4435
|
+
globalRemove: params.globalRemove,
|
|
4436
|
+
evalRemove
|
|
4437
|
+
}),
|
|
4438
|
+
stats: appendDefaultStats({
|
|
4439
|
+
stats: params.evalDef.stats,
|
|
4440
|
+
globalRemove: params.globalRemove,
|
|
4441
|
+
evalRemove
|
|
4442
|
+
}),
|
|
4443
|
+
charts: appendDefaultCharts({
|
|
4444
|
+
charts: params.evalDef.charts,
|
|
4445
|
+
globalRemove: params.globalRemove,
|
|
4446
|
+
evalRemove
|
|
4447
|
+
})
|
|
4448
|
+
};
|
|
4449
|
+
}
|
|
4450
|
+
function sumNullable(values) {
|
|
4451
|
+
let total = 0;
|
|
4452
|
+
let hasValue = false;
|
|
4453
|
+
for (const value of values) {
|
|
4454
|
+
if (value === null) continue;
|
|
4455
|
+
total += value;
|
|
4456
|
+
hasValue = true;
|
|
4457
|
+
}
|
|
4458
|
+
return hasValue ? total : void 0;
|
|
4459
|
+
}
|
|
4460
|
+
function assignIfMissing(params) {
|
|
4461
|
+
if (!params.activeKeys.has(params.key)) return;
|
|
4462
|
+
if (params.key in params.outputs) return;
|
|
4463
|
+
if (params.value === void 0) return;
|
|
4464
|
+
params.outputs[params.key] = params.value;
|
|
4465
|
+
}
|
|
4466
|
+
function addDefaultOutputs(params) {
|
|
4467
|
+
const activeKeys = new Set(getActiveDefaultConfigKeys(params));
|
|
4468
|
+
if (activeKeys.size === 0) return;
|
|
4469
|
+
const calls = extractLlmCalls(params.spans, params.llmCallsConfig);
|
|
4470
|
+
const apiCalls = extractApiCalls(params.spans, params.apiCallsConfig);
|
|
4471
|
+
assignIfMissing({
|
|
4472
|
+
outputs: params.outputs,
|
|
4473
|
+
key: "apiCalls",
|
|
4474
|
+
value: apiCalls.length > 0 ? apiCalls.length : void 0,
|
|
4475
|
+
activeKeys
|
|
4476
|
+
});
|
|
4477
|
+
if (calls.length === 0) return;
|
|
4478
|
+
assignIfMissing({
|
|
4479
|
+
outputs: params.outputs,
|
|
4480
|
+
key: "llmTurns",
|
|
4481
|
+
value: calls.length,
|
|
4482
|
+
activeKeys
|
|
4483
|
+
});
|
|
4484
|
+
assignIfMissing({
|
|
4485
|
+
outputs: params.outputs,
|
|
4486
|
+
key: "costUsd",
|
|
4487
|
+
value: sumNullable(calls.map((call) => call.costUsd)),
|
|
4488
|
+
activeKeys
|
|
4489
|
+
});
|
|
4490
|
+
assignIfMissing({
|
|
4491
|
+
outputs: params.outputs,
|
|
4492
|
+
key: "inputTokens",
|
|
4493
|
+
value: sumNullable(calls.map((call) => call.inputTokens)),
|
|
4494
|
+
activeKeys
|
|
4495
|
+
});
|
|
4496
|
+
assignIfMissing({
|
|
4497
|
+
outputs: params.outputs,
|
|
4498
|
+
key: "outputTokens",
|
|
4499
|
+
value: sumNullable(calls.map((call) => call.outputTokens)),
|
|
4500
|
+
activeKeys
|
|
4501
|
+
});
|
|
4502
|
+
assignIfMissing({
|
|
4503
|
+
outputs: params.outputs,
|
|
4504
|
+
key: "totalTokens",
|
|
4505
|
+
value: sumNullable(calls.map((call) => call.totalTokens)),
|
|
4506
|
+
activeKeys
|
|
4507
|
+
});
|
|
4508
|
+
assignIfMissing({
|
|
4509
|
+
outputs: params.outputs,
|
|
4510
|
+
key: "cachedInputTokens",
|
|
4511
|
+
value: sumNullable(calls.map((call) => call.cachedInputTokens)),
|
|
4512
|
+
activeKeys
|
|
4513
|
+
});
|
|
4514
|
+
assignIfMissing({
|
|
4515
|
+
outputs: params.outputs,
|
|
4516
|
+
key: "cacheCreationInputTokens",
|
|
4517
|
+
value: sumNullable(calls.map((call) => call.cacheCreationInputTokens)),
|
|
4518
|
+
activeKeys
|
|
4519
|
+
});
|
|
4520
|
+
assignIfMissing({
|
|
4521
|
+
outputs: params.outputs,
|
|
4522
|
+
key: "reasoningTokens",
|
|
4523
|
+
value: sumNullable(calls.map((call) => call.reasoningTokens)),
|
|
4524
|
+
activeKeys
|
|
4525
|
+
});
|
|
4526
|
+
assignIfMissing({
|
|
4527
|
+
outputs: params.outputs,
|
|
4528
|
+
key: "llmLatencyMs",
|
|
4529
|
+
value: sumNullable(calls.map((call) => call.latencyMs)),
|
|
4530
|
+
activeKeys
|
|
4531
|
+
});
|
|
4532
|
+
}
|
|
4533
|
+
//#endregion
|
|
4006
4534
|
//#region ../runner/src/discovery.ts
|
|
4007
4535
|
const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
|
|
4008
4536
|
const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
|
|
@@ -4664,7 +5192,7 @@ async function callWithUnknownResult(fn, args) {
|
|
|
4664
5192
|
return await Reflect.apply(fn, void 0, args);
|
|
4665
5193
|
}
|
|
4666
5194
|
async function runCase(params) {
|
|
4667
|
-
const { evalDef, evalId, evalCase, globalTraceDisplay, trial, startTime, cacheAdapter, cacheMode, codeFingerprint, moduleIsolation, evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
5195
|
+
const { evalDef, evalId, evalCase, globalTraceDisplay, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, codeFingerprint, moduleIsolation, evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
4668
5196
|
const scopedIdPrefix = buildScopedEvalIdPrefix({
|
|
4669
5197
|
evalId,
|
|
4670
5198
|
evalFilePath,
|
|
@@ -4714,6 +5242,14 @@ async function runCase(params) {
|
|
|
4714
5242
|
scope.assertionFailures.push(toAssertionFailure(message, e instanceof Error ? e : void 0));
|
|
4715
5243
|
}
|
|
4716
5244
|
}
|
|
5245
|
+
if (!nonAssertError) addDefaultOutputs({
|
|
5246
|
+
outputs: scope.outputs,
|
|
5247
|
+
spans: scope.spans,
|
|
5248
|
+
llmCallsConfig,
|
|
5249
|
+
apiCallsConfig,
|
|
5250
|
+
globalRemove: globalRemoveDefaultConfig,
|
|
5251
|
+
evalRemove: evalDef.removeDefaultConfig
|
|
5252
|
+
});
|
|
4717
5253
|
if (!nonAssertError && evalDef.outputsSchema) {
|
|
4718
5254
|
const { outputsSchema } = evalDef;
|
|
4719
5255
|
const parsedOutputs = await runInExistingEvalScope(scope, "outputsSchema", () => outputsSchema.safeParse(getOutputsSchemaInput(outputsSchema, scope.outputs)));
|
|
@@ -4795,6 +5331,11 @@ async function runCase(params) {
|
|
|
4795
5331
|
const status = nonAssertError ? "error" : passed ? "pass" : "fail";
|
|
4796
5332
|
const { trace: displayTrace, traceDisplay } = resolveTracePresentation(scope.spans, globalTraceDisplay, evalDef.traceDisplay);
|
|
4797
5333
|
const columns = {};
|
|
5334
|
+
const columnOverrides = mergeDefaultColumns({
|
|
5335
|
+
columns: evalDef.columns,
|
|
5336
|
+
globalRemove: globalRemoveDefaultConfig,
|
|
5337
|
+
evalRemove: evalDef.removeDefaultConfig
|
|
5338
|
+
});
|
|
4798
5339
|
for (const [key, value] of Object.entries(scope.outputs)) {
|
|
4799
5340
|
const cell = isBlob(value) ? await persistInlineArtifact({
|
|
4800
5341
|
artifactDir,
|
|
@@ -4803,7 +5344,7 @@ async function runCase(params) {
|
|
|
4803
5344
|
outputKey: key,
|
|
4804
5345
|
trial,
|
|
4805
5346
|
value
|
|
4806
|
-
}) : toCellValue(value,
|
|
5347
|
+
}) : toCellValue(value, columnOverrides?.[key]);
|
|
4807
5348
|
if (cell !== void 0) columns[key] = cell;
|
|
4808
5349
|
}
|
|
4809
5350
|
for (const key of Object.keys(evalDef.manualScores ?? {})) columns[key] = null;
|
|
@@ -5016,6 +5557,8 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5016
5557
|
key: runState.manifest.id,
|
|
5017
5558
|
workspaceRoot
|
|
5018
5559
|
};
|
|
5560
|
+
const llmCallsConfig = resolveLlmCallsConfig(config.llmCalls);
|
|
5561
|
+
const apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
|
|
5019
5562
|
for (const evalMeta of targetEvals) {
|
|
5020
5563
|
const evalFilePath = evalMeta.sourceFilePath;
|
|
5021
5564
|
let codeFingerprint = "";
|
|
@@ -5054,7 +5597,20 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5054
5597
|
evalId: evalMeta.id
|
|
5055
5598
|
}), request.target.evalIds, request.target.caseIds, evalMeta.id);
|
|
5056
5599
|
runState.summary.totalCases += cases.length;
|
|
5057
|
-
const
|
|
5600
|
+
const defaultConfig = resolveEvalDefaultConfig({
|
|
5601
|
+
evalDef,
|
|
5602
|
+
globalRemove: config.removeDefaultConfig
|
|
5603
|
+
});
|
|
5604
|
+
const declaredColumnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
|
|
5605
|
+
const accumulatedColumns = new Map(declaredColumnDefs.map((def) => [def.key, def]));
|
|
5606
|
+
const validatedCharts = validateCharts({
|
|
5607
|
+
charts: defaultConfig.charts,
|
|
5608
|
+
columnDefs: declaredColumnDefs,
|
|
5609
|
+
evalId: evalMeta.id
|
|
5610
|
+
});
|
|
5611
|
+
for (const warning of validatedCharts.warnings) console.warn(warning);
|
|
5612
|
+
evalMeta.stats = defaultConfig.stats;
|
|
5613
|
+
evalMeta.charts = validatedCharts.charts;
|
|
5058
5614
|
const evalCaseRows = [];
|
|
5059
5615
|
const preparedCases = [];
|
|
5060
5616
|
const scoreKeys = Object.freeze(Object.keys(evalDef.scores ?? {}));
|
|
@@ -5066,7 +5622,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5066
5622
|
preparedCases,
|
|
5067
5623
|
scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys]),
|
|
5068
5624
|
mergeColumns: (columns) => {
|
|
5069
|
-
mergeColumnDefs(accumulatedColumns, columns,
|
|
5625
|
+
mergeColumnDefs(accumulatedColumns, columns, defaultConfig.columns, evalDef.scores, evalDef.manualScores);
|
|
5070
5626
|
}
|
|
5071
5627
|
};
|
|
5072
5628
|
preparedEvals.push(preparedEval);
|
|
@@ -5087,6 +5643,9 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5087
5643
|
evalId: evalMeta.id,
|
|
5088
5644
|
evalCase,
|
|
5089
5645
|
globalTraceDisplay,
|
|
5646
|
+
llmCallsConfig,
|
|
5647
|
+
apiCallsConfig,
|
|
5648
|
+
globalRemoveDefaultConfig: config.removeDefaultConfig,
|
|
5090
5649
|
trial,
|
|
5091
5650
|
startTime,
|
|
5092
5651
|
cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
|
|
@@ -5237,4 +5796,4 @@ function toLastRunStatus(status) {
|
|
|
5237
5796
|
return status === "pending" ? null : status;
|
|
5238
5797
|
}
|
|
5239
5798
|
//#endregion
|
|
5240
|
-
export {
|
|
5799
|
+
export { removeDefaultConfigSchema as $, columnKindSchema as $t, extractApiCalls as A, setEvalOutput as An, cacheFileSchema as At, DEFAULT_API_CALLS_CONFIG as B, traceAttributeDisplayFormatSchema as Bt, validateCharts as C, incrementEvalOutput as Cn, evalChartTooltipExtraSchema as Ct, sseEnvelopeSchema as D, runInEvalRuntimeScope as Dn, cacheDebugKeyFileSchema as Dt, updateManualScoreRequestSchema as E, nextEvalId as En, cacheDebugKeyEntrySchema as Et, deriveScopedSummaryFromCases as F, getEvalRegistry as Fn, cacheRecordingSchema as Ft, apiCallMetricSchema as G, traceDisplayInputConfigSchema as Gt, agentEvalsConfigSchema as H, traceAttributeDisplayPlacementSchema as Ht, deriveStatusFromCaseRows as I, cacheStatusSchema as It, llmCallMetricFormatSchema as J, traceSpanSchema as Jt, apiCallsConfigSchema as K, traceSpanErrorSchema as Kt, deriveStatusFromChildStatuses as L, serializedCacheSpanSchema as Lt, getNestedAttribute as M, startEvalBackgroundJob as Mn, cacheModeSchema as Mt, getEvalTitle as N, repoFile as Nn, cacheOperationTypeSchema as Nt, extractCacheEntries as O, runInEvalScope as On, cacheEntrySchema as Ot, getEvalDisplayStatus as P, defineEval as Pn, cacheRecordingOpSchema as Pt, llmCallsConfigSchema as Q, columnFormatSchema as Qt, runManifestSchema as R, spanCacheOptionsSchema as Rt, normalizeScoreDef as S, getEvalCaseInput as Sn, evalChartMetricSchema as St, createRunRequestSchema as T, mergeEvalOutput as Tn, evalChartsConfigSchema as Tt, apiCallMetricFormatSchema as U, traceAttributeDisplaySchema as Ut, DEFAULT_LLM_CALLS_CONFIG as V, traceAttributeDisplayInputSchema as Vt, apiCallMetricPlacementSchema as W, traceDisplayConfigSchema as Wt, llmCallMetricSchema as X, cellValueSchema as Xt, llmCallMetricPlacementSchema as Y, traceSpanWarningSchema as Yt, llmCallPricingSchema as Z, columnDefSchema as Zt, loadEvalModule as _, appendToEvalOutput as _n, evalChartAggregateSchema as _t, loadPersistedRunSnapshot as a, z$1 as an, caseDetailSchema as at, loadConfig as b, evalLog as bn, evalChartColorSchema as bt, persistCaseDetail as c, evalSpan as cn, evalStatAggregateSchema as ct, recomputePersistedCaseStatus as d, hashCacheKeySync as dn, evalSummarySchema as dt, fileRefSchema as en, resolveApiCallsConfig as et, runTouchesEval as f, deserializeCacheRecording as fn, runLogEntrySchema as ft, setLatestRunInfoMap as g, EvalAssertionError as gn, scoreTraceSchema as gt, getTargetEvalIds as h, serializeCacheValue as hn, runLogPhaseSchema as ht, getLatestRunInfos as i, runArtifactRefSchema as in, assertionFailureSchema as it, extractLlmCalls as j, setScopeCacheContext as jn, cacheListItemSchema as jt, extractCacheHits as k, runInExistingEvalScope as kn, cacheEntryWithDebugKeySchema as kt, persistRunState as l, evalTracer as ln, evalStatItemSchema as lt, buildEvalSummary as m, serializeCacheRecording as mn, runLogLocationSchema as mt, generateRunId as n, numberDisplayOptionsSchema as nn, runLogsConfigSchema as nt, loadPersistedRunSnapshots as o, buildTraceTree as on, caseRowSchema as ot, resolveArtifactPath as p, deserializeCacheValue as pn, runLogLevelSchema as pt, defaultConfigKeySchema as q, traceSpanKindSchema as qt, getLastRunStatuses as r, repoFileRefSchema as rn, trialSelectionModeSchema as rt, nextShortIdFromSnapshots as s, captureEvalSpanError as sn, evalFreshnessStatusSchema as st, executeRun as t, jsonCellSchema as tn, resolveLlmCallsConfig as tt, recomputeEvalStatusesInRuns as u, hashCacheKey as un, evalStatsConfigSchema as ut, parseEvalMetas as v, configureEvalRunLogs as vn, evalChartAxisSchema as vt, createFsCacheStore as w, isInEvalScope as wn, evalChartTypeSchema as wt, buildDeclaredColumnDefs as x, getCurrentScope as xn, evalChartConfigSchema as xt, resolveEvalDefaultConfig as y, evalAssert as yn, evalChartBuiltinMetricSchema as yt, runSummarySchema as z, traceCacheRefSchema as zt };
|