@ls-stack/agent-eval 0.20.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-DsiLU65H.mjs → app-moDHbg1O.mjs} +3 -3
- package/dist/apps/web/dist/assets/index-AUDD3rNB.js +118 -0
- package/dist/apps/web/dist/assets/{index-CvR6QCLa.css → index-r0dVFK0B.css} +1 -1
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-weogme5U.mjs → cli-C0EtHhEO.mjs} +3 -3
- package/dist/index.d.mts +56 -61
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-Cv1kiOAG.mjs → runOrchestration-D1edUDhp.mjs} +155 -140
- package/dist/{runner-DzrMtgBu.mjs → runner-C9nP2VKL.mjs} +2 -2
- package/dist/{runner-B25oRQxX.mjs → runner-CyRhIzci.mjs} +1 -1
- package/dist/src-D-HuV8I-.mjs +3 -0
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +30 -20
- package/dist/apps/web/dist/assets/index-Cba4MFa0.js +0 -118
- package/dist/src-B879LZfo.mjs +0 -3
|
@@ -1970,7 +1970,15 @@ const numberDisplayOptionsSchema = z.object({
|
|
|
1970
1970
|
compactDisplay: z.enum(["short", "long"]).optional(),
|
|
1971
1971
|
prefix: z.string().optional(),
|
|
1972
1972
|
suffix: z.string().optional(),
|
|
1973
|
-
|
|
1973
|
+
minDecimalPlaces: z.number().int().min(0).optional(),
|
|
1974
|
+
maxDecimalPlaces: z.number().int().min(0).optional()
|
|
1975
|
+
}).refine((options) => {
|
|
1976
|
+
if (options.minDecimalPlaces === void 0) return true;
|
|
1977
|
+
if (options.maxDecimalPlaces === void 0) return true;
|
|
1978
|
+
return options.minDecimalPlaces <= options.maxDecimalPlaces;
|
|
1979
|
+
}, {
|
|
1980
|
+
message: "minDecimalPlaces must be less than or equal to maxDecimalPlaces",
|
|
1981
|
+
path: ["minDecimalPlaces"]
|
|
1974
1982
|
});
|
|
1975
1983
|
/** Schema for the supported column rendering kinds in list views. */
|
|
1976
1984
|
const columnKindSchema = z.enum([
|
|
@@ -2005,7 +2013,6 @@ const columnDefSchema = z.object({
|
|
|
2005
2013
|
passThreshold: z.number().optional(),
|
|
2006
2014
|
maxStars: z.number().int().min(2).optional(),
|
|
2007
2015
|
hideInTable: z.boolean().optional(),
|
|
2008
|
-
sortable: z.boolean().optional(),
|
|
2009
2016
|
align: z.enum([
|
|
2010
2017
|
"left",
|
|
2011
2018
|
"center",
|
|
@@ -2403,6 +2410,8 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
|
|
|
2403
2410
|
label: z.string().optional(),
|
|
2404
2411
|
aggregate: evalStatAggregateSchema,
|
|
2405
2412
|
format: columnFormatSchema.optional(),
|
|
2413
|
+
/** Number presentation options applied when `format: 'number'`. */
|
|
2414
|
+
numberFormat: numberDisplayOptionsSchema.optional(),
|
|
2406
2415
|
accent: z.boolean().optional()
|
|
2407
2416
|
})
|
|
2408
2417
|
]);
|
|
@@ -2459,7 +2468,8 @@ const caseRowSchema = z.object({
|
|
|
2459
2468
|
"error",
|
|
2460
2469
|
"cancelled"
|
|
2461
2470
|
]),
|
|
2462
|
-
|
|
2471
|
+
/** Elapsed case execution duration in milliseconds, or null before completion. */
|
|
2472
|
+
durationMs: z.number().nullable(),
|
|
2463
2473
|
costUsd: z.number().nullable().optional(),
|
|
2464
2474
|
columns: z.record(z.string(), cellValueSchema),
|
|
2465
2475
|
/** Winning trial index for the persisted case result. */
|
|
@@ -2577,7 +2587,7 @@ const defaultConfigKeySchema = z.enum([
|
|
|
2577
2587
|
"cachedInputTokens",
|
|
2578
2588
|
"cacheCreationInputTokens",
|
|
2579
2589
|
"reasoningTokens",
|
|
2580
|
-
"
|
|
2590
|
+
"llmDurationMs"
|
|
2581
2591
|
]);
|
|
2582
2592
|
/** Removal config for built-in eval-level outputs and UI metadata. */
|
|
2583
2593
|
const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfigKeySchema)]);
|
|
@@ -2654,7 +2664,7 @@ const apiCallMetricSchema = z.object({
|
|
|
2654
2664
|
});
|
|
2655
2665
|
/**
|
|
2656
2666
|
* Schema for one model/provider pricing entry used to derive LLM-call costs
|
|
2657
|
-
* from token counts
|
|
2667
|
+
* from token counts.
|
|
2658
2668
|
*/
|
|
2659
2669
|
const llmCallPricingSchema = z.object({
|
|
2660
2670
|
/** Exact model name read from the configured `attributes.model` path. */
|
|
@@ -2673,6 +2683,8 @@ const llmCallPricingSchema = z.object({
|
|
|
2673
2683
|
cachedInputUsdPerMillion: z.number().nonnegative().optional(),
|
|
2674
2684
|
/** USD per one million prompt-cache write tokens. */
|
|
2675
2685
|
cacheCreationInputUsdPerMillion: z.number().nonnegative().optional(),
|
|
2686
|
+
/** USD per one million one-hour prompt-cache write tokens. */
|
|
2687
|
+
cacheCreationInput1hUsdPerMillion: z.number().nonnegative().optional(),
|
|
2676
2688
|
/** USD per one million reasoning tokens when reported separately. */
|
|
2677
2689
|
reasoningUsdPerMillion: z.number().nonnegative().optional()
|
|
2678
2690
|
});
|
|
@@ -2683,12 +2695,9 @@ const llmCallsConfigSchema = z.object({
|
|
|
2683
2695
|
/**
|
|
2684
2696
|
* Attribute paths used to extract structured per-call fields. Each entry is
|
|
2685
2697
|
* a dot-path inside `span.attributes`. Missing paths fall back to the
|
|
2686
|
-
* built-in defaults (e.g. `usage.inputTokens
|
|
2687
|
-
*
|
|
2688
|
-
*
|
|
2689
|
-
* `reasoningCost`) feed the cost breakdown table in the expanded row when
|
|
2690
|
-
* spans provide explicit USD cost overrides. Prefer `pricing` for deriving
|
|
2691
|
-
* costs from token counts globally.
|
|
2698
|
+
* built-in defaults (e.g. `usage.inputTokens`). Derived fields such as
|
|
2699
|
+
* total tokens, tokens/sec, duration, and USD costs are intentionally not
|
|
2700
|
+
* configurable as attribute paths.
|
|
2692
2701
|
*/
|
|
2693
2702
|
attributes: z.object({
|
|
2694
2703
|
model: z.string().optional(),
|
|
@@ -2697,15 +2706,9 @@ const llmCallsConfigSchema = z.object({
|
|
|
2697
2706
|
outputTokens: z.string().optional(),
|
|
2698
2707
|
cachedInputTokens: z.string().optional(),
|
|
2699
2708
|
cacheCreationInputTokens: z.string().optional(),
|
|
2709
|
+
cacheCreationInput1hTokens: z.string().optional(),
|
|
2700
2710
|
reasoningTokens: z.string().optional(),
|
|
2701
|
-
|
|
2702
|
-
tokensPerSecond: z.string().optional(),
|
|
2703
|
-
cost: z.string().optional(),
|
|
2704
|
-
inputCost: z.string().optional(),
|
|
2705
|
-
outputCost: z.string().optional(),
|
|
2706
|
-
cachedInputCost: z.string().optional(),
|
|
2707
|
-
cacheCreationInputCost: z.string().optional(),
|
|
2708
|
-
reasoningCost: z.string().optional(),
|
|
2711
|
+
latencyMs: z.string().optional(),
|
|
2709
2712
|
steps: z.string().optional(),
|
|
2710
2713
|
finishReason: z.string().optional(),
|
|
2711
2714
|
input: z.string().optional(),
|
|
@@ -2714,9 +2717,8 @@ const llmCallsConfigSchema = z.object({
|
|
|
2714
2717
|
toolCalls: z.string().optional()
|
|
2715
2718
|
}).optional(),
|
|
2716
2719
|
/**
|
|
2717
|
-
* Model/provider pricing registry used to calculate
|
|
2718
|
-
*
|
|
2719
|
-
* etc.) take precedence over derived prices.
|
|
2720
|
+
* Model/provider pricing registry used to calculate LLM-call costs from
|
|
2721
|
+
* token counts. Built-in LLM cost fields are only derived from this registry.
|
|
2720
2722
|
*/
|
|
2721
2723
|
pricing: z.array(llmCallPricingSchema).optional(),
|
|
2722
2724
|
/** Custom user-defined metrics surfaced on each LLM call. */
|
|
@@ -2764,15 +2766,9 @@ const DEFAULT_LLM_CALLS_CONFIG = {
|
|
|
2764
2766
|
outputTokens: "usage.outputTokens",
|
|
2765
2767
|
cachedInputTokens: "usage.cachedInputTokens",
|
|
2766
2768
|
cacheCreationInputTokens: "usage.cacheCreationInputTokens",
|
|
2769
|
+
cacheCreationInput1hTokens: "usage.cacheCreationInput1hTokens",
|
|
2767
2770
|
reasoningTokens: "usage.reasoningTokens",
|
|
2768
|
-
|
|
2769
|
-
tokensPerSecond: "tokensPerSecond",
|
|
2770
|
-
cost: "costUsd",
|
|
2771
|
-
inputCost: "cost.inputUsd",
|
|
2772
|
-
outputCost: "cost.outputUsd",
|
|
2773
|
-
cachedInputCost: "cost.cachedInputUsd",
|
|
2774
|
-
cacheCreationInputCost: "cost.cacheCreationInputUsd",
|
|
2775
|
-
reasoningCost: "cost.reasoningUsd",
|
|
2771
|
+
latencyMs: "latencyMs",
|
|
2776
2772
|
steps: "steps",
|
|
2777
2773
|
finishReason: "finishReason",
|
|
2778
2774
|
input: "input",
|
|
@@ -2814,8 +2810,8 @@ const DEFAULT_API_CALLS_CONFIG = {
|
|
|
2814
2810
|
* attribute path.
|
|
2815
2811
|
* - Missing `metrics[].format` defaults to `'string'`.
|
|
2816
2812
|
* - Missing `metrics[].placements` defaults to `['body']`.
|
|
2817
|
-
* - Missing `pricing` defaults to an empty registry;
|
|
2818
|
-
*
|
|
2813
|
+
* - Missing `pricing` defaults to an empty registry; built-in costs are only
|
|
2814
|
+
* derived from configured pricing and token counts.
|
|
2819
2815
|
*/
|
|
2820
2816
|
function resolveLlmCallsConfig(input) {
|
|
2821
2817
|
return {
|
|
@@ -2839,6 +2835,7 @@ function resolveLlmCallsConfig(input) {
|
|
|
2839
2835
|
outputUsdPerMillion: p.outputUsdPerMillion,
|
|
2840
2836
|
cachedInputUsdPerMillion: p.cachedInputUsdPerMillion,
|
|
2841
2837
|
cacheCreationInputUsdPerMillion: p.cacheCreationInputUsdPerMillion,
|
|
2838
|
+
cacheCreationInput1hUsdPerMillion: p.cacheCreationInput1hUsdPerMillion,
|
|
2842
2839
|
reasoningUsdPerMillion: p.reasoningUsdPerMillion
|
|
2843
2840
|
}))
|
|
2844
2841
|
};
|
|
@@ -3028,8 +3025,8 @@ function deriveScopedSummaryFromCases(params) {
|
|
|
3028
3025
|
else if (caseRow.status === "cancelled") cancelledCases += 1;
|
|
3029
3026
|
else if (caseRow.status === "running") runningCases += 1;
|
|
3030
3027
|
else pendingCases += 1;
|
|
3031
|
-
if (caseRow.
|
|
3032
|
-
totalDurationMs += caseRow.
|
|
3028
|
+
if (caseRow.durationMs !== null) {
|
|
3029
|
+
totalDurationMs += caseRow.durationMs;
|
|
3033
3030
|
hasDuration = true;
|
|
3034
3031
|
}
|
|
3035
3032
|
}
|
|
@@ -3123,6 +3120,21 @@ function computeTokenCost(tokens, usdPerMillion) {
|
|
|
3123
3120
|
if (usdPerMillion === void 0) return null;
|
|
3124
3121
|
return tokens / 1e6 * usdPerMillion;
|
|
3125
3122
|
}
|
|
3123
|
+
function computeCacheCreationInputCost({ cacheCreationInputTokens, cacheCreationInput1hTokens, usdPerMillion, oneHourUsdPerMillion }) {
|
|
3124
|
+
if (cacheCreationInputTokens === null) return null;
|
|
3125
|
+
if (cacheCreationInputTokens === 0) return 0;
|
|
3126
|
+
if (cacheCreationInput1hTokens === null) return computeTokenCost(cacheCreationInputTokens, usdPerMillion);
|
|
3127
|
+
const oneHourTokens = Math.min(cacheCreationInput1hTokens, cacheCreationInputTokens);
|
|
3128
|
+
const shortLivedCost = computeTokenCost(cacheCreationInputTokens - oneHourTokens, usdPerMillion);
|
|
3129
|
+
const oneHourCost = computeTokenCost(oneHourTokens, oneHourUsdPerMillion);
|
|
3130
|
+
if (shortLivedCost === null || oneHourCost === null) return null;
|
|
3131
|
+
return shortLivedCost + oneHourCost;
|
|
3132
|
+
}
|
|
3133
|
+
function computeBaseInputTokens({ inputTokens, cachedInputTokens, cacheCreationInputTokens }) {
|
|
3134
|
+
if (inputTokens === null) return null;
|
|
3135
|
+
const cachedTokens = (cachedInputTokens ?? 0) + (cacheCreationInputTokens ?? 0);
|
|
3136
|
+
return Math.max(inputTokens - cachedTokens, 0);
|
|
3137
|
+
}
|
|
3126
3138
|
function pickPricingEntry({ pricing, model, provider }) {
|
|
3127
3139
|
if (model === null) return null;
|
|
3128
3140
|
let fallback = null;
|
|
@@ -3136,7 +3148,7 @@ function pickPricingEntry({ pricing, model, provider }) {
|
|
|
3136
3148
|
}
|
|
3137
3149
|
return fallback;
|
|
3138
3150
|
}
|
|
3139
|
-
function
|
|
3151
|
+
function computeTotalCost({ inputTokens, inputCostUsd, outputTokens, outputCostUsd, cachedInputTokens, cachedInputCostUsd, cacheCreationInputTokens, cacheCreationInputCostUsd, reasoningTokens, reasoningCostUsd }) {
|
|
3140
3152
|
const parts = [
|
|
3141
3153
|
{
|
|
3142
3154
|
tokens: inputTokens,
|
|
@@ -3173,7 +3185,7 @@ function computeFallbackTotalCost({ inputTokens, inputCostUsd, outputTokens, out
|
|
|
3173
3185
|
if (hasCost) return total;
|
|
3174
3186
|
return hasReportedTokens ? 0 : null;
|
|
3175
3187
|
}
|
|
3176
|
-
function
|
|
3188
|
+
function computeDurationMs$1(span) {
|
|
3177
3189
|
if (span.endedAt === null) return null;
|
|
3178
3190
|
const started = Date.parse(span.startedAt);
|
|
3179
3191
|
const ended = Date.parse(span.endedAt);
|
|
@@ -3181,10 +3193,16 @@ function computeLatencyMs$1(span) {
|
|
|
3181
3193
|
const delta = ended - started;
|
|
3182
3194
|
return delta >= 0 ? delta : null;
|
|
3183
3195
|
}
|
|
3184
|
-
function computeTotalTokens({
|
|
3185
|
-
if (
|
|
3186
|
-
|
|
3187
|
-
|
|
3196
|
+
function computeTotalTokens({ input, output }) {
|
|
3197
|
+
if (input === null && output === null) return null;
|
|
3198
|
+
return (input ?? 0) + (output ?? 0);
|
|
3199
|
+
}
|
|
3200
|
+
function computeTokensPerSecond({ outputTokens, durationMs, latencyMs }) {
|
|
3201
|
+
if (outputTokens === null || durationMs === null) return null;
|
|
3202
|
+
if (outputTokens === 0) return 0;
|
|
3203
|
+
const generationMs = latencyMs === null ? durationMs : durationMs - latencyMs;
|
|
3204
|
+
if (generationMs <= 0) return null;
|
|
3205
|
+
return outputTokens / (generationMs / 1e3);
|
|
3188
3206
|
}
|
|
3189
3207
|
function readSteps(attributes, path) {
|
|
3190
3208
|
const raw = getNestedAttribute(attributes, path);
|
|
@@ -3192,10 +3210,6 @@ function readSteps(attributes, path) {
|
|
|
3192
3210
|
stepCount: raw.length,
|
|
3193
3211
|
stepDetails: raw
|
|
3194
3212
|
};
|
|
3195
|
-
if (typeof raw === "number" && Number.isFinite(raw)) return {
|
|
3196
|
-
stepCount: raw,
|
|
3197
|
-
stepDetails: null
|
|
3198
|
-
};
|
|
3199
3213
|
return {
|
|
3200
3214
|
stepCount: null,
|
|
3201
3215
|
stepDetails: null
|
|
@@ -3217,16 +3231,22 @@ function pickError$1(span) {
|
|
|
3217
3231
|
* shape consumed by the LLM calls tab.
|
|
3218
3232
|
*
|
|
3219
3233
|
* Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
|
|
3220
|
-
* (`model`, token counts,
|
|
3234
|
+
* (`model`, token counts, latency, etc.) are read via
|
|
3221
3235
|
* `getNestedAttribute` from the configured paths, with safe coercion to
|
|
3222
|
-
* `string | null` / `number | null`.
|
|
3223
|
-
*
|
|
3224
|
-
* `
|
|
3225
|
-
*
|
|
3226
|
-
*
|
|
3227
|
-
*
|
|
3228
|
-
*
|
|
3229
|
-
*
|
|
3236
|
+
* `string | null` / `number | null`. `latencyMs` is an explicit
|
|
3237
|
+
* time-to-first-token attribute; full span elapsed time is reported separately
|
|
3238
|
+
* as `durationMs`. Built-in USD costs are derived only from configured model
|
|
3239
|
+
* pricing and token counts. `totalTokens` is always derived from input +
|
|
3240
|
+
* output tokens. Cached input and cache creation tokens are reported
|
|
3241
|
+
* separately because they are subsets of input/output usage. The main cache
|
|
3242
|
+
* creation token field is treated as the total write count; optional one-hour
|
|
3243
|
+
* cache creation tokens only split that total for cost calculation. Base input
|
|
3244
|
+
* cost uses input minus cache read/write tokens so cached tokens are not
|
|
3245
|
+
* charged twice. Cache read/write costs still contribute to the total USD cost
|
|
3246
|
+
* at their configured rates. The `steps` attribute path may resolve to an array
|
|
3247
|
+
* of per-step detail objects, with `stepCount` derived from the array length.
|
|
3248
|
+
* `durationMs` and `tokensPerSecond` are `null` while the span is still
|
|
3249
|
+
* running. User-defined `metrics` whose path resolves to
|
|
3230
3250
|
* `undefined` are dropped, but `null`, `0`, and `false` are preserved as
|
|
3231
3251
|
* legitimate values worth displaying. Original span order is preserved so the
|
|
3232
3252
|
* LLM calls tab matches the ordering in the Trace tab.
|
|
@@ -3243,19 +3263,30 @@ function extractLlmCalls(spans, config) {
|
|
|
3243
3263
|
const outputTokens = readNumber$2(attrs, config.attributes.outputTokens);
|
|
3244
3264
|
const cachedInputTokens = readNumber$2(attrs, config.attributes.cachedInputTokens);
|
|
3245
3265
|
const cacheCreationInputTokens = readNumber$2(attrs, config.attributes.cacheCreationInputTokens);
|
|
3266
|
+
const cacheCreationInput1hTokens = readNumber$2(attrs, config.attributes.cacheCreationInput1hTokens);
|
|
3246
3267
|
const reasoningTokens = readNumber$2(attrs, config.attributes.reasoningTokens);
|
|
3247
|
-
const
|
|
3268
|
+
const latencyMs = readNumber$2(attrs, config.attributes.latencyMs);
|
|
3269
|
+
const durationMs = computeDurationMs$1(span);
|
|
3248
3270
|
const pricing = pickPricingEntry({
|
|
3249
3271
|
pricing: config.pricing,
|
|
3250
3272
|
model,
|
|
3251
3273
|
provider
|
|
3252
3274
|
});
|
|
3253
|
-
const inputCostUsd =
|
|
3254
|
-
|
|
3255
|
-
|
|
3256
|
-
|
|
3257
|
-
|
|
3258
|
-
const
|
|
3275
|
+
const inputCostUsd = computeTokenCost(computeBaseInputTokens({
|
|
3276
|
+
inputTokens,
|
|
3277
|
+
cachedInputTokens,
|
|
3278
|
+
cacheCreationInputTokens
|
|
3279
|
+
}), pricing?.inputUsdPerMillion);
|
|
3280
|
+
const outputCostUsd = computeTokenCost(outputTokens, pricing?.outputUsdPerMillion);
|
|
3281
|
+
const cachedInputCostUsd = computeTokenCost(cachedInputTokens, pricing?.cachedInputUsdPerMillion);
|
|
3282
|
+
const cacheCreationInputCostUsd = computeCacheCreationInputCost({
|
|
3283
|
+
cacheCreationInputTokens,
|
|
3284
|
+
cacheCreationInput1hTokens,
|
|
3285
|
+
usdPerMillion: pricing?.cacheCreationInputUsdPerMillion,
|
|
3286
|
+
oneHourUsdPerMillion: pricing?.cacheCreationInput1hUsdPerMillion
|
|
3287
|
+
});
|
|
3288
|
+
const reasoningCostUsd = computeTokenCost(reasoningTokens, pricing?.reasoningUsdPerMillion);
|
|
3289
|
+
const costUsd = computeTotalCost({
|
|
3259
3290
|
inputTokens,
|
|
3260
3291
|
inputCostUsd,
|
|
3261
3292
|
outputTokens,
|
|
@@ -3293,13 +3324,15 @@ function extractLlmCalls(spans, config) {
|
|
|
3293
3324
|
cacheCreationInputTokens,
|
|
3294
3325
|
reasoningTokens,
|
|
3295
3326
|
totalTokens: computeTotalTokens({
|
|
3296
|
-
declared: declaredTotalTokens,
|
|
3297
3327
|
input: inputTokens,
|
|
3298
|
-
output: outputTokens
|
|
3299
|
-
|
|
3300
|
-
|
|
3328
|
+
output: outputTokens
|
|
3329
|
+
}),
|
|
3330
|
+
latencyMs,
|
|
3331
|
+
tokensPerSecond: computeTokensPerSecond({
|
|
3332
|
+
outputTokens,
|
|
3333
|
+
durationMs,
|
|
3334
|
+
latencyMs
|
|
3301
3335
|
}),
|
|
3302
|
-
tokensPerSecond: readNumber$2(attrs, config.attributes.tokensPerSecond),
|
|
3303
3336
|
costUsd,
|
|
3304
3337
|
inputCostUsd,
|
|
3305
3338
|
outputCostUsd,
|
|
@@ -3308,7 +3341,7 @@ function extractLlmCalls(spans, config) {
|
|
|
3308
3341
|
reasoningCostUsd,
|
|
3309
3342
|
...readSteps(attrs, config.attributes.steps),
|
|
3310
3343
|
finishReason: readString$2(attrs, config.attributes.finishReason),
|
|
3311
|
-
|
|
3344
|
+
durationMs,
|
|
3312
3345
|
input: getNestedAttribute(attrs, config.attributes.input),
|
|
3313
3346
|
output: getNestedAttribute(attrs, config.attributes.output),
|
|
3314
3347
|
reasoning: getNestedAttribute(attrs, config.attributes.reasoning),
|
|
@@ -3333,7 +3366,7 @@ function readString$1(attributes, path) {
|
|
|
3333
3366
|
const raw = getNestedAttribute(attributes, path);
|
|
3334
3367
|
return typeof raw === "string" && raw.length > 0 ? raw : null;
|
|
3335
3368
|
}
|
|
3336
|
-
function
|
|
3369
|
+
function computeDurationMs(span) {
|
|
3337
3370
|
if (span.endedAt === null) return null;
|
|
3338
3371
|
const started = Date.parse(span.startedAt);
|
|
3339
3372
|
const ended = Date.parse(span.endedAt);
|
|
@@ -3358,10 +3391,10 @@ function pickError(span) {
|
|
|
3358
3391
|
*
|
|
3359
3392
|
* Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
|
|
3360
3393
|
* (`method`, `url`, `statusCode`, etc.) are read via `getNestedAttribute` from
|
|
3361
|
-
* the configured paths. `durationMs` takes precedence
|
|
3362
|
-
* fallback to the span start/end timestamps. User-defined `metrics`
|
|
3363
|
-
* resolves to `undefined` are dropped, but `null`, `0`, and `false`
|
|
3364
|
-
* preserved as legitimate values worth displaying. Original span order is
|
|
3394
|
+
* the configured paths. An explicit `durationMs` attribute takes precedence,
|
|
3395
|
+
* with a fallback to the span start/end timestamps. User-defined `metrics`
|
|
3396
|
+
* whose path resolves to `undefined` are dropped, but `null`, `0`, and `false`
|
|
3397
|
+
* are preserved as legitimate values worth displaying. Original span order is
|
|
3365
3398
|
* preserved so the API calls tab matches the ordering in the Trace tab.
|
|
3366
3399
|
*/
|
|
3367
3400
|
function extractApiCalls(spans, config) {
|
|
@@ -3391,7 +3424,7 @@ function extractApiCalls(spans, config) {
|
|
|
3391
3424
|
method: readString$1(attrs, config.attributes.method),
|
|
3392
3425
|
url: readString$1(attrs, config.attributes.url),
|
|
3393
3426
|
statusCode: readNumber$1(attrs, config.attributes.statusCode),
|
|
3394
|
-
|
|
3427
|
+
durationMs: readNumber$1(attrs, config.attributes.durationMs) ?? computeDurationMs(span),
|
|
3395
3428
|
request: getNestedAttribute(attrs, config.attributes.request),
|
|
3396
3429
|
response: getNestedAttribute(attrs, config.attributes.response),
|
|
3397
3430
|
requestBody: getNestedAttribute(attrs, config.attributes.requestBody),
|
|
@@ -3789,7 +3822,7 @@ async function writeCacheFile(cacheDir, cacheFile) {
|
|
|
3789
3822
|
await mkdir(cacheDir, { recursive: true });
|
|
3790
3823
|
const filePath = ownerPath(cacheDir, cacheFile.owner);
|
|
3791
3824
|
const tmpPath = `${filePath}.${process.pid.toString()}.tmp`;
|
|
3792
|
-
await writeFile(tmpPath, JSON.stringify(cacheFile));
|
|
3825
|
+
await writeFile(tmpPath, JSON.stringify(cacheFile, null, 2));
|
|
3793
3826
|
await rename(tmpPath, filePath);
|
|
3794
3827
|
}
|
|
3795
3828
|
async function readDebugKeyFile(debugDir, owner) {
|
|
@@ -4035,7 +4068,6 @@ function getScoreOverride(def) {
|
|
|
4035
4068
|
format: def.format,
|
|
4036
4069
|
numberFormat: def.numberFormat,
|
|
4037
4070
|
hideInTable: def.hideInTable,
|
|
4038
|
-
sortable: def.sortable,
|
|
4039
4071
|
align: def.align,
|
|
4040
4072
|
maxStars: def.maxStars
|
|
4041
4073
|
};
|
|
@@ -4048,7 +4080,6 @@ function mergeOverrides(base, override) {
|
|
|
4048
4080
|
format: override.format ?? base.format,
|
|
4049
4081
|
numberFormat: override.numberFormat ?? base.numberFormat,
|
|
4050
4082
|
hideInTable: override.hideInTable ?? base.hideInTable,
|
|
4051
|
-
sortable: override.sortable ?? base.sortable,
|
|
4052
4083
|
align: override.align ?? base.align,
|
|
4053
4084
|
maxStars: override.maxStars ?? base.maxStars
|
|
4054
4085
|
};
|
|
@@ -4163,7 +4194,6 @@ function createColumnDef(params) {
|
|
|
4163
4194
|
if (override?.numberFormat !== void 0) def.numberFormat = override.numberFormat;
|
|
4164
4195
|
if (override?.maxStars !== void 0) def.maxStars = override.maxStars;
|
|
4165
4196
|
if (override?.hideInTable !== void 0) def.hideInTable = override.hideInTable;
|
|
4166
|
-
if (override?.sortable !== void 0) def.sortable = override.sortable;
|
|
4167
4197
|
if (override?.align !== void 0) def.align = override.align;
|
|
4168
4198
|
if (!isScore) return def;
|
|
4169
4199
|
def.isScore = true;
|
|
@@ -4232,85 +4262,76 @@ const DEFAULT_CONFIG_KEYS = [
|
|
|
4232
4262
|
"cachedInputTokens",
|
|
4233
4263
|
"cacheCreationInputTokens",
|
|
4234
4264
|
"reasoningTokens",
|
|
4235
|
-
"
|
|
4265
|
+
"llmDurationMs"
|
|
4236
4266
|
];
|
|
4237
|
-
const tokenNumberFormat = {
|
|
4238
|
-
|
|
4239
|
-
|
|
4267
|
+
const tokenNumberFormat = { notation: "compact" };
|
|
4268
|
+
const countNumberFormat = {
|
|
4269
|
+
minDecimalPlaces: 0,
|
|
4270
|
+
maxDecimalPlaces: 0
|
|
4271
|
+
};
|
|
4272
|
+
const costNumberFormat = {
|
|
4273
|
+
prefix: "$",
|
|
4274
|
+
maxDecimalPlaces: 4
|
|
4240
4275
|
};
|
|
4241
|
-
const countNumberFormat = { decimalPlaces: 0 };
|
|
4242
4276
|
const DEFAULT_COLUMNS = {
|
|
4243
4277
|
apiCalls: {
|
|
4244
4278
|
label: "API Calls",
|
|
4245
4279
|
format: "number",
|
|
4246
4280
|
numberFormat: countNumberFormat,
|
|
4247
|
-
align: "right"
|
|
4248
|
-
sortable: true
|
|
4281
|
+
align: "right"
|
|
4249
4282
|
},
|
|
4250
4283
|
costUsd: {
|
|
4251
4284
|
label: "Cost",
|
|
4252
4285
|
format: "number",
|
|
4253
|
-
numberFormat:
|
|
4254
|
-
|
|
4255
|
-
decimalPlaces: 4
|
|
4256
|
-
},
|
|
4257
|
-
align: "right",
|
|
4258
|
-
sortable: true
|
|
4286
|
+
numberFormat: costNumberFormat,
|
|
4287
|
+
align: "right"
|
|
4259
4288
|
},
|
|
4260
4289
|
llmTurns: {
|
|
4261
4290
|
label: "LLM Turns",
|
|
4262
4291
|
format: "number",
|
|
4263
4292
|
numberFormat: countNumberFormat,
|
|
4264
|
-
align: "right"
|
|
4265
|
-
sortable: true
|
|
4293
|
+
align: "right"
|
|
4266
4294
|
},
|
|
4267
4295
|
inputTokens: {
|
|
4268
4296
|
label: "Input Tokens",
|
|
4269
4297
|
format: "number",
|
|
4270
4298
|
numberFormat: tokenNumberFormat,
|
|
4271
|
-
align: "right"
|
|
4272
|
-
sortable: true
|
|
4299
|
+
align: "right"
|
|
4273
4300
|
},
|
|
4274
4301
|
outputTokens: {
|
|
4275
4302
|
label: "Output Tokens",
|
|
4276
4303
|
format: "number",
|
|
4277
4304
|
numberFormat: tokenNumberFormat,
|
|
4278
|
-
align: "right"
|
|
4279
|
-
sortable: true
|
|
4305
|
+
align: "right"
|
|
4280
4306
|
},
|
|
4281
4307
|
totalTokens: {
|
|
4282
4308
|
label: "Total Tokens",
|
|
4283
4309
|
format: "number",
|
|
4284
4310
|
numberFormat: tokenNumberFormat,
|
|
4285
|
-
align: "right"
|
|
4286
|
-
sortable: true
|
|
4311
|
+
align: "right"
|
|
4287
4312
|
},
|
|
4288
4313
|
cachedInputTokens: {
|
|
4289
4314
|
label: "Cached Input Tokens",
|
|
4290
4315
|
format: "number",
|
|
4291
4316
|
numberFormat: tokenNumberFormat,
|
|
4292
|
-
align: "right"
|
|
4293
|
-
sortable: true
|
|
4317
|
+
align: "right"
|
|
4294
4318
|
},
|
|
4295
4319
|
cacheCreationInputTokens: {
|
|
4296
4320
|
label: "Cache Write Tokens",
|
|
4297
4321
|
format: "number",
|
|
4298
4322
|
numberFormat: tokenNumberFormat,
|
|
4299
|
-
align: "right"
|
|
4300
|
-
sortable: true
|
|
4323
|
+
align: "right"
|
|
4301
4324
|
},
|
|
4302
4325
|
reasoningTokens: {
|
|
4303
4326
|
label: "Reasoning Tokens",
|
|
4304
4327
|
format: "number",
|
|
4305
4328
|
numberFormat: tokenNumberFormat,
|
|
4306
|
-
align: "right"
|
|
4307
|
-
sortable: true
|
|
4329
|
+
align: "right"
|
|
4308
4330
|
},
|
|
4309
|
-
|
|
4310
|
-
label: "LLM
|
|
4331
|
+
llmDurationMs: {
|
|
4332
|
+
label: "LLM Duration",
|
|
4311
4333
|
format: "duration",
|
|
4312
|
-
align: "right"
|
|
4313
|
-
sortable: true
|
|
4334
|
+
align: "right"
|
|
4314
4335
|
}
|
|
4315
4336
|
};
|
|
4316
4337
|
function resolveRemovedKeys(globalRemove, evalRemove) {
|
|
@@ -4336,31 +4357,29 @@ function appendDefaultStats(params) {
|
|
|
4336
4357
|
kind: "column",
|
|
4337
4358
|
key: "apiCalls",
|
|
4338
4359
|
label: "API Calls",
|
|
4339
|
-
aggregate: "avg"
|
|
4360
|
+
aggregate: "avg",
|
|
4361
|
+
numberFormat: countNumberFormat
|
|
4340
4362
|
});
|
|
4341
4363
|
if (activeKeys.has("costUsd")) defaults.push({
|
|
4342
4364
|
kind: "column",
|
|
4343
4365
|
key: "costUsd",
|
|
4344
4366
|
label: "LLM Cost",
|
|
4345
|
-
aggregate: "
|
|
4367
|
+
aggregate: "avg",
|
|
4368
|
+
numberFormat: costNumberFormat
|
|
4346
4369
|
});
|
|
4347
4370
|
if (activeKeys.has("totalTokens")) defaults.push({
|
|
4348
4371
|
kind: "column",
|
|
4349
4372
|
key: "totalTokens",
|
|
4350
4373
|
label: "Tokens",
|
|
4351
|
-
aggregate: "
|
|
4374
|
+
aggregate: "avg",
|
|
4375
|
+
numberFormat: tokenNumberFormat
|
|
4352
4376
|
});
|
|
4353
4377
|
if (activeKeys.has("llmTurns")) defaults.push({
|
|
4354
4378
|
kind: "column",
|
|
4355
4379
|
key: "llmTurns",
|
|
4356
4380
|
label: "LLM Turns",
|
|
4357
|
-
aggregate: "avg"
|
|
4358
|
-
|
|
4359
|
-
if (activeKeys.has("llmLatencyMs")) defaults.push({
|
|
4360
|
-
kind: "column",
|
|
4361
|
-
key: "llmLatencyMs",
|
|
4362
|
-
label: "LLM Latency",
|
|
4363
|
-
aggregate: "avg"
|
|
4381
|
+
aggregate: "avg",
|
|
4382
|
+
numberFormat: countNumberFormat
|
|
4364
4383
|
});
|
|
4365
4384
|
const merged = [...params.stats ?? [], ...defaults];
|
|
4366
4385
|
return merged.length > 0 ? merged : void 0;
|
|
@@ -4368,24 +4387,13 @@ function appendDefaultStats(params) {
|
|
|
4368
4387
|
function appendDefaultCharts(params) {
|
|
4369
4388
|
const activeKeys = new Set(getActiveDefaultConfigKeys(params));
|
|
4370
4389
|
const defaults = [];
|
|
4371
|
-
if (activeKeys.has("apiCalls")) defaults.push({
|
|
4372
|
-
heading: "API Calls",
|
|
4373
|
-
type: "bar",
|
|
4374
|
-
metrics: [{
|
|
4375
|
-
source: "column",
|
|
4376
|
-
key: "apiCalls",
|
|
4377
|
-
aggregate: "sum",
|
|
4378
|
-
label: "API Calls",
|
|
4379
|
-
color: "accentDim"
|
|
4380
|
-
}]
|
|
4381
|
-
});
|
|
4382
4390
|
if (activeKeys.has("costUsd")) defaults.push({
|
|
4383
4391
|
heading: "LLM Cost",
|
|
4384
4392
|
type: "area",
|
|
4385
4393
|
metrics: [{
|
|
4386
4394
|
source: "column",
|
|
4387
4395
|
key: "costUsd",
|
|
4388
|
-
aggregate: "
|
|
4396
|
+
aggregate: "avg",
|
|
4389
4397
|
label: "Cost",
|
|
4390
4398
|
color: "warning"
|
|
4391
4399
|
}]
|
|
@@ -4394,23 +4402,30 @@ function appendDefaultCharts(params) {
|
|
|
4394
4402
|
activeKeys.has("inputTokens") ? {
|
|
4395
4403
|
source: "column",
|
|
4396
4404
|
key: "inputTokens",
|
|
4397
|
-
aggregate: "
|
|
4405
|
+
aggregate: "avg",
|
|
4398
4406
|
label: "Input",
|
|
4399
4407
|
color: "accent"
|
|
4400
4408
|
} : null,
|
|
4401
4409
|
activeKeys.has("outputTokens") ? {
|
|
4402
4410
|
source: "column",
|
|
4403
4411
|
key: "outputTokens",
|
|
4404
|
-
aggregate: "
|
|
4412
|
+
aggregate: "avg",
|
|
4405
4413
|
label: "Output",
|
|
4406
4414
|
color: "success"
|
|
4407
4415
|
} : null,
|
|
4408
|
-
activeKeys.has("
|
|
4416
|
+
activeKeys.has("cachedInputTokens") ? {
|
|
4409
4417
|
source: "column",
|
|
4410
|
-
key: "
|
|
4411
|
-
aggregate: "
|
|
4412
|
-
label: "
|
|
4418
|
+
key: "cachedInputTokens",
|
|
4419
|
+
aggregate: "avg",
|
|
4420
|
+
label: "Cached Input",
|
|
4413
4421
|
color: "error"
|
|
4422
|
+
} : null,
|
|
4423
|
+
activeKeys.has("cacheCreationInputTokens") ? {
|
|
4424
|
+
source: "column",
|
|
4425
|
+
key: "cacheCreationInputTokens",
|
|
4426
|
+
aggregate: "avg",
|
|
4427
|
+
label: "Cache Write",
|
|
4428
|
+
color: "warning"
|
|
4414
4429
|
} : null
|
|
4415
4430
|
].filter((metric) => metric !== null);
|
|
4416
4431
|
if (tokenMetrics.length > 0) defaults.push({
|
|
@@ -4420,7 +4435,7 @@ function appendDefaultCharts(params) {
|
|
|
4420
4435
|
tooltipExtras: activeKeys.has("totalTokens") ? [{
|
|
4421
4436
|
source: "column",
|
|
4422
4437
|
key: "totalTokens",
|
|
4423
|
-
aggregate: "
|
|
4438
|
+
aggregate: "avg",
|
|
4424
4439
|
label: "Total"
|
|
4425
4440
|
}] : void 0
|
|
4426
4441
|
});
|
|
@@ -4525,8 +4540,8 @@ function addDefaultOutputs(params) {
|
|
|
4525
4540
|
});
|
|
4526
4541
|
assignIfMissing({
|
|
4527
4542
|
outputs: params.outputs,
|
|
4528
|
-
key: "
|
|
4529
|
-
value: sumNullable(calls.map((call) => call.
|
|
4543
|
+
key: "llmDurationMs",
|
|
4544
|
+
value: sumNullable(calls.map((call) => call.durationMs)),
|
|
4530
4545
|
activeKeys
|
|
4531
4546
|
});
|
|
4532
4547
|
}
|
|
@@ -5372,7 +5387,7 @@ async function runCase(params) {
|
|
|
5372
5387
|
caseDetail,
|
|
5373
5388
|
caseRowUpdate: {
|
|
5374
5389
|
status,
|
|
5375
|
-
|
|
5390
|
+
durationMs: Date.now() - startTime,
|
|
5376
5391
|
columns
|
|
5377
5392
|
}
|
|
5378
5393
|
};
|
|
@@ -5663,7 +5678,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5663
5678
|
caseId: evalCase.id,
|
|
5664
5679
|
evalId: evalMeta.id,
|
|
5665
5680
|
status: caseRowUpdate.status ?? "pending",
|
|
5666
|
-
|
|
5681
|
+
durationMs: caseRowUpdate.durationMs ?? null,
|
|
5667
5682
|
columns: caseRowUpdate.columns ?? {},
|
|
5668
5683
|
trial
|
|
5669
5684
|
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-C0EtHhEO.mjs";
|
|
2
|
+
import "./src-D-HuV8I-.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-C9nP2VKL.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|