@ls-stack/agent-eval 0.21.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-CmwmcUgG.mjs → app-moDHbg1O.mjs} +3 -3
- package/dist/apps/web/dist/assets/index-AUDD3rNB.js +118 -0
- package/dist/apps/web/dist/index.html +1 -1
- package/dist/bin.mjs +1 -1
- package/dist/{cli-DumvanQI.mjs → cli-C0EtHhEO.mjs} +3 -3
- package/dist/index.d.mts +46 -53
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-zYAcAPtS.mjs → runOrchestration-D1edUDhp.mjs} +109 -78
- package/dist/{runner-Dy_PECaf.mjs → runner-C9nP2VKL.mjs} +2 -2
- package/dist/{runner-BcwyX9CO.mjs → runner-CyRhIzci.mjs} +1 -1
- package/dist/src-D-HuV8I-.mjs +3 -0
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +23 -17
- package/dist/apps/web/dist/assets/index-EXO08yya.js +0 -118
- package/dist/src-BoAJb4wC.mjs +0 -3
|
@@ -2468,7 +2468,8 @@ const caseRowSchema = z.object({
|
|
|
2468
2468
|
"error",
|
|
2469
2469
|
"cancelled"
|
|
2470
2470
|
]),
|
|
2471
|
-
|
|
2471
|
+
/** Elapsed case execution duration in milliseconds, or null before completion. */
|
|
2472
|
+
durationMs: z.number().nullable(),
|
|
2472
2473
|
costUsd: z.number().nullable().optional(),
|
|
2473
2474
|
columns: z.record(z.string(), cellValueSchema),
|
|
2474
2475
|
/** Winning trial index for the persisted case result. */
|
|
@@ -2586,7 +2587,7 @@ const defaultConfigKeySchema = z.enum([
|
|
|
2586
2587
|
"cachedInputTokens",
|
|
2587
2588
|
"cacheCreationInputTokens",
|
|
2588
2589
|
"reasoningTokens",
|
|
2589
|
-
"
|
|
2590
|
+
"llmDurationMs"
|
|
2590
2591
|
]);
|
|
2591
2592
|
/** Removal config for built-in eval-level outputs and UI metadata. */
|
|
2592
2593
|
const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfigKeySchema)]);
|
|
@@ -2663,7 +2664,7 @@ const apiCallMetricSchema = z.object({
|
|
|
2663
2664
|
});
|
|
2664
2665
|
/**
|
|
2665
2666
|
* Schema for one model/provider pricing entry used to derive LLM-call costs
|
|
2666
|
-
* from token counts
|
|
2667
|
+
* from token counts.
|
|
2667
2668
|
*/
|
|
2668
2669
|
const llmCallPricingSchema = z.object({
|
|
2669
2670
|
/** Exact model name read from the configured `attributes.model` path. */
|
|
@@ -2682,6 +2683,8 @@ const llmCallPricingSchema = z.object({
|
|
|
2682
2683
|
cachedInputUsdPerMillion: z.number().nonnegative().optional(),
|
|
2683
2684
|
/** USD per one million prompt-cache write tokens. */
|
|
2684
2685
|
cacheCreationInputUsdPerMillion: z.number().nonnegative().optional(),
|
|
2686
|
+
/** USD per one million one-hour prompt-cache write tokens. */
|
|
2687
|
+
cacheCreationInput1hUsdPerMillion: z.number().nonnegative().optional(),
|
|
2685
2688
|
/** USD per one million reasoning tokens when reported separately. */
|
|
2686
2689
|
reasoningUsdPerMillion: z.number().nonnegative().optional()
|
|
2687
2690
|
});
|
|
@@ -2692,12 +2695,9 @@ const llmCallsConfigSchema = z.object({
|
|
|
2692
2695
|
/**
|
|
2693
2696
|
* Attribute paths used to extract structured per-call fields. Each entry is
|
|
2694
2697
|
* a dot-path inside `span.attributes`. Missing paths fall back to the
|
|
2695
|
-
* built-in defaults (e.g. `usage.inputTokens
|
|
2696
|
-
*
|
|
2697
|
-
*
|
|
2698
|
-
* `reasoningCost`) feed the cost breakdown table in the expanded row when
|
|
2699
|
-
* spans provide explicit USD cost overrides. Prefer `pricing` for deriving
|
|
2700
|
-
* costs from token counts globally.
|
|
2698
|
+
* built-in defaults (e.g. `usage.inputTokens`). Derived fields such as
|
|
2699
|
+
* total tokens, tokens/sec, duration, and USD costs are intentionally not
|
|
2700
|
+
* configurable as attribute paths.
|
|
2701
2701
|
*/
|
|
2702
2702
|
attributes: z.object({
|
|
2703
2703
|
model: z.string().optional(),
|
|
@@ -2706,15 +2706,9 @@ const llmCallsConfigSchema = z.object({
|
|
|
2706
2706
|
outputTokens: z.string().optional(),
|
|
2707
2707
|
cachedInputTokens: z.string().optional(),
|
|
2708
2708
|
cacheCreationInputTokens: z.string().optional(),
|
|
2709
|
+
cacheCreationInput1hTokens: z.string().optional(),
|
|
2709
2710
|
reasoningTokens: z.string().optional(),
|
|
2710
|
-
|
|
2711
|
-
tokensPerSecond: z.string().optional(),
|
|
2712
|
-
cost: z.string().optional(),
|
|
2713
|
-
inputCost: z.string().optional(),
|
|
2714
|
-
outputCost: z.string().optional(),
|
|
2715
|
-
cachedInputCost: z.string().optional(),
|
|
2716
|
-
cacheCreationInputCost: z.string().optional(),
|
|
2717
|
-
reasoningCost: z.string().optional(),
|
|
2711
|
+
latencyMs: z.string().optional(),
|
|
2718
2712
|
steps: z.string().optional(),
|
|
2719
2713
|
finishReason: z.string().optional(),
|
|
2720
2714
|
input: z.string().optional(),
|
|
@@ -2723,9 +2717,8 @@ const llmCallsConfigSchema = z.object({
|
|
|
2723
2717
|
toolCalls: z.string().optional()
|
|
2724
2718
|
}).optional(),
|
|
2725
2719
|
/**
|
|
2726
|
-
* Model/provider pricing registry used to calculate
|
|
2727
|
-
*
|
|
2728
|
-
* etc.) take precedence over derived prices.
|
|
2720
|
+
* Model/provider pricing registry used to calculate LLM-call costs from
|
|
2721
|
+
* token counts. Built-in LLM cost fields are only derived from this registry.
|
|
2729
2722
|
*/
|
|
2730
2723
|
pricing: z.array(llmCallPricingSchema).optional(),
|
|
2731
2724
|
/** Custom user-defined metrics surfaced on each LLM call. */
|
|
@@ -2773,15 +2766,9 @@ const DEFAULT_LLM_CALLS_CONFIG = {
|
|
|
2773
2766
|
outputTokens: "usage.outputTokens",
|
|
2774
2767
|
cachedInputTokens: "usage.cachedInputTokens",
|
|
2775
2768
|
cacheCreationInputTokens: "usage.cacheCreationInputTokens",
|
|
2769
|
+
cacheCreationInput1hTokens: "usage.cacheCreationInput1hTokens",
|
|
2776
2770
|
reasoningTokens: "usage.reasoningTokens",
|
|
2777
|
-
|
|
2778
|
-
tokensPerSecond: "tokensPerSecond",
|
|
2779
|
-
cost: "costUsd",
|
|
2780
|
-
inputCost: "cost.inputUsd",
|
|
2781
|
-
outputCost: "cost.outputUsd",
|
|
2782
|
-
cachedInputCost: "cost.cachedInputUsd",
|
|
2783
|
-
cacheCreationInputCost: "cost.cacheCreationInputUsd",
|
|
2784
|
-
reasoningCost: "cost.reasoningUsd",
|
|
2771
|
+
latencyMs: "latencyMs",
|
|
2785
2772
|
steps: "steps",
|
|
2786
2773
|
finishReason: "finishReason",
|
|
2787
2774
|
input: "input",
|
|
@@ -2823,8 +2810,8 @@ const DEFAULT_API_CALLS_CONFIG = {
|
|
|
2823
2810
|
* attribute path.
|
|
2824
2811
|
* - Missing `metrics[].format` defaults to `'string'`.
|
|
2825
2812
|
* - Missing `metrics[].placements` defaults to `['body']`.
|
|
2826
|
-
* - Missing `pricing` defaults to an empty registry;
|
|
2827
|
-
*
|
|
2813
|
+
* - Missing `pricing` defaults to an empty registry; built-in costs are only
|
|
2814
|
+
* derived from configured pricing and token counts.
|
|
2828
2815
|
*/
|
|
2829
2816
|
function resolveLlmCallsConfig(input) {
|
|
2830
2817
|
return {
|
|
@@ -2848,6 +2835,7 @@ function resolveLlmCallsConfig(input) {
|
|
|
2848
2835
|
outputUsdPerMillion: p.outputUsdPerMillion,
|
|
2849
2836
|
cachedInputUsdPerMillion: p.cachedInputUsdPerMillion,
|
|
2850
2837
|
cacheCreationInputUsdPerMillion: p.cacheCreationInputUsdPerMillion,
|
|
2838
|
+
cacheCreationInput1hUsdPerMillion: p.cacheCreationInput1hUsdPerMillion,
|
|
2851
2839
|
reasoningUsdPerMillion: p.reasoningUsdPerMillion
|
|
2852
2840
|
}))
|
|
2853
2841
|
};
|
|
@@ -3037,8 +3025,8 @@ function deriveScopedSummaryFromCases(params) {
|
|
|
3037
3025
|
else if (caseRow.status === "cancelled") cancelledCases += 1;
|
|
3038
3026
|
else if (caseRow.status === "running") runningCases += 1;
|
|
3039
3027
|
else pendingCases += 1;
|
|
3040
|
-
if (caseRow.
|
|
3041
|
-
totalDurationMs += caseRow.
|
|
3028
|
+
if (caseRow.durationMs !== null) {
|
|
3029
|
+
totalDurationMs += caseRow.durationMs;
|
|
3042
3030
|
hasDuration = true;
|
|
3043
3031
|
}
|
|
3044
3032
|
}
|
|
@@ -3132,6 +3120,21 @@ function computeTokenCost(tokens, usdPerMillion) {
|
|
|
3132
3120
|
if (usdPerMillion === void 0) return null;
|
|
3133
3121
|
return tokens / 1e6 * usdPerMillion;
|
|
3134
3122
|
}
|
|
3123
|
+
function computeCacheCreationInputCost({ cacheCreationInputTokens, cacheCreationInput1hTokens, usdPerMillion, oneHourUsdPerMillion }) {
|
|
3124
|
+
if (cacheCreationInputTokens === null) return null;
|
|
3125
|
+
if (cacheCreationInputTokens === 0) return 0;
|
|
3126
|
+
if (cacheCreationInput1hTokens === null) return computeTokenCost(cacheCreationInputTokens, usdPerMillion);
|
|
3127
|
+
const oneHourTokens = Math.min(cacheCreationInput1hTokens, cacheCreationInputTokens);
|
|
3128
|
+
const shortLivedCost = computeTokenCost(cacheCreationInputTokens - oneHourTokens, usdPerMillion);
|
|
3129
|
+
const oneHourCost = computeTokenCost(oneHourTokens, oneHourUsdPerMillion);
|
|
3130
|
+
if (shortLivedCost === null || oneHourCost === null) return null;
|
|
3131
|
+
return shortLivedCost + oneHourCost;
|
|
3132
|
+
}
|
|
3133
|
+
function computeBaseInputTokens({ inputTokens, cachedInputTokens, cacheCreationInputTokens }) {
|
|
3134
|
+
if (inputTokens === null) return null;
|
|
3135
|
+
const cachedTokens = (cachedInputTokens ?? 0) + (cacheCreationInputTokens ?? 0);
|
|
3136
|
+
return Math.max(inputTokens - cachedTokens, 0);
|
|
3137
|
+
}
|
|
3135
3138
|
function pickPricingEntry({ pricing, model, provider }) {
|
|
3136
3139
|
if (model === null) return null;
|
|
3137
3140
|
let fallback = null;
|
|
@@ -3145,7 +3148,7 @@ function pickPricingEntry({ pricing, model, provider }) {
|
|
|
3145
3148
|
}
|
|
3146
3149
|
return fallback;
|
|
3147
3150
|
}
|
|
3148
|
-
function
|
|
3151
|
+
function computeTotalCost({ inputTokens, inputCostUsd, outputTokens, outputCostUsd, cachedInputTokens, cachedInputCostUsd, cacheCreationInputTokens, cacheCreationInputCostUsd, reasoningTokens, reasoningCostUsd }) {
|
|
3149
3152
|
const parts = [
|
|
3150
3153
|
{
|
|
3151
3154
|
tokens: inputTokens,
|
|
@@ -3182,7 +3185,7 @@ function computeFallbackTotalCost({ inputTokens, inputCostUsd, outputTokens, out
|
|
|
3182
3185
|
if (hasCost) return total;
|
|
3183
3186
|
return hasReportedTokens ? 0 : null;
|
|
3184
3187
|
}
|
|
3185
|
-
function
|
|
3188
|
+
function computeDurationMs$1(span) {
|
|
3186
3189
|
if (span.endedAt === null) return null;
|
|
3187
3190
|
const started = Date.parse(span.startedAt);
|
|
3188
3191
|
const ended = Date.parse(span.endedAt);
|
|
@@ -3190,10 +3193,16 @@ function computeLatencyMs$1(span) {
|
|
|
3190
3193
|
const delta = ended - started;
|
|
3191
3194
|
return delta >= 0 ? delta : null;
|
|
3192
3195
|
}
|
|
3193
|
-
function computeTotalTokens({
|
|
3194
|
-
if (
|
|
3195
|
-
|
|
3196
|
-
|
|
3196
|
+
function computeTotalTokens({ input, output }) {
|
|
3197
|
+
if (input === null && output === null) return null;
|
|
3198
|
+
return (input ?? 0) + (output ?? 0);
|
|
3199
|
+
}
|
|
3200
|
+
function computeTokensPerSecond({ outputTokens, durationMs, latencyMs }) {
|
|
3201
|
+
if (outputTokens === null || durationMs === null) return null;
|
|
3202
|
+
if (outputTokens === 0) return 0;
|
|
3203
|
+
const generationMs = latencyMs === null ? durationMs : durationMs - latencyMs;
|
|
3204
|
+
if (generationMs <= 0) return null;
|
|
3205
|
+
return outputTokens / (generationMs / 1e3);
|
|
3197
3206
|
}
|
|
3198
3207
|
function readSteps(attributes, path) {
|
|
3199
3208
|
const raw = getNestedAttribute(attributes, path);
|
|
@@ -3201,10 +3210,6 @@ function readSteps(attributes, path) {
|
|
|
3201
3210
|
stepCount: raw.length,
|
|
3202
3211
|
stepDetails: raw
|
|
3203
3212
|
};
|
|
3204
|
-
if (typeof raw === "number" && Number.isFinite(raw)) return {
|
|
3205
|
-
stepCount: raw,
|
|
3206
|
-
stepDetails: null
|
|
3207
|
-
};
|
|
3208
3213
|
return {
|
|
3209
3214
|
stepCount: null,
|
|
3210
3215
|
stepDetails: null
|
|
@@ -3226,16 +3231,22 @@ function pickError$1(span) {
|
|
|
3226
3231
|
* shape consumed by the LLM calls tab.
|
|
3227
3232
|
*
|
|
3228
3233
|
* Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
|
|
3229
|
-
* (`model`, token counts,
|
|
3234
|
+
* (`model`, token counts, latency, etc.) are read via
|
|
3230
3235
|
* `getNestedAttribute` from the configured paths, with safe coercion to
|
|
3231
|
-
* `string | null` / `number | null`.
|
|
3232
|
-
*
|
|
3233
|
-
* `
|
|
3234
|
-
*
|
|
3235
|
-
*
|
|
3236
|
-
*
|
|
3237
|
-
*
|
|
3238
|
-
*
|
|
3236
|
+
* `string | null` / `number | null`. `latencyMs` is an explicit
|
|
3237
|
+
* time-to-first-token attribute; full span elapsed time is reported separately
|
|
3238
|
+
* as `durationMs`. Built-in USD costs are derived only from configured model
|
|
3239
|
+
* pricing and token counts. `totalTokens` is always derived from input +
|
|
3240
|
+
* output tokens. Cached input and cache creation tokens are reported
|
|
3241
|
+
* separately because they are subsets of input/output usage. The main cache
|
|
3242
|
+
* creation token field is treated as the total write count; optional one-hour
|
|
3243
|
+
* cache creation tokens only split that total for cost calculation. Base input
|
|
3244
|
+
* cost uses input minus cache read/write tokens so cached tokens are not
|
|
3245
|
+
* charged twice. Cache read/write costs still contribute to the total USD cost
|
|
3246
|
+
* at their configured rates. The `steps` attribute path may resolve to an array
|
|
3247
|
+
* of per-step detail objects, with `stepCount` derived from the array length.
|
|
3248
|
+
* `durationMs` and `tokensPerSecond` are `null` while the span is still
|
|
3249
|
+
* running. User-defined `metrics` whose path resolves to
|
|
3239
3250
|
* `undefined` are dropped, but `null`, `0`, and `false` are preserved as
|
|
3240
3251
|
* legitimate values worth displaying. Original span order is preserved so the
|
|
3241
3252
|
* LLM calls tab matches the ordering in the Trace tab.
|
|
@@ -3252,19 +3263,30 @@ function extractLlmCalls(spans, config) {
|
|
|
3252
3263
|
const outputTokens = readNumber$2(attrs, config.attributes.outputTokens);
|
|
3253
3264
|
const cachedInputTokens = readNumber$2(attrs, config.attributes.cachedInputTokens);
|
|
3254
3265
|
const cacheCreationInputTokens = readNumber$2(attrs, config.attributes.cacheCreationInputTokens);
|
|
3266
|
+
const cacheCreationInput1hTokens = readNumber$2(attrs, config.attributes.cacheCreationInput1hTokens);
|
|
3255
3267
|
const reasoningTokens = readNumber$2(attrs, config.attributes.reasoningTokens);
|
|
3256
|
-
const
|
|
3268
|
+
const latencyMs = readNumber$2(attrs, config.attributes.latencyMs);
|
|
3269
|
+
const durationMs = computeDurationMs$1(span);
|
|
3257
3270
|
const pricing = pickPricingEntry({
|
|
3258
3271
|
pricing: config.pricing,
|
|
3259
3272
|
model,
|
|
3260
3273
|
provider
|
|
3261
3274
|
});
|
|
3262
|
-
const inputCostUsd =
|
|
3263
|
-
|
|
3264
|
-
|
|
3265
|
-
|
|
3266
|
-
|
|
3267
|
-
const
|
|
3275
|
+
const inputCostUsd = computeTokenCost(computeBaseInputTokens({
|
|
3276
|
+
inputTokens,
|
|
3277
|
+
cachedInputTokens,
|
|
3278
|
+
cacheCreationInputTokens
|
|
3279
|
+
}), pricing?.inputUsdPerMillion);
|
|
3280
|
+
const outputCostUsd = computeTokenCost(outputTokens, pricing?.outputUsdPerMillion);
|
|
3281
|
+
const cachedInputCostUsd = computeTokenCost(cachedInputTokens, pricing?.cachedInputUsdPerMillion);
|
|
3282
|
+
const cacheCreationInputCostUsd = computeCacheCreationInputCost({
|
|
3283
|
+
cacheCreationInputTokens,
|
|
3284
|
+
cacheCreationInput1hTokens,
|
|
3285
|
+
usdPerMillion: pricing?.cacheCreationInputUsdPerMillion,
|
|
3286
|
+
oneHourUsdPerMillion: pricing?.cacheCreationInput1hUsdPerMillion
|
|
3287
|
+
});
|
|
3288
|
+
const reasoningCostUsd = computeTokenCost(reasoningTokens, pricing?.reasoningUsdPerMillion);
|
|
3289
|
+
const costUsd = computeTotalCost({
|
|
3268
3290
|
inputTokens,
|
|
3269
3291
|
inputCostUsd,
|
|
3270
3292
|
outputTokens,
|
|
@@ -3302,13 +3324,15 @@ function extractLlmCalls(spans, config) {
|
|
|
3302
3324
|
cacheCreationInputTokens,
|
|
3303
3325
|
reasoningTokens,
|
|
3304
3326
|
totalTokens: computeTotalTokens({
|
|
3305
|
-
declared: declaredTotalTokens,
|
|
3306
3327
|
input: inputTokens,
|
|
3307
|
-
output: outputTokens
|
|
3308
|
-
|
|
3309
|
-
|
|
3328
|
+
output: outputTokens
|
|
3329
|
+
}),
|
|
3330
|
+
latencyMs,
|
|
3331
|
+
tokensPerSecond: computeTokensPerSecond({
|
|
3332
|
+
outputTokens,
|
|
3333
|
+
durationMs,
|
|
3334
|
+
latencyMs
|
|
3310
3335
|
}),
|
|
3311
|
-
tokensPerSecond: readNumber$2(attrs, config.attributes.tokensPerSecond),
|
|
3312
3336
|
costUsd,
|
|
3313
3337
|
inputCostUsd,
|
|
3314
3338
|
outputCostUsd,
|
|
@@ -3317,7 +3341,7 @@ function extractLlmCalls(spans, config) {
|
|
|
3317
3341
|
reasoningCostUsd,
|
|
3318
3342
|
...readSteps(attrs, config.attributes.steps),
|
|
3319
3343
|
finishReason: readString$2(attrs, config.attributes.finishReason),
|
|
3320
|
-
|
|
3344
|
+
durationMs,
|
|
3321
3345
|
input: getNestedAttribute(attrs, config.attributes.input),
|
|
3322
3346
|
output: getNestedAttribute(attrs, config.attributes.output),
|
|
3323
3347
|
reasoning: getNestedAttribute(attrs, config.attributes.reasoning),
|
|
@@ -3342,7 +3366,7 @@ function readString$1(attributes, path) {
|
|
|
3342
3366
|
const raw = getNestedAttribute(attributes, path);
|
|
3343
3367
|
return typeof raw === "string" && raw.length > 0 ? raw : null;
|
|
3344
3368
|
}
|
|
3345
|
-
function
|
|
3369
|
+
function computeDurationMs(span) {
|
|
3346
3370
|
if (span.endedAt === null) return null;
|
|
3347
3371
|
const started = Date.parse(span.startedAt);
|
|
3348
3372
|
const ended = Date.parse(span.endedAt);
|
|
@@ -3367,10 +3391,10 @@ function pickError(span) {
|
|
|
3367
3391
|
*
|
|
3368
3392
|
* Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
|
|
3369
3393
|
* (`method`, `url`, `statusCode`, etc.) are read via `getNestedAttribute` from
|
|
3370
|
-
* the configured paths. `durationMs` takes precedence
|
|
3371
|
-
* fallback to the span start/end timestamps. User-defined `metrics`
|
|
3372
|
-
* resolves to `undefined` are dropped, but `null`, `0`, and `false`
|
|
3373
|
-
* preserved as legitimate values worth displaying. Original span order is
|
|
3394
|
+
* the configured paths. An explicit `durationMs` attribute takes precedence,
|
|
3395
|
+
* with a fallback to the span start/end timestamps. User-defined `metrics`
|
|
3396
|
+
* whose path resolves to `undefined` are dropped, but `null`, `0`, and `false`
|
|
3397
|
+
* are preserved as legitimate values worth displaying. Original span order is
|
|
3374
3398
|
* preserved so the API calls tab matches the ordering in the Trace tab.
|
|
3375
3399
|
*/
|
|
3376
3400
|
function extractApiCalls(spans, config) {
|
|
@@ -3400,7 +3424,7 @@ function extractApiCalls(spans, config) {
|
|
|
3400
3424
|
method: readString$1(attrs, config.attributes.method),
|
|
3401
3425
|
url: readString$1(attrs, config.attributes.url),
|
|
3402
3426
|
statusCode: readNumber$1(attrs, config.attributes.statusCode),
|
|
3403
|
-
|
|
3427
|
+
durationMs: readNumber$1(attrs, config.attributes.durationMs) ?? computeDurationMs(span),
|
|
3404
3428
|
request: getNestedAttribute(attrs, config.attributes.request),
|
|
3405
3429
|
response: getNestedAttribute(attrs, config.attributes.response),
|
|
3406
3430
|
requestBody: getNestedAttribute(attrs, config.attributes.requestBody),
|
|
@@ -3798,7 +3822,7 @@ async function writeCacheFile(cacheDir, cacheFile) {
|
|
|
3798
3822
|
await mkdir(cacheDir, { recursive: true });
|
|
3799
3823
|
const filePath = ownerPath(cacheDir, cacheFile.owner);
|
|
3800
3824
|
const tmpPath = `${filePath}.${process.pid.toString()}.tmp`;
|
|
3801
|
-
await writeFile(tmpPath, JSON.stringify(cacheFile));
|
|
3825
|
+
await writeFile(tmpPath, JSON.stringify(cacheFile, null, 2));
|
|
3802
3826
|
await rename(tmpPath, filePath);
|
|
3803
3827
|
}
|
|
3804
3828
|
async function readDebugKeyFile(debugDir, owner) {
|
|
@@ -4237,7 +4261,8 @@ const DEFAULT_CONFIG_KEYS = [
|
|
|
4237
4261
|
"totalTokens",
|
|
4238
4262
|
"cachedInputTokens",
|
|
4239
4263
|
"cacheCreationInputTokens",
|
|
4240
|
-
"
|
|
4264
|
+
"reasoningTokens",
|
|
4265
|
+
"llmDurationMs"
|
|
4241
4266
|
];
|
|
4242
4267
|
const tokenNumberFormat = { notation: "compact" };
|
|
4243
4268
|
const countNumberFormat = {
|
|
@@ -4303,8 +4328,8 @@ const DEFAULT_COLUMNS = {
|
|
|
4303
4328
|
numberFormat: tokenNumberFormat,
|
|
4304
4329
|
align: "right"
|
|
4305
4330
|
},
|
|
4306
|
-
|
|
4307
|
-
label: "LLM
|
|
4331
|
+
llmDurationMs: {
|
|
4332
|
+
label: "LLM Duration",
|
|
4308
4333
|
format: "duration",
|
|
4309
4334
|
align: "right"
|
|
4310
4335
|
}
|
|
@@ -4509,8 +4534,14 @@ function addDefaultOutputs(params) {
|
|
|
4509
4534
|
});
|
|
4510
4535
|
assignIfMissing({
|
|
4511
4536
|
outputs: params.outputs,
|
|
4512
|
-
key: "
|
|
4513
|
-
value: sumNullable(calls.map((call) => call.
|
|
4537
|
+
key: "reasoningTokens",
|
|
4538
|
+
value: sumNullable(calls.map((call) => call.reasoningTokens)),
|
|
4539
|
+
activeKeys
|
|
4540
|
+
});
|
|
4541
|
+
assignIfMissing({
|
|
4542
|
+
outputs: params.outputs,
|
|
4543
|
+
key: "llmDurationMs",
|
|
4544
|
+
value: sumNullable(calls.map((call) => call.durationMs)),
|
|
4514
4545
|
activeKeys
|
|
4515
4546
|
});
|
|
4516
4547
|
}
|
|
@@ -5356,7 +5387,7 @@ async function runCase(params) {
|
|
|
5356
5387
|
caseDetail,
|
|
5357
5388
|
caseRowUpdate: {
|
|
5358
5389
|
status,
|
|
5359
|
-
|
|
5390
|
+
durationMs: Date.now() - startTime,
|
|
5360
5391
|
columns
|
|
5361
5392
|
}
|
|
5362
5393
|
};
|
|
@@ -5647,7 +5678,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5647
5678
|
caseId: evalCase.id,
|
|
5648
5679
|
evalId: evalMeta.id,
|
|
5649
5680
|
status: caseRowUpdate.status ?? "pending",
|
|
5650
|
-
|
|
5681
|
+
durationMs: caseRowUpdate.durationMs ?? null,
|
|
5651
5682
|
columns: caseRowUpdate.columns ?? {},
|
|
5652
5683
|
trial
|
|
5653
5684
|
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-C0EtHhEO.mjs";
|
|
2
|
+
import "./src-D-HuV8I-.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-C9nP2VKL.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
package/package.json
CHANGED
|
@@ -92,20 +92,16 @@ export async function runRefundWorkflow(input: RefundInput) {
|
|
|
92
92
|
async () => {
|
|
93
93
|
let text: string;
|
|
94
94
|
let usage: { inputTokens: number; outputTokens: number };
|
|
95
|
-
let costUsd: number;
|
|
96
95
|
try {
|
|
97
|
-
({ text, usage
|
|
96
|
+
({ text, usage } = await llm.complete(input.message));
|
|
98
97
|
} catch (error) {
|
|
99
98
|
captureEvalSpanError(error);
|
|
100
|
-
({ text, usage
|
|
101
|
-
input.message,
|
|
102
|
-
));
|
|
99
|
+
({ text, usage } = await llm.completeWithFallback(input.message));
|
|
103
100
|
}
|
|
104
101
|
evalSpan.setAttributes({
|
|
105
102
|
model: 'gpt-4o-mini',
|
|
106
103
|
provider: 'openai',
|
|
107
104
|
usage,
|
|
108
|
-
costUsd,
|
|
109
105
|
});
|
|
110
106
|
const expectedLocale = getEvalCaseInput('locale');
|
|
111
107
|
if (typeof expectedLocale === 'string') {
|
|
@@ -137,8 +133,8 @@ are more specific. Only the `input` and `output` span attributes are promoted
|
|
|
137
133
|
automatically in the trace tree; use `traceDisplay` for other span attributes
|
|
138
134
|
such as `model` or `usage`. Eval-level LLM usage outputs, columns, stats, and
|
|
139
135
|
charts are derived from matching LLM spans by default. Prefer
|
|
140
|
-
`llmCalls.pricing` for LLM-call cost display
|
|
141
|
-
|
|
136
|
+
`llmCalls.pricing` for LLM-call cost display; built-in costs ignore span
|
|
137
|
+
`costUsd` attributes.
|
|
142
138
|
|
|
143
139
|
Use `captureEvalSpanError(error)` for recoverable errors on the active
|
|
144
140
|
`evalTracer.span(...)`, such as optional model/tool failures that fall back and
|
|
@@ -261,18 +257,28 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
261
257
|
See the `TraceDisplayInputConfig` type.
|
|
262
258
|
- `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are
|
|
263
259
|
summarized for review. Defaults to `kind: 'llm'` spans with `model`,
|
|
264
|
-
`usage.*`, `
|
|
265
|
-
attribute paths.
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
`
|
|
269
|
-
|
|
260
|
+
`usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional
|
|
261
|
+
attribute paths. `latencyMs` is time to first token; duration, total tokens,
|
|
262
|
+
tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
|
|
263
|
+
override `attributes.<field>` for non-default primitive span shapes, configure
|
|
264
|
+
`pricing` to derive USD costs from token counts by model/provider, and add
|
|
265
|
+
entries to `metrics` to surface arbitrary user metrics (`format: 'string' |
|
|
266
|
+
'number' | 'duration' | 'json' | 'boolean'`, `placements: ['header' |
|
|
267
|
+
'body']`).
|
|
270
268
|
- Default usage config derives missing eval outputs from matching LLM/API spans
|
|
271
269
|
before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`,
|
|
272
270
|
`inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`,
|
|
273
|
-
`cacheCreationInputTokens`, `reasoningTokens`, and `
|
|
274
|
-
outputs and column overrides win.
|
|
275
|
-
|
|
271
|
+
`cacheCreationInputTokens`, `reasoningTokens`, and `llmDurationMs`. Authored
|
|
272
|
+
outputs and column overrides win. `totalTokens` is input + output only; cache
|
|
273
|
+
read/write tokens stay separate and affect `costUsd` at their own rates.
|
|
274
|
+
Derived base input cost uses `inputTokens - cachedInputTokens -
|
|
275
|
+
cacheCreationInputTokens` so cache details are not double-counted.
|
|
276
|
+
`cacheCreationInputTokens` is the total cache-write count; optional
|
|
277
|
+
`cacheCreationInput1hTokens` only splits that total for 1-hour write pricing
|
|
278
|
+
via `cacheCreationInput1hUsdPerMillion`. `llmDurationMs` sums elapsed matched
|
|
279
|
+
LLM span durations; it is not time-to-first-token latency.
|
|
280
|
+
Remove defaults globally or per eval with `removeDefaultConfig: true` or a
|
|
281
|
+
key list such as
|
|
276
282
|
`removeDefaultConfig: ['apiCalls', 'reasoningTokens']`.
|
|
277
283
|
- `apiCalls` (in `agent-evals.config.ts`) configures how API-call spans are
|
|
278
284
|
summarized for review. Defaults to `kind: 'api'`, `'http'`, `'http.client'`,
|