@ls-stack/agent-eval 0.61.0 → 0.61.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/apps/web/dist/assets/{index-DxZsizjg.js → index-DXQ_LDQw.js} +77 -77
- package/dist/apps/web/dist/assets/{index-CM_zUhl_.css → index-zWPuRQmP.css} +1 -1
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/index.d.mts +35 -35
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +2 -2
package/dist/index.d.mts
CHANGED
|
@@ -2558,8 +2558,8 @@ declare const evalSummarySchema$1: z.ZodObject<{
|
|
|
2558
2558
|
}>;
|
|
2559
2559
|
label: z.ZodOptional<z.ZodString>;
|
|
2560
2560
|
color: z.ZodOptional<z.ZodEnum<{
|
|
2561
|
-
error: "error";
|
|
2562
2561
|
success: "success";
|
|
2562
|
+
error: "error";
|
|
2563
2563
|
warning: "warning";
|
|
2564
2564
|
accent: "accent";
|
|
2565
2565
|
accentDim: "accentDim";
|
|
@@ -2582,8 +2582,8 @@ declare const evalSummarySchema$1: z.ZodObject<{
|
|
|
2582
2582
|
}>;
|
|
2583
2583
|
label: z.ZodOptional<z.ZodString>;
|
|
2584
2584
|
color: z.ZodOptional<z.ZodEnum<{
|
|
2585
|
-
error: "error";
|
|
2586
2585
|
success: "success";
|
|
2586
|
+
error: "error";
|
|
2587
2587
|
warning: "warning";
|
|
2588
2588
|
accent: "accent";
|
|
2589
2589
|
accentDim: "accentDim";
|
|
@@ -2920,10 +2920,10 @@ declare const scoreTraceSchema: z.ZodObject<{
|
|
|
2920
2920
|
namespace: z.ZodString;
|
|
2921
2921
|
key: z.ZodString;
|
|
2922
2922
|
status: z.ZodEnum<{
|
|
2923
|
-
bypass: "bypass";
|
|
2924
|
-
refresh: "refresh";
|
|
2925
2923
|
hit: "hit";
|
|
2926
2924
|
miss: "miss";
|
|
2925
|
+
refresh: "refresh";
|
|
2926
|
+
bypass: "bypass";
|
|
2927
2927
|
}>;
|
|
2928
2928
|
read: z.ZodOptional<z.ZodBoolean>;
|
|
2929
2929
|
stored: z.ZodOptional<z.ZodBoolean>;
|
|
@@ -3092,10 +3092,10 @@ declare const caseDetailSchema$1: z.ZodObject<{
|
|
|
3092
3092
|
namespace: z.ZodString;
|
|
3093
3093
|
key: z.ZodString;
|
|
3094
3094
|
status: z.ZodEnum<{
|
|
3095
|
-
bypass: "bypass";
|
|
3096
|
-
refresh: "refresh";
|
|
3097
3095
|
hit: "hit";
|
|
3098
3096
|
miss: "miss";
|
|
3097
|
+
refresh: "refresh";
|
|
3098
|
+
bypass: "bypass";
|
|
3099
3099
|
}>;
|
|
3100
3100
|
read: z.ZodOptional<z.ZodBoolean>;
|
|
3101
3101
|
stored: z.ZodOptional<z.ZodBoolean>;
|
|
@@ -3213,10 +3213,10 @@ declare const caseDetailSchema$1: z.ZodObject<{
|
|
|
3213
3213
|
namespace: z.ZodString;
|
|
3214
3214
|
key: z.ZodString;
|
|
3215
3215
|
status: z.ZodEnum<{
|
|
3216
|
-
bypass: "bypass";
|
|
3217
|
-
refresh: "refresh";
|
|
3218
3216
|
hit: "hit";
|
|
3219
3217
|
miss: "miss";
|
|
3218
|
+
refresh: "refresh";
|
|
3219
|
+
bypass: "bypass";
|
|
3220
3220
|
}>;
|
|
3221
3221
|
read: z.ZodOptional<z.ZodBoolean>;
|
|
3222
3222
|
stored: z.ZodOptional<z.ZodBoolean>;
|
|
@@ -3283,8 +3283,8 @@ type EvalChartAggregate = z.infer<typeof evalChartAggregateSchema>;
|
|
|
3283
3283
|
* not emit raw hex so authored evals stay decoupled from the web theme.
|
|
3284
3284
|
*/
|
|
3285
3285
|
declare const evalChartColorSchema: z.ZodEnum<{
|
|
3286
|
-
error: "error";
|
|
3287
3286
|
success: "success";
|
|
3287
|
+
error: "error";
|
|
3288
3288
|
warning: "warning";
|
|
3289
3289
|
accent: "accent";
|
|
3290
3290
|
accentDim: "accentDim";
|
|
@@ -3312,8 +3312,8 @@ declare const evalChartMetricSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
|
3312
3312
|
}>;
|
|
3313
3313
|
label: z.ZodOptional<z.ZodString>;
|
|
3314
3314
|
color: z.ZodOptional<z.ZodEnum<{
|
|
3315
|
-
error: "error";
|
|
3316
3315
|
success: "success";
|
|
3316
|
+
error: "error";
|
|
3317
3317
|
warning: "warning";
|
|
3318
3318
|
accent: "accent";
|
|
3319
3319
|
accentDim: "accentDim";
|
|
@@ -3336,8 +3336,8 @@ declare const evalChartMetricSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
|
3336
3336
|
}>;
|
|
3337
3337
|
label: z.ZodOptional<z.ZodString>;
|
|
3338
3338
|
color: z.ZodOptional<z.ZodEnum<{
|
|
3339
|
-
error: "error";
|
|
3340
3339
|
success: "success";
|
|
3340
|
+
error: "error";
|
|
3341
3341
|
warning: "warning";
|
|
3342
3342
|
accent: "accent";
|
|
3343
3343
|
accentDim: "accentDim";
|
|
@@ -3395,8 +3395,8 @@ declare const evalChartConfigSchema: z.ZodObject<{
|
|
|
3395
3395
|
}>;
|
|
3396
3396
|
label: z.ZodOptional<z.ZodString>;
|
|
3397
3397
|
color: z.ZodOptional<z.ZodEnum<{
|
|
3398
|
-
error: "error";
|
|
3399
3398
|
success: "success";
|
|
3399
|
+
error: "error";
|
|
3400
3400
|
warning: "warning";
|
|
3401
3401
|
accent: "accent";
|
|
3402
3402
|
accentDim: "accentDim";
|
|
@@ -3419,8 +3419,8 @@ declare const evalChartConfigSchema: z.ZodObject<{
|
|
|
3419
3419
|
}>;
|
|
3420
3420
|
label: z.ZodOptional<z.ZodString>;
|
|
3421
3421
|
color: z.ZodOptional<z.ZodEnum<{
|
|
3422
|
-
error: "error";
|
|
3423
3422
|
success: "success";
|
|
3423
|
+
error: "error";
|
|
3424
3424
|
warning: "warning";
|
|
3425
3425
|
accent: "accent";
|
|
3426
3426
|
accentDim: "accentDim";
|
|
@@ -3485,8 +3485,8 @@ declare const evalChartsConfigSchema: z.ZodArray<z.ZodObject<{
|
|
|
3485
3485
|
}>;
|
|
3486
3486
|
label: z.ZodOptional<z.ZodString>;
|
|
3487
3487
|
color: z.ZodOptional<z.ZodEnum<{
|
|
3488
|
-
error: "error";
|
|
3489
3488
|
success: "success";
|
|
3489
|
+
error: "error";
|
|
3490
3490
|
warning: "warning";
|
|
3491
3491
|
accent: "accent";
|
|
3492
3492
|
accentDim: "accentDim";
|
|
@@ -3509,8 +3509,8 @@ declare const evalChartsConfigSchema: z.ZodArray<z.ZodObject<{
|
|
|
3509
3509
|
}>;
|
|
3510
3510
|
label: z.ZodOptional<z.ZodString>;
|
|
3511
3511
|
color: z.ZodOptional<z.ZodEnum<{
|
|
3512
|
-
error: "error";
|
|
3513
3512
|
success: "success";
|
|
3513
|
+
error: "error";
|
|
3514
3514
|
warning: "warning";
|
|
3515
3515
|
accent: "accent";
|
|
3516
3516
|
accentDim: "accentDim";
|
|
@@ -3589,9 +3589,9 @@ declare const runManifestSchema$1: z.ZodObject<{
|
|
|
3589
3589
|
median: "median";
|
|
3590
3590
|
}>>>;
|
|
3591
3591
|
cacheMode: z.ZodOptional<z.ZodEnum<{
|
|
3592
|
-
use: "use";
|
|
3593
|
-
bypass: "bypass";
|
|
3594
3592
|
refresh: "refresh";
|
|
3593
|
+
bypass: "bypass";
|
|
3594
|
+
use: "use";
|
|
3595
3595
|
}>>;
|
|
3596
3596
|
}, z.core.$strip>;
|
|
3597
3597
|
/** Persisted lifecycle metadata for a single eval run. */
|
|
@@ -4575,9 +4575,9 @@ declare function extractApiCalls(spans: EvalTraceSpan$1[], config: ResolvedApiCa
|
|
|
4575
4575
|
* - `refresh`: never read, always write (forces re-execution and overwrites).
|
|
4576
4576
|
*/
|
|
4577
4577
|
declare const cacheModeSchema: z.ZodEnum<{
|
|
4578
|
-
use: "use";
|
|
4579
|
-
bypass: "bypass";
|
|
4580
4578
|
refresh: "refresh";
|
|
4579
|
+
bypass: "bypass";
|
|
4580
|
+
use: "use";
|
|
4581
4581
|
}>;
|
|
4582
4582
|
/** Mode controlling how cached spans behave during a run. */
|
|
4583
4583
|
type CacheMode = z.infer<typeof cacheModeSchema>;
|
|
@@ -4591,17 +4591,17 @@ declare const spanCacheOptionsSchema: z.ZodObject<{
|
|
|
4591
4591
|
type SpanCacheOptions = z.infer<typeof spanCacheOptionsSchema>;
|
|
4592
4592
|
/** Category of operation stored in the eval cache. */
|
|
4593
4593
|
declare const cacheOperationTypeSchema: z.ZodEnum<{
|
|
4594
|
-
span: "span";
|
|
4595
4594
|
value: "value";
|
|
4595
|
+
span: "span";
|
|
4596
4596
|
}>;
|
|
4597
4597
|
/** Category of operation stored in the eval cache. */
|
|
4598
4598
|
type CacheOperationType = z.infer<typeof cacheOperationTypeSchema>;
|
|
4599
4599
|
/** Status of a cache lookup recorded on a span or case scope. */
|
|
4600
4600
|
declare const cacheStatusSchema: z.ZodEnum<{
|
|
4601
|
-
bypass: "bypass";
|
|
4602
|
-
refresh: "refresh";
|
|
4603
4601
|
hit: "hit";
|
|
4604
4602
|
miss: "miss";
|
|
4603
|
+
refresh: "refresh";
|
|
4604
|
+
bypass: "bypass";
|
|
4605
4605
|
}>;
|
|
4606
4606
|
/** Status of a cache lookup recorded on a span or case scope. */
|
|
4607
4607
|
type CacheStatus = z.infer<typeof cacheStatusSchema>;
|
|
@@ -4618,10 +4618,10 @@ declare const traceCacheRefSchema: z.ZodObject<{
|
|
|
4618
4618
|
namespace: z.ZodString;
|
|
4619
4619
|
key: z.ZodString;
|
|
4620
4620
|
status: z.ZodEnum<{
|
|
4621
|
-
bypass: "bypass";
|
|
4622
|
-
refresh: "refresh";
|
|
4623
4621
|
hit: "hit";
|
|
4624
4622
|
miss: "miss";
|
|
4623
|
+
refresh: "refresh";
|
|
4624
|
+
bypass: "bypass";
|
|
4625
4625
|
}>;
|
|
4626
4626
|
read: z.ZodOptional<z.ZodBoolean>;
|
|
4627
4627
|
stored: z.ZodOptional<z.ZodBoolean>;
|
|
@@ -4817,8 +4817,8 @@ declare const cacheEntrySchema: z.ZodObject<{
|
|
|
4817
4817
|
key: z.ZodString;
|
|
4818
4818
|
namespace: z.ZodString;
|
|
4819
4819
|
operationType: z.ZodOptional<z.ZodEnum<{
|
|
4820
|
-
span: "span";
|
|
4821
4820
|
value: "value";
|
|
4821
|
+
span: "span";
|
|
4822
4822
|
}>>;
|
|
4823
4823
|
operationName: z.ZodOptional<z.ZodString>;
|
|
4824
4824
|
spanName: z.ZodOptional<z.ZodString>;
|
|
@@ -4924,8 +4924,8 @@ declare const cacheDebugKeyEntrySchema: z.ZodObject<{
|
|
|
4924
4924
|
key: z.ZodString;
|
|
4925
4925
|
namespace: z.ZodString;
|
|
4926
4926
|
operationType: z.ZodEnum<{
|
|
4927
|
-
span: "span";
|
|
4928
4927
|
value: "value";
|
|
4928
|
+
span: "span";
|
|
4929
4929
|
}>;
|
|
4930
4930
|
operationName: z.ZodString;
|
|
4931
4931
|
storedAt: z.ZodString;
|
|
@@ -4935,8 +4935,8 @@ declare const cacheDebugKeyEntrySchema: z.ZodObject<{
|
|
|
4935
4935
|
key: z.ZodString;
|
|
4936
4936
|
namespace: z.ZodString;
|
|
4937
4937
|
operationType: z.ZodOptional<z.ZodEnum<{
|
|
4938
|
-
span: "span";
|
|
4939
4938
|
value: "value";
|
|
4939
|
+
span: "span";
|
|
4940
4940
|
}>>;
|
|
4941
4941
|
operationName: z.ZodOptional<z.ZodString>;
|
|
4942
4942
|
spanName: z.ZodOptional<z.ZodString>;
|
|
@@ -5042,8 +5042,8 @@ declare const cacheEntryWithDebugKeySchema$1: z.ZodObject<{
|
|
|
5042
5042
|
key: z.ZodString;
|
|
5043
5043
|
namespace: z.ZodString;
|
|
5044
5044
|
operationType: z.ZodOptional<z.ZodEnum<{
|
|
5045
|
-
span: "span";
|
|
5046
5045
|
value: "value";
|
|
5046
|
+
span: "span";
|
|
5047
5047
|
}>>;
|
|
5048
5048
|
operationName: z.ZodOptional<z.ZodString>;
|
|
5049
5049
|
spanName: z.ZodOptional<z.ZodString>;
|
|
@@ -5140,8 +5140,8 @@ declare const cacheEntryWithDebugKeySchema$1: z.ZodObject<{
|
|
|
5140
5140
|
key: z.ZodString;
|
|
5141
5141
|
namespace: z.ZodString;
|
|
5142
5142
|
operationType: z.ZodEnum<{
|
|
5143
|
-
span: "span";
|
|
5144
5143
|
value: "value";
|
|
5144
|
+
span: "span";
|
|
5145
5145
|
}>;
|
|
5146
5146
|
operationName: z.ZodString;
|
|
5147
5147
|
storedAt: z.ZodString;
|
|
@@ -5151,8 +5151,8 @@ declare const cacheEntryWithDebugKeySchema$1: z.ZodObject<{
|
|
|
5151
5151
|
key: z.ZodString;
|
|
5152
5152
|
namespace: z.ZodString;
|
|
5153
5153
|
operationType: z.ZodOptional<z.ZodEnum<{
|
|
5154
|
-
span: "span";
|
|
5155
5154
|
value: "value";
|
|
5155
|
+
span: "span";
|
|
5156
5156
|
}>>;
|
|
5157
5157
|
operationName: z.ZodOptional<z.ZodString>;
|
|
5158
5158
|
spanName: z.ZodOptional<z.ZodString>;
|
|
@@ -5258,8 +5258,8 @@ declare const cacheFileSchema: z.ZodObject<{
|
|
|
5258
5258
|
key: z.ZodString;
|
|
5259
5259
|
namespace: z.ZodString;
|
|
5260
5260
|
operationType: z.ZodOptional<z.ZodEnum<{
|
|
5261
|
-
span: "span";
|
|
5262
5261
|
value: "value";
|
|
5262
|
+
span: "span";
|
|
5263
5263
|
}>>;
|
|
5264
5264
|
operationName: z.ZodOptional<z.ZodString>;
|
|
5265
5265
|
spanName: z.ZodOptional<z.ZodString>;
|
|
@@ -5364,8 +5364,8 @@ declare const cacheDebugKeyFileSchema: z.ZodObject<{
|
|
|
5364
5364
|
key: z.ZodString;
|
|
5365
5365
|
namespace: z.ZodString;
|
|
5366
5366
|
operationType: z.ZodEnum<{
|
|
5367
|
-
span: "span";
|
|
5368
5367
|
value: "value";
|
|
5368
|
+
span: "span";
|
|
5369
5369
|
}>;
|
|
5370
5370
|
operationName: z.ZodString;
|
|
5371
5371
|
storedAt: z.ZodString;
|
|
@@ -5375,8 +5375,8 @@ declare const cacheDebugKeyFileSchema: z.ZodObject<{
|
|
|
5375
5375
|
key: z.ZodString;
|
|
5376
5376
|
namespace: z.ZodString;
|
|
5377
5377
|
operationType: z.ZodOptional<z.ZodEnum<{
|
|
5378
|
-
span: "span";
|
|
5379
5378
|
value: "value";
|
|
5379
|
+
span: "span";
|
|
5380
5380
|
}>>;
|
|
5381
5381
|
operationName: z.ZodOptional<z.ZodString>;
|
|
5382
5382
|
spanName: z.ZodOptional<z.ZodString>;
|
|
@@ -5587,9 +5587,9 @@ declare const createRunRequestSchema$1: z.ZodObject<{
|
|
|
5587
5587
|
temporary: z.ZodOptional<z.ZodBoolean>;
|
|
5588
5588
|
cache: z.ZodOptional<z.ZodObject<{
|
|
5589
5589
|
mode: z.ZodDefault<z.ZodEnum<{
|
|
5590
|
-
use: "use";
|
|
5591
|
-
bypass: "bypass";
|
|
5592
5590
|
refresh: "refresh";
|
|
5591
|
+
bypass: "bypass";
|
|
5592
|
+
use: "use";
|
|
5593
5593
|
}>>;
|
|
5594
5594
|
}, z.core.$strip>>;
|
|
5595
5595
|
manualInputs: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.61.
|
|
3
|
+
"version": "0.61.2",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -33,8 +33,8 @@
|
|
|
33
33
|
"@types/node": "^24.7.2",
|
|
34
34
|
"typescript": "^5.9.2",
|
|
35
35
|
"@agent-evals/runner": "0.0.1",
|
|
36
|
-
"@agent-evals/
|
|
37
|
-
"@agent-evals/
|
|
36
|
+
"@agent-evals/sdk": "0.0.1",
|
|
37
|
+
"@agent-evals/shared": "0.0.1"
|
|
38
38
|
},
|
|
39
39
|
"scripts": {
|
|
40
40
|
"build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
|
|
@@ -212,11 +212,11 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape (forma
|
|
|
212
212
|
- `tracingAssertions` is a single function that can be authored globally or locally on one eval when a finished-trace invariant should pass or fail the case without creating a fake score column. It receives the same `{ trace, input, case }` context as `deriveFromTracing`; call `evalAssert(...)` or `evalExpect(...)` inside it. Useful trace helpers include `trace.findSpan(name)`, `trace.findSpans(name)`, `trace.hasSpan(name)`, `trace.findSpansByKind(kind)`, `trace.findToolCallSpans()`, `trace.listToolCallSpanNames()`, `trace.hasToolCallSpan(name)`, `trace.getToolCallSpans(name)`, `trace.getToolCallSpanCount(toolName)`, `trace.hasToolCallSpanCount(toolName, expectedCalls)`, `trace.listSpanNames(kind?)`, `trace.listSpanNamesDfs(kind?)`, and `trace.flattenDfs()`. The tool-call helpers include both `kind: 'tool'` spans and imported execution spans recorded as `kind: 'tool_call'`. Tool-name checks and counts match the span `name` as well as GenAI/Mastra identity attributes such as `genAI["gen_ai.tool.name"]` and `mastra.entityName`; list helpers prefer those tool identity attributes when present. `getToolCallSpans(name)` returns one normalized object per matching call, including parsed `arguments`, parsed `result`, `description`, `toolType`, `attributes`, and the original `span`.
|
|
213
213
|
- `traceDisplay` promotes selected span attributes into the trace tree and detail pane; it supports aggregation across subtrees (`scope`, `mode`) and user-defined `transform(...)` for derived views (e.g. currency conversion). See the `TraceDisplayInputConfig` type.
|
|
214
214
|
- `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are summarized for review. Defaults to `kind: 'llm'` spans with `model`, `usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional attribute paths. The default `steps` path reads an array from `span.attributes.steps`; if it is missing, direct child `model_step` spans are shown as that call's steps. Tool calls are aggregated from the configured `toolCalls` path plus step-level `toolCalls` on authored step arrays or direct `model_step` child spans, including Mastra's serialized `mastra.model_step.output` format, and child `tool_call` execution spans under each model step. `latencyMs` is time to first token; duration, total tokens, output tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter, override `attributes.<field>` for non-default primitive span shapes, configure model-keyed `pricing` to derive USD costs from token counts, with nested `providers` entries for provider-specific rates, add `costCurrencies` to show converted cost columns in the expanded breakdown table only, add `derivedAttributes` to persist computed values back onto matching LLM spans before trace consumers run, and add entries to `metrics` to surface arbitrary user metrics (`format: 'string' | 'number' | 'duration' | 'json' | 'boolean'`, `placements: ['header' | 'body']`). `derivedAttributes` can be a keyed map for one-off fields or one callback that returns multiple path/value pairs. Derived keys are dot-paths under `span.attributes`; return `undefined` to skip one span or one returned key.
|
|
215
|
-
- Default usage config derives missing eval outputs from matching LLM/API spans before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`, `inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`, `cacheCreationInputTokens`, `reasoningTokens`, and `llmDurationMs`. Authored outputs and column overrides win. Default usage columns, stats, and charts use `hideIfNoValue: true`. Default LLM usage charts configure cost, input tokens, and output tokens separately and use `dedupeConsecutiveValues: true` to skip repeated adjacent chart values. `totalTokens` is input + output only; cache read/write tokens stay separate and affect `costUsd` at their own rates. `llmTurns` is the maximum per-call turn count in the case run, using configured steps when available and otherwise one turn per matched LLM call span. Derived base input cost uses `inputTokens - cachedInputTokens - cacheCreationInputTokens` so cache details are not double-counted. `cacheCreationInputTokens` is the total cache-write count; optional `cacheCreationInput1hTokens` only splits that total for 1-hour write pricing via `cacheCreationInput1hUsdPerMillion`. `llmDurationMs` sums elapsed matched LLM span durations; it is not time-to-first-token latency. Remove defaults globally or per eval with `removeDefaultConfig: true` or a key list such as `removeDefaultConfig: ['apiCalls', 'reasoningTokens']`.
|
|
215
|
+
- Default usage config derives missing eval outputs from matching LLM/API spans before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`, `inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`, `cacheCreationInputTokens`, `reasoningTokens`, and `llmDurationMs`. Authored outputs and column overrides win. The web UI fills in baseline run-health stats (`cases`, `passRate`, `duration`) and a pass-rate/duration history chart when an eval has not already authored equivalent run-health UI. If discovery metadata is missing but saved runs contain runtime columns such as `costUsd`, `inputTokens`, or `apiCalls`, the single-eval page can infer the standard usage stats and charts from those saved run values. Default usage columns, stats, and charts use `hideIfNoValue: true`. Default LLM usage charts configure cost, input tokens, and output tokens separately and use `dedupeConsecutiveValues: true` to skip repeated adjacent chart values. `totalTokens` is input + output only; cache read/write tokens stay separate and affect `costUsd` at their own rates. `llmTurns` is the maximum per-call turn count in the case run, using configured steps when available and otherwise one turn per matched LLM call span. Derived base input cost uses `inputTokens - cachedInputTokens - cacheCreationInputTokens` so cache details are not double-counted. `cacheCreationInputTokens` is the total cache-write count; optional `cacheCreationInput1hTokens` only splits that total for 1-hour write pricing via `cacheCreationInput1hUsdPerMillion`. `llmDurationMs` sums elapsed matched LLM span durations; it is not time-to-first-token latency. Remove defaults globally or per eval with `removeDefaultConfig: true` or a key list such as `removeDefaultConfig: ['apiCalls', 'reasoningTokens']`.
|
|
216
216
|
- `apiCalls` (in `agent-evals.config.ts`) configures how API-call spans are summarized for review. Defaults to `kind: 'api'`, `'http'`, `'http.client'`, and `'fetch'` spans with `method`, `url`, `statusCode`, `request`, `routeAlias`, `response`, `requestBody`, `responseBody`, `headers`, `durationMs`, and `error` read from conventional attribute paths. Override `kinds` or `attributes.<field>` for external tracers. Set a per-span `routeAlias` attribute such as `/v3/tabs/:id` to group dynamic URL paths in API-call route labels and endpoint charts while preserving original URLs in row details. Add `derivedAttributes` as a keyed map or object-returning callback for computed persisted API span attributes, and add `metrics` with the same formats and placements as LLM-call metrics.
|
|
217
217
|
- `runLogs` (in `agent-evals.config.ts`) controls case log capture. Use `runLogs: { captureConsole: false }` to keep console output in the terminal without persisting console calls to case details. Manual `evalLog(...)` calls are still captured. Captured log locations store the selected user-facing source frame and the full JavaScript stack so agents can inspect additional frames in persisted artifacts when diagnosing where a log came from.
|
|
218
218
|
|
|
219
|
-
Stats rows and history charts can be authored via `stats` / `charts` on the eval definition. Global `stats` in `agent-evals.config.ts` combine with eval-level stats. Native stat kinds include `cases`, `passRate`, `duration`, and `cacheHits`; `cacheHits` shows Agent Eval operation-level cache hits over total cache operations (`hits/total`) from spans and `evalTracer.cache(...)` refs, not LLM provider prompt-cache read tokens such as `cachedInputTokens`. Cache-hit stats use a separate aggregate control and default to `sum`; `avg` is average per-case hit rate, and min/max/best/worst select cases by hit rate. `duration` aggregates per-case durations using the same modes as column stats. Usage stats and LLM usage charts are added by default unless removed with `removeDefaultConfig`. Column stats can override `format` and `numberFormat`, otherwise they inherit from the matching column. Duration and column stat aggregates support `avg`, `min`, `max`, `sum`, `best` (highest finite value), and `worst` (lowest finite value). Use `defaultStatAggregate` in `agent-evals.config.ts` to set the workspace-wide initial duration/column stat mode, or on an eval definition to override it for that eval. Number formats use `maxDecimalPlaces` to cap decimals and `minDecimalPlaces` to pad trailing zeroes. Without `maxDecimalPlaces`, the default cap is 3 decimal places. Stats and charts support `hideIfNoValue: true`. Charts support `dedupeConsecutiveValues: true` to omit consecutive points whose plotted metrics and tooltip extras match the previous kept point. Their shapes live in the types; no need to memorize the option set.
|
|
219
|
+
Stats rows and history charts can be authored via `stats` / `charts` on the eval definition. Global `stats` in `agent-evals.config.ts` combine with eval-level stats. The web UI automatically supplies missing `cases`, `passRate`, and `duration` stats plus a pass-rate/duration history chart, including for a single completed run. Native stat kinds include `cases`, `passRate`, `duration`, and `cacheHits`; `cacheHits` shows Agent Eval operation-level cache hits over total cache operations (`hits/total`) from spans and `evalTracer.cache(...)` refs, not LLM provider prompt-cache read tokens such as `cachedInputTokens`. Cache-hit stats use a separate aggregate control and default to `sum`; `avg` is average per-case hit rate, and min/max/best/worst select cases by hit rate. `duration` aggregates per-case durations using the same modes as column stats. Usage stats and LLM usage charts are added by default unless removed with `removeDefaultConfig`. Column stats can override `format` and `numberFormat`, otherwise they inherit from the matching column. Duration and column stat aggregates support `avg`, `min`, `max`, `sum`, `best` (highest finite value), and `worst` (lowest finite value). Use `defaultStatAggregate` in `agent-evals.config.ts` to set the workspace-wide initial duration/column stat mode, or on an eval definition to override it for that eval. Number formats use `maxDecimalPlaces` to cap decimals and `minDecimalPlaces` to pad trailing zeroes. Without `maxDecimalPlaces`, the default cap is 3 decimal places. Stats and charts support `hideIfNoValue: true`. Charts support `dedupeConsecutiveValues: true` to omit consecutive points whose plotted metrics and tooltip extras match the previous kept point. Rendered charts with no plottable values show an unavailable state instead of a blank frame. Their shapes live in the types; no need to memorize the option set.
|
|
220
220
|
|
|
221
221
|
## Cached operations
|
|
222
222
|
|