@ls-stack/agent-eval 0.20.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -92,20 +92,16 @@ export async function runRefundWorkflow(input: RefundInput) {
92
92
  async () => {
93
93
  let text: string;
94
94
  let usage: { inputTokens: number; outputTokens: number };
95
- let costUsd: number;
96
95
  try {
97
- ({ text, usage, costUsd } = await llm.complete(input.message));
96
+ ({ text, usage } = await llm.complete(input.message));
98
97
  } catch (error) {
99
98
  captureEvalSpanError(error);
100
- ({ text, usage, costUsd } = await llm.completeWithFallback(
101
- input.message,
102
- ));
99
+ ({ text, usage } = await llm.completeWithFallback(input.message));
103
100
  }
104
101
  evalSpan.setAttributes({
105
102
  model: 'gpt-4o-mini',
106
103
  provider: 'openai',
107
104
  usage,
108
- costUsd,
109
105
  });
110
106
  const expectedLocale = getEvalCaseInput('locale');
111
107
  if (typeof expectedLocale === 'string') {
@@ -137,8 +133,8 @@ are more specific. Only the `input` and `output` span attributes are promoted
137
133
  automatically in the trace tree; use `traceDisplay` for other span attributes
138
134
  such as `model` or `usage`. Eval-level LLM usage outputs, columns, stats, and
139
135
  charts are derived from matching LLM spans by default. Prefer
140
- `llmCalls.pricing` for LLM-call cost display instead of writing `costUsd` on
141
- each span.
136
+ `llmCalls.pricing` for LLM-call cost display; built-in costs ignore span
137
+ `costUsd` attributes.
142
138
 
143
139
  Use `captureEvalSpanError(error)` for recoverable errors on the active
144
140
  `evalTracer.span(...)`, such as optional model/tool failures that fall back and
@@ -261,18 +257,28 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
261
257
  See the `TraceDisplayInputConfig` type.
262
258
  - `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are
263
259
  summarized for review. Defaults to `kind: 'llm'` spans with `model`,
264
- `usage.*`, `tokensPerSecond`, `input`, `output`, etc. read from conventional
265
- attribute paths. Override `kinds` to broaden the filter, override
266
- `attributes.<field>` for non-default span shapes, configure `pricing` to
267
- derive USD costs from token counts by model/provider, and add entries to
268
- `metrics` to surface arbitrary user metrics (`format: 'string' | 'number' |
269
- 'duration' | 'json' | 'boolean'`, `placements: ['header' | 'body']`).
260
+ `usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional
261
+ attribute paths. `latencyMs` is time to first token; duration, total tokens,
262
+ tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
263
+ override `attributes.<field>` for non-default primitive span shapes, configure
264
+ `pricing` to derive USD costs from token counts by model/provider, and add
265
+ entries to `metrics` to surface arbitrary user metrics (`format: 'string' |
266
+ 'number' | 'duration' | 'json' | 'boolean'`, `placements: ['header' |
267
+ 'body']`).
270
268
  - Default usage config derives missing eval outputs from matching LLM/API spans
271
269
  before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`,
272
270
  `inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`,
273
- `cacheCreationInputTokens`, `reasoningTokens`, and `llmLatencyMs`. Authored
274
- outputs and column overrides win. Remove defaults globally or per eval with
275
- `removeDefaultConfig: true` or a key list such as
271
+ `cacheCreationInputTokens`, `reasoningTokens`, and `llmDurationMs`. Authored
272
+ outputs and column overrides win. `totalTokens` is input + output only; cache
273
+ read/write tokens stay separate and affect `costUsd` at their own rates.
274
+ Derived base input cost uses `inputTokens - cachedInputTokens -
275
+ cacheCreationInputTokens` so cache details are not double-counted.
276
+ `cacheCreationInputTokens` is the total cache-write count; optional
277
+ `cacheCreationInput1hTokens` only splits that total for 1-hour write pricing
278
+ via `cacheCreationInput1hUsdPerMillion`. `llmDurationMs` sums elapsed matched
279
+ LLM span durations; it is not time-to-first-token latency.
280
+ Remove defaults globally or per eval with `removeDefaultConfig: true` or a
281
+ key list such as
276
282
  `removeDefaultConfig: ['apiCalls', 'reasoningTokens']`.
277
283
  - `apiCalls` (in `agent-evals.config.ts`) configures how API-call spans are
278
284
  summarized for review. Defaults to `kind: 'api'`, `'http'`, `'http.client'`,
@@ -287,9 +293,13 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
287
293
  are still captured.
288
294
 
289
295
  Stats rows and history charts on the eval card can be authored via `stats` /
290
- `charts` on the eval definition. Usage stats/charts are added by default
291
- unless removed with `removeDefaultConfig`. Their shapes live in the types; no
292
- need to memorize the option set.
296
+ `charts` on the eval definition. Usage stats and LLM usage charts are added by
297
+ default unless removed with `removeDefaultConfig`. Column stats can override
298
+ `format` and `numberFormat`, otherwise they inherit from the matching column.
299
+ Number formats use `maxDecimalPlaces` to cap decimals and `minDecimalPlaces`
300
+ to pad trailing zeroes. Without `maxDecimalPlaces`, they render up to 3 decimal
301
+ places.
302
+ Their shapes live in the types; no need to memorize the option set.
293
303
 
294
304
  ## Cached operations
295
305