@ls-stack/agent-eval 0.20.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-DsiLU65H.mjs → app-moDHbg1O.mjs} +3 -3
- package/dist/apps/web/dist/assets/index-AUDD3rNB.js +118 -0
- package/dist/apps/web/dist/assets/{index-CvR6QCLa.css → index-r0dVFK0B.css} +1 -1
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-weogme5U.mjs → cli-C0EtHhEO.mjs} +3 -3
- package/dist/index.d.mts +56 -61
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-Cv1kiOAG.mjs → runOrchestration-D1edUDhp.mjs} +155 -140
- package/dist/{runner-DzrMtgBu.mjs → runner-C9nP2VKL.mjs} +2 -2
- package/dist/{runner-B25oRQxX.mjs → runner-CyRhIzci.mjs} +1 -1
- package/dist/src-D-HuV8I-.mjs +3 -0
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +30 -20
- package/dist/apps/web/dist/assets/index-Cba4MFa0.js +0 -118
- package/dist/src-B879LZfo.mjs +0 -3
|
@@ -92,20 +92,16 @@ export async function runRefundWorkflow(input: RefundInput) {
|
|
|
92
92
|
async () => {
|
|
93
93
|
let text: string;
|
|
94
94
|
let usage: { inputTokens: number; outputTokens: number };
|
|
95
|
-
let costUsd: number;
|
|
96
95
|
try {
|
|
97
|
-
({ text, usage
|
|
96
|
+
({ text, usage } = await llm.complete(input.message));
|
|
98
97
|
} catch (error) {
|
|
99
98
|
captureEvalSpanError(error);
|
|
100
|
-
({ text, usage
|
|
101
|
-
input.message,
|
|
102
|
-
));
|
|
99
|
+
({ text, usage } = await llm.completeWithFallback(input.message));
|
|
103
100
|
}
|
|
104
101
|
evalSpan.setAttributes({
|
|
105
102
|
model: 'gpt-4o-mini',
|
|
106
103
|
provider: 'openai',
|
|
107
104
|
usage,
|
|
108
|
-
costUsd,
|
|
109
105
|
});
|
|
110
106
|
const expectedLocale = getEvalCaseInput('locale');
|
|
111
107
|
if (typeof expectedLocale === 'string') {
|
|
@@ -137,8 +133,8 @@ are more specific. Only the `input` and `output` span attributes are promoted
|
|
|
137
133
|
automatically in the trace tree; use `traceDisplay` for other span attributes
|
|
138
134
|
such as `model` or `usage`. Eval-level LLM usage outputs, columns, stats, and
|
|
139
135
|
charts are derived from matching LLM spans by default. Prefer
|
|
140
|
-
`llmCalls.pricing` for LLM-call cost display
|
|
141
|
-
|
|
136
|
+
`llmCalls.pricing` for LLM-call cost display; built-in costs ignore span
|
|
137
|
+
`costUsd` attributes.
|
|
142
138
|
|
|
143
139
|
Use `captureEvalSpanError(error)` for recoverable errors on the active
|
|
144
140
|
`evalTracer.span(...)`, such as optional model/tool failures that fall back and
|
|
@@ -261,18 +257,28 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
261
257
|
See the `TraceDisplayInputConfig` type.
|
|
262
258
|
- `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are
|
|
263
259
|
summarized for review. Defaults to `kind: 'llm'` spans with `model`,
|
|
264
|
-
`usage.*`, `
|
|
265
|
-
attribute paths.
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
`
|
|
269
|
-
|
|
260
|
+
`usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional
|
|
261
|
+
attribute paths. `latencyMs` is time to first token; duration, total tokens,
|
|
262
|
+
tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
|
|
263
|
+
override `attributes.<field>` for non-default primitive span shapes, configure
|
|
264
|
+
`pricing` to derive USD costs from token counts by model/provider, and add
|
|
265
|
+
entries to `metrics` to surface arbitrary user metrics (`format: 'string' |
|
|
266
|
+
'number' | 'duration' | 'json' | 'boolean'`, `placements: ['header' |
|
|
267
|
+
'body']`).
|
|
270
268
|
- Default usage config derives missing eval outputs from matching LLM/API spans
|
|
271
269
|
before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`,
|
|
272
270
|
`inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`,
|
|
273
|
-
`cacheCreationInputTokens`, `reasoningTokens`, and `
|
|
274
|
-
outputs and column overrides win.
|
|
275
|
-
|
|
271
|
+
`cacheCreationInputTokens`, `reasoningTokens`, and `llmDurationMs`. Authored
|
|
272
|
+
outputs and column overrides win. `totalTokens` is input + output only; cache
|
|
273
|
+
read/write tokens stay separate and affect `costUsd` at their own rates.
|
|
274
|
+
Derived base input cost uses `inputTokens - cachedInputTokens -
|
|
275
|
+
cacheCreationInputTokens` so cache details are not double-counted.
|
|
276
|
+
`cacheCreationInputTokens` is the total cache-write count; optional
|
|
277
|
+
`cacheCreationInput1hTokens` only splits that total for 1-hour write pricing
|
|
278
|
+
via `cacheCreationInput1hUsdPerMillion`. `llmDurationMs` sums elapsed matched
|
|
279
|
+
LLM span durations; it is not time-to-first-token latency.
|
|
280
|
+
Remove defaults globally or per eval with `removeDefaultConfig: true` or a
|
|
281
|
+
key list such as
|
|
276
282
|
`removeDefaultConfig: ['apiCalls', 'reasoningTokens']`.
|
|
277
283
|
- `apiCalls` (in `agent-evals.config.ts`) configures how API-call spans are
|
|
278
284
|
summarized for review. Defaults to `kind: 'api'`, `'http'`, `'http.client'`,
|
|
@@ -287,9 +293,13 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
287
293
|
are still captured.
|
|
288
294
|
|
|
289
295
|
Stats rows and history charts on the eval card can be authored via `stats` /
|
|
290
|
-
`charts` on the eval definition. Usage stats
|
|
291
|
-
unless removed with `removeDefaultConfig`.
|
|
292
|
-
|
|
296
|
+
`charts` on the eval definition. Usage stats and LLM usage charts are added by
|
|
297
|
+
default unless removed with `removeDefaultConfig`. Column stats can override
|
|
298
|
+
`format` and `numberFormat`, otherwise they inherit from the matching column.
|
|
299
|
+
Number formats use `maxDecimalPlaces` to cap decimals and `minDecimalPlaces`
|
|
300
|
+
to pad trailing zeroes. Without `maxDecimalPlaces`, they render up to 3 decimal
|
|
301
|
+
places.
|
|
302
|
+
Their shapes live in the types; no need to memorize the option set.
|
|
293
303
|
|
|
294
304
|
## Cached operations
|
|
295
305
|
|