@ls-stack/agent-eval 0.21.0 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-CmwmcUgG.mjs → app-Cw79dJDr.mjs} +3 -3
- package/dist/apps/web/dist/assets/index-AUDD3rNB.js +118 -0
- package/dist/apps/web/dist/index.html +1 -1
- package/dist/bin.mjs +1 -1
- package/dist/{cli-DumvanQI.mjs → cli-D3QNOcPN.mjs} +3 -3
- package/dist/index.d.mts +102 -56
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-zYAcAPtS.mjs → runOrchestration-CimthgI7.mjs} +248 -90
- package/dist/{runner-Dy_PECaf.mjs → runner-4yNYRvmF.mjs} +2 -2
- package/dist/{runner-BcwyX9CO.mjs → runner-B-SYzW8w.mjs} +1 -1
- package/dist/src-CcXfWT4M.mjs +3 -0
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +36 -17
- package/dist/apps/web/dist/assets/index-EXO08yya.js +0 -118
- package/dist/src-BoAJb4wC.mjs +0 -3
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-4yNYRvmF.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
package/package.json
CHANGED
|
@@ -92,20 +92,16 @@ export async function runRefundWorkflow(input: RefundInput) {
|
|
|
92
92
|
async () => {
|
|
93
93
|
let text: string;
|
|
94
94
|
let usage: { inputTokens: number; outputTokens: number };
|
|
95
|
-
let costUsd: number;
|
|
96
95
|
try {
|
|
97
|
-
({ text, usage
|
|
96
|
+
({ text, usage } = await llm.complete(input.message));
|
|
98
97
|
} catch (error) {
|
|
99
98
|
captureEvalSpanError(error);
|
|
100
|
-
({ text, usage
|
|
101
|
-
input.message,
|
|
102
|
-
));
|
|
99
|
+
({ text, usage } = await llm.completeWithFallback(input.message));
|
|
103
100
|
}
|
|
104
101
|
evalSpan.setAttributes({
|
|
105
102
|
model: 'gpt-4o-mini',
|
|
106
103
|
provider: 'openai',
|
|
107
104
|
usage,
|
|
108
|
-
costUsd,
|
|
109
105
|
});
|
|
110
106
|
const expectedLocale = getEvalCaseInput('locale');
|
|
111
107
|
if (typeof expectedLocale === 'string') {
|
|
@@ -137,8 +133,8 @@ are more specific. Only the `input` and `output` span attributes are promoted
|
|
|
137
133
|
automatically in the trace tree; use `traceDisplay` for other span attributes
|
|
138
134
|
such as `model` or `usage`. Eval-level LLM usage outputs, columns, stats, and
|
|
139
135
|
charts are derived from matching LLM spans by default. Prefer
|
|
140
|
-
`llmCalls.pricing` for LLM-call cost display
|
|
141
|
-
|
|
136
|
+
`llmCalls.pricing` for LLM-call cost display; built-in costs ignore span
|
|
137
|
+
`costUsd` attributes.
|
|
142
138
|
|
|
143
139
|
Use `captureEvalSpanError(error)` for recoverable errors on the active
|
|
144
140
|
`evalTracer.span(...)`, such as optional model/tool failures that fall back and
|
|
@@ -160,6 +156,19 @@ for settlement; promise and span errors keep their normal behavior. Use
|
|
|
160
156
|
`waitForBackgroundJob: false` on a span, or `waitForBackgroundJobs: false` on an
|
|
161
157
|
eval definition, when background work should not delay finalization.
|
|
162
158
|
|
|
159
|
+
Eval Date APIs use a shifted wall clock by default: `new Date()` and
|
|
160
|
+
`Date.now()` start at `2026-04-10T00:00:00.000Z` during case generation,
|
|
161
|
+
execution, tracing, derived outputs, and scorers, then continue advancing with
|
|
162
|
+
real elapsed time. Set `startTime` on a specific `defineEval(...)` to use
|
|
163
|
+
another initial clock value, or set `startTime: 'now'` for that eval to use the
|
|
164
|
+
real current clock. Timers are not faked, so async waits still run normally.
|
|
165
|
+
Set `freezeTime: true` to keep Date APIs frozen until they are moved manually.
|
|
166
|
+
Use `getEvalStartTime()` to read the captured wall-clock start as a `Date`.
|
|
167
|
+
Use `advanceEvalTime(unit, amount)` inside an eval to move the shifted clock
|
|
168
|
+
forward; supported units are `millisecond(s)`, `second(s)`, `minute(s)`,
|
|
169
|
+
`hour(s)`, and `day(s)`. It throws for evals with `startTime: 'now'`, unless
|
|
170
|
+
`freezeTime: true` is also set.
|
|
171
|
+
|
|
163
172
|
For libraries or observability exporters that already emit span lifecycle
|
|
164
173
|
events, use `evalTracer.startSpan(...)`, `evalTracer.updateSpan(...)`,
|
|
165
174
|
`evalTracer.endSpan(...)`, or `evalTracer.recordSpan(...)` to translate those
|
|
@@ -261,18 +270,28 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
261
270
|
See the `TraceDisplayInputConfig` type.
|
|
262
271
|
- `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are
|
|
263
272
|
summarized for review. Defaults to `kind: 'llm'` spans with `model`,
|
|
264
|
-
`usage.*`, `
|
|
265
|
-
attribute paths.
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
`
|
|
269
|
-
|
|
273
|
+
`usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional
|
|
274
|
+
attribute paths. `latencyMs` is time to first token; duration, total tokens,
|
|
275
|
+
tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
|
|
276
|
+
override `attributes.<field>` for non-default primitive span shapes, configure
|
|
277
|
+
`pricing` to derive USD costs from token counts by model/provider, and add
|
|
278
|
+
entries to `metrics` to surface arbitrary user metrics (`format: 'string' |
|
|
279
|
+
'number' | 'duration' | 'json' | 'boolean'`, `placements: ['header' |
|
|
280
|
+
'body']`).
|
|
270
281
|
- Default usage config derives missing eval outputs from matching LLM/API spans
|
|
271
282
|
before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`,
|
|
272
283
|
`inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`,
|
|
273
|
-
`cacheCreationInputTokens`, `reasoningTokens`, and `
|
|
274
|
-
outputs and column overrides win.
|
|
275
|
-
|
|
284
|
+
`cacheCreationInputTokens`, `reasoningTokens`, and `llmDurationMs`. Authored
|
|
285
|
+
outputs and column overrides win. `totalTokens` is input + output only; cache
|
|
286
|
+
read/write tokens stay separate and affect `costUsd` at their own rates.
|
|
287
|
+
Derived base input cost uses `inputTokens - cachedInputTokens -
|
|
288
|
+
cacheCreationInputTokens` so cache details are not double-counted.
|
|
289
|
+
`cacheCreationInputTokens` is the total cache-write count; optional
|
|
290
|
+
`cacheCreationInput1hTokens` only splits that total for 1-hour write pricing
|
|
291
|
+
via `cacheCreationInput1hUsdPerMillion`. `llmDurationMs` sums elapsed matched
|
|
292
|
+
LLM span durations; it is not time-to-first-token latency.
|
|
293
|
+
Remove defaults globally or per eval with `removeDefaultConfig: true` or a
|
|
294
|
+
key list such as
|
|
276
295
|
`removeDefaultConfig: ['apiCalls', 'reasoningTokens']`.
|
|
277
296
|
- `apiCalls` (in `agent-evals.config.ts`) configures how API-call spans are
|
|
278
297
|
summarized for review. Defaults to `kind: 'api'`, `'http'`, `'http.client'`,
|