@ls-stack/agent-eval 0.21.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-Dy_PECaf.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-4yNYRvmF.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -0,0 +1,3 @@
1
+ import "./runOrchestration-CimthgI7.mjs";
2
+ import "./cli-D3QNOcPN.mjs";
3
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.21.0",
3
+ "version": "0.23.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -92,20 +92,16 @@ export async function runRefundWorkflow(input: RefundInput) {
92
92
  async () => {
93
93
  let text: string;
94
94
  let usage: { inputTokens: number; outputTokens: number };
95
- let costUsd: number;
96
95
  try {
97
- ({ text, usage, costUsd } = await llm.complete(input.message));
96
+ ({ text, usage } = await llm.complete(input.message));
98
97
  } catch (error) {
99
98
  captureEvalSpanError(error);
100
- ({ text, usage, costUsd } = await llm.completeWithFallback(
101
- input.message,
102
- ));
99
+ ({ text, usage } = await llm.completeWithFallback(input.message));
103
100
  }
104
101
  evalSpan.setAttributes({
105
102
  model: 'gpt-4o-mini',
106
103
  provider: 'openai',
107
104
  usage,
108
- costUsd,
109
105
  });
110
106
  const expectedLocale = getEvalCaseInput('locale');
111
107
  if (typeof expectedLocale === 'string') {
@@ -137,8 +133,8 @@ are more specific. Only the `input` and `output` span attributes are promoted
137
133
  automatically in the trace tree; use `traceDisplay` for other span attributes
138
134
  such as `model` or `usage`. Eval-level LLM usage outputs, columns, stats, and
139
135
  charts are derived from matching LLM spans by default. Prefer
140
- `llmCalls.pricing` for LLM-call cost display instead of writing `costUsd` on
141
- each span.
136
+ `llmCalls.pricing` for LLM-call cost display; built-in costs ignore span
137
+ `costUsd` attributes.
142
138
 
143
139
  Use `captureEvalSpanError(error)` for recoverable errors on the active
144
140
  `evalTracer.span(...)`, such as optional model/tool failures that fall back and
@@ -160,6 +156,19 @@ for settlement; promise and span errors keep their normal behavior. Use
160
156
  `waitForBackgroundJob: false` on a span, or `waitForBackgroundJobs: false` on an
161
157
  eval definition, when background work should not delay finalization.
162
158
 
159
+ Eval Date APIs use a shifted wall clock by default: `new Date()` and
160
+ `Date.now()` start at `2026-04-10T00:00:00.000Z` during case generation,
161
+ execution, tracing, derived outputs, and scorers, then continue advancing with
162
+ real elapsed time. Set `startTime` on a specific `defineEval(...)` to use
163
+ another initial clock value, or set `startTime: 'now'` for that eval to use the
164
+ real current clock. Timers are not faked, so async waits still run normally.
165
+ Set `freezeTime: true` to keep Date APIs frozen until they are moved manually.
166
+ Use `getEvalStartTime()` to read the captured wall-clock start as a `Date`.
167
+ Use `advanceEvalTime(unit, amount)` inside an eval to move the shifted clock
168
+ forward; supported units are `millisecond(s)`, `second(s)`, `minute(s)`,
169
+ `hour(s)`, and `day(s)`. It throws for evals with `startTime: 'now'`, unless
170
+ `freezeTime: true` is also set.
171
+
163
172
  For libraries or observability exporters that already emit span lifecycle
164
173
  events, use `evalTracer.startSpan(...)`, `evalTracer.updateSpan(...)`,
165
174
  `evalTracer.endSpan(...)`, or `evalTracer.recordSpan(...)` to translate those
@@ -261,18 +270,28 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
261
270
  See the `TraceDisplayInputConfig` type.
262
271
  - `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are
263
272
  summarized for review. Defaults to `kind: 'llm'` spans with `model`,
264
- `usage.*`, `tokensPerSecond`, `input`, `output`, etc. read from conventional
265
- attribute paths. Override `kinds` to broaden the filter, override
266
- `attributes.<field>` for non-default span shapes, configure `pricing` to
267
- derive USD costs from token counts by model/provider, and add entries to
268
- `metrics` to surface arbitrary user metrics (`format: 'string' | 'number' |
269
- 'duration' | 'json' | 'boolean'`, `placements: ['header' | 'body']`).
273
+ `usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional
274
+ attribute paths. `latencyMs` is time to first token; duration, total tokens,
275
+ tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
276
+ override `attributes.<field>` for non-default primitive span shapes, configure
277
+ `pricing` to derive USD costs from token counts by model/provider, and add
278
+ entries to `metrics` to surface arbitrary user metrics (`format: 'string' |
279
+ 'number' | 'duration' | 'json' | 'boolean'`, `placements: ['header' |
280
+ 'body']`).
270
281
  - Default usage config derives missing eval outputs from matching LLM/API spans
271
282
  before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`,
272
283
  `inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`,
273
- `cacheCreationInputTokens`, `reasoningTokens`, and `llmLatencyMs`. Authored
274
- outputs and column overrides win. Remove defaults globally or per eval with
275
- `removeDefaultConfig: true` or a key list such as
284
+ `cacheCreationInputTokens`, `reasoningTokens`, and `llmDurationMs`. Authored
285
+ outputs and column overrides win. `totalTokens` is input + output only; cache
286
+ read/write tokens stay separate and affect `costUsd` at their own rates.
287
+ Derived base input cost uses `inputTokens - cachedInputTokens -
288
+ cacheCreationInputTokens` so cache details are not double-counted.
289
+ `cacheCreationInputTokens` is the total cache-write count; optional
290
+ `cacheCreationInput1hTokens` only splits that total for 1-hour write pricing
291
+ via `cacheCreationInput1hUsdPerMillion`. `llmDurationMs` sums elapsed matched
292
+ LLM span durations; it is not time-to-first-token latency.
293
+ Remove defaults globally or per eval with `removeDefaultConfig: true` or a
294
+ key list such as
276
295
  `removeDefaultConfig: ['apiCalls', 'reasoningTokens']`.
277
296
  - `apiCalls` (in `agent-evals.config.ts`) configures how API-call spans are
278
297
  summarized for review. Defaults to `kind: 'api'`, `'http'`, `'http.client'`,