npm - @ls-stack/agent-eval - Versions diffs - 0.21.0 → 0.23.0 - Mend

@ls-stack/agent-eval 0.21.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/dist/{app-CmwmcUgG.mjs → app-Cw79dJDr.mjs} +3 -3
package/dist/apps/web/dist/assets/index-AUDD3rNB.js +118 -0
package/dist/apps/web/dist/index.html +1 -1
package/dist/bin.mjs +1 -1
package/dist/{cli-DumvanQI.mjs → cli-D3QNOcPN.mjs} +3 -3
package/dist/index.d.mts +102 -56
package/dist/index.mjs +4 -4
package/dist/runChild.mjs +1 -1
package/dist/{runOrchestration-zYAcAPtS.mjs → runOrchestration-CimthgI7.mjs} +248 -90
package/dist/{runner-Dy_PECaf.mjs → runner-4yNYRvmF.mjs} +2 -2
package/dist/{runner-BcwyX9CO.mjs → runner-B-SYzW8w.mjs} +1 -1
package/dist/src-CcXfWT4M.mjs +3 -0
package/package.json +1 -1
package/skills/agent-eval/SKILL.md +36 -17
package/dist/apps/web/dist/assets/index-EXO08yya.js +0 -118
package/dist/src-BoAJb4wC.mjs +0 -3

package/dist/{runner-BcwyX9CO.mjs → runner-B-SYzW8w.mjs} RENAMED Viewed

@@ -1,2 +1,2 @@
-import { n as initRunner, t as getRunnerInstance } from "./runner-Dy_PECaf.mjs";
+import { n as initRunner, t as getRunnerInstance } from "./runner-4yNYRvmF.mjs";
 export { getRunnerInstance, initRunner };

package/dist/src-CcXfWT4M.mjs ADDED Viewed

@@ -0,0 +1,3 @@
+import "./runOrchestration-CimthgI7.mjs";
+import "./cli-D3QNOcPN.mjs";
+export {};

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ls-stack/agent-eval",
-  "version": "0.21.0",
+  "version": "0.23.0",
   "type": "module",
   "bin": {
     "agent-evals": "./dist/bin.mjs"

package/skills/agent-eval/SKILL.md CHANGED Viewed

@@ -92,20 +92,16 @@ export async function runRefundWorkflow(input: RefundInput) {
         async () => {
           let text: string;
           let usage: { inputTokens: number; outputTokens: number };
-          let costUsd: number;
           try {
-            ({ text, usage, costUsd } = await llm.complete(input.message));
+            ({ text, usage } = await llm.complete(input.message));
           } catch (error) {
             captureEvalSpanError(error);
-            ({ text, usage, costUsd } = await llm.completeWithFallback(
-              input.message,
-            ));
+            ({ text, usage } = await llm.completeWithFallback(input.message));
           }
           evalSpan.setAttributes({
             model: 'gpt-4o-mini',
             provider: 'openai',
             usage,
-            costUsd,
           });
           const expectedLocale = getEvalCaseInput('locale');
           if (typeof expectedLocale === 'string') {
@@ -137,8 +133,8 @@ are more specific. Only the `input` and `output` span attributes are promoted
 automatically in the trace tree; use `traceDisplay` for other span attributes
 such as `model` or `usage`. Eval-level LLM usage outputs, columns, stats, and
 charts are derived from matching LLM spans by default. Prefer
-`llmCalls.pricing` for LLM-call cost display instead of writing `costUsd` on
-each span.
+`llmCalls.pricing` for LLM-call cost display; built-in costs ignore span
+`costUsd` attributes.
 Use `captureEvalSpanError(error)` for recoverable errors on the active
 `evalTracer.span(...)`, such as optional model/tool failures that fall back and
@@ -160,6 +156,19 @@ for settlement; promise and span errors keep their normal behavior. Use
 `waitForBackgroundJob: false` on a span, or `waitForBackgroundJobs: false` on an
 eval definition, when background work should not delay finalization.
+Eval Date APIs use a shifted wall clock by default: `new Date()` and
+`Date.now()` start at `2026-04-10T00:00:00.000Z` during case generation,
+execution, tracing, derived outputs, and scorers, then continue advancing with
+real elapsed time. Set `startTime` on a specific `defineEval(...)` to use
+another initial clock value, or set `startTime: 'now'` for that eval to use the
+real current clock. Timers are not faked, so async waits still run normally.
+Set `freezeTime: true` to keep Date APIs frozen until they are moved manually.
+Use `getEvalStartTime()` to read the captured wall-clock start as a `Date`.
+Use `advanceEvalTime(unit, amount)` inside an eval to move the shifted clock
+forward; supported units are `millisecond(s)`, `second(s)`, `minute(s)`,
+`hour(s)`, and `day(s)`. It throws for evals with `startTime: 'now'`, unless
+`freezeTime: true` is also set.
 For libraries or observability exporters that already emit span lifecycle
 events, use `evalTracer.startSpan(...)`, `evalTracer.updateSpan(...)`,
 `evalTracer.endSpan(...)`, or `evalTracer.recordSpan(...)` to translate those
@@ -261,18 +270,28 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
   See the `TraceDisplayInputConfig` type.
 - `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are
   summarized for review. Defaults to `kind: 'llm'` spans with `model`,
-  `usage.*`, `tokensPerSecond`, `input`, `output`, etc. read from conventional
-  attribute paths. Override `kinds` to broaden the filter, override
-  `attributes.<field>` for non-default span shapes, configure `pricing` to
-  derive USD costs from token counts by model/provider, and add entries to
-  `metrics` to surface arbitrary user metrics (`format: 'string' | 'number' |
-'duration' | 'json' | 'boolean'`, `placements: ['header' | 'body']`).
+  `usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional
+  attribute paths. `latencyMs` is time to first token; duration, total tokens,
+  tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
+  override `attributes.<field>` for non-default primitive span shapes, configure
+  `pricing` to derive USD costs from token counts by model/provider, and add
+  entries to `metrics` to surface arbitrary user metrics (`format: 'string' |
+'number' | 'duration' | 'json' | 'boolean'`, `placements: ['header' |
+'body']`).
 - Default usage config derives missing eval outputs from matching LLM/API spans
   before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`,
   `inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`,
-  `cacheCreationInputTokens`, `reasoningTokens`, and `llmLatencyMs`. Authored
-  outputs and column overrides win. Remove defaults globally or per eval with
-  `removeDefaultConfig: true` or a key list such as
+  `cacheCreationInputTokens`, `reasoningTokens`, and `llmDurationMs`. Authored
+  outputs and column overrides win. `totalTokens` is input + output only; cache
+  read/write tokens stay separate and affect `costUsd` at their own rates.
+  Derived base input cost uses `inputTokens - cachedInputTokens -
+cacheCreationInputTokens` so cache details are not double-counted.
+  `cacheCreationInputTokens` is the total cache-write count; optional
+  `cacheCreationInput1hTokens` only splits that total for 1-hour write pricing
+  via `cacheCreationInput1hUsdPerMillion`. `llmDurationMs` sums elapsed matched
+  LLM span durations; it is not time-to-first-token latency.
+  Remove defaults globally or per eval with `removeDefaultConfig: true` or a
+  key list such as
   `removeDefaultConfig: ['apiCalls', 'reasoningTokens']`.
 - `apiCalls` (in `agent-evals.config.ts`) configures how API-call spans are
   summarized for review. Defaults to `kind: 'api'`, `'http'`, `'http.client'`,