@ls-stack/agent-eval 0.13.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,400 @@
1
+ ---
2
+ name: agent-eval
3
+ description: Create, run, and maintain TypeScript evals with @ls-stack/agent-eval. Use when adding eval coverage for an LLM or agent workflow, updating *.eval.ts files, checking eval results, configuring agent-evals.config.ts, inspecting saved .agent-evals run artifacts, or wiring product source code with evalTracer spans.
4
+ ---
5
+
6
+ # Agent Eval
7
+
8
+ Local-first, UI-first eval runner for LLM and agent systems. Evals are strict
9
+ TypeScript modules named `*.eval.ts`, discovered from `agent-evals.config.ts`,
10
+ and executed through the CLI (`agent-evals run`) or the web UI
11
+ (`agent-evals app`). Runs persist to `.agent-evals/` so results, traces, and
12
+ caches survive across processes.
13
+
14
+ This skill covers the mental model and conventions. For exhaustive field lists
15
+ (config options, eval shape, column formats, score/chart/stats options, trace
16
+ display rules), read the TypeScript declarations shipped with the package:
17
+
18
+ - `AgentEvalsConfig`, `EvalDefinition`, `EvalCase`, `EvalOutputs`,
19
+ `EvalColumnOverride`, `EvalScoreDef`, `EvalManualScoreDef`,
20
+ `EvalTraceTree`, `TraceSpanInfo`, and `z` are exported from
21
+ `@ls-stack/agent-eval`.
22
+ - `.d.ts` files land in `node_modules/@ls-stack/agent-eval/dist/`.
23
+ - CLI surface: `agent-evals --help` and `agent-evals <command> --help`.
24
+ Unknown help targets exit non-zero instead of falling back to global help.
25
+ - The CLI automatically loads `.env` from the current workspace. Shell-provided
26
+ environment variables win; pass `--no-env` to disable `.env` loading once.
27
+
28
+ Assume that enumerated tables in this document may lag behind the types —
29
+ treat the types as source of truth when they disagree.
30
+
31
+ ## Where tracing lives
32
+
33
+ **Tracing belongs in the product source code, not in the eval file.** The eval
34
+ file wires up cases and scoring; the real `evalTracer.span(...)` calls sit
35
+ inside the workflow, agent, or tool functions that both production and evals
36
+ invoke.
37
+
38
+ `evalTracer`, `evalSpan`, output helpers, and `evalAssert` are ambient no-ops
39
+ when called outside an eval case scope, so leaving them in production paths is
40
+ safe — they only record anything when the product code runs inside an eval's
41
+ `execute`. Use `isInEvalScope()` to branch on eval-only behavior in shared code
42
+ (e.g. skip a real network side effect): it returns `null` outside eval-owned
43
+ work and returns `'env'`, `'cases'`, `'eval'`, `'derive'`, `'outputsSchema'`, or
44
+ `'scorer'` during runner phases. Top-level modules imported while a run is being
45
+ prepared see `'env'`; code called from `execute` sees `'eval'`. Use
46
+ `getEvalCaseInput()` to read the current case input, or
47
+ `getEvalCaseInput('customer.tier')` for nested dot-path access; outside a case
48
+ scope it returns `undefined`. Use `nextEvalId()` inside eval-scoped code when a
49
+ stable generated id is needed; it includes the eval file, eval id, case id, and
50
+ a per-case sequence number, and throws outside an eval case scope.
51
+
52
+ ### Product code (instrumented once, reused everywhere)
53
+
54
+ ```ts
55
+ // src/workflows/refundWorkflow.ts
56
+ import {
57
+ appendToEvalOutput,
58
+ captureEvalSpanError,
59
+ evalAssert,
60
+ evalSpan,
61
+ evalTracer,
62
+ getEvalCaseInput,
63
+ incrementEvalOutput,
64
+ mergeEvalOutput,
65
+ nextEvalId,
66
+ setEvalOutput,
67
+ startEvalBackgroundJob,
68
+ } from '@ls-stack/agent-eval';
69
+
70
+ export async function runRefundWorkflow(input: RefundInput) {
71
+ return evalTracer.span(
72
+ { kind: 'agent', name: 'refund-workflow' },
73
+ async () => {
74
+ evalSpan.setAttribute('input', input);
75
+
76
+ const plan = await evalTracer.span(
77
+ {
78
+ kind: 'llm',
79
+ name: 'plan-refund',
80
+ cache: { key: { prompt: input.message, model: 'gpt-4o-mini' } },
81
+ },
82
+ async () => {
83
+ let text: string;
84
+ let usage: { inputTokens: number; outputTokens: number };
85
+ let costUsd: number;
86
+ try {
87
+ ({ text, usage, costUsd } = await llm.complete(input.message));
88
+ } catch (error) {
89
+ captureEvalSpanError(error);
90
+ ({ text, usage, costUsd } = await llm.completeWithFallback(
91
+ input.message,
92
+ ));
93
+ }
94
+ evalSpan.setAttributes({ model: 'gpt-4o-mini', usage });
95
+ const expectedLocale = getEvalCaseInput('locale');
96
+ if (typeof expectedLocale === 'string') {
97
+ evalSpan.setAttribute('expectedLocale', expectedLocale);
98
+ }
99
+ evalSpan.incrementAttribute('llmCalls', 1);
100
+ evalSpan.appendToAttribute('models', 'gpt-4o-mini');
101
+ incrementEvalOutput('costUsd', costUsd);
102
+ appendToEvalOutput('modelCalls', { model: 'gpt-4o-mini', costUsd });
103
+ return text;
104
+ },
105
+ );
106
+
107
+ const result = await applyRefund(plan);
108
+ const reviewId = nextEvalId();
109
+ setEvalOutput('response', result.finalText);
110
+ setEvalOutput('reviewId', reviewId);
111
+ mergeEvalOutput('metadata', { approved: result.approved });
112
+ evalAssert(result.approved, 'refund workflow should approve the case');
113
+ evalSpan.setAttribute('output', { result, reviewId });
114
+ return result;
115
+ },
116
+ );
117
+ }
118
+ ```
119
+
120
+ Span `kind` values are open-ended strings and are color-coded automatically in
121
+ the UI for every kind used during the app session. Use familiar kinds such as
122
+ `agent`, `tool`, `llm`, `api`, `retrieval`, `scorer`, or `checkpoint` when they
123
+ fit, and preserve external tracer kinds such as `mastra.workflow.step` when they
124
+ are more specific. The UI automatically promotes only the `input` and `output` span
125
+ attributes. Use `traceDisplay` for other span attributes such as `model`,
126
+ `usage`, or `costUsd`.
127
+
128
+ Use `captureEvalSpanError(error)` for recoverable errors on the active
129
+ `evalTracer.span(...)`, such as optional model/tool failures that fall back and
130
+ continue. You can pass one error, multiple error arguments, or an array. The
131
+ span is still marked `error`, and the UI renders captured errors in a dedicated
132
+ span detail block with timing relative to the span. Pass `'warning'` or
133
+ `{ level: 'warning' }` as the final argument for diagnostics that should be
134
+ visible in span detail without changing an otherwise successful span's status.
135
+
136
+ If a span callback throws, the SDK automatically marks that span as `error`,
137
+ stores the thrown error on it, and rethrows so the case errors. Use that for
138
+ terminal failures; use `captureEvalSpanError(...)` for recoverable failures that
139
+ continue through fallback logic.
140
+
141
+ Fire-and-forget spans started during `execute` are awaited before outputs,
142
+ `deriveFromTracing`, scores, and trace data are finalized, so `void
143
+ evalTracer.span(...)` is safe when the span result is not needed. Register
144
+ non-span promises with `startEvalBackgroundJob(promise)`. The runner only waits
145
+ for settlement; promise and span errors keep their normal behavior. Use
146
+ `waitForBackgroundJob: false` on a span, or `waitForBackgroundJobs: false` on an
147
+ eval definition, when background work should not delay finalization.
148
+
149
+ For libraries or observability exporters that already emit span lifecycle
150
+ events, use `evalTracer.startSpan(...)`, `evalTracer.updateSpan(...)`,
151
+ `evalTracer.endSpan(...)`, or `evalTracer.recordSpan(...)` to translate those
152
+ events into the eval trace tree without wrapping the upstream work in a
153
+ callback. Pass the upstream span id and parent id when available so the UI keeps
154
+ the original hierarchy.
155
+
156
+ ### Eval file (thin)
157
+
158
+ ```ts
159
+ // evals/refund-workflow.eval.ts
160
+ import { defineEval, z } from '@ls-stack/agent-eval';
161
+ import { runRefundWorkflow } from '../src/workflows/refundWorkflow.ts';
162
+
163
+ const outputsSchema = z.object({
164
+ response: z.string(),
165
+ costUsd: z.number().optional(),
166
+ toolCalls: z.number(),
167
+ llmTurns: z.number(),
168
+ });
169
+ type RefundOutputs = z.infer<typeof outputsSchema>;
170
+
171
+ defineEval<RefundInput, RefundOutputs>({
172
+ id: 'refund-workflow',
173
+ cases: [
174
+ { id: 'simple-text', input: { message: 'I want a refund for order #123' } },
175
+ ],
176
+ outputsSchema,
177
+ execute: async ({ input }) => {
178
+ await runRefundWorkflow(input);
179
+ },
180
+ deriveFromTracing: ({ trace }) => ({
181
+ toolCalls: trace.findSpansByKind('tool').length,
182
+ llmTurns: trace.findSpansByKind('llm').length,
183
+ }),
184
+ scores: {
185
+ mentionsRefund: {
186
+ passThreshold: 1,
187
+ compute: ({ outputs }) => (/refund/i.test(outputs.response) ? 1 : 0),
188
+ },
189
+ },
190
+ });
191
+ ```
192
+
193
+ `execute` usually just calls the product code. Push any placeholder
194
+ `evalTracer.span(...)` wrappers out of the eval and into the product module
195
+ they describe so production runs get the same trajectory. Only keep tracing
196
+ inside `execute` when the behavior being measured is eval-specific (e.g. a
197
+ judge-only sub-step with no production analogue).
198
+
199
+ Case `id` values anchor historical runs, caches, and manual scores — keep them
200
+ stable. See `EvalDefinition` / `EvalCase` in the types for every supported
201
+ field.
202
+
203
+ ## Scoring
204
+
205
+ Every score returns a normalized `0..1` value. Pass/fail is per-score: a case
206
+ fails if any score with `passThreshold` falls below it, if an assertion fails,
207
+ or if the case errors. Scores without `passThreshold` are informational.
208
+
209
+ Score functions run in their own trace scope, separate from the execution
210
+ trace, so LLM-as-judge scorers can use `evalTracer.span(...)` and cached spans
211
+ without polluting the agent trajectory. The case detail UI shows execution
212
+ spans on **Trace** and scorer spans on **Scoring**. Outputs set inside a scorer
213
+ stay private to that score.
214
+
215
+ `manualScores` declares score columns that reviewers fill in the web UI after
216
+ a run. Pending values keep the eval in an `unscored` state instead of failing.
217
+
218
+ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
219
+ (format, threshold, column overrides).
220
+
221
+ ## Outputs, columns, trace display
222
+
223
+ - `setEvalOutput(key, value)` writes reviewable data for the case. Values are
224
+ plain data (strings, numbers, booleans, JSON-safe objects) plus native
225
+ `Blob`/`File` or `FileRef` variants for media columns. Inside `execute`,
226
+ prefer the context `setOutput(key, value)` helper when writing schema-backed
227
+ outputs; it is typed from the eval's outputs generic. Keep `setEvalOutput`
228
+ for shared workflow code that does not receive the execute context.
229
+ - Use `incrementEvalOutput(key, delta)` for numeric totals,
230
+ `appendToEvalOutput(key, value)` for arrays that preserve existing scalar
231
+ values, and `mergeEvalOutput(key, patch)` for shallow object updates.
232
+ `evalSpan` has matching `incrementAttribute`, `appendToAttribute`, and
233
+ `mergeAttribute` helpers for span attributes.
234
+ - `outputsSchema` validates final outputs after `execute` and
235
+ `deriveFromTracing`, before computed scores. For Zod object schemas, only
236
+ declared keys are passed to the schema; parsed fields merge back into the raw
237
+ output map, so defaults/transforms apply to configured fields and
238
+ unconfigured outputs stay visible as before. Validation failures fail the case
239
+ and skip computed scores. When you pass a narrowed outputs type as the second
240
+ `defineEval` generic, `outputsSchema` is required.
241
+ - `columns` overrides the display for output and score keys (label, format,
242
+ alignment, visibility). The set of supported formats is declared by the
243
+ `ColumnFormat` union and `EvalColumnOverride` in the types.
244
+ - `traceDisplay` promotes selected span attributes into the trace tree and
245
+ detail pane; it supports aggregation across subtrees (`scope`, `mode`) and
246
+ user-defined `transform(...)` for derived views (e.g. currency conversion).
247
+ See the `TraceDisplayInputConfig` type.
248
+ - `llmCalls` (in `agent-evals.config.ts`) configures the LLM calls tab in the
249
+ case-run drawer. Defaults to `kind: 'llm'` spans with `model`, `usage.*`,
250
+ `costUsd`, `input`, `output`, etc. read from conventional attribute paths.
251
+ Override `kinds` to broaden the filter, override `attributes.<field>` for
252
+ non-default span shapes, and add entries to `metrics` to surface arbitrary
253
+ user metrics (`format: 'string' | 'number' | 'duration' | 'json' |
254
+ 'boolean'`, `placements: ['header' | 'body']`). The tab auto-hides when no
255
+ matching spans exist.
256
+ - `apiCalls` (in `agent-evals.config.ts`) configures the API calls tab in the
257
+ case-run drawer. Defaults to `kind: 'api'`, `'http'`, `'http.client'`, and
258
+ `'fetch'` spans with `method`, `url`, `statusCode`, `request`, `response`,
259
+ `requestBody`, `responseBody`, `headers`, `durationMs`, and `error` read
260
+ from conventional attribute paths. Override `kinds` or
261
+ `attributes.<field>` for external tracers, and add `metrics` with the same
262
+ formats and placements as LLM-call metrics. The tab auto-hides when no
263
+ matching spans exist.
264
+
265
+ Stats rows and history charts on the eval card are opt-in via `stats` /
266
+ `charts` on the eval definition. Their shapes live in the types; no need to
267
+ memorize the option set.
268
+
269
+ ## Cached operations
270
+
271
+ Wrap a costly pure span in `cache: { key }` so later runs replay its recorded
272
+ effects without re-executing:
273
+
274
+ ```ts
275
+ await evalTracer.span(
276
+ {
277
+ kind: 'llm',
278
+ name: 'plan-refund',
279
+ cache: { key: { prompt: input.message, model: 'gpt-4o-mini' } },
280
+ },
281
+ async () => {
282
+ const result = await llm.complete(input.message);
283
+ evalSpan.setAttributes({ model: 'gpt-4o-mini', output: result });
284
+ incrementEvalOutput('costUsd', computeCost(result));
285
+ appendToEvalOutput('llmCalls', { model: 'gpt-4o-mini' });
286
+ return result;
287
+ },
288
+ );
289
+ ```
290
+
291
+ Use `evalTracer.cache(...)` for pure values that should not create their own
292
+ trace span:
293
+
294
+ ```ts
295
+ const context = await evalTracer.cache(
296
+ { name: 'receipt-audit-context', key: { orderId: input.orderId } },
297
+ async () => {
298
+ const result = await loadReceiptContext(input);
299
+ evalSpan.setAttribute('receiptContext', result);
300
+ evalSpan.mergeAttribute('receiptSummary', { orderId: input.orderId });
301
+ return result;
302
+ },
303
+ );
304
+ ```
305
+
306
+ Mental model:
307
+
308
+ - Only SDK-mediated effects replay on a hit: sub-spans, checkpoints,
309
+ output helper calls, span attributes. External side
310
+ effects (HTTP, DB writes, file I/O) **do not** replay — cache only pure
311
+ functions of the key.
312
+ - `evalTracer.cache(...)` does not create a span. When it runs inside an active
313
+ span, that span gets a `cache.refs` entry with the value cache name, key,
314
+ namespace, and hit/miss status. When called directly from the case body
315
+ (no surrounding span), the ref is recorded on the case detail's `cacheRefs`
316
+ array so spanless caches still appear in the UI's **Cache hits** tab, where
317
+ each hit can be expanded for inspection or deleted by namespace/key.
318
+ - The cache key folds in a source-file fingerprint, so editing the eval busts
319
+ the cache automatically.
320
+ - `cache.namespace` on spans or `namespace` on value caches can share entries
321
+ across operations/evals, but the source-file fingerprint still participates
322
+ in the final key. Shared namespaces are reusable across evals in the same
323
+ file; evals in different files miss even with the same namespace and key.
324
+ - Cache keys should be deterministic primitives, arrays, and plain objects.
325
+ `Buffer`, `ArrayBuffer`, and typed arrays hash by bytes. Native `Blob`/`File`
326
+ keys use stable metadata by default (`type`, `size`, plus
327
+ `name`/`lastModified` for `File`) and do not read file bytes. Add
328
+ `serializeFileBytes: true` to a cached span or `evalTracer.cache(...)` call
329
+ when byte-level cache invalidation is required.
330
+ - Cache entries are stored in inspectable owner files under
331
+ `.agent-evals/cache/<owner>.json`; each namespace is capped at 100 entries by
332
+ default. Configure `cache.maxEntriesPerNamespace` for the default cap and
333
+ `cache.maxEntriesByNamespace` for exact namespace-specific caps.
334
+ - Cached payloads use advance serialization/deserialization with the Web API plugin set, so return values and
335
+ recorded SDK effects preserve richer built-ins such as `Date`, `Map`, `Set`,
336
+ typed arrays, `URL`, `Headers`, `Blob`, and `File` on hits. Cache keys still
337
+ use the deterministic key-hashing rules above.
338
+ - Cache mode per run is controlled by CLI flags (see `agent-evals run --help`)
339
+ and by a chevron menu on each eval card in the UI.
340
+ - The UI Stop action cancels the whole active run by terminating that run's
341
+ isolated execution process.
342
+
343
+ ## Artifacts
344
+
345
+ Run output lives under `.agent-evals/runs/<run-id>/` and cache entries under
346
+ `.agent-evals/cache/<eval-id>.json`. Files in a run directory include run
347
+ metadata, a run summary, per-case results, and per-case trace JSON. Inspect
348
+ these when debugging persisted output, costs, columns, traces, or failures —
349
+ the filenames are stable even when their internal schema evolves, so pick the
350
+ one whose name matches what you are debugging and read it directly.
351
+
352
+ ## Module mocking
353
+
354
+ For true module replacement inside an eval, register `mock.module(...)` from
355
+ `node:test` before dynamically importing the module graph. The CLI enables
356
+ Node's `--experimental-test-module-mocks` flag automatically. Use dynamic
357
+ `import(...)` inside `execute` — static imports happen too early.
358
+
359
+ ```ts
360
+ import { mock } from 'node:test';
361
+ import { defineEval } from '@ls-stack/agent-eval';
362
+
363
+ defineEval({
364
+ id: 'module-mock-demo',
365
+ cases: [{ id: 'mocked-dependency', input: { customerId: 'vip-100' } }],
366
+ execute: async ({ input, setOutput }) => {
367
+ mock.module('../src/customerLookup.ts', {
368
+ namedExports: { lookupCustomer: async () => ({ segment: 'vip' }) },
369
+ });
370
+ const { runWorkflow } = await import('../src/workflow.ts');
371
+ const result = await runWorkflow(input);
372
+ setOutput('segment', result.segment);
373
+ },
374
+ });
375
+ ```
376
+
377
+ ## Workflow checklist
378
+
379
+ When adding or changing evals:
380
+
381
+ 1. Put the tracing + ambient SDK calls in the product code that runs in both
382
+ production and evals. Keep eval files thin.
383
+ 2. Use realistic cases drawn from real product flows; avoid placeholder inputs.
384
+ 3. `evalAssert` for hard invariants, `scores` for graded signals,
385
+ `passThreshold` only on scores that should gate pass/fail.
386
+ 4. Surface reviewable values through execute-context `setOutput` or ambient
387
+ `setEvalOutput` in shared workflow code, and shape them with `columns`
388
+ formats from the `ColumnFormat` type.
389
+ 5. Promote high-signal span attributes with `traceDisplay` so the UI
390
+ highlights them in the trace tree and detail pane.
391
+ 6. Cache costly pure spans with `cache: { key }` and pure spanless values with
392
+ `evalTracer.cache(...)`; never cache operations whose external side effects
393
+ you depend on.
394
+ 7. Sanity-check after changes: `agent-evals list`, then
395
+ `agent-evals run --eval <id>`. Open the UI only when you need to inspect
396
+ traces, trends, or fill manual scores. From an eval page, the eval actions
397
+ menu can copy package-manager-specific CLI run and debug commands.
398
+ 8. To debug a focused run, use
399
+ `agent-evals run --inspect-brk --eval <id> --case <case-id>` and attach a
400
+ Node.js debugger before continuing execution.