@ls-stack/agent-eval 0.27.1 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-Clf8xUFa.mjs";
2
- import "./src-BBwT7_cy.mjs";
1
+ import { n as createRunner } from "./cli-CIc_gBNM.mjs";
2
+ import "./src-CkWT1iSu.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-zqKwTlNj.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-1F8MeY5V.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -0,0 +1,3 @@
1
+ import "./runOrchestration-CIARrLs6.mjs";
2
+ import "./cli-CIc_gBNM.mjs";
3
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.27.1",
3
+ "version": "0.29.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -16,33 +16,6 @@
16
16
  "dist",
17
17
  "skills"
18
18
  ],
19
- "tsdown": {
20
- "clean": true,
21
- "copy": [
22
- {
23
- "from": "../../apps/web/dist/**/*",
24
- "to": "dist/web",
25
- "flatten": false
26
- }
27
- ],
28
- "deps": {
29
- "alwaysBundle": [
30
- "@agent-evals/runner",
31
- "@agent-evals/sdk",
32
- "@agent-evals/shared"
33
- ]
34
- },
35
- "dts": {
36
- "eager": true
37
- },
38
- "entry": [
39
- "src/index.ts",
40
- "src/bin.ts",
41
- "src/runChild.ts"
42
- ],
43
- "format": "esm",
44
- "platform": "node"
45
- },
46
19
  "dependencies": {
47
20
  "@hono/node-server": "^1.14.3",
48
21
  "@hono/zod-validator": "^0.7.3",
@@ -63,7 +36,7 @@
63
36
  "@agent-evals/shared": "0.0.1"
64
37
  },
65
38
  "scripts": {
66
- "build": "pnpm --filter @agent-evals/web build && tsdown",
39
+ "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
67
40
  "test": "vitest run",
68
41
  "tsc": "tsgo --noEmit",
69
42
  "lint": "tsgo --noEmit && eslint .",
@@ -16,9 +16,9 @@ This skill covers the mental model and conventions. For exhaustive field lists
16
16
  display rules), read the TypeScript declarations shipped with the package:
17
17
 
18
18
  - `AgentEvalsConfig`, `EvalDefinition`, `EvalCase`, `EvalOutputs`,
19
- `EvalColumnOverride`, `EvalScoreDef`, `EvalManualScoreDef`,
20
- `EvalTraceTree`, `TraceSpanInfo`, and `z` are exported from
21
- `@ls-stack/agent-eval`.
19
+ `EvalColumnOverride`, `EvalDeriveConfig`, `EvalScoreDef`,
20
+ `EvalManualScoreDef`, `EvalTraceTree`, `TraceSpanInfo`, and `z` are exported
21
+ from `@ls-stack/agent-eval`.
22
22
  - `.d.ts` files land in `node_modules/@ls-stack/agent-eval/dist/`.
23
23
  - CLI surface: `agent-evals --help` and `agent-evals <command> --help`.
24
24
  Unknown help targets exit non-zero instead of falling back to global help.
@@ -28,6 +28,10 @@ display rules), read the TypeScript declarations shipped with the package:
28
28
  for targeted CLI runs. Set `allowCliRunAll: true` in
29
29
  `agent-evals.config.ts` to opt into run-all CLI behavior. The web UI can
30
30
  still run grouped evals and confirms before starting more than five.
31
+ - `agent-evals app` watches `agent-evals.config.ts` and reloads config in
32
+ place when the runner is idle. If config changes during an active run, the UI
33
+ shows a pending reload banner and blocks new runs until the current run
34
+ reaches a terminal state and the reload applies.
31
35
 
32
36
  Assume that enumerated tables in this document may lag behind the types —
33
37
  treat the types as source of truth when they disagree.
@@ -230,6 +234,50 @@ Case `id` values anchor historical runs, caches, and manual scores — keep them
230
234
  stable. See `EvalDefinition` / `EvalCase` in the types for every supported
231
235
  field.
232
236
 
237
+ ### Manual input
238
+
239
+ Use `manualInput` instead of `cases` when each run should pause for the user
240
+ to type values:
241
+
242
+ ```ts
243
+ const inputSchema = z.object({
244
+ name: z.string().min(1),
245
+ tone: z.enum(['friendly', 'formal']),
246
+ notes: z.string().max(500).optional(),
247
+ sendEmail: z.boolean().default(false),
248
+ });
249
+
250
+ defineEval<z.infer<typeof inputSchema>>({
251
+ id: 'manual-input-greeting',
252
+ manualInput: {
253
+ schema: inputSchema,
254
+ title: 'Greet someone',
255
+ submitLabel: 'Greet',
256
+ fields: { notes: { multiline: true, rows: 4 } },
257
+ },
258
+ execute: ({ input, setOutput }) => {
259
+ setOutput('greeting', `Hi, ${input.name}!`);
260
+ },
261
+ });
262
+ ```
263
+
264
+ The web UI opens a modal driven by the descriptor derived from the schema
265
+ (`z.string` → text, `z.enum` → select, `z.boolean` → checkbox, etc.; nested
266
+ shapes fall back to a JSON textarea). The CLI accepts `--input '<json>'` for a
267
+ single targeted eval or `--input-file <path>` mapping eval keys/ids to inputs.
268
+ Each run produces one synthetic case `<evalId>-manual` with the validated
269
+ submission; mixing `manualInput` with `cases` is rejected at discovery time.
270
+
271
+ For file or image fields, set `{ asFile: true, accept?, maxSizeBytes? }` and
272
+ type the field with `manualInputFileValueSchema`. The widget supports click,
273
+ drag-and-drop, and clipboard paste (so a screenshot capture flows in
274
+ directly). The runtime value carries `{ name, mimeType, sizeBytes, sha256,
275
+ path }`, where `path` is a workspace-relative run artifact. Use
276
+ `readManualInputFile(value)` when bytes, `Blob`, `File`, text, or parsed JSON
277
+ are needed. In CLI runs, provide path objects such as
278
+ `{ "image": { "path": "./screenshot.png" } }`; the CLI stages the file before
279
+ starting the run.
280
+
233
281
  ## Scoring
234
282
 
235
283
  Every score returns a normalized `0..1` value. Pass/fail is per-score: a case
@@ -269,7 +317,19 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
269
317
  `defineEval` generic, `outputsSchema` is required.
270
318
  - `columns` overrides the display for output and score keys (label, format,
271
319
  alignment, visibility). The set of supported formats is declared by the
272
- `ColumnFormat` union and `EvalColumnOverride` in the types.
320
+ `ColumnFormat` union and `EvalColumnOverride` in the types. Global
321
+ `columns` in `agent-evals.config.ts` apply to every eval; eval-level
322
+ `columns` override matching global keys. Use `hideIfNoValue: true` to hide a
323
+ column from the runs table when every rendered row is missing the value,
324
+ `null`, or an empty string; `0` and `false` still count as values, and the
325
+ value remains available in case details and raw output data.
326
+ - `deriveFromTracing` can be authored globally in `agent-evals.config.ts` or
327
+ locally on one eval. Prefer the keyed map form for shared metrics:
328
+ `deriveFromTracing: { toolCalls: ({ trace }) => trace.findSpansByKind('tool').length }`.
329
+ The older object-returning function form remains supported. Global
330
+ derivations run first; runtime outputs are never overwritten, and eval-level
331
+ derivations only fill keys still missing after global derivations. In keyed
332
+ form, return `undefined` to omit one output for that case.
273
333
  - `traceDisplay` promotes selected span attributes into the trace tree and
274
334
  detail pane; it supports aggregation across subtrees (`scope`, `mode`) and
275
335
  user-defined `transform(...)` for derived views (e.g. currency conversion).
@@ -280,18 +340,26 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
280
340
  attribute paths. `latencyMs` is time to first token; duration, total tokens,
281
341
  tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
282
342
  override `attributes.<field>` for non-default primitive span shapes, configure
283
- `pricing` to derive USD costs from token counts by model/provider, add
284
- `derivedAttributes` to persist computed values back onto matching LLM spans
285
- before trace consumers run, and add entries to `metrics` to surface arbitrary
286
- user metrics (`format: 'string' | 'number' | 'duration' | 'json' |
287
- 'boolean'`, `placements: ['header' | 'body']`). `derivedAttributes` keys are
288
- dot-paths under `span.attributes`; return `undefined` to skip one span.
343
+ model-keyed `pricing` to derive USD costs from token counts, with nested
344
+ `providers` entries for provider-specific rates, add `derivedAttributes` to
345
+ persist computed values back onto matching LLM spans before trace consumers
346
+ run, and add entries to `metrics` to surface arbitrary user metrics
347
+ (`format: 'string' | 'number' | 'duration' | 'json' | 'boolean'`,
348
+ `placements: ['header' | 'body']`). `derivedAttributes` keys are dot-paths
349
+ under `span.attributes`; return `undefined` to skip one span. For saved runs,
350
+ the case drawer more menu can recalculate configured LLM/API derived
351
+ attributes for one case and persist the updated trace artifacts without
352
+ re-running the eval.
289
353
  - Default usage config derives missing eval outputs from matching LLM/API spans
290
354
  before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`,
291
355
  `inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`,
292
356
  `cacheCreationInputTokens`, `reasoningTokens`, and `llmDurationMs`. Authored
293
- outputs and column overrides win. `totalTokens` is input + output only; cache
294
- read/write tokens stay separate and affect `costUsd` at their own rates.
357
+ outputs and column overrides win. Default usage columns, stats, and charts
358
+ use `hideIfNoValue: true`, so the UI hides them until matching LLM/API span
359
+ data exists. Default LLM usage charts render cost, input tokens, and output
360
+ tokens separately and use `dedupeConsecutiveValues: true` to skip repeated
361
+ adjacent chart values. `totalTokens` is input + output only; cache read/write
362
+ tokens stay separate and affect `costUsd` at their own rates.
295
363
  Derived base input cost uses `inputTokens - cachedInputTokens -
296
364
  cacheCreationInputTokens` so cache details are not double-counted.
297
365
  `cacheCreationInputTokens` is the total cache-write count; optional
@@ -315,12 +383,17 @@ cacheCreationInputTokens` so cache details are not double-counted.
315
383
  are still captured.
316
384
 
317
385
  Stats rows and history charts on the eval card can be authored via `stats` /
318
- `charts` on the eval definition. Usage stats and LLM usage charts are added by
386
+ `charts` on the eval definition. Global `stats` in `agent-evals.config.ts`
387
+ render before eval-level stats. Usage stats and LLM usage charts are added by
319
388
  default unless removed with `removeDefaultConfig`. Column stats can override
320
389
  `format` and `numberFormat`, otherwise they inherit from the matching column.
321
390
  Number formats use `maxDecimalPlaces` to cap decimals and `minDecimalPlaces`
322
391
  to pad trailing zeroes. Without `maxDecimalPlaces`, they render up to 3 decimal
323
- places.
392
+ places. Stats and charts support `hideIfNoValue: true`; stats hide when they
393
+ would otherwise render an empty value, and charts hide when no plotted metric or
394
+ tooltip extra has a numeric value in the rendered history window. Charts support
395
+ `dedupeConsecutiveValues: true` to omit consecutive points whose plotted metrics
396
+ and tooltip extras match the previous kept point.
324
397
  Their shapes live in the types; no need to memorize the option set.
325
398
 
326
399
  ## Cached operations
@@ -378,12 +451,18 @@ Mental model:
378
451
  (no surrounding span), the ref is recorded on the case detail's `cacheRefs`
379
452
  array.
380
453
  - Cache identity is the namespace plus the authored key. Source-file
381
- fingerprints are stored as metadata for inspection, but do not participate in
382
- cache-key hashing.
454
+ fingerprints are tracked for run freshness separately, but do not participate
455
+ in cache-key hashing.
383
456
  - Cached spans require an explicit `cache.namespace`; value caches default to
384
457
  `${evalId}__${name}` and can be overridden with `namespace`. Matching
385
458
  namespaces share entries across operations/evals that use the same authored
386
459
  key.
460
+ - Per eval, `cache: { read?: boolean; store?: boolean }` controls whether
461
+ authored cached operations may read or persist entries. Both default to
462
+ `true`. Use `read: false` to always execute instead of replaying hits, and
463
+ `store: false` to allow reads while preventing misses/refreshes from writing
464
+ cache or raw-key debug files. Run-level bypass/refresh controls still take
465
+ precedence.
387
466
  - Authored eval ids are unique within one eval file. The exact eval identity is
388
467
  the workspace-relative file path plus eval id, so the same id can be reused in
389
468
  different files. Case ids must be unique within one eval; duplicate case ids
@@ -403,10 +482,15 @@ Mental model:
403
482
  user inputs, or other sensitive data, should be gitignored, and is not needed
404
483
  for cache reuse. The UI Cache tab shows the raw key when it is available and
405
484
  can be filtered to hits or new entries added by cache misses/refreshes.
406
- - Cached payloads use advance serialization/deserialization with the Web API plugin set, so return values and
407
- recorded SDK effects preserve richer built-ins such as `Date`, `Map`, `Set`,
408
- typed arrays, `URL`, `Headers`, `Blob`, and `File` on hits. Cache keys still
409
- use the deterministic key-hashing rules above.
485
+ Misses/refreshes with `cache.store: false` are shown as non-stored activity
486
+ without fetch/delete controls.
487
+ - Cached payloads use advanced serialization/deserialization with the Web API
488
+ plugin set, so return values and recorded SDK effects preserve richer
489
+ built-ins such as `Date`, `Map`, `Set`, typed arrays, `URL`, `Headers`,
490
+ `Blob`, and `File` on hits. Undefined values are omitted by default instead
491
+ of being written to cache files; direct serializer callers can pass
492
+ `{ preserveUndefined: true }` when explicit undefined wrappers are needed.
493
+ Cache keys still use the deterministic key-hashing rules above.
410
494
  - Cache mode per run is controlled by CLI flags (see `agent-evals run --help`).
411
495
 
412
496
  ## Artifacts