@ls-stack/agent-eval 0.55.1 → 0.56.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-Sw38bCaq.mjs";
1
+ import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-6lrtj48K.mjs";
2
2
  import { readFile, readdir, rm, writeFile } from "node:fs/promises";
3
3
  import { dirname, join } from "node:path";
4
4
  import { existsSync } from "node:fs";
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-72rsqJRq.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-DYlwuAT3.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-BR3wMZMx.mjs";
2
- import "./src-hBGtzWuA.mjs";
1
+ import { n as createRunner } from "./cli-DQO2Fpt2.mjs";
2
+ import "./src-DCGrFAmO.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,5 +1,5 @@
1
- import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-Sw38bCaq.mjs";
2
- import "./cli-BR3wMZMx.mjs";
1
+ import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-6lrtj48K.mjs";
2
+ import "./cli-DQO2Fpt2.mjs";
3
3
  //#region src/index.ts
4
4
  /** Register an eval definition with typed tag support. */
5
5
  function defineEval(definition) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.55.1",
3
+ "version": "0.56.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -33,8 +33,8 @@
33
33
  "@types/node": "^24.7.2",
34
34
  "typescript": "^5.9.2",
35
35
  "@agent-evals/runner": "0.0.1",
36
- "@agent-evals/shared": "0.0.1",
37
- "@agent-evals/sdk": "0.0.1"
36
+ "@agent-evals/sdk": "0.0.1",
37
+ "@agent-evals/shared": "0.0.1"
38
38
  },
39
39
  "scripts": {
40
40
  "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
@@ -318,6 +318,11 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
318
318
  - `setEvalOutput(key, value)` writes reviewable data for the case. Values are
319
319
  stored as received: primitives, objects/arrays, explicit file refs, and
320
320
  native `Blob`/`File` values. `columns.format` only controls visualization.
321
+ Inside `execute`, `setOutput(key, value, formatOrOverride)` can attach a
322
+ display hint directly to a runtime output, e.g. `'markdown'` or
323
+ `{ label: 'Receipt', format: 'image', hideInTable: true }`. Authored
324
+ global/eval `columns` for the same key take precedence over that runtime
325
+ hint.
321
326
  Non-JSON runtime values such as `Date`, `Map`, `Set`, `BigInt`, typed arrays,
322
327
  and class instances use the tagged value serializer instead of a string
323
328
  fallback. Native `Blob`/`File` values are copied to run artifacts because
@@ -363,8 +368,11 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
363
368
  - `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are
364
369
  summarized for review. Defaults to `kind: 'llm'` spans with `model`,
365
370
  `usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional
366
- attribute paths. `latencyMs` is time to first token; duration, total tokens,
367
- output tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
371
+ attribute paths. The default `steps` path reads an array from
372
+ `span.attributes.steps`; if it is missing, direct child `model_step` spans are
373
+ shown as that call's steps. `latencyMs` is time to first token; duration,
374
+ total tokens, output tokens/sec, and USD costs are derived. Override `kinds`
375
+ to broaden the filter,
368
376
  override `attributes.<field>` for non-default primitive span shapes, configure
369
377
  model-keyed `pricing` to derive USD costs from token counts, with nested
370
378
  `providers` entries for provider-specific rates, add `costCurrencies` to show
@@ -513,12 +521,21 @@ Mental model:
513
521
  `serializeFileBytes: true` to a cached span or `evalTracer.cache(...)` call
514
522
  when byte-level cache invalidation is required.
515
523
  - Cache entries are stored as one Brotli-compressed JSON file per key under
516
- `.agent-evals/cache/<sanitizedNamespace>/<keyHash>.json.br`; each namespace
517
- is capped at 100 entries by default. Configure
524
+ `.agent-evals/cache/<sanitizedNamespace>/<keyHash>.json.br`, with a small
525
+ namespace index sidecar at
526
+ `.agent-evals/cache/<sanitizedNamespace>/.index-<namespaceHash>.json`.
527
+ Listing and retention use the index without opening cached payloads. Index
528
+ rows intentionally stay minimal: stored time, last access time, and external
529
+ JSON blob refs. Each namespace is capped at 100 entries by default. The runner
530
+ prunes least recently accessed indexed entries after a run finishes and the
531
+ runner stays idle for `cache.pruneIdleDelayMs ?? 5000` milliseconds. Configure
518
532
  `cache.maxEntriesPerNamespace` for the default cap and
519
533
  `cache.maxEntriesByNamespace` for exact namespace-specific caps.
534
+ - Unindexed legacy cache files are ignored by normal lookup/listing. Use
535
+ `agent-evals cache repair` to remove unindexed cache files, stale index rows,
536
+ debug sidecars, and unreferenced blob files.
520
537
  - Nested cached JSON values at or above roughly 10K JSON characters are stored
521
- as content-addressed Brotli blobs under `.agent-evals/cache-blobs/` and
538
+ as content-addressed Brotli blobs under `.agent-evals/cache/cache-blobs/` and
522
539
  referenced from cache JSON by sha256. Identical large payloads share the same
523
540
  blob.
524
541
  - Authored raw cache keys are stored for debugging under
@@ -536,11 +553,13 @@ Mental model:
536
553
 
537
554
  ## Artifacts
538
555
 
539
- Run output lives under `.agent-evals/runs/<run-id>/`. Cache metadata lives under
540
- `.agent-evals/cache/<sanitizedNamespace>/<keyHash>.json.br`. Do not rely on a
541
- specific cache filename when authoring evals; configure cache namespaces
542
- manually in eval code, then use `agent-evals cache list` to inspect the
543
- persisted namespace/key entries. Files in a run directory include run metadata,
556
+ Run output lives under `.agent-evals/runs/<run-id>/`. Cache payloads live under
557
+ `.agent-evals/cache/<sanitizedNamespace>/<keyHash>.json.br` with namespace
558
+ index sidecars next to them. Do not rely on a specific cache filename when
559
+ authoring evals; configure cache namespaces manually in eval code, then use
560
+ `agent-evals cache list` to inspect persisted namespace/key entries or
561
+ `agent-evals cache repair` to clean orphaned cache artifacts. Files in a run
562
+ directory include run metadata,
544
563
  a run summary, per-case results, and per-case trace JSON. Inspect run files when
545
564
  debugging persisted output, costs, columns, traces, or failures; inspect cache
546
565
  entries when debugging replayed span/value-cache results.