npm - @ls-stack/agent-eval - Versions diffs - 0.55.1 → 0.56.0 - Mend

@ls-stack/agent-eval 0.55.1 → 0.56.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/{app-BD0D9-7k.mjs → app-Bpe6Monh.mjs} +4 -4
package/dist/apps/web/dist/assets/index-CfSiAVmi.js +377 -0
package/dist/apps/web/dist/assets/{index-2I-eWzVL.css → index-Xa_7PteQ.css} +1 -1
package/dist/apps/web/dist/index.html +2 -2
package/dist/bin.mjs +1 -1
package/dist/caseChild.mjs +2 -1
package/dist/{cli-BR3wMZMx.mjs → cli-DQO2Fpt2.mjs} +60 -13
package/dist/index.d.mts +610 -62
package/dist/index.mjs +3 -3
package/dist/runChild.mjs +2 -2
package/dist/{runExecution-Sw38bCaq.mjs → runExecution-6lrtj48K.mjs} +458 -121
package/dist/{runOrchestration-DJsdLYeZ.mjs → runOrchestration-BYaN2mzS.mjs} +1 -1
package/dist/{runner-dB69WsnM.mjs → runner-C3CiS2o7.mjs} +1 -1
package/dist/{runner-72rsqJRq.mjs → runner-DYlwuAT3.mjs} +2 -2
package/dist/{src-hBGtzWuA.mjs → src-DCGrFAmO.mjs} +2 -2
package/package.json +3 -3
package/skills/agent-eval/SKILL.md +29 -10
package/dist/apps/web/dist/assets/index-CvsPmlHl.js +0 -377

package/dist/{runOrchestration-DJsdLYeZ.mjs → runOrchestration-BYaN2mzS.mjs} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-Sw38bCaq.mjs";
+import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-6lrtj48K.mjs";
 import { readFile, readdir, rm, writeFile } from "node:fs/promises";
 import { dirname, join } from "node:path";
 import { existsSync } from "node:fs";

package/dist/{runner-dB69WsnM.mjs → runner-C3CiS2o7.mjs} RENAMED Viewed

@@ -1,2 +1,2 @@
-import { n as initRunner, t as getRunnerInstance } from "./runner-72rsqJRq.mjs";
+import { n as initRunner, t as getRunnerInstance } from "./runner-DYlwuAT3.mjs";
 export { getRunnerInstance, initRunner };

package/dist/{runner-72rsqJRq.mjs → runner-DYlwuAT3.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { n as createRunner } from "./cli-BR3wMZMx.mjs";
-import "./src-hBGtzWuA.mjs";
+import { n as createRunner } from "./cli-DQO2Fpt2.mjs";
+import "./src-DCGrFAmO.mjs";
 //#region ../../apps/server/src/runner.ts
 let runnerInstance = null;
 function getRunnerInstance() {

package/dist/{src-hBGtzWuA.mjs → src-DCGrFAmO.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-Sw38bCaq.mjs";
-import "./cli-BR3wMZMx.mjs";
+import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-6lrtj48K.mjs";
+import "./cli-DQO2Fpt2.mjs";
 //#region src/index.ts
 /** Register an eval definition with typed tag support. */
 function defineEval(definition) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ls-stack/agent-eval",
-  "version": "0.55.1",
+  "version": "0.56.0",
   "type": "module",
   "bin": {
     "agent-evals": "./dist/bin.mjs"
@@ -33,8 +33,8 @@
     "@types/node": "^24.7.2",
     "typescript": "^5.9.2",
     "@agent-evals/runner": "0.0.1",
-    "@agent-evals/shared": "0.0.1",
-    "@agent-evals/sdk": "0.0.1"
+    "@agent-evals/sdk": "0.0.1",
+    "@agent-evals/shared": "0.0.1"
   },
   "scripts": {
     "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",

package/skills/agent-eval/SKILL.md CHANGED Viewed

@@ -318,6 +318,11 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
 - `setEvalOutput(key, value)` writes reviewable data for the case. Values are
   stored as received: primitives, objects/arrays, explicit file refs, and
   native `Blob`/`File` values. `columns.format` only controls visualization.
+  Inside `execute`, `setOutput(key, value, formatOrOverride)` can attach a
+  display hint directly to a runtime output, e.g. `'markdown'` or
+  `{ label: 'Receipt', format: 'image', hideInTable: true }`. Authored
+  global/eval `columns` for the same key take precedence over that runtime
+  hint.
   Non-JSON runtime values such as `Date`, `Map`, `Set`, `BigInt`, typed arrays,
   and class instances use the tagged value serializer instead of a string
   fallback. Native `Blob`/`File` values are copied to run artifacts because
@@ -363,8 +368,11 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
 - `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are
   summarized for review. Defaults to `kind: 'llm'` spans with `model`,
   `usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional
-  attribute paths. `latencyMs` is time to first token; duration, total tokens,
-  output tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
+  attribute paths. The default `steps` path reads an array from
+  `span.attributes.steps`; if it is missing, direct child `model_step` spans are
+  shown as that call's steps. `latencyMs` is time to first token; duration,
+  total tokens, output tokens/sec, and USD costs are derived. Override `kinds`
+  to broaden the filter,
   override `attributes.<field>` for non-default primitive span shapes, configure
   model-keyed `pricing` to derive USD costs from token counts, with nested
   `providers` entries for provider-specific rates, add `costCurrencies` to show
@@ -513,12 +521,21 @@ Mental model:
   `serializeFileBytes: true` to a cached span or `evalTracer.cache(...)` call
   when byte-level cache invalidation is required.
 - Cache entries are stored as one Brotli-compressed JSON file per key under
-  `.agent-evals/cache/<sanitizedNamespace>/<keyHash>.json.br`; each namespace
-  is capped at 100 entries by default. Configure
+  `.agent-evals/cache/<sanitizedNamespace>/<keyHash>.json.br`, with a small
+  namespace index sidecar at
+  `.agent-evals/cache/<sanitizedNamespace>/.index-<namespaceHash>.json`.
+  Listing and retention use the index without opening cached payloads. Index
+  rows intentionally stay minimal: stored time, last access time, and external
+  JSON blob refs. Each namespace is capped at 100 entries by default. The runner
+  prunes least recently accessed indexed entries after a run finishes and the
+  runner stays idle for `cache.pruneIdleDelayMs ?? 5000` milliseconds. Configure
   `cache.maxEntriesPerNamespace` for the default cap and
   `cache.maxEntriesByNamespace` for exact namespace-specific caps.
+- Unindexed legacy cache files are ignored by normal lookup/listing. Use
+  `agent-evals cache repair` to remove unindexed cache files, stale index rows,
+  debug sidecars, and unreferenced blob files.
 - Nested cached JSON values at or above roughly 10K JSON characters are stored
-  as content-addressed Brotli blobs under `.agent-evals/cache-blobs/` and
+  as content-addressed Brotli blobs under `.agent-evals/cache/cache-blobs/` and
   referenced from cache JSON by sha256. Identical large payloads share the same
   blob.
 - Authored raw cache keys are stored for debugging under
@@ -536,11 +553,13 @@ Mental model:
 ## Artifacts
-Run output lives under `.agent-evals/runs/<run-id>/`. Cache metadata lives under
-`.agent-evals/cache/<sanitizedNamespace>/<keyHash>.json.br`. Do not rely on a
-specific cache filename when authoring evals; configure cache namespaces
-manually in eval code, then use `agent-evals cache list` to inspect the
-persisted namespace/key entries. Files in a run directory include run metadata,
+Run output lives under `.agent-evals/runs/<run-id>/`. Cache payloads live under
+`.agent-evals/cache/<sanitizedNamespace>/<keyHash>.json.br` with namespace
+index sidecars next to them. Do not rely on a specific cache filename when
+authoring evals; configure cache namespaces manually in eval code, then use
+`agent-evals cache list` to inspect persisted namespace/key entries or
+`agent-evals cache repair` to clean orphaned cache artifacts. Files in a run
+directory include run metadata,
 a run summary, per-case results, and per-case trace JSON. Inspect run files when
 debugging persisted output, costs, columns, traces, or failures; inspect cache
 entries when debugging replayed span/value-cache results.