npm - @ls-stack/agent-eval - Versions diffs - 0.36.0 → 0.38.0 - Mend

@ls-stack/agent-eval 0.36.0 → 0.38.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/{app-BlNzXWDM.mjs → app-DD-8kx5H.mjs} +50 -6
package/dist/apps/web/dist/assets/index-C6PgBOfP.css +1 -0
package/dist/apps/web/dist/assets/index-CO86PsY-.js +140 -0
package/dist/apps/web/dist/index.html +2 -2
package/dist/bin.mjs +14 -3
package/dist/{cli-Dg3abrOv.mjs → cli-BUX6tr9J.mjs} +106 -25
package/dist/index.d.mts +159 -150
package/dist/index.mjs +3 -3
package/dist/runChild.mjs +73 -2
package/dist/{runOrchestration-V1TxX8es.mjs → runOrchestration-BhUFWvq9.mjs} +293 -121
package/dist/{runner-BCs5rzej.mjs → runner-B1wfPKNH.mjs} +2 -2
package/dist/{runner-znY6PY1M.mjs → runner-CoRf7Vef.mjs} +1 -1
package/dist/src-BwJ5tod2.mjs +3 -0
package/package.json +3 -3
package/skills/agent-eval/SKILL.md +18 -6
package/dist/apps/web/dist/assets/index-BYtcGddU.js +0 -140
package/dist/apps/web/dist/assets/index-D0rC5MSS.css +0 -1
package/dist/src-DBypR4TV.mjs +0 -3

package/dist/{runner-BCs5rzej.mjs → runner-B1wfPKNH.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { n as createRunner } from "./cli-Dg3abrOv.mjs";
-import "./src-DBypR4TV.mjs";
+import { n as createRunner } from "./cli-BUX6tr9J.mjs";
+import "./src-BwJ5tod2.mjs";
 //#region ../../apps/server/src/runner.ts
 let runnerInstance = null;
 function getRunnerInstance() {

package/dist/{runner-znY6PY1M.mjs → runner-CoRf7Vef.mjs} RENAMED Viewed

@@ -1,2 +1,2 @@
-import { n as initRunner, t as getRunnerInstance } from "./runner-BCs5rzej.mjs";
+import { n as initRunner, t as getRunnerInstance } from "./runner-B1wfPKNH.mjs";
 export { getRunnerInstance, initRunner };

package/dist/src-BwJ5tod2.mjs ADDED Viewed

@@ -0,0 +1,3 @@
+import "./runOrchestration-BhUFWvq9.mjs";
+import "./cli-BUX6tr9J.mjs";
+export {};

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ls-stack/agent-eval",
-  "version": "0.36.0",
+  "version": "0.38.0",
   "type": "module",
   "bin": {
     "agent-evals": "./dist/bin.mjs"
@@ -31,9 +31,9 @@
   "devDependencies": {
     "@types/node": "^24.7.2",
     "typescript": "^5.9.2",
+    "@agent-evals/sdk": "0.0.1",
     "@agent-evals/runner": "0.0.1",
-    "@agent-evals/shared": "0.0.1",
-    "@agent-evals/sdk": "0.0.1"
+    "@agent-evals/shared": "0.0.1"
   },
   "scripts": {
     "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",

package/skills/agent-eval/SKILL.md CHANGED Viewed

@@ -27,7 +27,13 @@ display rules), read the TypeScript declarations shipped with the package:
 - Unfiltered `agent-evals run` is disabled by default; use `--eval` or `--case`
   for targeted CLI runs. Set `allowCliRunAll: true` in
   `agent-evals.config.ts` to opt into run-all CLI behavior. The web UI can
-  still run grouped evals and confirms before starting more than five.
+  still run grouped evals and confirms before starting more than five. On a
+  single eval page, the Run chevron can open a picker to run specific authored
+  case ids; those case-picked runs are temporary by default and can be made
+  durable in the modal.
+- `agent-evals run --temporary` persists a run like normal history, but deletes
+  it before the next run starts. Temporary runs appear in `show-runs` and the UI
+  while present; normal runs are never deleted by temporary-run cleanup.
 - `agent-evals app` watches `agent-evals.config.ts` and reloads config in
   place when the runner is idle. If config changes during an active run, the UI
   shows a pending reload banner and blocks new runs until the current run
@@ -485,6 +491,10 @@ Mental model:
   `.agent-evals/cache/<owner>.json`; each namespace is capped at 100 entries by
   default. Configure `cache.maxEntriesPerNamespace` for the default cap and
   `cache.maxEntriesByNamespace` for exact namespace-specific caps.
+- Nested cached JSON values at or above roughly 10K JSON characters are stored
+  as content-addressed Brotli blobs under `.agent-evals/cache-blobs/` and
+  referenced from cache JSON by sha256. Identical large payloads share the same
+  blob.
 - Authored raw cache keys are stored for debugging under
   `.agent-evals/cache-debug/<owner>.json`. This folder may include prompts,
   user inputs, or other sensitive data, should be gitignored, and is not needed
@@ -492,11 +502,11 @@ Mental model:
   can be filtered to hits or new entries added by cache misses/refreshes.
   Misses/refreshes with `cache.store: false` are shown as non-stored activity
   without fetch/delete controls.
-- Cached payloads use advanced serialization/deserialization with the Web API
-  plugin set, so return values and recorded SDK effects preserve richer
-  built-ins such as `Date`, `Map`, `Set`, typed arrays, `URL`, `Headers`,
-  `Blob`, and `File` on hits. Undefined values are omitted by default instead
-  of being written to cache files; direct serializer callers can pass
+- Cached payloads use JSON-safe tagged serialization, so return values and
+  recorded SDK effects preserve richer built-ins such as `Date`, `Map`, `Set`,
+  typed arrays, `URL`, `Headers`, `Blob`, and `File` on hits. Undefined values
+  are omitted by default instead of being written to cache files; direct
+  serializer callers can pass
   `{ preserveUndefined: true }` when explicit undefined wrappers are needed.
   Cache keys still use the deterministic key-hashing rules above.
 - Cache mode per run is controlled by CLI flags (see `agent-evals run --help`).
@@ -507,6 +517,8 @@ Run output lives under `.agent-evals/runs/<run-id>/` and cache entries under
 `.agent-evals/cache/<eval-id>.json`. Files in a run directory include run
 metadata, a run summary, per-case results, and per-case trace JSON. Inspect
 these when debugging persisted output, costs, columns, traces, or failures.
+Temporary runs use the same directory layout, but are removed before the next
+run of any kind starts.
 Use `agent-evals show-runs` when you need stable file
 paths before reading saved output: