@ls-stack/agent-eval 0.36.0 → 0.38.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-Dg3abrOv.mjs";
2
- import "./src-DBypR4TV.mjs";
1
+ import { n as createRunner } from "./cli-BUX6tr9J.mjs";
2
+ import "./src-BwJ5tod2.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-BCs5rzej.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-B1wfPKNH.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -0,0 +1,3 @@
1
+ import "./runOrchestration-BhUFWvq9.mjs";
2
+ import "./cli-BUX6tr9J.mjs";
3
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.36.0",
3
+ "version": "0.38.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -31,9 +31,9 @@
31
31
  "devDependencies": {
32
32
  "@types/node": "^24.7.2",
33
33
  "typescript": "^5.9.2",
34
+ "@agent-evals/sdk": "0.0.1",
34
35
  "@agent-evals/runner": "0.0.1",
35
- "@agent-evals/shared": "0.0.1",
36
- "@agent-evals/sdk": "0.0.1"
36
+ "@agent-evals/shared": "0.0.1"
37
37
  },
38
38
  "scripts": {
39
39
  "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
@@ -27,7 +27,13 @@ display rules), read the TypeScript declarations shipped with the package:
27
27
  - Unfiltered `agent-evals run` is disabled by default; use `--eval` or `--case`
28
28
  for targeted CLI runs. Set `allowCliRunAll: true` in
29
29
  `agent-evals.config.ts` to opt into run-all CLI behavior. The web UI can
30
- still run grouped evals and confirms before starting more than five.
30
+ still run grouped evals and confirms before starting more than five. On a
31
+ single eval page, the Run chevron can open a picker to run specific authored
32
+ case ids; those case-picked runs are temporary by default and can be made
33
+ durable in the modal.
34
+ - `agent-evals run --temporary` persists a run like normal history, but deletes
35
+ it before the next run starts. Temporary runs appear in `show-runs` and the UI
36
+ while present; normal runs are never deleted by temporary-run cleanup.
31
37
  - `agent-evals app` watches `agent-evals.config.ts` and reloads config in
32
38
  place when the runner is idle. If config changes during an active run, the UI
33
39
  shows a pending reload banner and blocks new runs until the current run
@@ -485,6 +491,10 @@ Mental model:
485
491
  `.agent-evals/cache/<owner>.json`; each namespace is capped at 100 entries by
486
492
  default. Configure `cache.maxEntriesPerNamespace` for the default cap and
487
493
  `cache.maxEntriesByNamespace` for exact namespace-specific caps.
494
+ - Nested cached JSON values at or above roughly 10K JSON characters are stored
495
+ as content-addressed Brotli blobs under `.agent-evals/cache-blobs/` and
496
+ referenced from cache JSON by sha256. Identical large payloads share the same
497
+ blob.
488
498
  - Authored raw cache keys are stored for debugging under
489
499
  `.agent-evals/cache-debug/<owner>.json`. This folder may include prompts,
490
500
  user inputs, or other sensitive data, should be gitignored, and is not needed
@@ -492,11 +502,11 @@ Mental model:
492
502
  can be filtered to hits or new entries added by cache misses/refreshes.
493
503
  Misses/refreshes with `cache.store: false` are shown as non-stored activity
494
504
  without fetch/delete controls.
495
- - Cached payloads use advanced serialization/deserialization with the Web API
496
- plugin set, so return values and recorded SDK effects preserve richer
497
- built-ins such as `Date`, `Map`, `Set`, typed arrays, `URL`, `Headers`,
498
- `Blob`, and `File` on hits. Undefined values are omitted by default instead
499
- of being written to cache files; direct serializer callers can pass
505
+ - Cached payloads use JSON-safe tagged serialization, so return values and
506
+ recorded SDK effects preserve richer built-ins such as `Date`, `Map`, `Set`,
507
+ typed arrays, `URL`, `Headers`, `Blob`, and `File` on hits. Undefined values
508
+ are omitted by default instead of being written to cache files; direct
509
+ serializer callers can pass
500
510
  `{ preserveUndefined: true }` when explicit undefined wrappers are needed.
501
511
  Cache keys still use the deterministic key-hashing rules above.
502
512
  - Cache mode per run is controlled by CLI flags (see `agent-evals run --help`).
@@ -507,6 +517,8 @@ Run output lives under `.agent-evals/runs/<run-id>/` and cache entries under
507
517
  `.agent-evals/cache/<eval-id>.json`. Files in a run directory include run
508
518
  metadata, a run summary, per-case results, and per-case trace JSON. Inspect
509
519
  these when debugging persisted output, costs, columns, traces, or failures.
520
+ Temporary runs use the same directory layout, but are removed before the next
521
+ run of any kind starts.
510
522
 
511
523
  Use `agent-evals show-runs` when you need stable file
512
524
  paths before reading saved output: