@ls-stack/agent-eval 0.42.1 → 0.42.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-DbVfkr9T.mjs";
2
- import "./src-CANi3gpd.mjs";
1
+ import { n as createRunner } from "./cli-BeJCJMQo.mjs";
2
+ import "./src-D7_xKo7h.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-DiCQ57JQ.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-B1KygirW.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { Lt as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-SPaHx-SC.mjs";
2
- import "./cli-DbVfkr9T.mjs";
1
+ import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-OVUFw1fL.mjs";
2
+ import "./cli-BeJCJMQo.mjs";
3
3
  //#region src/index.ts
4
4
  /** Register an eval definition with typed tag support. */
5
5
  function defineEval(definition) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.42.1",
3
+ "version": "0.42.3",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -32,9 +32,9 @@
32
32
  "devDependencies": {
33
33
  "@types/node": "^24.7.2",
34
34
  "typescript": "^5.9.2",
35
+ "@agent-evals/sdk": "0.0.1",
35
36
  "@agent-evals/runner": "0.0.1",
36
- "@agent-evals/shared": "0.0.1",
37
- "@agent-evals/sdk": "0.0.1"
37
+ "@agent-evals/shared": "0.0.1"
38
38
  },
39
39
  "scripts": {
40
40
  "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
@@ -490,10 +490,10 @@ Mental model:
490
490
  - Cache identity is the namespace plus the authored key. Source-file
491
491
  fingerprints are tracked for run freshness separately, but do not participate
492
492
  in cache-key hashing.
493
- - Cached spans require an explicit `cache.namespace`; value caches default to
494
- `${evalId}__${name}` and can be overridden with `namespace`. Matching
495
- namespaces share entries across operations/evals that use the same authored
496
- key.
493
+ - Cached spans require an explicit `cache.namespace`. Value caches can also set
494
+ an explicit `namespace`; prefer doing that when the cache is part of a
495
+ documented workflow. Matching namespaces share entries across operations/evals
496
+ that use the same authored key.
497
497
  - Per eval, `cache: { read?: boolean; store?: boolean }` controls whether
498
498
  authored cached operations may read or persist entries. Both default to
499
499
  `true`. Use `read: false` to always execute instead of replaying hits, and
@@ -536,10 +536,17 @@ Mental model:
536
536
 
537
537
  ## Artifacts
538
538
 
539
- Run output lives under `.agent-evals/runs/<run-id>/` and cache entries under
540
- `.agent-evals/cache/<eval-id>.json`. Files in a run directory include run
541
- metadata, a run summary, per-case results, and per-case trace JSON. Inspect
542
- these when debugging persisted output, costs, columns, traces, or failures.
539
+ Run output lives under `.agent-evals/runs/<run-id>/`. Cache metadata lives under
540
+ `.agent-evals/cache/`, grouped into runner-managed owner files. Do not rely on a
541
+ specific cache filename when authoring evals; configure cache namespaces
542
+ manually in eval code, then use `agent-evals cache list` or the UI Cache tab to
543
+ inspect the persisted namespace/key entries. Files in a run directory include
544
+ run metadata, a run summary, per-case results, and per-case trace JSON. Inspect
545
+ run files when debugging persisted output, costs, columns, traces, or failures;
546
+ inspect cache entries when debugging replayed span/value-cache results.
547
+ Targeted evals in `run.json` are recorded by exact `evalKeys`
548
+ (`filePath + evalId`) rather than authored eval ids, so duplicate eval ids stay
549
+ unambiguous in saved history.
543
550
  Temporary runs use the same directory layout, but are removed before the next
544
551
  run of any kind starts.
545
552