@ls-stack/agent-eval 0.42.2 → 0.45.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-BeJCJMQo.mjs";
2
- import "./src-D7_xKo7h.mjs";
1
+ import { n as createRunner } from "./cli-vdJYkEVk.mjs";
2
+ import "./src-BRqs3kSA.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,5 +1,5 @@
1
- import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-OVUFw1fL.mjs";
2
- import "./cli-BeJCJMQo.mjs";
1
+ import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-BFdxG9ws.mjs";
2
+ import "./cli-vdJYkEVk.mjs";
3
3
  //#region src/index.ts
4
4
  /** Register an eval definition with typed tag support. */
5
5
  function defineEval(definition) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.42.2",
3
+ "version": "0.45.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -33,8 +33,8 @@
33
33
  "@types/node": "^24.7.2",
34
34
  "typescript": "^5.9.2",
35
35
  "@agent-evals/runner": "0.0.1",
36
- "@agent-evals/shared": "0.0.1",
37
- "@agent-evals/sdk": "0.0.1"
36
+ "@agent-evals/sdk": "0.0.1",
37
+ "@agent-evals/shared": "0.0.1"
38
38
  },
39
39
  "scripts": {
40
40
  "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
@@ -120,7 +120,7 @@ export async function runRefundWorkflow(input: RefundInput) {
120
120
  kind: 'llm',
121
121
  name: 'plan-refund',
122
122
  cache: {
123
- namespace: 'refund-workflow__plan-refund',
123
+ namespace: 'refund-workflow.plan-refund',
124
124
  key: { prompt: input.message, model: 'gpt-4o-mini' },
125
125
  },
126
126
  },
@@ -444,7 +444,7 @@ await evalTracer.span(
444
444
  kind: 'llm',
445
445
  name: 'plan-refund',
446
446
  cache: {
447
- namespace: 'refund-workflow__plan-refund',
447
+ namespace: 'refund-workflow.plan-refund',
448
448
  key: { prompt: input.message, model: 'gpt-4o-mini' },
449
449
  },
450
450
  },
@@ -510,21 +510,23 @@ Mental model:
510
510
  `name`/`lastModified` for `File`) and do not read file bytes. Add
511
511
  `serializeFileBytes: true` to a cached span or `evalTracer.cache(...)` call
512
512
  when byte-level cache invalidation is required.
513
- - Cache entries are stored in inspectable owner files under
514
- `.agent-evals/cache/<owner>.json`; each namespace is capped at 100 entries by
515
- default. Configure `cache.maxEntriesPerNamespace` for the default cap and
513
+ - Cache entries are stored as one Brotli-compressed JSON file per key under
514
+ `.agent-evals/cache/<sanitizedNamespace>/<keyHash>.json.br`; each namespace
515
+ is capped at 100 entries by default. Configure
516
+ `cache.maxEntriesPerNamespace` for the default cap and
516
517
  `cache.maxEntriesByNamespace` for exact namespace-specific caps.
517
518
  - Nested cached JSON values at or above roughly 10K JSON characters are stored
518
519
  as content-addressed Brotli blobs under `.agent-evals/cache-blobs/` and
519
520
  referenced from cache JSON by sha256. Identical large payloads share the same
520
521
  blob.
521
522
  - Authored raw cache keys are stored for debugging under
522
- `.agent-evals/cache-debug/<owner>.json`. This folder may include prompts,
523
- user inputs, or other sensitive data, should be gitignored, and is not needed
524
- for cache reuse. The UI Cache tab shows the raw key when it is available and
525
- can be filtered to hits or new entries added by cache misses/refreshes.
526
- Misses/refreshes with `cache.store: false` are shown as non-stored activity
527
- without fetch/delete controls.
523
+ `.agent-evals/cache-debug/<sanitizedNamespace>/<keyHash>.json`. This folder
524
+ may include prompts, user inputs, full serialized cache payloads, or other
525
+ sensitive data, should be gitignored, and is not needed for cache reuse. The
526
+ UI Cache tab shows the raw key when it is available and can be filtered to
527
+ hits or new entries added by cache misses/refreshes. Misses/refreshes with
528
+ `cache.store: false` are shown as non-stored activity without fetch/delete
529
+ controls.
528
530
  - Cached payloads use JSON-safe tagged serialization, so return values and
529
531
  recorded SDK effects preserve richer built-ins such as `Date`, `Map`, `Set`,
530
532
  typed arrays, `URL`, `Headers`, `Blob`, and `File` on hits. Undefined values
@@ -537,7 +539,7 @@ Mental model:
537
539
  ## Artifacts
538
540
 
539
541
  Run output lives under `.agent-evals/runs/<run-id>/`. Cache metadata lives under
540
- `.agent-evals/cache/`, grouped into runner-managed owner files. Do not rely on a
542
+ `.agent-evals/cache/<sanitizedNamespace>/<keyHash>.json.br`. Do not rely on a
541
543
  specific cache filename when authoring evals; configure cache namespaces
542
544
  manually in eval code, then use `agent-evals cache list` or the UI Cache tab to
543
545
  inspect the persisted namespace/key entries. Files in a run directory include