@ls-stack/agent-eval 0.55.1 → 0.55.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-BD0D9-7k.mjs → app-NI4to6lp.mjs} +4 -4
- package/dist/apps/web/dist/assets/{index-CvsPmlHl.js → index-C7QjETk8.js} +54 -54
- package/dist/apps/web/dist/index.html +1 -1
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +1 -1
- package/dist/{cli-BR3wMZMx.mjs → cli-Bu9347r1.mjs} +60 -13
- package/dist/index.d.mts +37 -24
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +2 -2
- package/dist/{runExecution-Sw38bCaq.mjs → runExecution-C31dpemR.mjs} +363 -114
- package/dist/{runOrchestration-DJsdLYeZ.mjs → runOrchestration-3RoHLW4U.mjs} +1 -1
- package/dist/{runner-72rsqJRq.mjs → runner-B4EfMn1d.mjs} +2 -2
- package/dist/{runner-dB69WsnM.mjs → runner-CTp9zHbM.mjs} +1 -1
- package/dist/{src-hBGtzWuA.mjs → src-FR60ZR_4.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +19 -8
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-C31dpemR.mjs";
|
|
2
|
+
import "./cli-Bu9347r1.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.55.
|
|
3
|
+
"version": "0.55.2",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -33,8 +33,8 @@
|
|
|
33
33
|
"@types/node": "^24.7.2",
|
|
34
34
|
"typescript": "^5.9.2",
|
|
35
35
|
"@agent-evals/runner": "0.0.1",
|
|
36
|
-
"@agent-evals/
|
|
37
|
-
"@agent-evals/
|
|
36
|
+
"@agent-evals/sdk": "0.0.1",
|
|
37
|
+
"@agent-evals/shared": "0.0.1"
|
|
38
38
|
},
|
|
39
39
|
"scripts": {
|
|
40
40
|
"build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
|
|
@@ -513,12 +513,21 @@ Mental model:
|
|
|
513
513
|
`serializeFileBytes: true` to a cached span or `evalTracer.cache(...)` call
|
|
514
514
|
when byte-level cache invalidation is required.
|
|
515
515
|
- Cache entries are stored as one Brotli-compressed JSON file per key under
|
|
516
|
-
`.agent-evals/cache/<sanitizedNamespace>/<keyHash>.json.br
|
|
517
|
-
|
|
516
|
+
`.agent-evals/cache/<sanitizedNamespace>/<keyHash>.json.br`, with a small
|
|
517
|
+
namespace index sidecar at
|
|
518
|
+
`.agent-evals/cache/<sanitizedNamespace>/.index-<namespaceHash>.json`.
|
|
519
|
+
Listing and retention use the index without opening cached payloads. Index
|
|
520
|
+
rows intentionally stay minimal: stored time, last access time, and external
|
|
521
|
+
JSON blob refs. Each namespace is capped at 100 entries by default. The runner
|
|
522
|
+
prunes least recently accessed indexed entries after a run finishes and the
|
|
523
|
+
runner stays idle for `cache.pruneIdleDelayMs ?? 5000` milliseconds. Configure
|
|
518
524
|
`cache.maxEntriesPerNamespace` for the default cap and
|
|
519
525
|
`cache.maxEntriesByNamespace` for exact namespace-specific caps.
|
|
526
|
+
- Unindexed legacy cache files are ignored by normal lookup/listing. Use
|
|
527
|
+
`agent-evals cache repair` to remove unindexed cache files, stale index rows,
|
|
528
|
+
debug sidecars, and unreferenced blob files.
|
|
520
529
|
- Nested cached JSON values at or above roughly 10K JSON characters are stored
|
|
521
|
-
as content-addressed Brotli blobs under `.agent-evals/cache-blobs/` and
|
|
530
|
+
as content-addressed Brotli blobs under `.agent-evals/cache/cache-blobs/` and
|
|
522
531
|
referenced from cache JSON by sha256. Identical large payloads share the same
|
|
523
532
|
blob.
|
|
524
533
|
- Authored raw cache keys are stored for debugging under
|
|
@@ -536,11 +545,13 @@ Mental model:
|
|
|
536
545
|
|
|
537
546
|
## Artifacts
|
|
538
547
|
|
|
539
|
-
Run output lives under `.agent-evals/runs/<run-id>/`. Cache
|
|
540
|
-
`.agent-evals/cache/<sanitizedNamespace>/<keyHash>.json.br
|
|
541
|
-
|
|
542
|
-
manually in eval code, then use
|
|
543
|
-
persisted namespace/key entries
|
|
548
|
+
Run output lives under `.agent-evals/runs/<run-id>/`. Cache payloads live under
|
|
549
|
+
`.agent-evals/cache/<sanitizedNamespace>/<keyHash>.json.br` with namespace
|
|
550
|
+
index sidecars next to them. Do not rely on a specific cache filename when
|
|
551
|
+
authoring evals; configure cache namespaces manually in eval code, then use
|
|
552
|
+
`agent-evals cache list` to inspect persisted namespace/key entries or
|
|
553
|
+
`agent-evals cache repair` to clean orphaned cache artifacts. Files in a run
|
|
554
|
+
directory include run metadata,
|
|
544
555
|
a run summary, per-case results, and per-case trace JSON. Inspect run files when
|
|
545
556
|
debugging persisted output, costs, columns, traces, or failures; inspect cache
|
|
546
557
|
entries when debugging replayed span/value-cache results.
|