@ls-stack/agent-eval 0.42.2 → 0.45.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-DPamBr5R.mjs → app-BZmhhSFZ.mjs} +10 -4
- package/dist/apps/web/dist/assets/index-BU3IqUso.css +1 -0
- package/dist/apps/web/dist/assets/index-am2Knit6.js +140 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-BeJCJMQo.mjs → cli-vdJYkEVk.mjs} +28 -5
- package/dist/index.d.mts +368 -75
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-OVUFw1fL.mjs → runOrchestration-BFdxG9ws.mjs} +259 -250
- package/dist/{runner-BJQq7cpd.mjs → runner--aH0jO4Z.mjs} +1 -1
- package/dist/{runner-B1KygirW.mjs → runner-DJWn_7p0.mjs} +2 -2
- package/dist/{src-D7_xKo7h.mjs → src-BRqs3kSA.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +14 -12
- package/dist/apps/web/dist/assets/index-DH0qYD76.js +0 -140
- package/dist/apps/web/dist/assets/index-eFM9VIsz.css +0 -1
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-vdJYkEVk.mjs";
|
|
2
|
+
import "./src-BRqs3kSA.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-BFdxG9ws.mjs";
|
|
2
|
+
import "./cli-vdJYkEVk.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.45.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -33,8 +33,8 @@
|
|
|
33
33
|
"@types/node": "^24.7.2",
|
|
34
34
|
"typescript": "^5.9.2",
|
|
35
35
|
"@agent-evals/runner": "0.0.1",
|
|
36
|
-
"@agent-evals/
|
|
37
|
-
"@agent-evals/
|
|
36
|
+
"@agent-evals/sdk": "0.0.1",
|
|
37
|
+
"@agent-evals/shared": "0.0.1"
|
|
38
38
|
},
|
|
39
39
|
"scripts": {
|
|
40
40
|
"build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
|
|
@@ -120,7 +120,7 @@ export async function runRefundWorkflow(input: RefundInput) {
|
|
|
120
120
|
kind: 'llm',
|
|
121
121
|
name: 'plan-refund',
|
|
122
122
|
cache: {
|
|
123
|
-
namespace: 'refund-
|
|
123
|
+
namespace: 'refund-workflow.plan-refund',
|
|
124
124
|
key: { prompt: input.message, model: 'gpt-4o-mini' },
|
|
125
125
|
},
|
|
126
126
|
},
|
|
@@ -444,7 +444,7 @@ await evalTracer.span(
|
|
|
444
444
|
kind: 'llm',
|
|
445
445
|
name: 'plan-refund',
|
|
446
446
|
cache: {
|
|
447
|
-
namespace: 'refund-
|
|
447
|
+
namespace: 'refund-workflow.plan-refund',
|
|
448
448
|
key: { prompt: input.message, model: 'gpt-4o-mini' },
|
|
449
449
|
},
|
|
450
450
|
},
|
|
@@ -510,21 +510,23 @@ Mental model:
|
|
|
510
510
|
`name`/`lastModified` for `File`) and do not read file bytes. Add
|
|
511
511
|
`serializeFileBytes: true` to a cached span or `evalTracer.cache(...)` call
|
|
512
512
|
when byte-level cache invalidation is required.
|
|
513
|
-
- Cache entries are stored
|
|
514
|
-
`.agent-evals/cache/<
|
|
515
|
-
|
|
513
|
+
- Cache entries are stored as one Brotli-compressed JSON file per key under
|
|
514
|
+
`.agent-evals/cache/<sanitizedNamespace>/<keyHash>.json.br`; each namespace
|
|
515
|
+
is capped at 100 entries by default. Configure
|
|
516
|
+
`cache.maxEntriesPerNamespace` for the default cap and
|
|
516
517
|
`cache.maxEntriesByNamespace` for exact namespace-specific caps.
|
|
517
518
|
- Nested cached JSON values at or above roughly 10K JSON characters are stored
|
|
518
519
|
as content-addressed Brotli blobs under `.agent-evals/cache-blobs/` and
|
|
519
520
|
referenced from cache JSON by sha256. Identical large payloads share the same
|
|
520
521
|
blob.
|
|
521
522
|
- Authored raw cache keys are stored for debugging under
|
|
522
|
-
`.agent-evals/cache-debug/<
|
|
523
|
-
user inputs,
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
without fetch/delete
|
|
523
|
+
`.agent-evals/cache-debug/<sanitizedNamespace>/<keyHash>.json`. This folder
|
|
524
|
+
may include prompts, user inputs, full serialized cache payloads, or other
|
|
525
|
+
sensitive data, should be gitignored, and is not needed for cache reuse. The
|
|
526
|
+
UI Cache tab shows the raw key when it is available and can be filtered to
|
|
527
|
+
hits or new entries added by cache misses/refreshes. Misses/refreshes with
|
|
528
|
+
`cache.store: false` are shown as non-stored activity without fetch/delete
|
|
529
|
+
controls.
|
|
528
530
|
- Cached payloads use JSON-safe tagged serialization, so return values and
|
|
529
531
|
recorded SDK effects preserve richer built-ins such as `Date`, `Map`, `Set`,
|
|
530
532
|
typed arrays, `URL`, `Headers`, `Blob`, and `File` on hits. Undefined values
|
|
@@ -537,7 +539,7 @@ Mental model:
|
|
|
537
539
|
## Artifacts
|
|
538
540
|
|
|
539
541
|
Run output lives under `.agent-evals/runs/<run-id>/`. Cache metadata lives under
|
|
540
|
-
`.agent-evals/cache
|
|
542
|
+
`.agent-evals/cache/<sanitizedNamespace>/<keyHash>.json.br`. Do not rely on a
|
|
541
543
|
specific cache filename when authoring evals; configure cache namespaces
|
|
542
544
|
manually in eval code, then use `agent-evals cache list` or the UI Cache tab to
|
|
543
545
|
inspect the persisted namespace/key entries. Files in a run directory include
|