@ls-stack/agent-eval 0.36.0 → 0.38.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-BlNzXWDM.mjs → app-DD-8kx5H.mjs} +50 -6
- package/dist/apps/web/dist/assets/index-C6PgBOfP.css +1 -0
- package/dist/apps/web/dist/assets/index-CO86PsY-.js +140 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +14 -3
- package/dist/{cli-Dg3abrOv.mjs → cli-BUX6tr9J.mjs} +106 -25
- package/dist/index.d.mts +159 -150
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +73 -2
- package/dist/{runOrchestration-V1TxX8es.mjs → runOrchestration-BhUFWvq9.mjs} +293 -121
- package/dist/{runner-BCs5rzej.mjs → runner-B1wfPKNH.mjs} +2 -2
- package/dist/{runner-znY6PY1M.mjs → runner-CoRf7Vef.mjs} +1 -1
- package/dist/src-BwJ5tod2.mjs +3 -0
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +18 -6
- package/dist/apps/web/dist/assets/index-BYtcGddU.js +0 -140
- package/dist/apps/web/dist/assets/index-D0rC5MSS.css +0 -1
- package/dist/src-DBypR4TV.mjs +0 -3
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-BUX6tr9J.mjs";
|
|
2
|
+
import "./src-BwJ5tod2.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-B1wfPKNH.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.38.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -31,9 +31,9 @@
|
|
|
31
31
|
"devDependencies": {
|
|
32
32
|
"@types/node": "^24.7.2",
|
|
33
33
|
"typescript": "^5.9.2",
|
|
34
|
+
"@agent-evals/sdk": "0.0.1",
|
|
34
35
|
"@agent-evals/runner": "0.0.1",
|
|
35
|
-
"@agent-evals/shared": "0.0.1"
|
|
36
|
-
"@agent-evals/sdk": "0.0.1"
|
|
36
|
+
"@agent-evals/shared": "0.0.1"
|
|
37
37
|
},
|
|
38
38
|
"scripts": {
|
|
39
39
|
"build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
|
|
@@ -27,7 +27,13 @@ display rules), read the TypeScript declarations shipped with the package:
|
|
|
27
27
|
- Unfiltered `agent-evals run` is disabled by default; use `--eval` or `--case`
|
|
28
28
|
for targeted CLI runs. Set `allowCliRunAll: true` in
|
|
29
29
|
`agent-evals.config.ts` to opt into run-all CLI behavior. The web UI can
|
|
30
|
-
still run grouped evals and confirms before starting more than five.
|
|
30
|
+
still run grouped evals and confirms before starting more than five. On a
|
|
31
|
+
single eval page, the Run chevron can open a picker to run specific authored
|
|
32
|
+
case ids; those case-picked runs are temporary by default and can be made
|
|
33
|
+
durable in the modal.
|
|
34
|
+
- `agent-evals run --temporary` persists a run like normal history, but deletes
|
|
35
|
+
it before the next run starts. Temporary runs appear in `show-runs` and the UI
|
|
36
|
+
while present; normal runs are never deleted by temporary-run cleanup.
|
|
31
37
|
- `agent-evals app` watches `agent-evals.config.ts` and reloads config in
|
|
32
38
|
place when the runner is idle. If config changes during an active run, the UI
|
|
33
39
|
shows a pending reload banner and blocks new runs until the current run
|
|
@@ -485,6 +491,10 @@ Mental model:
|
|
|
485
491
|
`.agent-evals/cache/<owner>.json`; each namespace is capped at 100 entries by
|
|
486
492
|
default. Configure `cache.maxEntriesPerNamespace` for the default cap and
|
|
487
493
|
`cache.maxEntriesByNamespace` for exact namespace-specific caps.
|
|
494
|
+
- Nested cached JSON values at or above roughly 10K JSON characters are stored
|
|
495
|
+
as content-addressed Brotli blobs under `.agent-evals/cache-blobs/` and
|
|
496
|
+
referenced from cache JSON by sha256. Identical large payloads share the same
|
|
497
|
+
blob.
|
|
488
498
|
- Authored raw cache keys are stored for debugging under
|
|
489
499
|
`.agent-evals/cache-debug/<owner>.json`. This folder may include prompts,
|
|
490
500
|
user inputs, or other sensitive data, should be gitignored, and is not needed
|
|
@@ -492,11 +502,11 @@ Mental model:
|
|
|
492
502
|
can be filtered to hits or new entries added by cache misses/refreshes.
|
|
493
503
|
Misses/refreshes with `cache.store: false` are shown as non-stored activity
|
|
494
504
|
without fetch/delete controls.
|
|
495
|
-
- Cached payloads use
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
505
|
+
- Cached payloads use JSON-safe tagged serialization, so return values and
|
|
506
|
+
recorded SDK effects preserve richer built-ins such as `Date`, `Map`, `Set`,
|
|
507
|
+
typed arrays, `URL`, `Headers`, `Blob`, and `File` on hits. Undefined values
|
|
508
|
+
are omitted by default instead of being written to cache files; direct
|
|
509
|
+
serializer callers can pass
|
|
500
510
|
`{ preserveUndefined: true }` when explicit undefined wrappers are needed.
|
|
501
511
|
Cache keys still use the deterministic key-hashing rules above.
|
|
502
512
|
- Cache mode per run is controlled by CLI flags (see `agent-evals run --help`).
|
|
@@ -507,6 +517,8 @@ Run output lives under `.agent-evals/runs/<run-id>/` and cache entries under
|
|
|
507
517
|
`.agent-evals/cache/<eval-id>.json`. Files in a run directory include run
|
|
508
518
|
metadata, a run summary, per-case results, and per-case trace JSON. Inspect
|
|
509
519
|
these when debugging persisted output, costs, columns, traces, or failures.
|
|
520
|
+
Temporary runs use the same directory layout, but are removed before the next
|
|
521
|
+
run of any kind starts.
|
|
510
522
|
|
|
511
523
|
Use `agent-evals show-runs` when you need stable file
|
|
512
524
|
paths before reading saved output:
|