@ls-stack/agent-eval 0.42.1 → 0.42.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-mOYjX9zq.mjs → app-DPamBr5R.mjs} +4 -4
- package/dist/apps/web/dist/assets/{index-eFM9VIsz.css → index-S3J5Nm0o.css} +1 -1
- package/dist/apps/web/dist/assets/{index-CANDLTsq.js → index-XLJByNnS.js} +2 -2
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-DbVfkr9T.mjs → cli-BeJCJMQo.mjs} +49 -23
- package/dist/index.d.mts +42 -43
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-SPaHx-SC.mjs → runOrchestration-OVUFw1fL.mjs} +12 -23
- package/dist/{runner-DiCQ57JQ.mjs → runner-B1KygirW.mjs} +2 -2
- package/dist/{runner-BYOdLBle.mjs → runner-BJQq7cpd.mjs} +1 -1
- package/dist/{src-CANi3gpd.mjs → src-D7_xKo7h.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +15 -8
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-BeJCJMQo.mjs";
|
|
2
|
+
import "./src-D7_xKo7h.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-B1KygirW.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-OVUFw1fL.mjs";
|
|
2
|
+
import "./cli-BeJCJMQo.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.42.
|
|
3
|
+
"version": "0.42.3",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -32,9 +32,9 @@
|
|
|
32
32
|
"devDependencies": {
|
|
33
33
|
"@types/node": "^24.7.2",
|
|
34
34
|
"typescript": "^5.9.2",
|
|
35
|
+
"@agent-evals/sdk": "0.0.1",
|
|
35
36
|
"@agent-evals/runner": "0.0.1",
|
|
36
|
-
"@agent-evals/shared": "0.0.1"
|
|
37
|
-
"@agent-evals/sdk": "0.0.1"
|
|
37
|
+
"@agent-evals/shared": "0.0.1"
|
|
38
38
|
},
|
|
39
39
|
"scripts": {
|
|
40
40
|
"build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
|
|
@@ -490,10 +490,10 @@ Mental model:
|
|
|
490
490
|
- Cache identity is the namespace plus the authored key. Source-file
|
|
491
491
|
fingerprints are tracked for run freshness separately, but do not participate
|
|
492
492
|
in cache-key hashing.
|
|
493
|
-
- Cached spans require an explicit `cache.namespace
|
|
494
|
-
|
|
495
|
-
namespaces share entries across operations/evals
|
|
496
|
-
key.
|
|
493
|
+
- Cached spans require an explicit `cache.namespace`. Value caches can also set
|
|
494
|
+
an explicit `namespace`; prefer doing that when the cache is part of a
|
|
495
|
+
documented workflow. Matching namespaces share entries across operations/evals
|
|
496
|
+
that use the same authored key.
|
|
497
497
|
- Per eval, `cache: { read?: boolean; store?: boolean }` controls whether
|
|
498
498
|
authored cached operations may read or persist entries. Both default to
|
|
499
499
|
`true`. Use `read: false` to always execute instead of replaying hits, and
|
|
@@ -536,10 +536,17 @@ Mental model:
|
|
|
536
536
|
|
|
537
537
|
## Artifacts
|
|
538
538
|
|
|
539
|
-
Run output lives under `.agent-evals/runs/<run-id
|
|
540
|
-
`.agent-evals/cache
|
|
541
|
-
|
|
542
|
-
|
|
539
|
+
Run output lives under `.agent-evals/runs/<run-id>/`. Cache metadata lives under
|
|
540
|
+
`.agent-evals/cache/`, grouped into runner-managed owner files. Do not rely on a
|
|
541
|
+
specific cache filename when authoring evals; configure cache namespaces
|
|
542
|
+
manually in eval code, then use `agent-evals cache list` or the UI Cache tab to
|
|
543
|
+
inspect the persisted namespace/key entries. Files in a run directory include
|
|
544
|
+
run metadata, a run summary, per-case results, and per-case trace JSON. Inspect
|
|
545
|
+
run files when debugging persisted output, costs, columns, traces, or failures;
|
|
546
|
+
inspect cache entries when debugging replayed span/value-cache results.
|
|
547
|
+
Targeted evals in `run.json` are recorded by exact `evalKeys`
|
|
548
|
+
(`filePath + evalId`) rather than authored eval ids, so duplicate eval ids stay
|
|
549
|
+
unambiguous in saved history.
|
|
543
550
|
Temporary runs use the same directory layout, but are removed before the next
|
|
544
551
|
run of any kind starts.
|
|
545
552
|
|