@ls-stack/agent-eval 0.40.0 → 0.42.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-0k8Y1OBk.mjs → app-1vE5Ryry.mjs} +5 -5
- package/dist/apps/web/dist/assets/index-5CB9eJZy.js +140 -0
- package/dist/apps/web/dist/assets/index-eFM9VIsz.css +1 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-Sg8UdOnm.mjs → cli-Bk5g-bat.mjs} +43 -11
- package/dist/index.d.mts +3035 -3369
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +5 -3
- package/dist/{runOrchestration-CTzVNrDP.mjs → runOrchestration-DhTiT4V0.mjs} +4635 -4262
- package/dist/{runner-njK_CtXC.mjs → runner-B1Cyevvr.mjs} +1 -1
- package/dist/{runner-Bb9JdFkg.mjs → runner-BG0L4yId.mjs} +2 -2
- package/dist/src-t6OVp1li.mjs +13 -0
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +24 -1
- package/dist/apps/web/dist/assets/index-C4v6dWcv.js +0 -140
- package/dist/apps/web/dist/assets/index-C58_zLA9.css +0 -1
- package/dist/src-BZzPFS8r.mjs +0 -3
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-BG0L4yId.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-Bk5g-bat.mjs";
|
|
2
|
+
import "./src-t6OVp1li.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { Lt as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-DhTiT4V0.mjs";
|
|
2
|
+
import "./cli-Bk5g-bat.mjs";
|
|
3
|
+
//#region src/index.ts
|
|
4
|
+
/** Register an eval definition with typed tag support. */
|
|
5
|
+
function defineEval(definition) {
|
|
6
|
+
defineEval$1(definition);
|
|
7
|
+
}
|
|
8
|
+
/** Return whether the active eval case has tags matching the typed input. */
|
|
9
|
+
function matchesEvalTags(input) {
|
|
10
|
+
return matchesEvalTags$1(input);
|
|
11
|
+
}
|
|
12
|
+
//#endregion
|
|
13
|
+
export { matchesEvalTags as n, defineEval as t };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.42.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -33,8 +33,8 @@
|
|
|
33
33
|
"@types/node": "^24.7.2",
|
|
34
34
|
"typescript": "^5.9.2",
|
|
35
35
|
"@agent-evals/runner": "0.0.1",
|
|
36
|
-
"@agent-evals/
|
|
37
|
-
"@agent-evals/
|
|
36
|
+
"@agent-evals/shared": "0.0.1",
|
|
37
|
+
"@agent-evals/sdk": "0.0.1"
|
|
38
38
|
},
|
|
39
39
|
"scripts": {
|
|
40
40
|
"build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
|
|
@@ -25,7 +25,8 @@ display rules), read the TypeScript declarations shipped with the package:
|
|
|
25
25
|
- The CLI automatically loads `.env` from the current workspace. Shell-provided
|
|
26
26
|
environment variables win; pass `--no-env` to disable `.env` loading once.
|
|
27
27
|
- Unfiltered `agent-evals run` is disabled by default; use `--eval` or `--case`
|
|
28
|
-
for targeted CLI runs
|
|
28
|
+
for targeted CLI runs, or `--tags-filter <expr>` to run cases matching tags.
|
|
29
|
+
Set `allowCliRunAll: true` in
|
|
29
30
|
`agent-evals.config.ts` to opt into run-all CLI behavior. The web UI can
|
|
30
31
|
still run grouped evals and confirms before starting more than five. On a
|
|
31
32
|
single eval page, the Run chevron can open a picker to run specific authored
|
|
@@ -69,6 +70,25 @@ during case-owned phases by default; log arguments are stored as JSON-safe
|
|
|
69
70
|
values and rendered with the JSON viewer, collapsed previews include best-effort
|
|
70
71
|
code locations when stack data is available, previews are capped, and logs
|
|
71
72
|
inside cached operations are not replayed from cache hits.
|
|
73
|
+
Use eval tags to target related coverage without naming every case:
|
|
74
|
+
`AgentEvalsConfig.tags` applies workspace-wide tags, `defineEval({ tags })`
|
|
75
|
+
adds eval tags, `case.tags` adds case-only tags, and `removeTags` disables a
|
|
76
|
+
configured global tag for one eval. CLI filters support Vitest-style tag
|
|
77
|
+
expressions such as `agent-evals run --tags-filter "refunds && !slow"`.
|
|
78
|
+
Inside eval-scoped code, use `matchesEvalTags('tag')` or
|
|
79
|
+
`matchesEvalTags({ all, any, not })`; it uses typed exact tag names and returns
|
|
80
|
+
`false` outside a case scope. Projects can narrow tag names with a `.d.ts`
|
|
81
|
+
module augmentation:
|
|
82
|
+
|
|
83
|
+
```ts
|
|
84
|
+
import '@ls-stack/agent-eval';
|
|
85
|
+
|
|
86
|
+
declare module '@ls-stack/agent-eval' {
|
|
87
|
+
interface AgentEvalTagRegistry {
|
|
88
|
+
tags: 'refunds' | 'media' | 'manual' | 'slow';
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
```
|
|
72
92
|
|
|
73
93
|
### Product code (instrumented once, reused everywhere)
|
|
74
94
|
|
|
@@ -333,6 +353,9 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
333
353
|
column from the runs table when every rendered row is missing the value,
|
|
334
354
|
`null`, or an empty string; `0` and `false` still count as values, and the
|
|
335
355
|
value remains available in case details and raw output data.
|
|
356
|
+
In the case detail Output tab, string outputs that look like Markdown render
|
|
357
|
+
as Markdown even without `format: 'markdown'`, with a Preview/Raw toggle for
|
|
358
|
+
inspecting the original text.
|
|
336
359
|
- `deriveFromTracing` can be authored globally in `agent-evals.config.ts` or
|
|
337
360
|
locally on one eval. Prefer the keyed map form for shared metrics:
|
|
338
361
|
`deriveFromTracing: { toolCalls: ({ trace }) => trace.findSpansByKind('tool').length }`.
|