@ls-stack/agent-eval 0.27.1 → 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-CJj1yPPD.mjs → app-D6-msfKP.mjs} +45 -6
- package/dist/apps/web/dist/assets/index-BCr6J8Uj.js +118 -0
- package/dist/apps/web/dist/assets/index-DjUTm3M-.css +1 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-Clf8xUFa.mjs → cli-CIc_gBNM.mjs} +965 -167
- package/dist/index.d.mts +5828 -3368
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +4 -2
- package/dist/{runOrchestration-FEvBwwJI.mjs → runOrchestration-CIARrLs6.mjs} +1046 -228
- package/dist/{runner-zqKwTlNj.mjs → runner-1F8MeY5V.mjs} +2 -2
- package/dist/{runner-KbDKLSU4.mjs → runner-Bq1f9B9d.mjs} +1 -1
- package/dist/src-CkWT1iSu.mjs +3 -0
- package/package.json +2 -29
- package/skills/agent-eval/SKILL.md +104 -20
- package/dist/apps/web/dist/assets/index-6YqV9t4k.js +0 -118
- package/dist/apps/web/dist/assets/index-C-OiMSQD.css +0 -1
- package/dist/bin.d.mts +0 -1
- package/dist/runChild.d.mts +0 -1
- package/dist/src-BBwT7_cy.mjs +0 -3
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-CIc_gBNM.mjs";
|
|
2
|
+
import "./src-CkWT1iSu.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-1F8MeY5V.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.29.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -16,33 +16,6 @@
|
|
|
16
16
|
"dist",
|
|
17
17
|
"skills"
|
|
18
18
|
],
|
|
19
|
-
"tsdown": {
|
|
20
|
-
"clean": true,
|
|
21
|
-
"copy": [
|
|
22
|
-
{
|
|
23
|
-
"from": "../../apps/web/dist/**/*",
|
|
24
|
-
"to": "dist/web",
|
|
25
|
-
"flatten": false
|
|
26
|
-
}
|
|
27
|
-
],
|
|
28
|
-
"deps": {
|
|
29
|
-
"alwaysBundle": [
|
|
30
|
-
"@agent-evals/runner",
|
|
31
|
-
"@agent-evals/sdk",
|
|
32
|
-
"@agent-evals/shared"
|
|
33
|
-
]
|
|
34
|
-
},
|
|
35
|
-
"dts": {
|
|
36
|
-
"eager": true
|
|
37
|
-
},
|
|
38
|
-
"entry": [
|
|
39
|
-
"src/index.ts",
|
|
40
|
-
"src/bin.ts",
|
|
41
|
-
"src/runChild.ts"
|
|
42
|
-
],
|
|
43
|
-
"format": "esm",
|
|
44
|
-
"platform": "node"
|
|
45
|
-
},
|
|
46
19
|
"dependencies": {
|
|
47
20
|
"@hono/node-server": "^1.14.3",
|
|
48
21
|
"@hono/zod-validator": "^0.7.3",
|
|
@@ -63,7 +36,7 @@
|
|
|
63
36
|
"@agent-evals/shared": "0.0.1"
|
|
64
37
|
},
|
|
65
38
|
"scripts": {
|
|
66
|
-
"build": "pnpm --filter @agent-evals/web build && tsdown",
|
|
39
|
+
"build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
|
|
67
40
|
"test": "vitest run",
|
|
68
41
|
"tsc": "tsgo --noEmit",
|
|
69
42
|
"lint": "tsgo --noEmit && eslint .",
|
|
@@ -16,9 +16,9 @@ This skill covers the mental model and conventions. For exhaustive field lists
|
|
|
16
16
|
display rules), read the TypeScript declarations shipped with the package:
|
|
17
17
|
|
|
18
18
|
- `AgentEvalsConfig`, `EvalDefinition`, `EvalCase`, `EvalOutputs`,
|
|
19
|
-
`EvalColumnOverride`, `
|
|
20
|
-
`EvalTraceTree`, `TraceSpanInfo`, and `z` are exported
|
|
21
|
-
`@ls-stack/agent-eval`.
|
|
19
|
+
`EvalColumnOverride`, `EvalDeriveConfig`, `EvalScoreDef`,
|
|
20
|
+
`EvalManualScoreDef`, `EvalTraceTree`, `TraceSpanInfo`, and `z` are exported
|
|
21
|
+
from `@ls-stack/agent-eval`.
|
|
22
22
|
- `.d.ts` files land in `node_modules/@ls-stack/agent-eval/dist/`.
|
|
23
23
|
- CLI surface: `agent-evals --help` and `agent-evals <command> --help`.
|
|
24
24
|
Unknown help targets exit non-zero instead of falling back to global help.
|
|
@@ -28,6 +28,10 @@ display rules), read the TypeScript declarations shipped with the package:
|
|
|
28
28
|
for targeted CLI runs. Set `allowCliRunAll: true` in
|
|
29
29
|
`agent-evals.config.ts` to opt into run-all CLI behavior. The web UI can
|
|
30
30
|
still run grouped evals and confirms before starting more than five.
|
|
31
|
+
- `agent-evals app` watches `agent-evals.config.ts` and reloads config in
|
|
32
|
+
place when the runner is idle. If config changes during an active run, the UI
|
|
33
|
+
shows a pending reload banner and blocks new runs until the current run
|
|
34
|
+
reaches a terminal state and the reload applies.
|
|
31
35
|
|
|
32
36
|
Assume that enumerated tables in this document may lag behind the types —
|
|
33
37
|
treat the types as source of truth when they disagree.
|
|
@@ -230,6 +234,50 @@ Case `id` values anchor historical runs, caches, and manual scores — keep them
|
|
|
230
234
|
stable. See `EvalDefinition` / `EvalCase` in the types for every supported
|
|
231
235
|
field.
|
|
232
236
|
|
|
237
|
+
### Manual input
|
|
238
|
+
|
|
239
|
+
Use `manualInput` instead of `cases` when each run should pause for the user
|
|
240
|
+
to type values:
|
|
241
|
+
|
|
242
|
+
```ts
|
|
243
|
+
const inputSchema = z.object({
|
|
244
|
+
name: z.string().min(1),
|
|
245
|
+
tone: z.enum(['friendly', 'formal']),
|
|
246
|
+
notes: z.string().max(500).optional(),
|
|
247
|
+
sendEmail: z.boolean().default(false),
|
|
248
|
+
});
|
|
249
|
+
|
|
250
|
+
defineEval<z.infer<typeof inputSchema>>({
|
|
251
|
+
id: 'manual-input-greeting',
|
|
252
|
+
manualInput: {
|
|
253
|
+
schema: inputSchema,
|
|
254
|
+
title: 'Greet someone',
|
|
255
|
+
submitLabel: 'Greet',
|
|
256
|
+
fields: { notes: { multiline: true, rows: 4 } },
|
|
257
|
+
},
|
|
258
|
+
execute: ({ input, setOutput }) => {
|
|
259
|
+
setOutput('greeting', `Hi, ${input.name}!`);
|
|
260
|
+
},
|
|
261
|
+
});
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
The web UI opens a modal driven by the descriptor derived from the schema
|
|
265
|
+
(`z.string` → text, `z.enum` → select, `z.boolean` → checkbox, etc.; nested
|
|
266
|
+
shapes fall back to a JSON textarea). The CLI accepts `--input '<json>'` for a
|
|
267
|
+
single targeted eval or `--input-file <path>` mapping eval keys/ids to inputs.
|
|
268
|
+
Each run produces one synthetic case `<evalId>-manual` with the validated
|
|
269
|
+
submission; mixing `manualInput` with `cases` is rejected at discovery time.
|
|
270
|
+
|
|
271
|
+
For file or image fields, set `{ asFile: true, accept?, maxSizeBytes? }` and
|
|
272
|
+
type the field with `manualInputFileValueSchema`. The widget supports click,
|
|
273
|
+
drag-and-drop, and clipboard paste (so a screenshot capture flows in
|
|
274
|
+
directly). The runtime value carries `{ name, mimeType, sizeBytes, sha256,
|
|
275
|
+
path }`, where `path` is a workspace-relative run artifact. Use
|
|
276
|
+
`readManualInputFile(value)` when bytes, `Blob`, `File`, text, or parsed JSON
|
|
277
|
+
are needed. In CLI runs, provide path objects such as
|
|
278
|
+
`{ "image": { "path": "./screenshot.png" } }`; the CLI stages the file before
|
|
279
|
+
starting the run.
|
|
280
|
+
|
|
233
281
|
## Scoring
|
|
234
282
|
|
|
235
283
|
Every score returns a normalized `0..1` value. Pass/fail is per-score: a case
|
|
@@ -269,7 +317,19 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
269
317
|
`defineEval` generic, `outputsSchema` is required.
|
|
270
318
|
- `columns` overrides the display for output and score keys (label, format,
|
|
271
319
|
alignment, visibility). The set of supported formats is declared by the
|
|
272
|
-
`ColumnFormat` union and `EvalColumnOverride` in the types.
|
|
320
|
+
`ColumnFormat` union and `EvalColumnOverride` in the types. Global
|
|
321
|
+
`columns` in `agent-evals.config.ts` apply to every eval; eval-level
|
|
322
|
+
`columns` override matching global keys. Use `hideIfNoValue: true` to hide a
|
|
323
|
+
column from the runs table when every rendered row is missing the value,
|
|
324
|
+
`null`, or an empty string; `0` and `false` still count as values, and the
|
|
325
|
+
value remains available in case details and raw output data.
|
|
326
|
+
- `deriveFromTracing` can be authored globally in `agent-evals.config.ts` or
|
|
327
|
+
locally on one eval. Prefer the keyed map form for shared metrics:
|
|
328
|
+
`deriveFromTracing: { toolCalls: ({ trace }) => trace.findSpansByKind('tool').length }`.
|
|
329
|
+
The older object-returning function form remains supported. Global
|
|
330
|
+
derivations run first; runtime outputs are never overwritten, and eval-level
|
|
331
|
+
derivations only fill keys still missing after global derivations. In keyed
|
|
332
|
+
form, return `undefined` to omit one output for that case.
|
|
273
333
|
- `traceDisplay` promotes selected span attributes into the trace tree and
|
|
274
334
|
detail pane; it supports aggregation across subtrees (`scope`, `mode`) and
|
|
275
335
|
user-defined `transform(...)` for derived views (e.g. currency conversion).
|
|
@@ -280,18 +340,26 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
280
340
|
attribute paths. `latencyMs` is time to first token; duration, total tokens,
|
|
281
341
|
tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
|
|
282
342
|
override `attributes.<field>` for non-default primitive span shapes, configure
|
|
283
|
-
`pricing` to derive USD costs from token counts
|
|
284
|
-
`
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
'
|
|
288
|
-
|
|
343
|
+
model-keyed `pricing` to derive USD costs from token counts, with nested
|
|
344
|
+
`providers` entries for provider-specific rates, add `derivedAttributes` to
|
|
345
|
+
persist computed values back onto matching LLM spans before trace consumers
|
|
346
|
+
run, and add entries to `metrics` to surface arbitrary user metrics
|
|
347
|
+
(`format: 'string' | 'number' | 'duration' | 'json' | 'boolean'`,
|
|
348
|
+
`placements: ['header' | 'body']`). `derivedAttributes` keys are dot-paths
|
|
349
|
+
under `span.attributes`; return `undefined` to skip one span. For saved runs,
|
|
350
|
+
the case drawer more menu can recalculate configured LLM/API derived
|
|
351
|
+
attributes for one case and persist the updated trace artifacts without
|
|
352
|
+
re-running the eval.
|
|
289
353
|
- Default usage config derives missing eval outputs from matching LLM/API spans
|
|
290
354
|
before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`,
|
|
291
355
|
`inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`,
|
|
292
356
|
`cacheCreationInputTokens`, `reasoningTokens`, and `llmDurationMs`. Authored
|
|
293
|
-
outputs and column overrides win.
|
|
294
|
-
|
|
357
|
+
outputs and column overrides win. Default usage columns, stats, and charts
|
|
358
|
+
use `hideIfNoValue: true`, so the UI hides them until matching LLM/API span
|
|
359
|
+
data exists. Default LLM usage charts render cost, input tokens, and output
|
|
360
|
+
tokens separately and use `dedupeConsecutiveValues: true` to skip repeated
|
|
361
|
+
adjacent chart values. `totalTokens` is input + output only; cache read/write
|
|
362
|
+
tokens stay separate and affect `costUsd` at their own rates.
|
|
295
363
|
Derived base input cost uses `inputTokens - cachedInputTokens -
|
|
296
364
|
cacheCreationInputTokens` so cache details are not double-counted.
|
|
297
365
|
`cacheCreationInputTokens` is the total cache-write count; optional
|
|
@@ -315,12 +383,17 @@ cacheCreationInputTokens` so cache details are not double-counted.
|
|
|
315
383
|
are still captured.
|
|
316
384
|
|
|
317
385
|
Stats rows and history charts on the eval card can be authored via `stats` /
|
|
318
|
-
`charts` on the eval definition.
|
|
386
|
+
`charts` on the eval definition. Global `stats` in `agent-evals.config.ts`
|
|
387
|
+
render before eval-level stats. Usage stats and LLM usage charts are added by
|
|
319
388
|
default unless removed with `removeDefaultConfig`. Column stats can override
|
|
320
389
|
`format` and `numberFormat`, otherwise they inherit from the matching column.
|
|
321
390
|
Number formats use `maxDecimalPlaces` to cap decimals and `minDecimalPlaces`
|
|
322
391
|
to pad trailing zeroes. Without `maxDecimalPlaces`, they render up to 3 decimal
|
|
323
|
-
places.
|
|
392
|
+
places. Stats and charts support `hideIfNoValue: true`; stats hide when they
|
|
393
|
+
would otherwise render an empty value, and charts hide when no plotted metric or
|
|
394
|
+
tooltip extra has a numeric value in the rendered history window. Charts support
|
|
395
|
+
`dedupeConsecutiveValues: true` to omit consecutive points whose plotted metrics
|
|
396
|
+
and tooltip extras match the previous kept point.
|
|
324
397
|
Their shapes live in the types; no need to memorize the option set.
|
|
325
398
|
|
|
326
399
|
## Cached operations
|
|
@@ -378,12 +451,18 @@ Mental model:
|
|
|
378
451
|
(no surrounding span), the ref is recorded on the case detail's `cacheRefs`
|
|
379
452
|
array.
|
|
380
453
|
- Cache identity is the namespace plus the authored key. Source-file
|
|
381
|
-
fingerprints are
|
|
382
|
-
cache-key hashing.
|
|
454
|
+
fingerprints are tracked for run freshness separately, but do not participate
|
|
455
|
+
in cache-key hashing.
|
|
383
456
|
- Cached spans require an explicit `cache.namespace`; value caches default to
|
|
384
457
|
`${evalId}__${name}` and can be overridden with `namespace`. Matching
|
|
385
458
|
namespaces share entries across operations/evals that use the same authored
|
|
386
459
|
key.
|
|
460
|
+
- Per eval, `cache: { read?: boolean; store?: boolean }` controls whether
|
|
461
|
+
authored cached operations may read or persist entries. Both default to
|
|
462
|
+
`true`. Use `read: false` to always execute instead of replaying hits, and
|
|
463
|
+
`store: false` to allow reads while preventing misses/refreshes from writing
|
|
464
|
+
cache or raw-key debug files. Run-level bypass/refresh controls still take
|
|
465
|
+
precedence.
|
|
387
466
|
- Authored eval ids are unique within one eval file. The exact eval identity is
|
|
388
467
|
the workspace-relative file path plus eval id, so the same id can be reused in
|
|
389
468
|
different files. Case ids must be unique within one eval; duplicate case ids
|
|
@@ -403,10 +482,15 @@ Mental model:
|
|
|
403
482
|
user inputs, or other sensitive data, should be gitignored, and is not needed
|
|
404
483
|
for cache reuse. The UI Cache tab shows the raw key when it is available and
|
|
405
484
|
can be filtered to hits or new entries added by cache misses/refreshes.
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
485
|
+
Misses/refreshes with `cache.store: false` are shown as non-stored activity
|
|
486
|
+
without fetch/delete controls.
|
|
487
|
+
- Cached payloads use advanced serialization/deserialization with the Web API
|
|
488
|
+
plugin set, so return values and recorded SDK effects preserve richer
|
|
489
|
+
built-ins such as `Date`, `Map`, `Set`, typed arrays, `URL`, `Headers`,
|
|
490
|
+
`Blob`, and `File` on hits. Undefined values are omitted by default instead
|
|
491
|
+
of being written to cache files; direct serializer callers can pass
|
|
492
|
+
`{ preserveUndefined: true }` when explicit undefined wrappers are needed.
|
|
493
|
+
Cache keys still use the deterministic key-hashing rules above.
|
|
410
494
|
- Cache mode per run is controlled by CLI flags (see `agent-evals run --help`).
|
|
411
495
|
|
|
412
496
|
## Artifacts
|