@ls-stack/agent-eval 0.55.1 → 0.56.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-BD0D9-7k.mjs → app-Bpe6Monh.mjs} +4 -4
- package/dist/apps/web/dist/assets/index-CfSiAVmi.js +377 -0
- package/dist/apps/web/dist/assets/{index-2I-eWzVL.css → index-Xa_7PteQ.css} +1 -1
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +2 -1
- package/dist/{cli-BR3wMZMx.mjs → cli-DQO2Fpt2.mjs} +60 -13
- package/dist/index.d.mts +610 -62
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +2 -2
- package/dist/{runExecution-Sw38bCaq.mjs → runExecution-6lrtj48K.mjs} +458 -121
- package/dist/{runOrchestration-DJsdLYeZ.mjs → runOrchestration-BYaN2mzS.mjs} +1 -1
- package/dist/{runner-dB69WsnM.mjs → runner-C3CiS2o7.mjs} +1 -1
- package/dist/{runner-72rsqJRq.mjs → runner-DYlwuAT3.mjs} +2 -2
- package/dist/{src-hBGtzWuA.mjs → src-DCGrFAmO.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +29 -10
- package/dist/apps/web/dist/assets/index-CvsPmlHl.js +0 -377
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-
|
|
1
|
+
import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-6lrtj48K.mjs";
|
|
2
2
|
import { readFile, readdir, rm, writeFile } from "node:fs/promises";
|
|
3
3
|
import { dirname, join } from "node:path";
|
|
4
4
|
import { existsSync } from "node:fs";
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-DYlwuAT3.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-DQO2Fpt2.mjs";
|
|
2
|
+
import "./src-DCGrFAmO.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-6lrtj48K.mjs";
|
|
2
|
+
import "./cli-DQO2Fpt2.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.56.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -33,8 +33,8 @@
|
|
|
33
33
|
"@types/node": "^24.7.2",
|
|
34
34
|
"typescript": "^5.9.2",
|
|
35
35
|
"@agent-evals/runner": "0.0.1",
|
|
36
|
-
"@agent-evals/
|
|
37
|
-
"@agent-evals/
|
|
36
|
+
"@agent-evals/sdk": "0.0.1",
|
|
37
|
+
"@agent-evals/shared": "0.0.1"
|
|
38
38
|
},
|
|
39
39
|
"scripts": {
|
|
40
40
|
"build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
|
|
@@ -318,6 +318,11 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
318
318
|
- `setEvalOutput(key, value)` writes reviewable data for the case. Values are
|
|
319
319
|
stored as received: primitives, objects/arrays, explicit file refs, and
|
|
320
320
|
native `Blob`/`File` values. `columns.format` only controls visualization.
|
|
321
|
+
Inside `execute`, `setOutput(key, value, formatOrOverride)` can attach a
|
|
322
|
+
display hint directly to a runtime output, e.g. `'markdown'` or
|
|
323
|
+
`{ label: 'Receipt', format: 'image', hideInTable: true }`. Authored
|
|
324
|
+
global/eval `columns` for the same key take precedence over that runtime
|
|
325
|
+
hint.
|
|
321
326
|
Non-JSON runtime values such as `Date`, `Map`, `Set`, `BigInt`, typed arrays,
|
|
322
327
|
and class instances use the tagged value serializer instead of a string
|
|
323
328
|
fallback. Native `Blob`/`File` values are copied to run artifacts because
|
|
@@ -363,8 +368,11 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
363
368
|
- `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are
|
|
364
369
|
summarized for review. Defaults to `kind: 'llm'` spans with `model`,
|
|
365
370
|
`usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional
|
|
366
|
-
attribute paths. `
|
|
367
|
-
|
|
371
|
+
attribute paths. The default `steps` path reads an array from
|
|
372
|
+
`span.attributes.steps`; if it is missing, direct child `model_step` spans are
|
|
373
|
+
shown as that call's steps. `latencyMs` is time to first token; duration,
|
|
374
|
+
total tokens, output tokens/sec, and USD costs are derived. Override `kinds`
|
|
375
|
+
to broaden the filter,
|
|
368
376
|
override `attributes.<field>` for non-default primitive span shapes, configure
|
|
369
377
|
model-keyed `pricing` to derive USD costs from token counts, with nested
|
|
370
378
|
`providers` entries for provider-specific rates, add `costCurrencies` to show
|
|
@@ -513,12 +521,21 @@ Mental model:
|
|
|
513
521
|
`serializeFileBytes: true` to a cached span or `evalTracer.cache(...)` call
|
|
514
522
|
when byte-level cache invalidation is required.
|
|
515
523
|
- Cache entries are stored as one Brotli-compressed JSON file per key under
|
|
516
|
-
`.agent-evals/cache/<sanitizedNamespace>/<keyHash>.json.br
|
|
517
|
-
|
|
524
|
+
`.agent-evals/cache/<sanitizedNamespace>/<keyHash>.json.br`, with a small
|
|
525
|
+
namespace index sidecar at
|
|
526
|
+
`.agent-evals/cache/<sanitizedNamespace>/.index-<namespaceHash>.json`.
|
|
527
|
+
Listing and retention use the index without opening cached payloads. Index
|
|
528
|
+
rows intentionally stay minimal: stored time, last access time, and external
|
|
529
|
+
JSON blob refs. Each namespace is capped at 100 entries by default. The runner
|
|
530
|
+
prunes least recently accessed indexed entries after a run finishes and the
|
|
531
|
+
runner stays idle for `cache.pruneIdleDelayMs ?? 5000` milliseconds. Configure
|
|
518
532
|
`cache.maxEntriesPerNamespace` for the default cap and
|
|
519
533
|
`cache.maxEntriesByNamespace` for exact namespace-specific caps.
|
|
534
|
+
- Unindexed legacy cache files are ignored by normal lookup/listing. Use
|
|
535
|
+
`agent-evals cache repair` to remove unindexed cache files, stale index rows,
|
|
536
|
+
debug sidecars, and unreferenced blob files.
|
|
520
537
|
- Nested cached JSON values at or above roughly 10K JSON characters are stored
|
|
521
|
-
as content-addressed Brotli blobs under `.agent-evals/cache-blobs/` and
|
|
538
|
+
as content-addressed Brotli blobs under `.agent-evals/cache/cache-blobs/` and
|
|
522
539
|
referenced from cache JSON by sha256. Identical large payloads share the same
|
|
523
540
|
blob.
|
|
524
541
|
- Authored raw cache keys are stored for debugging under
|
|
@@ -536,11 +553,13 @@ Mental model:
|
|
|
536
553
|
|
|
537
554
|
## Artifacts
|
|
538
555
|
|
|
539
|
-
Run output lives under `.agent-evals/runs/<run-id>/`. Cache
|
|
540
|
-
`.agent-evals/cache/<sanitizedNamespace>/<keyHash>.json.br
|
|
541
|
-
|
|
542
|
-
manually in eval code, then use
|
|
543
|
-
persisted namespace/key entries
|
|
556
|
+
Run output lives under `.agent-evals/runs/<run-id>/`. Cache payloads live under
|
|
557
|
+
`.agent-evals/cache/<sanitizedNamespace>/<keyHash>.json.br` with namespace
|
|
558
|
+
index sidecars next to them. Do not rely on a specific cache filename when
|
|
559
|
+
authoring evals; configure cache namespaces manually in eval code, then use
|
|
560
|
+
`agent-evals cache list` to inspect persisted namespace/key entries or
|
|
561
|
+
`agent-evals cache repair` to clean orphaned cache artifacts. Files in a run
|
|
562
|
+
directory include run metadata,
|
|
544
563
|
a run summary, per-case results, and per-case trace JSON. Inspect run files when
|
|
545
564
|
debugging persisted output, costs, columns, traces, or failures; inspect cache
|
|
546
565
|
entries when debugging replayed span/value-cache results.
|