npm - @ls-stack/agent-eval - Versions diffs - 0.27.1 → 0.29.0 - Mend

@ls-stack/agent-eval 0.27.1 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/dist/{app-CJj1yPPD.mjs → app-D6-msfKP.mjs} +45 -6
package/dist/apps/web/dist/assets/index-BCr6J8Uj.js +118 -0
package/dist/apps/web/dist/assets/index-DjUTm3M-.css +1 -0
package/dist/apps/web/dist/index.html +2 -2
package/dist/bin.mjs +1 -1
package/dist/{cli-Clf8xUFa.mjs → cli-CIc_gBNM.mjs} +965 -167
package/dist/index.d.mts +5828 -3368
package/dist/index.mjs +4 -4
package/dist/runChild.mjs +4 -2
package/dist/{runOrchestration-FEvBwwJI.mjs → runOrchestration-CIARrLs6.mjs} +1046 -228
package/dist/{runner-zqKwTlNj.mjs → runner-1F8MeY5V.mjs} +2 -2
package/dist/{runner-KbDKLSU4.mjs → runner-Bq1f9B9d.mjs} +1 -1
package/dist/src-CkWT1iSu.mjs +3 -0
package/package.json +2 -29
package/skills/agent-eval/SKILL.md +104 -20
package/dist/apps/web/dist/assets/index-6YqV9t4k.js +0 -118
package/dist/apps/web/dist/assets/index-C-OiMSQD.css +0 -1
package/dist/bin.d.mts +0 -1
package/dist/runChild.d.mts +0 -1
package/dist/src-BBwT7_cy.mjs +0 -3

package/dist/{runner-zqKwTlNj.mjs → runner-1F8MeY5V.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { n as createRunner } from "./cli-Clf8xUFa.mjs";
-import "./src-BBwT7_cy.mjs";
+import { n as createRunner } from "./cli-CIc_gBNM.mjs";
+import "./src-CkWT1iSu.mjs";
 //#region ../../apps/server/src/runner.ts
 let runnerInstance = null;
 function getRunnerInstance() {

package/dist/{runner-KbDKLSU4.mjs → runner-Bq1f9B9d.mjs} RENAMED Viewed

@@ -1,2 +1,2 @@
-import { n as initRunner, t as getRunnerInstance } from "./runner-zqKwTlNj.mjs";
+import { n as initRunner, t as getRunnerInstance } from "./runner-1F8MeY5V.mjs";
 export { getRunnerInstance, initRunner };

package/dist/src-CkWT1iSu.mjs ADDED Viewed

@@ -0,0 +1,3 @@
+import "./runOrchestration-CIARrLs6.mjs";
+import "./cli-CIc_gBNM.mjs";
+export {};

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ls-stack/agent-eval",
-  "version": "0.27.1",
+  "version": "0.29.0",
   "type": "module",
   "bin": {
     "agent-evals": "./dist/bin.mjs"
@@ -16,33 +16,6 @@
     "dist",
     "skills"
   ],
-  "tsdown": {
-    "clean": true,
-    "copy": [
-      {
-        "from": "../../apps/web/dist/**/*",
-        "to": "dist/web",
-        "flatten": false
-      }
-    ],
-    "deps": {
-      "alwaysBundle": [
-        "@agent-evals/runner",
-        "@agent-evals/sdk",
-        "@agent-evals/shared"
-      ]
-    },
-    "dts": {
-      "eager": true
-    },
-    "entry": [
-      "src/index.ts",
-      "src/bin.ts",
-      "src/runChild.ts"
-    ],
-    "format": "esm",
-    "platform": "node"
-  },
   "dependencies": {
     "@hono/node-server": "^1.14.3",
     "@hono/zod-validator": "^0.7.3",
@@ -63,7 +36,7 @@
     "@agent-evals/shared": "0.0.1"
   },
   "scripts": {
-    "build": "pnpm --filter @agent-evals/web build && tsdown",
+    "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
     "test": "vitest run",
     "tsc": "tsgo --noEmit",
     "lint": "tsgo --noEmit && eslint .",

package/skills/agent-eval/SKILL.md CHANGED Viewed

@@ -16,9 +16,9 @@ This skill covers the mental model and conventions. For exhaustive field lists
 display rules), read the TypeScript declarations shipped with the package:
 - `AgentEvalsConfig`, `EvalDefinition`, `EvalCase`, `EvalOutputs`,
-  `EvalColumnOverride`, `EvalScoreDef`, `EvalManualScoreDef`,
-  `EvalTraceTree`, `TraceSpanInfo`, and `z` are exported from
-  `@ls-stack/agent-eval`.
+  `EvalColumnOverride`, `EvalDeriveConfig`, `EvalScoreDef`,
+  `EvalManualScoreDef`, `EvalTraceTree`, `TraceSpanInfo`, and `z` are exported
+  from `@ls-stack/agent-eval`.
 - `.d.ts` files land in `node_modules/@ls-stack/agent-eval/dist/`.
 - CLI surface: `agent-evals --help` and `agent-evals <command> --help`.
   Unknown help targets exit non-zero instead of falling back to global help.
@@ -28,6 +28,10 @@ display rules), read the TypeScript declarations shipped with the package:
   for targeted CLI runs. Set `allowCliRunAll: true` in
   `agent-evals.config.ts` to opt into run-all CLI behavior. The web UI can
   still run grouped evals and confirms before starting more than five.
+- `agent-evals app` watches `agent-evals.config.ts` and reloads config in
+  place when the runner is idle. If config changes during an active run, the UI
+  shows a pending reload banner and blocks new runs until the current run
+  reaches a terminal state and the reload applies.
 Assume that enumerated tables in this document may lag behind the types —
 treat the types as source of truth when they disagree.
@@ -230,6 +234,50 @@ Case `id` values anchor historical runs, caches, and manual scores — keep them
 stable. See `EvalDefinition` / `EvalCase` in the types for every supported
 field.
+### Manual input
+Use `manualInput` instead of `cases` when each run should pause for the user
+to type values:
+```ts
+const inputSchema = z.object({
+  name: z.string().min(1),
+  tone: z.enum(['friendly', 'formal']),
+  notes: z.string().max(500).optional(),
+  sendEmail: z.boolean().default(false),
+});
+defineEval<z.infer<typeof inputSchema>>({
+  id: 'manual-input-greeting',
+  manualInput: {
+    schema: inputSchema,
+    title: 'Greet someone',
+    submitLabel: 'Greet',
+    fields: { notes: { multiline: true, rows: 4 } },
+  },
+  execute: ({ input, setOutput }) => {
+    setOutput('greeting', `Hi, ${input.name}!`);
+  },
+});
+```
+The web UI opens a modal driven by the descriptor derived from the schema
+(`z.string` → text, `z.enum` → select, `z.boolean` → checkbox, etc.; nested
+shapes fall back to a JSON textarea). The CLI accepts `--input '<json>'` for a
+single targeted eval or `--input-file <path>` mapping eval keys/ids to inputs.
+Each run produces one synthetic case `<evalId>-manual` with the validated
+submission; mixing `manualInput` with `cases` is rejected at discovery time.
+For file or image fields, set `{ asFile: true, accept?, maxSizeBytes? }` and
+type the field with `manualInputFileValueSchema`. The widget supports click,
+drag-and-drop, and clipboard paste (so a screenshot capture flows in
+directly). The runtime value carries `{ name, mimeType, sizeBytes, sha256,
+path }`, where `path` is a workspace-relative run artifact. Use
+`readManualInputFile(value)` when bytes, `Blob`, `File`, text, or parsed JSON
+are needed. In CLI runs, provide path objects such as
+`{ "image": { "path": "./screenshot.png" } }`; the CLI stages the file before
+starting the run.
 ## Scoring
 Every score returns a normalized `0..1` value. Pass/fail is per-score: a case
@@ -269,7 +317,19 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
   `defineEval` generic, `outputsSchema` is required.
 - `columns` overrides the display for output and score keys (label, format,
   alignment, visibility). The set of supported formats is declared by the
-  `ColumnFormat` union and `EvalColumnOverride` in the types.
+  `ColumnFormat` union and `EvalColumnOverride` in the types. Global
+  `columns` in `agent-evals.config.ts` apply to every eval; eval-level
+  `columns` override matching global keys. Use `hideIfNoValue: true` to hide a
+  column from the runs table when every rendered row is missing the value,
+  `null`, or an empty string; `0` and `false` still count as values, and the
+  value remains available in case details and raw output data.
+- `deriveFromTracing` can be authored globally in `agent-evals.config.ts` or
+  locally on one eval. Prefer the keyed map form for shared metrics:
+  `deriveFromTracing: { toolCalls: ({ trace }) => trace.findSpansByKind('tool').length }`.
+  The older object-returning function form remains supported. Global
+  derivations run first; runtime outputs are never overwritten, and eval-level
+  derivations only fill keys still missing after global derivations. In keyed
+  form, return `undefined` to omit one output for that case.
 - `traceDisplay` promotes selected span attributes into the trace tree and
   detail pane; it supports aggregation across subtrees (`scope`, `mode`) and
   user-defined `transform(...)` for derived views (e.g. currency conversion).
@@ -280,18 +340,26 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
   attribute paths. `latencyMs` is time to first token; duration, total tokens,
   tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
   override `attributes.<field>` for non-default primitive span shapes, configure
-  `pricing` to derive USD costs from token counts by model/provider, add
-  `derivedAttributes` to persist computed values back onto matching LLM spans
-  before trace consumers run, and add entries to `metrics` to surface arbitrary
-  user metrics (`format: 'string' | 'number' | 'duration' | 'json' |
-'boolean'`, `placements: ['header' | 'body']`). `derivedAttributes` keys are
-  dot-paths under `span.attributes`; return `undefined` to skip one span.
+  model-keyed `pricing` to derive USD costs from token counts, with nested
+  `providers` entries for provider-specific rates, add `derivedAttributes` to
+  persist computed values back onto matching LLM spans before trace consumers
+  run, and add entries to `metrics` to surface arbitrary user metrics
+  (`format: 'string' | 'number' | 'duration' | 'json' | 'boolean'`,
+  `placements: ['header' | 'body']`). `derivedAttributes` keys are dot-paths
+  under `span.attributes`; return `undefined` to skip one span. For saved runs,
+  the case drawer more menu can recalculate configured LLM/API derived
+  attributes for one case and persist the updated trace artifacts without
+  re-running the eval.
 - Default usage config derives missing eval outputs from matching LLM/API spans
   before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`,
   `inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`,
   `cacheCreationInputTokens`, `reasoningTokens`, and `llmDurationMs`. Authored
-  outputs and column overrides win. `totalTokens` is input + output only; cache
-  read/write tokens stay separate and affect `costUsd` at their own rates.
+  outputs and column overrides win. Default usage columns, stats, and charts
+  use `hideIfNoValue: true`, so the UI hides them until matching LLM/API span
+  data exists. Default LLM usage charts render cost, input tokens, and output
+  tokens separately and use `dedupeConsecutiveValues: true` to skip repeated
+  adjacent chart values. `totalTokens` is input + output only; cache read/write
+  tokens stay separate and affect `costUsd` at their own rates.
   Derived base input cost uses `inputTokens - cachedInputTokens -
 cacheCreationInputTokens` so cache details are not double-counted.
   `cacheCreationInputTokens` is the total cache-write count; optional
@@ -315,12 +383,17 @@ cacheCreationInputTokens` so cache details are not double-counted.
   are still captured.
 Stats rows and history charts on the eval card can be authored via `stats` /
-`charts` on the eval definition. Usage stats and LLM usage charts are added by
+`charts` on the eval definition. Global `stats` in `agent-evals.config.ts`
+render before eval-level stats. Usage stats and LLM usage charts are added by
 default unless removed with `removeDefaultConfig`. Column stats can override
 `format` and `numberFormat`, otherwise they inherit from the matching column.
 Number formats use `maxDecimalPlaces` to cap decimals and `minDecimalPlaces`
 to pad trailing zeroes. Without `maxDecimalPlaces`, they render up to 3 decimal
-places.
+places. Stats and charts support `hideIfNoValue: true`; stats hide when they
+would otherwise render an empty value, and charts hide when no plotted metric or
+tooltip extra has a numeric value in the rendered history window. Charts support
+`dedupeConsecutiveValues: true` to omit consecutive points whose plotted metrics
+and tooltip extras match the previous kept point.
 Their shapes live in the types; no need to memorize the option set.
 ## Cached operations
@@ -378,12 +451,18 @@ Mental model:
   (no surrounding span), the ref is recorded on the case detail's `cacheRefs`
   array.
 - Cache identity is the namespace plus the authored key. Source-file
-  fingerprints are stored as metadata for inspection, but do not participate in
-  cache-key hashing.
+  fingerprints are tracked for run freshness separately, but do not participate
+  in cache-key hashing.
 - Cached spans require an explicit `cache.namespace`; value caches default to
   `${evalId}__${name}` and can be overridden with `namespace`. Matching
   namespaces share entries across operations/evals that use the same authored
   key.
+- Per eval, `cache: { read?: boolean; store?: boolean }` controls whether
+  authored cached operations may read or persist entries. Both default to
+  `true`. Use `read: false` to always execute instead of replaying hits, and
+  `store: false` to allow reads while preventing misses/refreshes from writing
+  cache or raw-key debug files. Run-level bypass/refresh controls still take
+  precedence.
 - Authored eval ids are unique within one eval file. The exact eval identity is
   the workspace-relative file path plus eval id, so the same id can be reused in
   different files. Case ids must be unique within one eval; duplicate case ids
@@ -403,10 +482,15 @@ Mental model:
   user inputs, or other sensitive data, should be gitignored, and is not needed
   for cache reuse. The UI Cache tab shows the raw key when it is available and
   can be filtered to hits or new entries added by cache misses/refreshes.
-- Cached payloads use advance serialization/deserialization with the Web API plugin set, so return values and
-  recorded SDK effects preserve richer built-ins such as `Date`, `Map`, `Set`,
-  typed arrays, `URL`, `Headers`, `Blob`, and `File` on hits. Cache keys still
-  use the deterministic key-hashing rules above.
+  Misses/refreshes with `cache.store: false` are shown as non-stored activity
+  without fetch/delete controls.
+- Cached payloads use advanced serialization/deserialization with the Web API
+  plugin set, so return values and recorded SDK effects preserve richer
+  built-ins such as `Date`, `Map`, `Set`, typed arrays, `URL`, `Headers`,
+  `Blob`, and `File` on hits. Undefined values are omitted by default instead
+  of being written to cache files; direct serializer callers can pass
+  `{ preserveUndefined: true }` when explicit undefined wrappers are needed.
+  Cache keys still use the deterministic key-hashing rules above.
 - Cache mode per run is controlled by CLI flags (see `agent-evals run --help`).
 ## Artifacts