@ls-stack/agent-eval 0.15.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as caseRowSchema, $t as appendToEvalOutput, A as getEvalTitle, At as traceDisplayConfigSchema, B as apiCallMetricFormatSchema, Bt as fileRefSchema, C as createRunRequestSchema, Ct as serializedCacheSpanSchema, D as extractApiCalls, Dt as traceAttributeDisplayInputSchema, E as extractCacheHits, Et as traceAttributeDisplayFormatSchema, F as runManifestSchema, Ft as traceSpanWarningSchema, G as llmCallMetricPlacementSchema, Gt as z, H as apiCallMetricSchema, Ht as numberDisplayOptionsSchema, I as runSummarySchema, It as cellValueSchema, J as resolveApiCallsConfig, Jt as evalSpan, K as llmCallMetricSchema, Kt as buildTraceTree, L as DEFAULT_API_CALLS_CONFIG, Lt as columnDefSchema, M as deriveScopedSummaryFromCases, Mt as traceSpanErrorSchema, N as deriveStatusFromCaseRows, Nt as traceSpanKindSchema, O as extractLlmCalls, Ot as traceAttributeDisplayPlacementSchema, P as deriveStatusFromChildStatuses, Pt as traceSpanSchema, Q as caseDetailSchema, Qt as EvalAssertionError, R as DEFAULT_LLM_CALLS_CONFIG, Rt as columnFormatSchema, St as cacheStatusSchema, T as sseEnvelopeSchema, Tt as traceCacheRefSchema, U as apiCallsConfigSchema, Ut as repoFileRefSchema, V as apiCallMetricPlacementSchema, Vt as jsonCellSchema, W as llmCallMetricFormatSchema, Wt as runArtifactRefSchema, X as trialSelectionModeSchema, Xt as hashCacheKey, Y as resolveLlmCallsConfig, Yt as evalTracer, Z as assertionFailureSchema, Zt as hashCacheKeySync, _t as cacheListItemSchema, an as mergeEvalOutput, at as scoreTraceSchema, bt as cacheRecordingOpSchema, cn as runInEvalScope, ct as evalChartBuiltinMetricSchema, dn as setScopeCacheContext, dt as evalChartMetricSchema, en as evalAssert, et as evalFreshnessStatusSchema, fn as startEvalBackgroundJob, ft as evalChartTooltipExtraSchema, gt as cacheFileSchema, hn as getEvalRegistry, ht as cacheEntrySchema, in as isInEvalScope, it as evalSummarySchema, j as getEvalDisplayStatus, jt as traceDisplayInputConfigSchema, k as getNestedAttribute, kt as traceAttributeDisplaySchema, ln as runInExistingEvalScope, lt as evalChartColorSchema, mn as defineEval, mt as evalChartsConfigSchema, nn as getEvalCaseInput, nt as evalStatItemSchema, on as nextEvalId, ot as evalChartAggregateSchema, pn as repoFile, pt as evalChartTypeSchema, q as llmCallsConfigSchema, qt as captureEvalSpanError, rn as incrementEvalOutput, rt as evalStatsConfigSchema, sn as runInEvalRuntimeScope, st as evalChartAxisSchema, tn as getCurrentScope, tt as evalStatAggregateSchema, un as setEvalOutput, ut as evalChartConfigSchema, vt as cacheModeSchema, w as updateManualScoreRequestSchema, wt as spanCacheOptionsSchema, xt as cacheRecordingSchema, yt as cacheOperationTypeSchema, z as agentEvalsConfigSchema, zt as columnKindSchema } from "./runOrchestration-B3fYtpKo.mjs";
2
- import { n as createRunner, t as runCli } from "./cli-B-sCTyz8.mjs";
3
- import "./src-jaOlXwb5.mjs";
1
+ import { $ as caseRowSchema, $t as appendToEvalOutput, A as getEvalTitle, At as traceDisplayConfigSchema, B as apiCallMetricFormatSchema, Bt as fileRefSchema, C as createRunRequestSchema, Ct as serializedCacheSpanSchema, D as extractApiCalls, Dt as traceAttributeDisplayInputSchema, E as extractCacheHits, Et as traceAttributeDisplayFormatSchema, F as runManifestSchema, Ft as traceSpanWarningSchema, G as llmCallMetricPlacementSchema, Gt as z, H as apiCallMetricSchema, Ht as numberDisplayOptionsSchema, I as runSummarySchema, It as cellValueSchema, J as resolveApiCallsConfig, Jt as evalSpan, K as llmCallMetricSchema, Kt as buildTraceTree, L as DEFAULT_API_CALLS_CONFIG, Lt as columnDefSchema, M as deriveScopedSummaryFromCases, Mt as traceSpanErrorSchema, N as deriveStatusFromCaseRows, Nt as traceSpanKindSchema, O as extractLlmCalls, Ot as traceAttributeDisplayPlacementSchema, P as deriveStatusFromChildStatuses, Pt as traceSpanSchema, Q as caseDetailSchema, Qt as EvalAssertionError, R as DEFAULT_LLM_CALLS_CONFIG, Rt as columnFormatSchema, St as cacheStatusSchema, T as sseEnvelopeSchema, Tt as traceCacheRefSchema, U as apiCallsConfigSchema, Ut as repoFileRefSchema, V as apiCallMetricPlacementSchema, Vt as jsonCellSchema, W as llmCallMetricFormatSchema, Wt as runArtifactRefSchema, X as trialSelectionModeSchema, Xt as hashCacheKey, Y as resolveLlmCallsConfig, Yt as evalTracer, Z as assertionFailureSchema, Zt as hashCacheKeySync, _t as cacheListItemSchema, an as mergeEvalOutput, at as scoreTraceSchema, bt as cacheRecordingOpSchema, cn as runInEvalScope, ct as evalChartBuiltinMetricSchema, dn as setScopeCacheContext, dt as evalChartMetricSchema, en as evalAssert, et as evalFreshnessStatusSchema, fn as startEvalBackgroundJob, ft as evalChartTooltipExtraSchema, gt as cacheFileSchema, hn as getEvalRegistry, ht as cacheEntrySchema, in as isInEvalScope, it as evalSummarySchema, j as getEvalDisplayStatus, jt as traceDisplayInputConfigSchema, k as getNestedAttribute, kt as traceAttributeDisplaySchema, ln as runInExistingEvalScope, lt as evalChartColorSchema, mn as defineEval, mt as evalChartsConfigSchema, nn as getEvalCaseInput, nt as evalStatItemSchema, on as nextEvalId, ot as evalChartAggregateSchema, pn as repoFile, pt as evalChartTypeSchema, q as llmCallsConfigSchema, qt as captureEvalSpanError, rn as incrementEvalOutput, rt as evalStatsConfigSchema, sn as runInEvalRuntimeScope, st as evalChartAxisSchema, tn as getCurrentScope, tt as evalStatAggregateSchema, un as setEvalOutput, ut as evalChartConfigSchema, vt as cacheModeSchema, w as updateManualScoreRequestSchema, wt as spanCacheOptionsSchema, xt as cacheRecordingSchema, yt as cacheOperationTypeSchema, z as agentEvalsConfigSchema, zt as columnKindSchema } from "./runOrchestration-BDyNrRQT.mjs";
2
+ import { n as createRunner, t as runCli } from "./cli-BmrtjQj_.mjs";
3
+ import "./src-CEAJYN_X.mjs";
4
4
  export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
package/dist/runChild.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { C as createRunRequestSchema, F as runManifestSchema, I as runSummarySchema, Lt as columnDefSchema, S as createFsCacheStore, mt as evalChartsConfigSchema, rt as evalStatsConfigSchema, t as executeRun, v as parseEvalMetas, y as loadConfig } from "./runOrchestration-B3fYtpKo.mjs";
1
+ import { C as createRunRequestSchema, F as runManifestSchema, I as runSummarySchema, Lt as columnDefSchema, S as createFsCacheStore, mt as evalChartsConfigSchema, rt as evalStatsConfigSchema, t as executeRun, v as parseEvalMetas, y as loadConfig } from "./runOrchestration-BDyNrRQT.mjs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { readFile } from "node:fs/promises";
4
4
  import { relative } from "node:path";
@@ -3913,6 +3913,7 @@ const agentEvalsConfigSchema = z.object({
3913
3913
  trialSelection: trialSelectionModeSchema.optional(),
3914
3914
  concurrency: z.number().optional(),
3915
3915
  staleAfterDays: z.number().optional(),
3916
+ allowCliRunAll: z.boolean().optional(),
3916
3917
  traceDisplay: traceDisplayInputConfigSchema.optional(),
3917
3918
  llmCalls: llmCallsConfigSchema.optional(),
3918
3919
  apiCalls: apiCallsConfigSchema.optional(),
@@ -4881,6 +4882,7 @@ const defaultConfig = {
4881
4882
  trialSelection: "lowestScore",
4882
4883
  concurrency: 2,
4883
4884
  staleAfterDays: 14,
4885
+ allowCliRunAll: false,
4884
4886
  traceDisplay: { attributes: [{
4885
4887
  path: "input",
4886
4888
  label: "Input",
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-B-sCTyz8.mjs";
2
- import "./src-jaOlXwb5.mjs";
1
+ import { n as createRunner } from "./cli-BmrtjQj_.mjs";
2
+ import "./src-CEAJYN_X.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-vunKoSBu.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-CsZqhbiA.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -0,0 +1,3 @@
1
+ import "./runOrchestration-BDyNrRQT.mjs";
2
+ import "./cli-BmrtjQj_.mjs";
3
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.15.0",
3
+ "version": "0.16.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -59,8 +59,8 @@
59
59
  "@types/node": "^24.7.2",
60
60
  "typescript": "^5.9.2",
61
61
  "@agent-evals/runner": "0.0.1",
62
- "@agent-evals/sdk": "0.0.1",
63
- "@agent-evals/shared": "0.0.1"
62
+ "@agent-evals/shared": "0.0.1",
63
+ "@agent-evals/sdk": "0.0.1"
64
64
  },
65
65
  "scripts": {
66
66
  "build": "pnpm --filter @agent-evals/web build && tsdown",
@@ -24,6 +24,10 @@ display rules), read the TypeScript declarations shipped with the package:
24
24
  Unknown help targets exit non-zero instead of falling back to global help.
25
25
  - The CLI automatically loads `.env` from the current workspace. Shell-provided
26
26
  environment variables win; pass `--no-env` to disable `.env` loading once.
27
+ - Unfiltered `agent-evals run` is disabled by default; use `--eval` or `--case`
28
+ for targeted CLI runs. Set `allowCliRunAll: true` in
29
+ `agent-evals.config.ts` to opt into run-all CLI behavior. The web UI can
30
+ still run grouped evals and confirms before starting more than five.
27
31
 
28
32
  Assume that enumerated tables in this document may lag behind the types —
29
33
  treat the types as source of truth when they disagree.
@@ -117,21 +121,19 @@ export async function runRefundWorkflow(input: RefundInput) {
117
121
  }
118
122
  ```
119
123
 
120
- Span `kind` values are open-ended strings and are color-coded automatically in
121
- the UI for every kind used during the app session. Use familiar kinds such as
124
+ Span `kind` values are open-ended strings. Use familiar kinds such as
122
125
  `agent`, `tool`, `llm`, `api`, `retrieval`, `scorer`, or `checkpoint` when they
123
126
  fit, and preserve external tracer kinds such as `mastra.workflow.step` when they
124
- are more specific. The UI automatically promotes only the `input` and `output` span
125
- attributes. Use `traceDisplay` for other span attributes such as `model`,
127
+ are more specific. Only the `input` and `output` span attributes are promoted
128
+ automatically; use `traceDisplay` for other span attributes such as `model`,
126
129
  `usage`, or `costUsd`.
127
130
 
128
131
  Use `captureEvalSpanError(error)` for recoverable errors on the active
129
132
  `evalTracer.span(...)`, such as optional model/tool failures that fall back and
130
133
  continue. You can pass one error, multiple error arguments, or an array. The
131
- span is still marked `error`, and the UI renders captured errors in a dedicated
132
- span detail block with timing relative to the span. Pass `'warning'` or
133
- `{ level: 'warning' }` as the final argument for diagnostics that should be
134
- visible in span detail without changing an otherwise successful span's status.
134
+ span is still marked `error`. Pass `'warning'` or `{ level: 'warning' }` as the
135
+ final argument for diagnostics that should not change an otherwise successful
136
+ span's status.
135
137
 
136
138
  If a span callback throws, the SDK automatically marks that span as `error`,
137
139
  stores the thrown error on it, and rethrows so the case errors. Use that for
@@ -208,12 +210,11 @@ or if the case errors. Scores without `passThreshold` are informational.
208
210
 
209
211
  Score functions run in their own trace scope, separate from the execution
210
212
  trace, so LLM-as-judge scorers can use `evalTracer.span(...)` and cached spans
211
- without polluting the agent trajectory. The case detail UI shows execution
212
- spans on **Trace** and scorer spans on **Scoring**. Outputs set inside a scorer
213
- stay private to that score.
213
+ without polluting the agent trajectory. Outputs set inside a scorer stay
214
+ private to that score.
214
215
 
215
- `manualScores` declares score columns that reviewers fill in the web UI after
216
- a run. Pending values keep the eval in an `unscored` state instead of failing.
216
+ `manualScores` declares score columns that reviewers fill in after a run.
217
+ Pending values keep the eval in an `unscored` state instead of failing.
217
218
 
218
219
  See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
219
220
  (format, threshold, column overrides).
@@ -245,22 +246,20 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
245
246
  detail pane; it supports aggregation across subtrees (`scope`, `mode`) and
246
247
  user-defined `transform(...)` for derived views (e.g. currency conversion).
247
248
  See the `TraceDisplayInputConfig` type.
248
- - `llmCalls` (in `agent-evals.config.ts`) configures the LLM calls tab in the
249
- case-run drawer. Defaults to `kind: 'llm'` spans with `model`, `usage.*`,
250
- `costUsd`, `input`, `output`, etc. read from conventional attribute paths.
251
- Override `kinds` to broaden the filter, override `attributes.<field>` for
252
- non-default span shapes, and add entries to `metrics` to surface arbitrary
253
- user metrics (`format: 'string' | 'number' | 'duration' | 'json' |
254
- 'boolean'`, `placements: ['header' | 'body']`). The tab auto-hides when no
255
- matching spans exist.
256
- - `apiCalls` (in `agent-evals.config.ts`) configures the API calls tab in the
257
- case-run drawer. Defaults to `kind: 'api'`, `'http'`, `'http.client'`, and
258
- `'fetch'` spans with `method`, `url`, `statusCode`, `request`, `response`,
259
- `requestBody`, `responseBody`, `headers`, `durationMs`, and `error` read
260
- from conventional attribute paths. Override `kinds` or
249
+ - `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are
250
+ summarized for review. Defaults to `kind: 'llm'` spans with `model`,
251
+ `usage.*`, `costUsd`, `input`, `output`, etc. read from conventional
252
+ attribute paths. Override `kinds` to broaden the filter, override
253
+ `attributes.<field>` for non-default span shapes, and add entries to
254
+ `metrics` to surface arbitrary user metrics (`format: 'string' | 'number' |
255
+ 'duration' | 'json' | 'boolean'`, `placements: ['header' | 'body']`).
256
+ - `apiCalls` (in `agent-evals.config.ts`) configures how API-call spans are
257
+ summarized for review. Defaults to `kind: 'api'`, `'http'`, `'http.client'`,
258
+ and `'fetch'` spans with `method`, `url`, `statusCode`, `request`,
259
+ `response`, `requestBody`, `responseBody`, `headers`, `durationMs`, and
260
+ `error` read from conventional attribute paths. Override `kinds` or
261
261
  `attributes.<field>` for external tracers, and add `metrics` with the same
262
- formats and placements as LLM-call metrics. The tab auto-hides when no
263
- matching spans exist.
262
+ formats and placements as LLM-call metrics.
264
263
 
265
264
  Stats rows and history charts on the eval card are opt-in via `stats` /
266
265
  `charts` on the eval definition. Their shapes live in the types; no need to
@@ -313,8 +312,7 @@ Mental model:
313
312
  span, that span gets a `cache.refs` entry with the value cache name, key,
314
313
  namespace, and hit/miss status. When called directly from the case body
315
314
  (no surrounding span), the ref is recorded on the case detail's `cacheRefs`
316
- array so spanless caches still appear in the UI's **Cache hits** tab, where
317
- each hit can be expanded for inspection or deleted by namespace/key.
315
+ array.
318
316
  - The cache key folds in a source-file fingerprint, so editing the eval busts
319
317
  the cache automatically.
320
318
  - `cache.namespace` on spans or `namespace` on value caches can share entries
@@ -335,19 +333,31 @@ Mental model:
335
333
  recorded SDK effects preserve richer built-ins such as `Date`, `Map`, `Set`,
336
334
  typed arrays, `URL`, `Headers`, `Blob`, and `File` on hits. Cache keys still
337
335
  use the deterministic key-hashing rules above.
338
- - Cache mode per run is controlled by CLI flags (see `agent-evals run --help`)
339
- and by a chevron menu on each eval card in the UI.
340
- - The UI Stop action cancels the whole active run by terminating that run's
341
- isolated execution process.
336
+ - Cache mode per run is controlled by CLI flags (see `agent-evals run --help`).
342
337
 
343
338
  ## Artifacts
344
339
 
345
340
  Run output lives under `.agent-evals/runs/<run-id>/` and cache entries under
346
341
  `.agent-evals/cache/<eval-id>.json`. Files in a run directory include run
347
342
  metadata, a run summary, per-case results, and per-case trace JSON. Inspect
348
- these when debugging persisted output, costs, columns, traces, or failures
349
- the filenames are stable even when their internal schema evolves, so pick the
350
- one whose name matches what you are debugging and read it directly.
343
+ these when debugging persisted output, costs, columns, traces, or failures.
344
+
345
+ Use `agent-evals show-runs` when you need stable file
346
+ paths before reading saved output:
347
+
348
+ ```sh
349
+ agent-evals show-runs
350
+ agent-evals show-runs latest --json
351
+ jq . .agent-evals/runs/<run-id>/summary.json
352
+ jq -s . .agent-evals/runs/<run-id>/cases.jsonl
353
+ jq . .agent-evals/runs/<run-id>/case-details/<case-id>.json
354
+ jq . .agent-evals/runs/<run-id>/traces/<case-id>.json
355
+ ```
356
+
357
+ Run ids can be full timestamp ids, short ids such as `r0` from
358
+ `agent-evals show-runs`, or `latest`. `show-runs` is only an artifact index;
359
+ the files themselves remain the source of truth for detailed results and
360
+ traces.
351
361
 
352
362
  ## Module mocking
353
363
 
@@ -386,15 +396,13 @@ When adding or changing evals:
386
396
  4. Surface reviewable values through execute-context `setOutput` or ambient
387
397
  `setEvalOutput` in shared workflow code, and shape them with `columns`
388
398
  formats from the `ColumnFormat` type.
389
- 5. Promote high-signal span attributes with `traceDisplay` so the UI
390
- highlights them in the trace tree and detail pane.
399
+ 5. Promote high-signal span attributes with `traceDisplay` so they surface in
400
+ the trace tree and detail pane.
391
401
  6. Cache costly pure spans with `cache: { key }` and pure spanless values with
392
402
  `evalTracer.cache(...)`; never cache operations whose external side effects
393
403
  you depend on.
394
404
  7. Sanity-check after changes: `agent-evals list`, then
395
- `agent-evals run --eval <id>`. Open the UI only when you need to inspect
396
- traces, trends, or fill manual scores. From an eval page, the eval actions
397
- menu can copy package-manager-specific CLI run and debug commands.
398
- 8. To debug a focused run, use
399
- `agent-evals run --inspect-brk --eval <id> --case <case-id>` and attach a
400
- Node.js debugger before continuing execution.
405
+ `agent-evals run --eval <id>`.
406
+ 8. Locate saved artifacts with `agent-evals show-runs latest --json`, then read
407
+ the relevant `summary.json`, `cases.jsonl`, `case-details/<case-id>.json`,
408
+ or `traces/<case-id>.json` file directly.