npm - @ls-stack/agent-eval - Versions diffs - 0.15.0 → 0.16.0 - Mend

@ls-stack/agent-eval 0.15.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/dist/{app-B7FUWsVm.mjs → app-B8e-oWYc.mjs} +3 -3
package/dist/apps/web/dist/assets/index-9hO8NpgZ.js +117 -0
package/dist/apps/web/dist/assets/{index-BVnLr79e.css → index-MARPw1bH.css} +1 -1
package/dist/apps/web/dist/index.html +2 -2
package/dist/bin.mjs +1 -1
package/dist/{cli-B-sCTyz8.mjs → cli-BmrtjQj_.mjs} +150 -4
package/dist/index.d.mts +14 -0
package/dist/index.mjs +3 -3
package/dist/runChild.mjs +1 -1
package/dist/{runOrchestration-B3fYtpKo.mjs → runOrchestration-BDyNrRQT.mjs} +2 -0
package/dist/{runner-vunKoSBu.mjs → runner-CsZqhbiA.mjs} +2 -2
package/dist/{runner-Dt-Ynv6s.mjs → runner-DABFPXkx.mjs} +1 -1
package/dist/src-CEAJYN_X.mjs +3 -0
package/package.json +3 -3
package/skills/agent-eval/SKILL.md +53 -45
package/dist/apps/web/dist/assets/index-Cx2CvM6a.js +0 -117
package/dist/src-jaOlXwb5.mjs +0 -3

package/dist/index.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { $ as caseRowSchema, $t as appendToEvalOutput, A as getEvalTitle, At as traceDisplayConfigSchema, B as apiCallMetricFormatSchema, Bt as fileRefSchema, C as createRunRequestSchema, Ct as serializedCacheSpanSchema, D as extractApiCalls, Dt as traceAttributeDisplayInputSchema, E as extractCacheHits, Et as traceAttributeDisplayFormatSchema, F as runManifestSchema, Ft as traceSpanWarningSchema, G as llmCallMetricPlacementSchema, Gt as z, H as apiCallMetricSchema, Ht as numberDisplayOptionsSchema, I as runSummarySchema, It as cellValueSchema, J as resolveApiCallsConfig, Jt as evalSpan, K as llmCallMetricSchema, Kt as buildTraceTree, L as DEFAULT_API_CALLS_CONFIG, Lt as columnDefSchema, M as deriveScopedSummaryFromCases, Mt as traceSpanErrorSchema, N as deriveStatusFromCaseRows, Nt as traceSpanKindSchema, O as extractLlmCalls, Ot as traceAttributeDisplayPlacementSchema, P as deriveStatusFromChildStatuses, Pt as traceSpanSchema, Q as caseDetailSchema, Qt as EvalAssertionError, R as DEFAULT_LLM_CALLS_CONFIG, Rt as columnFormatSchema, St as cacheStatusSchema, T as sseEnvelopeSchema, Tt as traceCacheRefSchema, U as apiCallsConfigSchema, Ut as repoFileRefSchema, V as apiCallMetricPlacementSchema, Vt as jsonCellSchema, W as llmCallMetricFormatSchema, Wt as runArtifactRefSchema, X as trialSelectionModeSchema, Xt as hashCacheKey, Y as resolveLlmCallsConfig, Yt as evalTracer, Z as assertionFailureSchema, Zt as hashCacheKeySync, _t as cacheListItemSchema, an as mergeEvalOutput, at as scoreTraceSchema, bt as cacheRecordingOpSchema, cn as runInEvalScope, ct as evalChartBuiltinMetricSchema, dn as setScopeCacheContext, dt as evalChartMetricSchema, en as evalAssert, et as evalFreshnessStatusSchema, fn as startEvalBackgroundJob, ft as evalChartTooltipExtraSchema, gt as cacheFileSchema, hn as getEvalRegistry, ht as cacheEntrySchema, in as isInEvalScope, it as evalSummarySchema, j as getEvalDisplayStatus, jt as traceDisplayInputConfigSchema, k as getNestedAttribute, kt as traceAttributeDisplaySchema, ln as runInExistingEvalScope, lt as evalChartColorSchema, mn as defineEval, mt as evalChartsConfigSchema, nn as getEvalCaseInput, nt as evalStatItemSchema, on as nextEvalId, ot as evalChartAggregateSchema, pn as repoFile, pt as evalChartTypeSchema, q as llmCallsConfigSchema, qt as captureEvalSpanError, rn as incrementEvalOutput, rt as evalStatsConfigSchema, sn as runInEvalRuntimeScope, st as evalChartAxisSchema, tn as getCurrentScope, tt as evalStatAggregateSchema, un as setEvalOutput, ut as evalChartConfigSchema, vt as cacheModeSchema, w as updateManualScoreRequestSchema, wt as spanCacheOptionsSchema, xt as cacheRecordingSchema, yt as cacheOperationTypeSchema, z as agentEvalsConfigSchema, zt as columnKindSchema } from "./runOrchestration-B3fYtpKo.mjs";
-import { n as createRunner, t as runCli } from "./cli-B-sCTyz8.mjs";
-import "./src-jaOlXwb5.mjs";
+import { $ as caseRowSchema, $t as appendToEvalOutput, A as getEvalTitle, At as traceDisplayConfigSchema, B as apiCallMetricFormatSchema, Bt as fileRefSchema, C as createRunRequestSchema, Ct as serializedCacheSpanSchema, D as extractApiCalls, Dt as traceAttributeDisplayInputSchema, E as extractCacheHits, Et as traceAttributeDisplayFormatSchema, F as runManifestSchema, Ft as traceSpanWarningSchema, G as llmCallMetricPlacementSchema, Gt as z, H as apiCallMetricSchema, Ht as numberDisplayOptionsSchema, I as runSummarySchema, It as cellValueSchema, J as resolveApiCallsConfig, Jt as evalSpan, K as llmCallMetricSchema, Kt as buildTraceTree, L as DEFAULT_API_CALLS_CONFIG, Lt as columnDefSchema, M as deriveScopedSummaryFromCases, Mt as traceSpanErrorSchema, N as deriveStatusFromCaseRows, Nt as traceSpanKindSchema, O as extractLlmCalls, Ot as traceAttributeDisplayPlacementSchema, P as deriveStatusFromChildStatuses, Pt as traceSpanSchema, Q as caseDetailSchema, Qt as EvalAssertionError, R as DEFAULT_LLM_CALLS_CONFIG, Rt as columnFormatSchema, St as cacheStatusSchema, T as sseEnvelopeSchema, Tt as traceCacheRefSchema, U as apiCallsConfigSchema, Ut as repoFileRefSchema, V as apiCallMetricPlacementSchema, Vt as jsonCellSchema, W as llmCallMetricFormatSchema, Wt as runArtifactRefSchema, X as trialSelectionModeSchema, Xt as hashCacheKey, Y as resolveLlmCallsConfig, Yt as evalTracer, Z as assertionFailureSchema, Zt as hashCacheKeySync, _t as cacheListItemSchema, an as mergeEvalOutput, at as scoreTraceSchema, bt as cacheRecordingOpSchema, cn as runInEvalScope, ct as evalChartBuiltinMetricSchema, dn as setScopeCacheContext, dt as evalChartMetricSchema, en as evalAssert, et as evalFreshnessStatusSchema, fn as startEvalBackgroundJob, ft as evalChartTooltipExtraSchema, gt as cacheFileSchema, hn as getEvalRegistry, ht as cacheEntrySchema, in as isInEvalScope, it as evalSummarySchema, j as getEvalDisplayStatus, jt as traceDisplayInputConfigSchema, k as getNestedAttribute, kt as traceAttributeDisplaySchema, ln as runInExistingEvalScope, lt as evalChartColorSchema, mn as defineEval, mt as evalChartsConfigSchema, nn as getEvalCaseInput, nt as evalStatItemSchema, on as nextEvalId, ot as evalChartAggregateSchema, pn as repoFile, pt as evalChartTypeSchema, q as llmCallsConfigSchema, qt as captureEvalSpanError, rn as incrementEvalOutput, rt as evalStatsConfigSchema, sn as runInEvalRuntimeScope, st as evalChartAxisSchema, tn as getCurrentScope, tt as evalStatAggregateSchema, un as setEvalOutput, ut as evalChartConfigSchema, vt as cacheModeSchema, w as updateManualScoreRequestSchema, wt as spanCacheOptionsSchema, xt as cacheRecordingSchema, yt as cacheOperationTypeSchema, z as agentEvalsConfigSchema, zt as columnKindSchema } from "./runOrchestration-BDyNrRQT.mjs";
+import { n as createRunner, t as runCli } from "./cli-BmrtjQj_.mjs";
+import "./src-CEAJYN_X.mjs";
 export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };

package/dist/runChild.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { C as createRunRequestSchema, F as runManifestSchema, I as runSummarySchema, Lt as columnDefSchema, S as createFsCacheStore, mt as evalChartsConfigSchema, rt as evalStatsConfigSchema, t as executeRun, v as parseEvalMetas, y as loadConfig } from "./runOrchestration-B3fYtpKo.mjs";
+import { C as createRunRequestSchema, F as runManifestSchema, I as runSummarySchema, Lt as columnDefSchema, S as createFsCacheStore, mt as evalChartsConfigSchema, rt as evalStatsConfigSchema, t as executeRun, v as parseEvalMetas, y as loadConfig } from "./runOrchestration-BDyNrRQT.mjs";
 import { createHash } from "node:crypto";
 import { readFile } from "node:fs/promises";
 import { relative } from "node:path";

package/dist/{runOrchestration-B3fYtpKo.mjs → runOrchestration-BDyNrRQT.mjs} RENAMED Viewed

@@ -3913,6 +3913,7 @@ const agentEvalsConfigSchema = z.object({
 	trialSelection: trialSelectionModeSchema.optional(),
 	concurrency: z.number().optional(),
 	staleAfterDays: z.number().optional(),
+	allowCliRunAll: z.boolean().optional(),
 	traceDisplay: traceDisplayInputConfigSchema.optional(),
 	llmCalls: llmCallsConfigSchema.optional(),
 	apiCalls: apiCallsConfigSchema.optional(),
@@ -4881,6 +4882,7 @@ const defaultConfig = {
 	trialSelection: "lowestScore",
 	concurrency: 2,
 	staleAfterDays: 14,
+	allowCliRunAll: false,
 	traceDisplay: { attributes: [{
 		path: "input",
 		label: "Input",

package/dist/{runner-vunKoSBu.mjs → runner-CsZqhbiA.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { n as createRunner } from "./cli-B-sCTyz8.mjs";
-import "./src-jaOlXwb5.mjs";
+import { n as createRunner } from "./cli-BmrtjQj_.mjs";
+import "./src-CEAJYN_X.mjs";
 //#region ../../apps/server/src/runner.ts
 let runnerInstance = null;
 function getRunnerInstance() {

package/dist/{runner-Dt-Ynv6s.mjs → runner-DABFPXkx.mjs} RENAMED Viewed

@@ -1,2 +1,2 @@
-import { n as initRunner, t as getRunnerInstance } from "./runner-vunKoSBu.mjs";
+import { n as initRunner, t as getRunnerInstance } from "./runner-CsZqhbiA.mjs";
 export { getRunnerInstance, initRunner };

package/dist/src-CEAJYN_X.mjs ADDED Viewed

@@ -0,0 +1,3 @@
+import "./runOrchestration-BDyNrRQT.mjs";
+import "./cli-BmrtjQj_.mjs";
+export {};

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ls-stack/agent-eval",
-  "version": "0.15.0",
+  "version": "0.16.0",
   "type": "module",
   "bin": {
     "agent-evals": "./dist/bin.mjs"
@@ -59,8 +59,8 @@
     "@types/node": "^24.7.2",
     "typescript": "^5.9.2",
     "@agent-evals/runner": "0.0.1",
-    "@agent-evals/sdk": "0.0.1",
-    "@agent-evals/shared": "0.0.1"
+    "@agent-evals/shared": "0.0.1",
+    "@agent-evals/sdk": "0.0.1"
   },
   "scripts": {
     "build": "pnpm --filter @agent-evals/web build && tsdown",

package/skills/agent-eval/SKILL.md CHANGED Viewed

@@ -24,6 +24,10 @@ display rules), read the TypeScript declarations shipped with the package:
   Unknown help targets exit non-zero instead of falling back to global help.
 - The CLI automatically loads `.env` from the current workspace. Shell-provided
   environment variables win; pass `--no-env` to disable `.env` loading once.
+- Unfiltered `agent-evals run` is disabled by default; use `--eval` or `--case`
+  for targeted CLI runs. Set `allowCliRunAll: true` in
+  `agent-evals.config.ts` to opt into run-all CLI behavior. The web UI can
+  still run grouped evals and confirms before starting more than five.
 Assume that enumerated tables in this document may lag behind the types —
 treat the types as source of truth when they disagree.
@@ -117,21 +121,19 @@ export async function runRefundWorkflow(input: RefundInput) {
 }
 ```
-Span `kind` values are open-ended strings and are color-coded automatically in
-the UI for every kind used during the app session. Use familiar kinds such as
+Span `kind` values are open-ended strings. Use familiar kinds such as
 `agent`, `tool`, `llm`, `api`, `retrieval`, `scorer`, or `checkpoint` when they
 fit, and preserve external tracer kinds such as `mastra.workflow.step` when they
-are more specific. The UI automatically promotes only the `input` and `output` span
-attributes. Use `traceDisplay` for other span attributes such as `model`,
+are more specific. Only the `input` and `output` span attributes are promoted
+automatically; use `traceDisplay` for other span attributes such as `model`,
 `usage`, or `costUsd`.
 Use `captureEvalSpanError(error)` for recoverable errors on the active
 `evalTracer.span(...)`, such as optional model/tool failures that fall back and
 continue. You can pass one error, multiple error arguments, or an array. The
-span is still marked `error`, and the UI renders captured errors in a dedicated
-span detail block with timing relative to the span. Pass `'warning'` or
-`{ level: 'warning' }` as the final argument for diagnostics that should be
-visible in span detail without changing an otherwise successful span's status.
+span is still marked `error`. Pass `'warning'` or `{ level: 'warning' }` as the
+final argument for diagnostics that should not change an otherwise successful
+span's status.
 If a span callback throws, the SDK automatically marks that span as `error`,
 stores the thrown error on it, and rethrows so the case errors. Use that for
@@ -208,12 +210,11 @@ or if the case errors. Scores without `passThreshold` are informational.
 Score functions run in their own trace scope, separate from the execution
 trace, so LLM-as-judge scorers can use `evalTracer.span(...)` and cached spans
-without polluting the agent trajectory. The case detail UI shows execution
-spans on **Trace** and scorer spans on **Scoring**. Outputs set inside a scorer
-stay private to that score.
+without polluting the agent trajectory. Outputs set inside a scorer stay
+private to that score.
-`manualScores` declares score columns that reviewers fill in the web UI after
-a run. Pending values keep the eval in an `unscored` state instead of failing.
+`manualScores` declares score columns that reviewers fill in after a run.
+Pending values keep the eval in an `unscored` state instead of failing.
 See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
 (format, threshold, column overrides).
@@ -245,22 +246,20 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
   detail pane; it supports aggregation across subtrees (`scope`, `mode`) and
   user-defined `transform(...)` for derived views (e.g. currency conversion).
   See the `TraceDisplayInputConfig` type.
-- `llmCalls` (in `agent-evals.config.ts`) configures the LLM calls tab in the
-  case-run drawer. Defaults to `kind: 'llm'` spans with `model`, `usage.*`,
-  `costUsd`, `input`, `output`, etc. read from conventional attribute paths.
-  Override `kinds` to broaden the filter, override `attributes.<field>` for
-  non-default span shapes, and add entries to `metrics` to surface arbitrary
-  user metrics (`format: 'string' | 'number' | 'duration' | 'json' |
-'boolean'`, `placements: ['header' | 'body']`). The tab auto-hides when no
-  matching spans exist.
-- `apiCalls` (in `agent-evals.config.ts`) configures the API calls tab in the
-  case-run drawer. Defaults to `kind: 'api'`, `'http'`, `'http.client'`, and
-  `'fetch'` spans with `method`, `url`, `statusCode`, `request`, `response`,
-  `requestBody`, `responseBody`, `headers`, `durationMs`, and `error` read
-  from conventional attribute paths. Override `kinds` or
+- `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are
+  summarized for review. Defaults to `kind: 'llm'` spans with `model`,
+  `usage.*`, `costUsd`, `input`, `output`, etc. read from conventional
+  attribute paths. Override `kinds` to broaden the filter, override
+  `attributes.<field>` for non-default span shapes, and add entries to
+  `metrics` to surface arbitrary user metrics (`format: 'string' | 'number' |
+'duration' | 'json' | 'boolean'`, `placements: ['header' | 'body']`).
+- `apiCalls` (in `agent-evals.config.ts`) configures how API-call spans are
+  summarized for review. Defaults to `kind: 'api'`, `'http'`, `'http.client'`,
+  and `'fetch'` spans with `method`, `url`, `statusCode`, `request`,
+  `response`, `requestBody`, `responseBody`, `headers`, `durationMs`, and
+  `error` read from conventional attribute paths. Override `kinds` or
   `attributes.<field>` for external tracers, and add `metrics` with the same
-  formats and placements as LLM-call metrics. The tab auto-hides when no
-  matching spans exist.
+  formats and placements as LLM-call metrics.
 Stats rows and history charts on the eval card are opt-in via `stats` /
 `charts` on the eval definition. Their shapes live in the types; no need to
@@ -313,8 +312,7 @@ Mental model:
   span, that span gets a `cache.refs` entry with the value cache name, key,
   namespace, and hit/miss status. When called directly from the case body
   (no surrounding span), the ref is recorded on the case detail's `cacheRefs`
-  array so spanless caches still appear in the UI's **Cache hits** tab, where
-  each hit can be expanded for inspection or deleted by namespace/key.
+  array.
 - The cache key folds in a source-file fingerprint, so editing the eval busts
   the cache automatically.
 - `cache.namespace` on spans or `namespace` on value caches can share entries
@@ -335,19 +333,31 @@ Mental model:
   recorded SDK effects preserve richer built-ins such as `Date`, `Map`, `Set`,
   typed arrays, `URL`, `Headers`, `Blob`, and `File` on hits. Cache keys still
   use the deterministic key-hashing rules above.
-- Cache mode per run is controlled by CLI flags (see `agent-evals run --help`)
-  and by a chevron menu on each eval card in the UI.
-- The UI Stop action cancels the whole active run by terminating that run's
-  isolated execution process.
+- Cache mode per run is controlled by CLI flags (see `agent-evals run --help`).
 ## Artifacts
 Run output lives under `.agent-evals/runs/<run-id>/` and cache entries under
 `.agent-evals/cache/<eval-id>.json`. Files in a run directory include run
 metadata, a run summary, per-case results, and per-case trace JSON. Inspect
-these when debugging persisted output, costs, columns, traces, or failures —
-the filenames are stable even when their internal schema evolves, so pick the
-one whose name matches what you are debugging and read it directly.
+these when debugging persisted output, costs, columns, traces, or failures.
+Use `agent-evals show-runs` when you need stable file
+paths before reading saved output:
+```sh
+agent-evals show-runs
+agent-evals show-runs latest --json
+jq . .agent-evals/runs/<run-id>/summary.json
+jq -s . .agent-evals/runs/<run-id>/cases.jsonl
+jq . .agent-evals/runs/<run-id>/case-details/<case-id>.json
+jq . .agent-evals/runs/<run-id>/traces/<case-id>.json
+```
+Run ids can be full timestamp ids, short ids such as `r0` from
+`agent-evals show-runs`, or `latest`. `show-runs` is only an artifact index;
+the files themselves remain the source of truth for detailed results and
+traces.
 ## Module mocking
@@ -386,15 +396,13 @@ When adding or changing evals:
 4. Surface reviewable values through execute-context `setOutput` or ambient
    `setEvalOutput` in shared workflow code, and shape them with `columns`
    formats from the `ColumnFormat` type.
-5. Promote high-signal span attributes with `traceDisplay` so the UI
-   highlights them in the trace tree and detail pane.
+5. Promote high-signal span attributes with `traceDisplay` so they surface in
+   the trace tree and detail pane.
 6. Cache costly pure spans with `cache: { key }` and pure spanless values with
    `evalTracer.cache(...)`; never cache operations whose external side effects
    you depend on.
 7. Sanity-check after changes: `agent-evals list`, then
-   `agent-evals run --eval <id>`. Open the UI only when you need to inspect
-   traces, trends, or fill manual scores. From an eval page, the eval actions
-   menu can copy package-manager-specific CLI run and debug commands.
-8. To debug a focused run, use
-   `agent-evals run --inspect-brk --eval <id> --case <case-id>` and attach a
-   Node.js debugger before continuing execution.
+   `agent-evals run --eval <id>`.
+8. Locate saved artifacts with `agent-evals show-runs latest --json`, then read
+   the relevant `summary.json`, `cases.jsonl`, `case-details/<case-id>.json`,
+   or `traces/<case-id>.json` file directly.