npm - @ls-stack/agent-eval - Versions diffs - 0.58.0 → 0.58.2 - Mend

@ls-stack/agent-eval 0.58.0 → 0.58.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/dist/{app-L9GdY28I.mjs → app-BxD6aHbp.mjs} +52 -7
package/dist/apps/web/dist/assets/index-BMWBZw_u.js +377 -0
package/dist/apps/web/dist/assets/index-CHH7m5Cv.css +1 -0
package/dist/apps/web/dist/index.html +2 -2
package/dist/bin.mjs +1 -1
package/dist/caseChild.mjs +2 -1
package/dist/{cli-Cf37PZKi.mjs → cli-HBwXIJsg.mjs} +31 -5
package/dist/index.d.mts +136 -80
package/dist/index.mjs +4 -4
package/dist/runChild.mjs +2 -2
package/dist/{runExecution-C4kAOhC1.mjs → runExecution-pHJ0_TzH.mjs} +188 -89
package/dist/{runOrchestration-5xEiQxiS.mjs → runOrchestration-ngVXShH4.mjs} +73 -6
package/dist/{runner-JIykMlve.mjs → runner-BnZMGBla.mjs} +1 -1
package/dist/{runner-bjd_UB9i.mjs → runner-D_pz2NON.mjs} +2 -2
package/dist/{src-303BocMW.mjs → src-AeXGBJ26.mjs} +2 -2
package/package.json +1 -1
package/skills/agent-eval/SKILL.md +18 -3
package/dist/apps/web/dist/assets/index-Cz9p4l-t.js +0 -377
package/dist/apps/web/dist/assets/index-DtARRwsS.css +0 -1

package/dist/index.d.mts CHANGED Viewed

@@ -268,6 +268,7 @@ declare const runLogEntrySchema$1: z$1.ZodObject<{
   phase: z$1.ZodEnum<{
     eval: "eval";
     derive: "derive";
+    tracingAssertions: "tracingAssertions";
     outputsSchema: "outputsSchema";
     scorer: "scorer";
   }>;
@@ -399,10 +400,17 @@ type EvalCase$1$1<TInput = unknown> = {
 };
 /** Query helpers built from the flattened trace recorded for one eval case. */
 type EvalTraceTree = {
-  spans: EvalTraceSpan$2[];
-  rootSpans: EvalTraceSpan$2[];
-  findSpan: (name: string) => EvalTraceSpan$2 | undefined;
-  findSpansByKind: (kind: string) => EvalTraceSpan$2[];
+  /** Flat span list in creation order. */spans: EvalTraceSpan$2[]; /** Top-level spans whose `parentId` is `null`. */
+  rootSpans: EvalTraceSpan$2[]; /** Return the first span whose name exactly matches `name`. */
+  findSpan: (name: string) => EvalTraceSpan$2 | undefined; /** Return every span whose name exactly matches `name`. */
+  findSpans: (name: string) => EvalTraceSpan$2[]; /** Return whether any span name exactly matches `name`. */
+  hasSpan: (name: string) => boolean; /** Return every span whose kind exactly matches `kind`. */
+  findSpansByKind: (kind: string) => EvalTraceSpan$2[]; /** Return every span with `kind: 'tool'`. */
+  findToolCallSpans: () => EvalTraceSpan$2[]; /** Return the names of every span with `kind: 'tool'`. */
+  listToolCallSpanNames: () => string[]; /** Return whether a `kind: 'tool'` span has a name exactly matching `name`. */
+  hasToolCallSpan: (name: string) => boolean; /** Return span names in creation order, optionally filtered by kind. */
+  listSpanNames: (kind?: string) => string[]; /** Return span names in depth-first tree order, optionally filtered by kind. */
+  listSpanNamesDfs: (kind?: string) => string[]; /** Return all spans in depth-first tree order. */
   flattenDfs: () => EvalTraceSpan$2[];
   checkpoints: Map<string, unknown>;
 };
@@ -421,6 +429,12 @@ type EvalDeriveMap<TInput = unknown> = Record<string, EvalDeriveValueFn<TInput>>
 type EvalDeriveFn<TInput = unknown> = (ctx: EvalDeriveContext<TInput>) => Record<string, unknown> | Promise<Record<string, unknown>>;
 /** Trace-derived output config accepted globally and on eval definitions. */
 type EvalDeriveConfig<TInput = unknown> = EvalDeriveMap<TInput> | EvalDeriveFn<TInput>;
+/** Function that records trace-derived assertions for one case. */
+type EvalTracingAssertionsFn<TInput = unknown> = (ctx: EvalDeriveContext<TInput>) => MaybePromise$1<void>;
+/** Keyed trace-derived assertion config for grouping related checks. */
+type EvalTracingAssertionsMap<TInput = unknown> = Record<string, EvalTracingAssertionsFn<TInput>>;
+/** Trace-derived assertion config accepted globally and on eval definitions. */
+type EvalTracingAssertionsConfig<TInput = unknown> = EvalTracingAssertionsMap<TInput> | EvalTracingAssertionsFn<TInput>;
 /** UI overrides for a derived or scored column emitted by an eval. */
 type EvalColumnOverride = {
   /** Display label shown for the column in tables and detail views. */label?: string;
@@ -1142,9 +1156,18 @@ type EvalDefinitionBase<TInput = unknown, TOutputs extends EvalOutputs = EvalOut
    *
    * Prefer the keyed map form when each key has one derivation. The
    * object-returning callback form is also supported. Derived values only fill
-   * keys not already recorded during execution.
+   * keys not already recorded during execution. Assertion helpers are not
+   * allowed here; use `tracingAssertions` for trace-derived pass/fail checks.
    */
   deriveFromTracing?: EvalDeriveConfig<TInput>;
+  /**
+   * Record assertions from the finished execution trace.
+   *
+   * Runs after `deriveFromTracing` and before output schema validation and
+   * scores. Use `evalAssert(...)` or `evalExpect(...)` inside the callback to
+   * write normal assertion results without creating score columns.
+   */
+  tracingAssertions?: EvalTracingAssertionsConfig<TInput>;
   /**
    * Computed score columns for each case.
    *
@@ -1455,7 +1478,9 @@ type CacheScopeContext = {
 /** Active recording frame captured while a cached operation body executes. */
 type CacheRecordingFrame = {
   /** Length of `scope.spans` immediately before the cached body started. */baseSpanIndex: number; /** Parent id used when recording and replaying direct child spans. */
-  replayParentSpanId: string | null; /** Ordered observable effects recorded during the cached body. */
+  replayParentSpanId: string | null; /** Spans created by this cache body's async execution branch. */
+  spanIds: Set<string>; /** Non-cache attributes written to the replay parent by this async branch. */
+  finalAttributes: Record<string, unknown>; /** Ordered observable effects recorded during the cached body. */
   ops: CacheRecordingOp$1[];
 };
 /** Mutable per-case runtime state stored in async local storage. */
@@ -1480,11 +1505,6 @@ type EvalCaseScope = {
   logs: RunLogEntry$1[];
   spans: EvalTraceSpan$2[];
   checkpoints: Map<string, unknown>;
-  /**
-   * Stack of active cache recorders. Ops are written to the top-most frame
-   * when it exists and `replayingDepth === 0`.
-   */
-  recordingStack: CacheRecordingFrame[];
   /**
    * Incremented while replaying a cached operation, so nested SDK calls do not
    * accidentally double-record ops into outer recorders.
@@ -1506,12 +1526,16 @@ type EvalCaseScope = {
  * covers run-time module/environment loading, including top-level code in
  * modules imported while a run is being prepared.
  */
-type EvalRuntimeScope = 'env' | 'cases' | 'eval' | 'derive' | 'outputsSchema' | 'scorer';
+type EvalRuntimeScope = 'env' | 'cases' | 'eval' | 'derive' | 'tracingAssertions' | 'outputsSchema' | 'scorer';
 type EvalLogLevelInput = RunLogLevel$1 | 'warning';
 /** Error thrown when an eval assertion fails during case execution. */
 declare class EvalAssertionError extends Error {
   constructor(message: string);
 }
+/** Error thrown when an SDK helper is used in an unsupported runner phase. */
+declare class EvalRuntimeUsageError extends Error {
+  constructor(message: string);
+}
 /** Return the host process clock, bypassing the eval Date shim. */
 /**
  * Eval time helpers for reading and moving the active eval clock.
@@ -1542,8 +1566,10 @@ declare function getCurrentScope(): EvalCaseScope | undefined;
  *
  * Returns `null` outside eval-owned work, `env` while the runner is loading
  * eval modules for a run, `cases` while generating cases, `eval` while running
- * case `execute`, `derive` while deriving outputs from traces, `outputsSchema`
- * while validating outputs, and `scorer` while computing scores.
+ * case `execute`, `derive` while deriving outputs from traces,
+ * `tracingAssertions` while checking trace-derived assertions,
+ * `outputsSchema` while validating outputs, and `scorer` while computing
+ * scores.
  */
 declare function isInEvalScope(): EvalRuntimeScope | null;
 /**
@@ -1659,7 +1685,8 @@ declare function incrementEvalOutput(key: string, delta: number): void;
  * Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
  * can safely reuse `evalAssert(...)` when it also runs outside an eval. The
  * TypeScript assertion signature still narrows the checked value after the
- * call.
+ * call. Calls inside `deriveFromTracing` throw because derivations must only
+ * write outputs; use `tracingAssertions` for trace-derived pass/fail checks.
  */
 declare function evalAssert(condition: unknown, message: string): asserts condition; //#endregion
 //#region src/valueCache.d.ts
@@ -2017,8 +2044,8 @@ declare const traceAttributeDisplaySchema: z$1.ZodObject<{
     subtree: "subtree";
   }>>;
   mode: z$1.ZodOptional<z$1.ZodEnum<{
-    all: "all";
     sum: "sum";
+    all: "all";
     last: "last";
   }>>;
 }, z$1.core.$strip>;
@@ -2053,8 +2080,8 @@ declare const traceDisplayConfigSchema: z$1.ZodObject<{
       subtree: "subtree";
     }>>;
     mode: z$1.ZodOptional<z$1.ZodEnum<{
-      all: "all";
       sum: "sum";
+      all: "all";
       last: "last";
     }>>;
   }, z$1.core.$strip>>>;
@@ -2093,8 +2120,8 @@ declare const traceAttributeDisplayInputSchema: z$1.ZodObject<{
     subtree: "subtree";
   }>>;
   mode: z$1.ZodOptional<z$1.ZodEnum<{
-    all: "all";
     sum: "sum";
+    all: "all";
     last: "last";
   }>>;
   transform: z$1.ZodOptional<z$1.ZodCustom<TraceAttributeTransform, TraceAttributeTransform>>;
@@ -2131,8 +2158,8 @@ declare const traceDisplayInputConfigSchema: z$1.ZodObject<{
       subtree: "subtree";
     }>>;
     mode: z$1.ZodOptional<z$1.ZodEnum<{
-      all: "all";
       sum: "sum";
+      all: "all";
       last: "last";
     }>>;
     transform: z$1.ZodOptional<z$1.ZodCustom<TraceAttributeTransform, TraceAttributeTransform>>;
@@ -2217,9 +2244,9 @@ type EvalFreshnessStatus = z$1.infer<typeof evalFreshnessStatusSchema>;
  */
 declare const evalStatAggregateSchema: z$1.ZodEnum<{
   avg: "avg";
-  sum: "sum";
   min: "min";
   max: "max";
+  sum: "sum";
   best: "best";
   worst: "worst";
 }>;
@@ -2249,9 +2276,9 @@ declare const evalStatItemSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
   kind: z$1.ZodLiteral<"duration">;
   aggregate: z$1.ZodOptional<z$1.ZodEnum<{
     avg: "avg";
-    sum: "sum";
     min: "min";
     max: "max";
+    sum: "sum";
     best: "best";
     worst: "worst";
   }>>;
@@ -2260,9 +2287,9 @@ declare const evalStatItemSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
   kind: z$1.ZodLiteral<"cacheHits">;
   aggregate: z$1.ZodOptional<z$1.ZodEnum<{
     avg: "avg";
-    sum: "sum";
     min: "min";
     max: "max";
+    sum: "sum";
     best: "best";
     worst: "worst";
   }>>;
@@ -2273,9 +2300,9 @@ declare const evalStatItemSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
   label: z$1.ZodOptional<z$1.ZodString>;
   aggregate: z$1.ZodEnum<{
     avg: "avg";
-    sum: "sum";
     min: "min";
     max: "max";
+    sum: "sum";
     best: "best";
     worst: "worst";
   }>;
@@ -2313,9 +2340,9 @@ declare const evalStatsConfigSchema: z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z$1
   kind: z$1.ZodLiteral<"duration">;
   aggregate: z$1.ZodOptional<z$1.ZodEnum<{
     avg: "avg";
-    sum: "sum";
     min: "min";
     max: "max";
+    sum: "sum";
     best: "best";
     worst: "worst";
   }>>;
@@ -2324,9 +2351,9 @@ declare const evalStatsConfigSchema: z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z$1
   kind: z$1.ZodLiteral<"cacheHits">;
   aggregate: z$1.ZodOptional<z$1.ZodEnum<{
     avg: "avg";
-    sum: "sum";
     min: "min";
     max: "max";
+    sum: "sum";
     best: "best";
     worst: "worst";
   }>>;
@@ -2337,9 +2364,9 @@ declare const evalStatsConfigSchema: z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z$1
   label: z$1.ZodOptional<z$1.ZodString>;
   aggregate: z$1.ZodEnum<{
     avg: "avg";
-    sum: "sum";
     min: "min";
     max: "max";
+    sum: "sum";
     best: "best";
     worst: "worst";
   }>;
@@ -2422,10 +2449,10 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
   caseIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
   lastRunStatus: z$1.ZodNullable<z$1.ZodEnum<{
     error: "error";
-    running: "running";
-    cancelled: "cancelled";
     pass: "pass";
     fail: "fail";
+    running: "running";
+    cancelled: "cancelled";
     unscored: "unscored";
   }>>;
   stats: z$1.ZodOptional<z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
@@ -2440,9 +2467,9 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
     kind: z$1.ZodLiteral<"duration">;
     aggregate: z$1.ZodOptional<z$1.ZodEnum<{
       avg: "avg";
-      sum: "sum";
       min: "min";
       max: "max";
+      sum: "sum";
       best: "best";
       worst: "worst";
     }>>;
@@ -2451,9 +2478,9 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
     kind: z$1.ZodLiteral<"cacheHits">;
     aggregate: z$1.ZodOptional<z$1.ZodEnum<{
       avg: "avg";
-      sum: "sum";
       min: "min";
       max: "max";
+      sum: "sum";
       best: "best";
       worst: "worst";
     }>>;
@@ -2464,9 +2491,9 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
     label: z$1.ZodOptional<z$1.ZodString>;
     aggregate: z$1.ZodEnum<{
       avg: "avg";
-      sum: "sum";
       min: "min";
       max: "max";
+      sum: "sum";
       best: "best";
       worst: "worst";
     }>;
@@ -2491,9 +2518,9 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
   }, z$1.core.$strip>], "kind">>>;
   defaultStatAggregate: z$1.ZodOptional<z$1.ZodEnum<{
     avg: "avg";
-    sum: "sum";
     min: "min";
     max: "max";
+    sum: "sum";
     best: "best";
     worst: "worst";
   }>>;
@@ -2530,9 +2557,9 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
       key: z$1.ZodString;
       aggregate: z$1.ZodEnum<{
         avg: "avg";
-        sum: "sum";
         min: "min";
         max: "max";
+        sum: "sum";
         latest: "latest";
         passThresholdRate: "passThresholdRate";
       }>;
@@ -2572,9 +2599,9 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
       key: z$1.ZodString;
       aggregate: z$1.ZodEnum<{
         avg: "avg";
-        sum: "sum";
         min: "min";
         max: "max";
+        sum: "sum";
         latest: "latest";
         passThresholdRate: "passThresholdRate";
       }>;
@@ -2671,11 +2698,11 @@ declare const caseRowSchema$1: z$1.ZodObject<{
   tags: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
   status: z$1.ZodEnum<{
     error: "error";
-    pending: "pending";
-    running: "running";
-    cancelled: "cancelled";
     pass: "pass";
     fail: "fail";
+    running: "running";
+    cancelled: "cancelled";
+    pending: "pending";
   }>;
   durationMs: z$1.ZodNullable<z$1.ZodNumber>;
   cacheHits: z$1.ZodOptional<z$1.ZodNumber>;
@@ -2756,6 +2783,7 @@ type RunLogLevel = z$1.infer<typeof runLogLevelSchema>;
 declare const runLogPhaseSchema: z$1.ZodEnum<{
   eval: "eval";
   derive: "derive";
+  tracingAssertions: "tracingAssertions";
   outputsSchema: "outputsSchema";
   scorer: "scorer";
 }>;
@@ -2782,6 +2810,7 @@ declare const runLogEntrySchema: z$1.ZodObject<{
   phase: z$1.ZodEnum<{
     eval: "eval";
     derive: "derive";
+    tracingAssertions: "tracingAssertions";
     outputsSchema: "outputsSchema";
     scorer: "scorer";
   }>;
@@ -2862,8 +2891,8 @@ declare const scoreTraceSchema: z$1.ZodObject<{
         subtree: "subtree";
       }>>;
       mode: z$1.ZodOptional<z$1.ZodEnum<{
-        all: "all";
         sum: "sum";
+        all: "all";
         last: "last";
       }>>;
     }, z$1.core.$strip>>>;
@@ -2874,10 +2903,10 @@ declare const scoreTraceSchema: z$1.ZodObject<{
     namespace: z$1.ZodString;
     key: z$1.ZodString;
     status: z$1.ZodEnum<{
-      bypass: "bypass";
-      refresh: "refresh";
       hit: "hit";
       miss: "miss";
+      refresh: "refresh";
+      bypass: "bypass";
     }>;
     read: z$1.ZodOptional<z$1.ZodBoolean>;
     stored: z$1.ZodOptional<z$1.ZodBoolean>;
@@ -2896,11 +2925,11 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
   tags: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
   status: z$1.ZodEnum<{
     error: "error";
-    pending: "pending";
-    running: "running";
-    cancelled: "cancelled";
     pass: "pass";
     fail: "fail";
+    running: "running";
+    cancelled: "cancelled";
+    pending: "pending";
   }>;
   input: z$1.ZodUnknown;
   trace: z$1.ZodArray<z$1.ZodObject<{
@@ -2965,8 +2994,8 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
         subtree: "subtree";
       }>>;
       mode: z$1.ZodOptional<z$1.ZodEnum<{
-        all: "all";
         sum: "sum";
+        all: "all";
         last: "last";
       }>>;
     }, z$1.core.$strip>>>;
@@ -3034,8 +3063,8 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
           subtree: "subtree";
         }>>;
         mode: z$1.ZodOptional<z$1.ZodEnum<{
-          all: "all";
           sum: "sum";
+          all: "all";
           last: "last";
         }>>;
       }, z$1.core.$strip>>>;
@@ -3046,10 +3075,10 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
       namespace: z$1.ZodString;
       key: z$1.ZodString;
       status: z$1.ZodEnum<{
-        bypass: "bypass";
-        refresh: "refresh";
         hit: "hit";
         miss: "miss";
+        refresh: "refresh";
+        bypass: "bypass";
       }>;
       read: z$1.ZodOptional<z$1.ZodBoolean>;
       stored: z$1.ZodOptional<z$1.ZodBoolean>;
@@ -3140,6 +3169,7 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
     phase: z$1.ZodEnum<{
       eval: "eval";
       derive: "derive";
+      tracingAssertions: "tracingAssertions";
       outputsSchema: "outputsSchema";
       scorer: "scorer";
     }>;
@@ -3166,10 +3196,10 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
     namespace: z$1.ZodString;
     key: z$1.ZodString;
     status: z$1.ZodEnum<{
-      bypass: "bypass";
-      refresh: "refresh";
       hit: "hit";
       miss: "miss";
+      refresh: "refresh";
+      bypass: "bypass";
     }>;
     read: z$1.ZodOptional<z$1.ZodBoolean>;
     stored: z$1.ZodOptional<z$1.ZodBoolean>;
@@ -3223,9 +3253,9 @@ type EvalChartBuiltinMetric = z$1.infer<typeof evalChartBuiltinMetricSchema>;
 /** Reducer applied to a numeric column across all cases of a single run. */
 declare const evalChartAggregateSchema: z$1.ZodEnum<{
   avg: "avg";
-  sum: "sum";
   min: "min";
   max: "max";
+  sum: "sum";
   latest: "latest";
   passThresholdRate: "passThresholdRate";
 }>;
@@ -3281,9 +3311,9 @@ declare const evalChartMetricSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
   key: z$1.ZodString;
   aggregate: z$1.ZodEnum<{
     avg: "avg";
-    sum: "sum";
     min: "min";
     max: "max";
+    sum: "sum";
     latest: "latest";
     passThresholdRate: "passThresholdRate";
   }>;
@@ -3316,9 +3346,9 @@ declare const evalChartTooltipExtraSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObj
   key: z$1.ZodString;
   aggregate: z$1.ZodEnum<{
     avg: "avg";
-    sum: "sum";
     min: "min";
     max: "max";
+    sum: "sum";
     latest: "latest";
     passThresholdRate: "passThresholdRate";
   }>;
@@ -3364,9 +3394,9 @@ declare const evalChartConfigSchema: z$1.ZodObject<{
     key: z$1.ZodString;
     aggregate: z$1.ZodEnum<{
       avg: "avg";
-      sum: "sum";
       min: "min";
       max: "max";
+      sum: "sum";
       latest: "latest";
       passThresholdRate: "passThresholdRate";
     }>;
@@ -3406,9 +3436,9 @@ declare const evalChartConfigSchema: z$1.ZodObject<{
     key: z$1.ZodString;
     aggregate: z$1.ZodEnum<{
       avg: "avg";
-      sum: "sum";
       min: "min";
       max: "max";
+      sum: "sum";
       latest: "latest";
       passThresholdRate: "passThresholdRate";
     }>;
@@ -3454,9 +3484,9 @@ declare const evalChartsConfigSchema: z$1.ZodArray<z$1.ZodObject<{
     key: z$1.ZodString;
     aggregate: z$1.ZodEnum<{
       avg: "avg";
-      sum: "sum";
       min: "min";
       max: "max";
+      sum: "sum";
       latest: "latest";
       passThresholdRate: "passThresholdRate";
     }>;
@@ -3496,9 +3526,9 @@ declare const evalChartsConfigSchema: z$1.ZodArray<z$1.ZodObject<{
     key: z$1.ZodString;
     aggregate: z$1.ZodEnum<{
       avg: "avg";
-      sum: "sum";
       min: "min";
       max: "max";
+      sum: "sum";
       latest: "latest";
       passThresholdRate: "passThresholdRate";
     }>;
@@ -3514,10 +3544,10 @@ declare const runManifestSchema$1: z$1.ZodObject<{
   shortId: z$1.ZodString;
   status: z$1.ZodEnum<{
     error: "error";
-    pending: "pending";
     running: "running";
-    completed: "completed";
     cancelled: "cancelled";
+    pending: "pending";
+    completed: "completed";
   }>;
   temporary: z$1.ZodDefault<z$1.ZodOptional<z$1.ZodBoolean>>;
   startedAt: z$1.ZodString;
@@ -3526,9 +3556,9 @@ declare const runManifestSchema$1: z$1.ZodObject<{
   evalSourceFingerprints: z$1.ZodDefault<z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodString>>>;
   target: z$1.ZodObject<{
     mode: z$1.ZodEnum<{
+      caseIds: "caseIds";
       all: "all";
       evalIds: "evalIds";
-      caseIds: "caseIds";
     }>;
     evalKeys: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
     files: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
@@ -3542,9 +3572,9 @@ declare const runManifestSchema$1: z$1.ZodObject<{
     median: "median";
   }>>>;
   cacheMode: z$1.ZodOptional<z$1.ZodEnum<{
-    use: "use";
-    bypass: "bypass";
     refresh: "refresh";
+    bypass: "bypass";
+    use: "use";
   }>>;
 }, z$1.core.$strip>;
 /** Persisted lifecycle metadata for a single eval run. */
@@ -3554,10 +3584,10 @@ declare const runSummarySchema$1: z$1.ZodObject<{
   runId: z$1.ZodString;
   status: z$1.ZodEnum<{
     error: "error";
-    pending: "pending";
     running: "running";
-    completed: "completed";
     cancelled: "cancelled";
+    pending: "pending";
+    completed: "completed";
   }>;
   totalCases: z$1.ZodNumber;
   passedCases: z$1.ZodNumber;
@@ -3613,7 +3643,7 @@ type ScopedCaseSummary = {
 //#endregion
 //#region src/evalStatus.d.ts
 /** Display status used for eval, file, and folder UI surfaces. */
-type EvalDisplayStatus = DerivedStatus | 'stale' | 'outdated' | 'unscored';
+type EvalDisplayStatus = DerivedStatus | 'enqueued' | 'stale' | 'outdated' | 'unscored';
 /**
  * Derive the user-facing eval status from the raw latest run result plus
  * freshness state.
@@ -3661,10 +3691,17 @@ type EvalCase$1<TInput = unknown> = {
 };
 /** Query helpers built from the flattened trace recorded for one eval case. */
 type EvalTraceTree$1 = {
-  spans: EvalTraceSpan$1[];
-  rootSpans: EvalTraceSpan$1[];
-  findSpan: (name: string) => EvalTraceSpan$1 | undefined;
-  findSpansByKind: (kind: string) => EvalTraceSpan$1[];
+  /** Flat span list in creation order. */spans: EvalTraceSpan$1[]; /** Top-level spans whose `parentId` is `null`. */
+  rootSpans: EvalTraceSpan$1[]; /** Return the first span whose name exactly matches `name`. */
+  findSpan: (name: string) => EvalTraceSpan$1 | undefined; /** Return every span whose name exactly matches `name`. */
+  findSpans: (name: string) => EvalTraceSpan$1[]; /** Return whether any span name exactly matches `name`. */
+  hasSpan: (name: string) => boolean; /** Return every span whose kind exactly matches `kind`. */
+  findSpansByKind: (kind: string) => EvalTraceSpan$1[]; /** Return every span with `kind: 'tool'`. */
+  findToolCallSpans: () => EvalTraceSpan$1[]; /** Return the names of every span with `kind: 'tool'`. */
+  listToolCallSpanNames: () => string[]; /** Return whether a `kind: 'tool'` span has a name exactly matching `name`. */
+  hasToolCallSpan: (name: string) => boolean; /** Return span names in creation order, optionally filtered by kind. */
+  listSpanNames: (kind?: string) => string[]; /** Return span names in depth-first tree order, optionally filtered by kind. */
+  listSpanNamesDfs: (kind?: string) => string[]; /** Return all spans in depth-first tree order. */
   flattenDfs: () => EvalTraceSpan$1[];
   checkpoints: Map<string, unknown>;
 };
@@ -3684,6 +3721,13 @@ type EvalDeriveFn$1<TInput = unknown> = (ctx: EvalDeriveContext$1<TInput>) => Re
 /** Trace-derived output config accepted globally and on eval definitions. */
 type EvalDeriveConfig$1<TInput = unknown> = EvalDeriveMap$1<TInput> | EvalDeriveFn$1<TInput>;
 /** Schema for keyed or object-returning trace-derived output config. */
+/** Function that records trace-derived assertions for one case. */
+type EvalTracingAssertionsFn$1<TInput = unknown> = (ctx: EvalDeriveContext$1<TInput>) => MaybePromise<void>;
+/** Keyed trace-derived assertion config for grouping related checks. */
+type EvalTracingAssertionsMap$1<TInput = unknown> = Record<string, EvalTracingAssertionsFn$1<TInput>>;
+/** Trace-derived assertion config accepted globally and on eval definitions. */
+type EvalTracingAssertionsConfig$1<TInput = unknown> = EvalTracingAssertionsMap$1<TInput> | EvalTracingAssertionsFn$1<TInput>;
+/** Schema for function or keyed trace-derived assertion config. */
 /** UI overrides for a derived or scored column emitted by an eval. */
 type EvalColumnOverride$1 = {
   /** Display label shown for the column in tables and detail views. */label?: string;
@@ -4136,9 +4180,19 @@ type AgentEvalsConfig$1 = {
    * Prefer the keyed map form for shared metrics:
    * `{ toolCalls: ({ trace }) => trace.findSpansByKind('tool').length }`.
    * The object-returning function form is also supported. Derived outputs
-   * only fill keys that were not already recorded by eval execution.
+   * only fill keys that were not already recorded by eval execution. Do not
+   * call assertion helpers here; use `tracingAssertions` for trace-derived
+   * pass/fail checks.
    */
   deriveFromTracing?: EvalDeriveConfig$1;
+  /**
+   * Workspace-wide assertions derived from the finished execution trace.
+   *
+   * These run after `deriveFromTracing` and before output schema validation and
+   * scores. Use `evalAssert(...)` or `evalExpect(...)` inside the callback to
+   * record normal assertion results without creating fake score columns.
+   */
+  tracingAssertions?: EvalTracingAssertionsConfig$1;
   /**
    * Workspace-wide stats prepended to every eval's stats row.
    *
@@ -4469,9 +4523,9 @@ declare function extractApiCalls(spans: EvalTraceSpan$1[], config: ResolvedApiCa
  * - `refresh`: never read, always write (forces re-execution and overwrites).
  */
 declare const cacheModeSchema: z$1.ZodEnum<{
-  use: "use";
-  bypass: "bypass";
   refresh: "refresh";
+  bypass: "bypass";
+  use: "use";
 }>;
 /** Mode controlling how cached spans behave during a run. */
 type CacheMode = z$1.infer<typeof cacheModeSchema>;
@@ -4492,10 +4546,10 @@ declare const cacheOperationTypeSchema: z$1.ZodEnum<{
 type CacheOperationType = z$1.infer<typeof cacheOperationTypeSchema>;
 /** Status of a cache lookup recorded on a span or case scope. */
 declare const cacheStatusSchema: z$1.ZodEnum<{
-  bypass: "bypass";
-  refresh: "refresh";
   hit: "hit";
   miss: "miss";
+  refresh: "refresh";
+  bypass: "bypass";
 }>;
 /** Status of a cache lookup recorded on a span or case scope. */
 type CacheStatus = z$1.infer<typeof cacheStatusSchema>;
@@ -4512,10 +4566,10 @@ declare const traceCacheRefSchema: z$1.ZodObject<{
   namespace: z$1.ZodString;
   key: z$1.ZodString;
   status: z$1.ZodEnum<{
-    bypass: "bypass";
-    refresh: "refresh";
     hit: "hit";
     miss: "miss";
+    refresh: "refresh";
+    bypass: "bypass";
   }>;
   read: z$1.ZodOptional<z$1.ZodBoolean>;
   stored: z$1.ZodOptional<z$1.ZodBoolean>;
@@ -5467,9 +5521,9 @@ type ConfigReloadState = z$1.infer<typeof configReloadStateSchema$1>;
 declare const createRunRequestSchema$1: z$1.ZodObject<{
   target: z$1.ZodObject<{
     mode: z$1.ZodEnum<{
+      caseIds: "caseIds";
       all: "all";
       evalIds: "evalIds";
-      caseIds: "caseIds";
     }>;
     evalKeys: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
     files: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
@@ -5481,9 +5535,9 @@ declare const createRunRequestSchema$1: z$1.ZodObject<{
   temporary: z$1.ZodOptional<z$1.ZodBoolean>;
   cache: z$1.ZodOptional<z$1.ZodObject<{
     mode: z$1.ZodDefault<z$1.ZodEnum<{
-      use: "use";
-      bypass: "bypass";
       refresh: "refresh";
+      bypass: "bypass";
+      use: "use";
     }>>;
   }, z$1.core.$strip>>;
   manualInputs: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
@@ -6369,6 +6423,7 @@ declare const caseDetailSchema: z$1.ZodObject<{
     phase: z$1.ZodEnum<{
       eval: "eval";
       derive: "derive";
+      tracingAssertions: "tracingAssertions";
       outputsSchema: "outputsSchema";
       scorer: "scorer";
     }>;
@@ -6995,7 +7050,8 @@ type EvalRunner = {
   getEvals(): EvalSummary$1[]; /** Look up one discovered eval by id. */
   getEval(id: string): EvalSummary$1 | undefined; /** Return discovery errors that should be shown before running evals. */
   getDiscoveryIssues(): DiscoveryIssue$1[]; /** Return current config-reload state for the long-running app server. */
-  getConfigReloadState(): ConfigReloadState$1; /** Re-scan configured eval files and emit a discovery update to listeners. */
+  getConfigReloadState(): ConfigReloadState$1; /** Return the effective per-run case concurrency after applying defaults. */
+  getConfiguredConcurrency(): number; /** Re-scan configured eval files and emit a discovery update to listeners. */
   refreshDiscovery(): Promise<void>;
   startRun(request: CreateRunRequest$1): Promise<{
     manifest: RunManifest$1;
@@ -7230,4 +7286,4 @@ declare function defineEval<TInput = unknown, TOutputs extends EvalOutputs = Eva
 /** Return whether the active eval case has tags matching the typed input. */
 declare function matchesEvalTags(input: EvalTagMatchInput): boolean;
 //#endregion
-export { AgentEvalTagRegistry, AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheRepairSummary, type CacheScopeContext, type CacheSerializationOptions, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CallDerivedAttributesConfig, type CallDerivedAttributesFn, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type ConfigReloadState, type ConfigReloadStatus, type CreateRunRequest, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCacheConfig, EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, EvalDefinition, type EvalDeriveConfig, type EvalDeriveContext, type EvalDeriveFn, type EvalDeriveMap, type EvalDeriveValueFn, type EvalDisplayStatus, type EvalExecuteContext, type EvalExpectation, type EvalFreshnessStatus, type EvalManualInputConfig, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, EvalTag, EvalTagMatchInput, type EvalTraceTree, type JsonCell, type LlmCallCostBreakdown, type LlmCallCostCurrency, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallPricingRate, type LlmCallPricingRegistry, type LlmCallSimulatedTokens, type LlmCallsConfigInput, type LlmCostScenario, type ManualInputDescriptor, type ManualInputFieldDescriptor, type ManualInputFieldKind, type ManualInputFieldOverride, type ManualInputFieldsConfig, type ManualInputFileValue, type ManualInputSelectOption, type MaterializeManualInputFilesResult, type NumberDisplayOptions, type ReadManualInputFileResult, type RemoveDefaultConfig, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallCostCurrency, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
+export { AgentEvalTagRegistry, AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheRepairSummary, type CacheScopeContext, type CacheSerializationOptions, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CallDerivedAttributesConfig, type CallDerivedAttributesFn, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type ConfigReloadState, type ConfigReloadStatus, type CreateRunRequest, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCacheConfig, EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, EvalDefinition, type EvalDeriveConfig, type EvalDeriveContext, type EvalDeriveFn, type EvalDeriveMap, type EvalDeriveValueFn, type EvalDisplayStatus, type EvalExecuteContext, type EvalExpectation, type EvalFreshnessStatus, type EvalManualInputConfig, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, EvalRuntimeUsageError, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, EvalTag, EvalTagMatchInput, type EvalTraceTree, type EvalTracingAssertionsConfig, type EvalTracingAssertionsFn, type EvalTracingAssertionsMap, type JsonCell, type LlmCallCostBreakdown, type LlmCallCostCurrency, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallPricingRate, type LlmCallPricingRegistry, type LlmCallSimulatedTokens, type LlmCallsConfigInput, type LlmCostScenario, type ManualInputDescriptor, type ManualInputFieldDescriptor, type ManualInputFieldKind, type ManualInputFieldOverride, type ManualInputFieldsConfig, type ManualInputFileValue, type ManualInputSelectOption, type MaterializeManualInputFilesResult, type NumberDisplayOptions, type ReadManualInputFileResult, type RemoveDefaultConfig, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallCostCurrency, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };