npm - @ls-stack/agent-eval - Versions diffs - 0.58.1 → 0.58.3 - Mend

@ls-stack/agent-eval 0.58.1 → 0.58.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/dist/{app-DhMIbjlE.mjs → app-ROCEce9X.mjs} +52 -7
package/dist/apps/web/dist/assets/index-CHH7m5Cv.css +1 -0
package/dist/apps/web/dist/assets/index-PTikBbhf.js +377 -0
package/dist/apps/web/dist/index.html +2 -2
package/dist/bin.mjs +1 -1
package/dist/caseChild.mjs +2 -1
package/dist/{cli-_g2qOMK6.mjs → cli-SP4kEtYL.mjs} +31 -5
package/dist/index.d.mts +184 -129
package/dist/index.mjs +4 -4
package/dist/runChild.mjs +2 -2
package/dist/{runExecution-d42Lm0i5.mjs → runExecution-CFw0MQFs.mjs} +114 -21
package/dist/{runOrchestration-CvmFeOmT.mjs → runOrchestration-CxjiQmof.mjs} +73 -6
package/dist/{runner-BKogjiYd.mjs → runner-BlFQyvN2.mjs} +1 -1
package/dist/{runner-MSr8sAWm.mjs → runner-CY3bgsjU.mjs} +2 -2
package/dist/{src-CdZsOn6y.mjs → src-7GbQj1sb.mjs} +2 -2
package/package.json +3 -3
package/skills/agent-eval/SKILL.md +19 -3
package/dist/apps/web/dist/assets/index-Cz9p4l-t.js +0 -377
package/dist/apps/web/dist/assets/index-DtARRwsS.css +0 -1

package/dist/index.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { $ as startEvalBackgroundJob, A as repoFile, B as getCurrentScope, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as appendToEvalOutput, G as mergeEvalOutput, H as incrementEvalOutput, J as runInEvalScope, K as nextEvalId, L as evalAssert, M as readManualInputFile, N as evalExpect, Nt as getEvalRegistry, O as serializeCacheRecording, P as EvalAssertionError, Q as setScopeCacheContext, R as evalLog, S as evalSpan, T as hashCacheKeySync, U as isInEvalScope, V as getEvalCaseInput, Y as runInExistingEvalScope, Z as setEvalOutput, at as extractLlmCalls, b as buildTraceTree, it as extractApiCalls, j as manualInputFileValueSchema, k as serializeCacheValue, lt as getNestedAttribute, nt as extractCacheEntries, ot as simulateLlmCallCost, q as runInEvalRuntimeScope, rt as extractCacheHits, st as simulateTokenAllocation, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalTime } from "./runExecution-d42Lm0i5.mjs";
-import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-_g2qOMK6.mjs";
-import { n as matchesEvalTags, t as defineEval } from "./src-CdZsOn6y.mjs";
-export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
+import { $ as setScopeCacheContext, A as repoFile, B as evalTime, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as EvalRuntimeUsageError, Ft as getEvalRegistry, H as getEvalCaseInput, I as appendToEvalOutput, J as runInEvalRuntimeScope, K as mergeEvalOutput, M as readManualInputFile, N as evalExpect, O as serializeCacheRecording, P as EvalAssertionError, Q as setEvalOutput, R as evalAssert, S as evalSpan, T as hashCacheKeySync, U as incrementEvalOutput, V as getCurrentScope, W as isInEvalScope, X as runInExistingEvalScope, Y as runInEvalScope, at as extractApiCalls, b as buildTraceTree, ct as simulateTokenAllocation, et as startEvalBackgroundJob, it as extractCacheHits, j as manualInputFileValueSchema, k as serializeCacheValue, ot as extractLlmCalls, q as nextEvalId, rt as extractCacheEntries, st as simulateLlmCallCost, ut as getNestedAttribute, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalLog } from "./runExecution-CFw0MQFs.mjs";
+import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-SP4kEtYL.mjs";
+import { n as matchesEvalTags, t as defineEval } from "./src-7GbQj1sb.mjs";
+export { EvalAssertionError, EvalRuntimeUsageError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };

package/dist/runChild.mjs CHANGED Viewed

@@ -1,5 +1,5 @@
-import { At as evalChartsConfigSchema, Ct as buildEvalKey, Dt as evalStatAggregateSchema, I as configureEvalRunLogs, Ot as evalStatsConfigSchema, bt as runSummarySchema, et as createRunRequestSchema, jt as columnDefSchema, kt as manualInputDescriptorSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, v as createFsCacheStore, yt as runManifestSchema } from "./runExecution-d42Lm0i5.mjs";
-import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-CvmFeOmT.mjs";
+import { At as evalStatsConfigSchema, L as configureEvalRunLogs, Mt as evalChartsConfigSchema, Nt as columnDefSchema, Tt as buildEvalKey, bt as runManifestSchema, jt as manualInputDescriptorSchema, kt as evalStatAggregateSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, tt as createRunRequestSchema, v as createFsCacheStore, xt as runSummarySchema } from "./runExecution-CFw0MQFs.mjs";
+import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-CxjiQmof.mjs";
 import { z } from "zod/v4";
 import { readFile } from "node:fs/promises";
 import { relative } from "node:path";

package/dist/{runExecution-d42Lm0i5.mjs → runExecution-CFw0MQFs.mjs} RENAMED Viewed

@@ -851,6 +851,7 @@ const runLogLevelSchema = z.enum([
 const runLogPhaseSchema = z.enum([
 	"eval",
 	"derive",
+	"tracingAssertions",
 	"outputsSchema",
 	"scorer"
 ]);
@@ -1008,6 +1009,8 @@ const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfi
 const evalDeriveValueFnSchema = z.custom((value) => typeof value === "function", { message: "Expected a derive output function" });
 /** Schema for keyed or object-returning trace-derived output config. */
 const evalDeriveConfigSchema = z.union([z.custom((value) => typeof value === "function", { message: "Expected a deriveFromTracing function" }), z.record(z.string().min(1), evalDeriveValueFnSchema)]);
+/** Schema for trace-derived assertion config. */
+const evalTracingAssertionsConfigSchema = z.custom((value) => typeof value === "function", { message: "Expected a tracingAssertions function" });
 /** Schema for UI overrides on derived or scored columns. */
 const evalColumnOverrideSchema = z.object({
 	label: z.string().optional(),
@@ -1411,6 +1414,7 @@ const agentEvalsConfigSchema = z.object({
 	traceDisplay: traceDisplayInputConfigSchema.optional(),
 	columns: evalColumnsSchema.optional(),
 	deriveFromTracing: evalDeriveConfigSchema.optional(),
+	tracingAssertions: evalTracingAssertionsConfigSchema.optional(),
 	stats: evalStatsConfigSchema.optional(),
 	defaultStatAggregate: evalStatAggregateSchema.optional(),
 	llmCalls: llmCallsConfigSchema.optional(),
@@ -1847,8 +1851,9 @@ function deriveScopedSummaryFromCases(params) {
 * freshness state.
 */
 function getEvalDisplayStatus(params) {
-	const { stale, outdated, lastRunStatus, isRunning = false } = params;
+	const { stale, outdated, lastRunStatus, isRunning = false, isEnqueued = false } = params;
 	if (isRunning || lastRunStatus === "running") return "running";
+	if (isEnqueued) return "enqueued";
 	if (lastRunStatus === "pass") {
 		if (stale) return "stale";
 		if (outdated) return "outdated";
@@ -2718,6 +2723,17 @@ var EvalAssertionError = class extends Error {
 		this.name = "EvalAssertionError";
 	}
 };
+/** Error thrown when an SDK helper is used in an unsupported runner phase. */
+var EvalRuntimeUsageError = class extends Error {
+	constructor(message) {
+		super(message);
+		this.name = "EvalRuntimeUsageError";
+	}
+};
+/** Throw when assertion helpers are used in a runner phase that forbids them. */
+function assertEvalAssertionsAllowed(apiName) {
+	if (getCurrentScope() && runtimeScopeStorage.getStore() === "derive") throw new EvalRuntimeUsageError(`${apiName} cannot be used inside deriveFromTracing. Use tracingAssertions for trace-derived assertions.`);
+}
 function getEvalClockStateNowMs(state) {
 	const elapsedMs = state.frozen ? 0 : realDate.now() - state.realStartMs;
 	return state.startMs + elapsedMs + state.offsetMs;
@@ -2823,8 +2839,10 @@ function recordSpanForActiveCacheRecording(scope, spanId) {
 *
 * Returns `null` outside eval-owned work, `env` while the runner is loading
 * eval modules for a run, `cases` while generating cases, `eval` while running
-* case `execute`, `derive` while deriving outputs from traces, `outputsSchema`
-* while validating outputs, and `scorer` while computing scores.
+* case `execute`, `derive` while deriving outputs from traces,
+* `tracingAssertions` while checking trace-derived assertions,
+* `outputsSchema` while validating outputs, and `scorer` while computing
+* scores.
 */
 function isInEvalScope() {
 	if (activeEvalRuntimeScopeCount === 0) return null;
@@ -2845,7 +2863,7 @@ function normalizeLogLevel(level) {
 }
 function getCurrentLogPhase() {
 	const runtimeScope = runtimeScopeStorage.getStore();
-	if (runtimeScope === "eval" || runtimeScope === "derive" || runtimeScope === "outputsSchema" || runtimeScope === "scorer") return runtimeScope;
+	if (runtimeScope === "eval" || runtimeScope === "derive" || runtimeScope === "tracingAssertions" || runtimeScope === "outputsSchema" || runtimeScope === "scorer") return runtimeScope;
 	return null;
 }
 function formatLogArgs(args) {
@@ -3300,10 +3318,12 @@ function incrementEvalOutput(key, delta) {
 * Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
 * can safely reuse `evalAssert(...)` when it also runs outside an eval. The
 * TypeScript assertion signature still narrows the checked value after the
-* call.
+* call. Calls inside `deriveFromTracing` throw because derivations must only
+* write outputs; use `tracingAssertions` for trace-derived pass/fail checks.
 */
 function evalAssert(condition, message) {
 	const scope = getCurrentScope();
+	assertEvalAssertionsAllowed("evalAssert(...)");
 	if (condition) {
 		if (scope) scope.assertions.push({
 			message,
@@ -3454,6 +3474,7 @@ var EvalExpectationImpl = class EvalExpectationImpl {
 * case scope is active, matching `evalAssert(...)`.
 */
 function evalExpect(value) {
+	assertEvalAssertionsAllowed("evalExpect(...)");
 	return new EvalExpectationImpl(value, false);
 }
 //#endregion
@@ -5005,25 +5026,53 @@ const evalTracer = {
 };
 /** Build a queryable trace tree helper from a flat span list and checkpoints. */
 function buildTraceTree(spans, checkpoints) {
+	const rootSpans = spans.filter((s) => s.parentId === null);
+	const flattenDfs = () => {
+		const result = [];
+		function visit(parentId) {
+			for (const childSpan of spans) if (childSpan.parentId === parentId) {
+				result.push(childSpan);
+				visit(childSpan.id);
+			}
+		}
+		visit(null);
+		return result;
+	};
+	const filterSpanNames = (sourceSpans, kind) => {
+		return sourceSpans.filter((span) => kind === void 0 || span.kind === kind).map((span) => span.name);
+	};
 	return {
 		spans,
-		rootSpans: spans.filter((s) => s.parentId === null),
+		rootSpans,
 		findSpan(name) {
 			return spans.find((s) => s.name === name);
 		},
+		findSpans(name) {
+			return spans.filter((s) => s.name === name);
+		},
+		hasSpan(name) {
+			return spans.some((s) => s.name === name);
+		},
 		findSpansByKind(kind) {
 			return spans.filter((s) => s.kind === kind);
 		},
+		findToolCallSpans() {
+			return spans.filter((s) => s.kind === "tool");
+		},
+		listToolCallSpanNames() {
+			return filterSpanNames(spans, "tool");
+		},
+		hasToolCallSpan(name) {
+			return spans.some((s) => s.kind === "tool" && s.name === name);
+		},
+		listSpanNames(kind) {
+			return filterSpanNames(spans, kind);
+		},
+		listSpanNamesDfs(kind) {
+			return filterSpanNames(flattenDfs(), kind);
+		},
 		flattenDfs() {
-			const result = [];
-			function visit(parentId) {
-				for (const childSpan of spans) if (childSpan.parentId === parentId) {
-					result.push(childSpan);
-					visit(childSpan.id);
-				}
-			}
-			visit(null);
-			return result;
+			return flattenDfs();
 		},
 		checkpoints
 	};
@@ -6665,7 +6714,7 @@ async function resolveDeriveFromTracingConfig(params) {
 	return derived;
 }
 async function runDeriveFromTracingConfig(params) {
-	if (params.deriveFromTracing === void 0) return;
+	if (params.deriveFromTracing === void 0) return null;
 	const { deriveFromTracing } = params;
 	try {
 		const derived = await runInExistingEvalScope(params.scope, "derive", async () => await resolveDeriveFromTracingConfig({
@@ -6677,13 +6726,43 @@ async function runDeriveFromTracingConfig(params) {
 			outputs: params.scope.outputs,
 			derived
 		});
+		return null;
 	} catch (e) {
+		if (e instanceof EvalRuntimeUsageError) return e;
 		const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
 		recordAssertionFailure(params.scope, toAssertionFailure(message, e instanceof Error ? e : void 0));
+		return null;
 	}
 }
+async function runOneTracingAssertion(params) {
+	const { label, tracingAssertion, scope, traceTree, evalCase } = params;
+	const failureCountBefore = scope.assertionFailures.length;
+	const ctx = {
+		trace: traceTree,
+		input: evalCase.input,
+		case: evalCase
+	};
+	try {
+		await runInExistingEvalScope(scope, "tracingAssertions", async () => {
+			await callUnknownFunction(tracingAssertion, [ctx]);
+		});
+	} catch (e) {
+		if (e instanceof EvalAssertionError && scope.assertionFailures.length > failureCountBefore) return;
+		recordAssertionFailure(scope, toAssertionFailure(`${label} threw: ${e instanceof Error ? e.message : String(e)}`, e instanceof Error ? e : void 0));
+	}
+}
+async function runTracingAssertionsConfig(params) {
+	if (params.tracingAssertions === void 0) return;
+	await runOneTracingAssertion({
+		label: "tracingAssertions",
+		tracingAssertion: params.tracingAssertions,
+		scope: params.scope,
+		traceTree: params.traceTree,
+		evalCase: params.evalCase
+	});
+}
 async function runCase(params) {
-	const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalColumns, globalDeriveFromTracing, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
+	const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalColumns, globalDeriveFromTracing, globalTracingAssertions, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
 	const scopedIdPrefix = buildScopedEvalIdPrefix({
 		evalId,
 		evalFilePath,
@@ -6728,22 +6807,36 @@ async function runCase(params) {
 		apiCallsConfig
 	});
 	const traceTree = buildTraceTree(spansWithDerivedAttributes, scope.checkpoints);
-	const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
+	let nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
 	if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) recordAssertionFailure(scope, toAssertionFailure(executeError.message, executeError));
 	if (!nonAssertError) {
-		await runDeriveFromTracingConfig({
+		nonAssertError = await runDeriveFromTracingConfig({
 			deriveFromTracing: globalDeriveFromTracing,
 			scope,
 			traceTree,
 			evalCase
 		});
-		await runDeriveFromTracingConfig({
+		if (!nonAssertError) nonAssertError = await runDeriveFromTracingConfig({
 			deriveFromTracing: evalDef.deriveFromTracing,
 			scope,
 			traceTree,
 			evalCase
 		});
 	}
+	if (!nonAssertError) {
+		await runTracingAssertionsConfig({
+			tracingAssertions: globalTracingAssertions,
+			scope,
+			traceTree,
+			evalCase
+		});
+		await runTracingAssertionsConfig({
+			tracingAssertions: evalDef.tracingAssertions,
+			scope,
+			traceTree,
+			evalCase
+		});
+	}
 	if (!nonAssertError) addDefaultOutputs({
 		outputs: scope.outputs,
 		spans: spansWithDerivedAttributes,
@@ -6933,4 +7026,4 @@ function recordAssertionFailure(scope, failure) {
 	});
 }
 //#endregion
-export { startEvalBackgroundJob as $, repoFile as A, evalChartsConfigSchema as At, getCurrentScope as B, evalTracer as C, buildEvalKey as Ct, deserializeCacheValue as D, evalStatAggregateSchema as Dt, deserializeCacheRecording as E, caseRowSchema as Et, appendToEvalOutput as F, mergeEvalOutput as G, incrementEvalOutput as H, configureEvalRunLogs as I, runInEvalScope as J, nextEvalId as K, evalAssert as L, readManualInputFile as M, defineEval as Mt, evalExpect as N, getEvalRegistry as Nt, serializeCacheRecording as O, evalStatsConfigSchema as Ot, EvalAssertionError as P, runWithEvalRegistry as Pt, setScopeCacheContext as Q, evalLog as R, evalSpan as S, resolveLlmCallsConfig as St, hashCacheKeySync as T, caseDetailSchema as Tt, isInEvalScope as U, getEvalCaseInput as V, matchesEvalTags as W, runWithEvalClock as X, runInExistingEvalScope as Y, setEvalOutput as Z, createBufferedCacheStore as _, validateEvalTagName as _t, isCaseChildParentMessage as a, extractLlmCalls as at, buildTraceTree as b, runSummarySchema as bt, resolveArtifactPath as c, applyDerivedCallAttributes as ct, loadEvalModule as d, getEvalDisplayStatus as dt, createRunRequestSchema as et, resolveEvalDefaultConfig as f, deriveScopedSummaryFromCases as ft, commitPendingCacheWrites as g, matchesTagsFilter as gt, normalizeScoreDef as h, dedupeEvalTags as ht, isCaseChildMessage as i, extractApiCalls as it, manualInputFileValueSchema as j, columnDefSchema as jt, serializeCacheValue as k, manualInputDescriptorSchema as kt, registerAgentEvalsPackageResolutionHooks as l, getNestedAttribute as lt, buildDeclaredColumnDefs as m, deriveStatusFromChildStatuses as mt, resolveRunnableEvalCases as n, extractCacheEntries as nt, stripTerminalControlCodes as o, simulateLlmCallCost as ot, loadConfig as p, deriveStatusFromCaseRows as pt, runInEvalRuntimeScope as q, runCase as r, extractCacheHits as rt, resolveTracePresentation as s, simulateTokenAllocation as st, filterEvalCases as t, updateManualScoreRequestSchema as tt, runWithModuleIsolation as u, getEvalTitle as ut, createFsCacheStore as v, validateTagsFilterExpression as vt, hashCacheKey as w, getCaseRowCaseKey as wt, captureEvalSpanError as x, resolveApiCallsConfig as xt, z$1 as y, runManifestSchema as yt, evalTime as z };
+export { setScopeCacheContext as $, repoFile as A, evalStatsConfigSchema as At, evalTime as B, evalTracer as C, resolveLlmCallsConfig as Ct, deserializeCacheValue as D, caseDetailSchema as Dt, deserializeCacheRecording as E, getCaseRowCaseKey as Et, EvalRuntimeUsageError as F, getEvalRegistry as Ft, matchesEvalTags as G, getEvalCaseInput as H, appendToEvalOutput as I, runWithEvalRegistry as It, runInEvalRuntimeScope as J, mergeEvalOutput as K, configureEvalRunLogs as L, readManualInputFile as M, evalChartsConfigSchema as Mt, evalExpect as N, columnDefSchema as Nt, serializeCacheRecording as O, caseRowSchema as Ot, EvalAssertionError as P, defineEval as Pt, setEvalOutput as Q, evalAssert as R, evalSpan as S, resolveApiCallsConfig as St, hashCacheKeySync as T, buildEvalKey as Tt, incrementEvalOutput as U, getCurrentScope as V, isInEvalScope as W, runInExistingEvalScope as X, runInEvalScope as Y, runWithEvalClock as Z, createBufferedCacheStore as _, matchesTagsFilter as _t, isCaseChildParentMessage as a, extractApiCalls as at, buildTraceTree as b, runManifestSchema as bt, resolveArtifactPath as c, simulateTokenAllocation as ct, loadEvalModule as d, getEvalTitle as dt, startEvalBackgroundJob as et, resolveEvalDefaultConfig as f, getEvalDisplayStatus as ft, commitPendingCacheWrites as g, dedupeEvalTags as gt, normalizeScoreDef as h, deriveStatusFromChildStatuses as ht, isCaseChildMessage as i, extractCacheHits as it, manualInputFileValueSchema as j, manualInputDescriptorSchema as jt, serializeCacheValue as k, evalStatAggregateSchema as kt, registerAgentEvalsPackageResolutionHooks as l, applyDerivedCallAttributes as lt, buildDeclaredColumnDefs as m, deriveStatusFromCaseRows as mt, resolveRunnableEvalCases as n, updateManualScoreRequestSchema as nt, stripTerminalControlCodes as o, extractLlmCalls as ot, loadConfig as p, deriveScopedSummaryFromCases as pt, nextEvalId as q, runCase as r, extractCacheEntries as rt, resolveTracePresentation as s, simulateLlmCallCost as st, filterEvalCases as t, createRunRequestSchema as tt, runWithModuleIsolation as u, getNestedAttribute as ut, createFsCacheStore as v, validateEvalTagName as vt, hashCacheKey as w, buildCaseKey as wt, captureEvalSpanError as x, runSummarySchema as xt, z$1 as y, validateTagsFilterExpression as yt, evalLog as z };

package/dist/{runOrchestration-CvmFeOmT.mjs → runOrchestration-CxjiQmof.mjs} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-d42Lm0i5.mjs";
+import { Dt as caseDetailSchema, Et as getCaseRowCaseKey, It as runWithEvalRegistry, J as runInEvalRuntimeScope, Ot as caseRowSchema, Z as runWithEvalClock, _t as matchesTagsFilter, bt as runManifestSchema, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as dedupeEvalTags, ht as deriveStatusFromChildStatuses, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromCaseRows, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveScopedSummaryFromCases, t as filterEvalCases, u as runWithModuleIsolation, vt as validateEvalTagName, wt as buildCaseKey, xt as runSummarySchema, yt as validateTagsFilterExpression } from "./runExecution-CFw0MQFs.mjs";
 import { readFile, readdir, rm, writeFile } from "node:fs/promises";
 import { dirname, join } from "node:path";
 import { existsSync } from "node:fs";
@@ -1030,6 +1030,7 @@ async function executeQueuedCases(params) {
 async function executeQueuedCase(params) {
 	const { queuedCase, globalTraceDisplay } = params;
 	const startTime = Date.now();
+	await queuedCase.onStart?.();
 	const result = await queuedCase.execute({
 		globalTraceDisplay,
 		startTime
@@ -1249,6 +1250,32 @@ function buildRunErrorMessage(errors) {
 		return `[${entry.evalId}] ${messageLine}\n${details}`;
 	}).join("\n");
 }
+function upsertCaseRow(caseRows, nextCaseRow) {
+	const existingIndex = caseRows.findIndex((caseRow) => getCaseRowCaseKey(caseRow) === getCaseRowCaseKey(nextCaseRow) && caseRow.trial === nextCaseRow.trial);
+	if (existingIndex === -1) {
+		caseRows.push(nextCaseRow);
+		return;
+	}
+	caseRows[existingIndex] = nextCaseRow;
+}
+function removeLiveCaseRows(caseRows, nextCaseRow) {
+	const caseKey = getCaseRowCaseKey(nextCaseRow);
+	for (let i = caseRows.length - 1; i >= 0; i--) {
+		const caseRow = caseRows[i];
+		if (caseRow === void 0) continue;
+		if (getCaseRowCaseKey(caseRow) !== caseKey) continue;
+		if (caseRow.status !== "pending" && caseRow.status !== "running") continue;
+		caseRows.splice(i, 1);
+	}
+}
+function emitCaseRowEvent(params) {
+	params.emitEvent(params.runState, {
+		type: params.type,
+		runId: params.runState.manifest.id,
+		timestamp: (/* @__PURE__ */ new Date()).toISOString(),
+		payload: params.caseRow
+	});
+}
 async function finalizePreparedCase(params) {
 	const { runState, runDir, preparedEval, preparedCase, onCaseFinished, emitEvent } = params;
 	if (preparedCase.finalized || preparedCase.trialResults.length === 0) return;
@@ -1263,7 +1290,8 @@ async function finalizePreparedCase(params) {
 		pendingWrites: winningTrial.pendingCacheWrites
 	});
 	const artifactFileId = getCaseArtifactFileId(runState, winningTrial.caseRow);
-	runState.cases.push(winningTrial.caseRow);
+	removeLiveCaseRows(runState.cases, winningTrial.caseRow);
+	upsertCaseRow(runState.cases, winningTrial.caseRow);
 	runState.caseDetails.set(getCaseRowCaseKey(winningTrial.caseRow), winningTrial.caseDetail);
 	if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
 	else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
@@ -1271,11 +1299,11 @@ async function finalizePreparedCase(params) {
 	await writeFile(join(runDir, "traces", `${encodeURIComponent(artifactFileId)}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
 	await persistCaseDetail(runDir, winningTrial.caseDetail, artifactFileId);
 	onCaseFinished?.(winningTrial.caseDetail, winningTrial.caseRow);
-	emitEvent(runState, {
+	emitCaseRowEvent({
+		runState,
+		emitEvent,
 		type: "case.finished",
-		runId: runState.manifest.id,
-		timestamp: (/* @__PURE__ */ new Date()).toISOString(),
-		payload: winningTrial.caseRow
+		caseRow: winningTrial.caseRow
 	});
 	preparedEval.evalCaseRows.push(winningTrial.caseRow);
 }
@@ -1437,13 +1465,52 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
 							preparedEvals.push(preparedEval);
 							for (const evalCase of cases) {
 								const trialResults = [];
+								const liveCaseRow = {
+									caseId: evalCase.id,
+									evalId: evalMeta.id,
+									evalKey: evalMeta.key,
+									caseKey: buildCaseKey({
+										filePath: evalMeta.filePath,
+										evalId: evalMeta.id,
+										caseId: evalCase.id
+									}),
+									tags: evalCase.tags,
+									status: "pending",
+									durationMs: null,
+									cacheHits: 0,
+									cacheOperations: 0,
+									columns: {},
+									trial: 0
+								};
 								const preparedCase = {
 									caseId: evalCase.id,
+									liveCaseRow,
 									trialResults,
 									finalized: false
 								};
 								preparedCases.push(preparedCase);
+								upsertCaseRow(runState.cases, liveCaseRow);
+								emitCaseRowEvent({
+									runState,
+									emitEvent,
+									type: "case.updated",
+									caseRow: liveCaseRow
+								});
 								for (let trial = 0; trial < request.trials; trial++) queuedCases.push({
+									onStart: () => {
+										if (preparedCase.finalized) return;
+										preparedCase.liveCaseRow = {
+											...preparedCase.liveCaseRow,
+											status: "running"
+										};
+										upsertCaseRow(runState.cases, preparedCase.liveCaseRow);
+										emitCaseRowEvent({
+											runState,
+											emitEvent,
+											type: "case.started",
+											caseRow: preparedCase.liveCaseRow
+										});
+									},
 									execute: async ({ startTime, globalTraceDisplay }) => await executeCaseChild({
 										evalId: evalMeta.id,
 										evalKey: evalMeta.key,

package/dist/{runner-BKogjiYd.mjs → runner-BlFQyvN2.mjs} RENAMED Viewed

@@ -1,2 +1,2 @@
-import { n as initRunner, t as getRunnerInstance } from "./runner-MSr8sAWm.mjs";
+import { n as initRunner, t as getRunnerInstance } from "./runner-CY3bgsjU.mjs";
 export { getRunnerInstance, initRunner };

package/dist/{runner-MSr8sAWm.mjs → runner-CY3bgsjU.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { n as createRunner } from "./cli-_g2qOMK6.mjs";
-import "./src-CdZsOn6y.mjs";
+import { n as createRunner } from "./cli-SP4kEtYL.mjs";
+import "./src-7GbQj1sb.mjs";
 //#region ../../apps/server/src/runner.ts
 let runnerInstance = null;
 function getRunnerInstance() {

package/dist/{src-CdZsOn6y.mjs → src-7GbQj1sb.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-d42Lm0i5.mjs";
-import "./cli-_g2qOMK6.mjs";
+import { G as matchesEvalTags$1, Pt as defineEval$1 } from "./runExecution-CFw0MQFs.mjs";
+import "./cli-SP4kEtYL.mjs";
 //#region src/index.ts
 /** Register an eval definition with typed tag support. */
 function defineEval(definition) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ls-stack/agent-eval",
-  "version": "0.58.1",
+  "version": "0.58.3",
   "type": "module",
   "bin": {
     "agent-evals": "./dist/bin.mjs"
@@ -33,8 +33,8 @@
     "@types/node": "^24.7.2",
     "typescript": "^5.9.2",
     "@agent-evals/runner": "0.0.1",
-    "@agent-evals/shared": "0.0.1",
-    "@agent-evals/sdk": "0.0.1"
+    "@agent-evals/sdk": "0.0.1",
+    "@agent-evals/shared": "0.0.1"
   },
   "scripts": {
     "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",

package/skills/agent-eval/SKILL.md CHANGED Viewed

@@ -35,6 +35,9 @@ display rules), read the TypeScript declarations shipped with the package:
 - `agent-evals app` watches `agent-evals.config.ts` and reloads config in
   place when the runner is idle. If config changes during an active run, the
   reload applies after the current run reaches a terminal state.
+- App-triggered runs log the queued target evals, resolved case concurrency,
+  each case start for evals that are actually running, and the terminal run
+  summary in the server terminal.
 Assume that enumerated tables in this document may lag behind the types —
 treat the types as source of truth when they disagree.
@@ -360,7 +363,19 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
   The older object-returning function form remains supported. Global
   derivations run first; runtime outputs are never overwritten, and eval-level
   derivations only fill keys still missing after global derivations. In keyed
-  form, return `undefined` to omit one output for that case.
+  form, return `undefined` to omit one output for that case. Do not call
+  `evalAssert(...)` or `evalExpect(...)` from `deriveFromTracing`; use
+  `tracingAssertions` for trace-derived pass/fail checks.
+- `tracingAssertions` is a single function that can be authored globally or
+  locally on one eval when a finished-trace invariant should pass or fail the
+  case without creating a fake score column. It receives the same
+  `{ trace, input, case }` context as `deriveFromTracing`; call
+  `evalAssert(...)` or `evalExpect(...)` inside it.
+  Useful trace helpers include `trace.findSpan(name)`, `trace.findSpans(name)`,
+  `trace.hasSpan(name)`, `trace.findSpansByKind(kind)`,
+  `trace.findToolCallSpans()`, `trace.listToolCallSpanNames()`,
+  `trace.hasToolCallSpan(name)`, `trace.listSpanNames(kind?)`,
+  `trace.listSpanNamesDfs(kind?)`, and `trace.flattenDfs()`.
 - `traceDisplay` promotes selected span attributes into the trace tree and
   detail pane; it supports aggregation across subtrees (`scope`, `mode`) and
   user-defined `transform(...)` for derived views (e.g. currency conversion).
@@ -629,8 +644,9 @@ When adding or changing evals:
 3. `evalAssert` for hard invariants and truthy type narrowing. It records
    pass/fail entries in case-detail `assertions`; failed entries are also kept
    in `assertionFailures` and fail the case. Use `evalExpect` for non-trivial
-   comparisons, `scores` for graded signals, and `passThreshold` only on
-   scores that should gate pass/fail.
+   comparisons, `tracingAssertions` for invariants derived from the finished
+   trace, `scores` for graded signals, and `passThreshold` only on scores that
+   should gate pass/fail.
 4. Surface reviewable values through execute-context `setOutput` or ambient
    `setEvalOutput` in shared workflow code, and shape them with `columns`
    formats from the `ColumnFormat` type.