npm - @ls-stack/agent-eval - Versions diffs - 0.58.1 → 0.58.2 - Mend

@ls-stack/agent-eval 0.58.1 → 0.58.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/dist/{app-DhMIbjlE.mjs → app-BxD6aHbp.mjs} +52 -7
package/dist/apps/web/dist/assets/index-BMWBZw_u.js +377 -0
package/dist/apps/web/dist/assets/index-CHH7m5Cv.css +1 -0
package/dist/apps/web/dist/index.html +2 -2
package/dist/bin.mjs +1 -1
package/dist/caseChild.mjs +2 -1
package/dist/{cli-_g2qOMK6.mjs → cli-HBwXIJsg.mjs} +31 -5
package/dist/index.d.mts +76 -17
package/dist/index.mjs +4 -4
package/dist/runChild.mjs +2 -2
package/dist/{runExecution-d42Lm0i5.mjs → runExecution-pHJ0_TzH.mjs} +125 -21
package/dist/{runOrchestration-CvmFeOmT.mjs → runOrchestration-ngVXShH4.mjs} +73 -6
package/dist/{runner-BKogjiYd.mjs → runner-BnZMGBla.mjs} +1 -1
package/dist/{runner-MSr8sAWm.mjs → runner-D_pz2NON.mjs} +2 -2
package/dist/{src-CdZsOn6y.mjs → src-AeXGBJ26.mjs} +2 -2
package/package.json +3 -3
package/skills/agent-eval/SKILL.md +18 -3
package/dist/apps/web/dist/assets/index-Cz9p4l-t.js +0 -377
package/dist/apps/web/dist/assets/index-DtARRwsS.css +0 -1

package/dist/{runOrchestration-CvmFeOmT.mjs → runOrchestration-ngVXShH4.mjs} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-d42Lm0i5.mjs";
+import { Dt as caseDetailSchema, Et as getCaseRowCaseKey, It as runWithEvalRegistry, J as runInEvalRuntimeScope, Ot as caseRowSchema, Z as runWithEvalClock, _t as matchesTagsFilter, bt as runManifestSchema, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as dedupeEvalTags, ht as deriveStatusFromChildStatuses, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromCaseRows, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveScopedSummaryFromCases, t as filterEvalCases, u as runWithModuleIsolation, vt as validateEvalTagName, wt as buildCaseKey, xt as runSummarySchema, yt as validateTagsFilterExpression } from "./runExecution-pHJ0_TzH.mjs";
 import { readFile, readdir, rm, writeFile } from "node:fs/promises";
 import { dirname, join } from "node:path";
 import { existsSync } from "node:fs";
@@ -1030,6 +1030,7 @@ async function executeQueuedCases(params) {
 async function executeQueuedCase(params) {
 	const { queuedCase, globalTraceDisplay } = params;
 	const startTime = Date.now();
+	await queuedCase.onStart?.();
 	const result = await queuedCase.execute({
 		globalTraceDisplay,
 		startTime
@@ -1249,6 +1250,32 @@ function buildRunErrorMessage(errors) {
 		return `[${entry.evalId}] ${messageLine}\n${details}`;
 	}).join("\n");
 }
+function upsertCaseRow(caseRows, nextCaseRow) {
+	const existingIndex = caseRows.findIndex((caseRow) => getCaseRowCaseKey(caseRow) === getCaseRowCaseKey(nextCaseRow) && caseRow.trial === nextCaseRow.trial);
+	if (existingIndex === -1) {
+		caseRows.push(nextCaseRow);
+		return;
+	}
+	caseRows[existingIndex] = nextCaseRow;
+}
+function removeLiveCaseRows(caseRows, nextCaseRow) {
+	const caseKey = getCaseRowCaseKey(nextCaseRow);
+	for (let i = caseRows.length - 1; i >= 0; i--) {
+		const caseRow = caseRows[i];
+		if (caseRow === void 0) continue;
+		if (getCaseRowCaseKey(caseRow) !== caseKey) continue;
+		if (caseRow.status !== "pending" && caseRow.status !== "running") continue;
+		caseRows.splice(i, 1);
+	}
+}
+function emitCaseRowEvent(params) {
+	params.emitEvent(params.runState, {
+		type: params.type,
+		runId: params.runState.manifest.id,
+		timestamp: (/* @__PURE__ */ new Date()).toISOString(),
+		payload: params.caseRow
+	});
+}
 async function finalizePreparedCase(params) {
 	const { runState, runDir, preparedEval, preparedCase, onCaseFinished, emitEvent } = params;
 	if (preparedCase.finalized || preparedCase.trialResults.length === 0) return;
@@ -1263,7 +1290,8 @@ async function finalizePreparedCase(params) {
 		pendingWrites: winningTrial.pendingCacheWrites
 	});
 	const artifactFileId = getCaseArtifactFileId(runState, winningTrial.caseRow);
-	runState.cases.push(winningTrial.caseRow);
+	removeLiveCaseRows(runState.cases, winningTrial.caseRow);
+	upsertCaseRow(runState.cases, winningTrial.caseRow);
 	runState.caseDetails.set(getCaseRowCaseKey(winningTrial.caseRow), winningTrial.caseDetail);
 	if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
 	else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
@@ -1271,11 +1299,11 @@ async function finalizePreparedCase(params) {
 	await writeFile(join(runDir, "traces", `${encodeURIComponent(artifactFileId)}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
 	await persistCaseDetail(runDir, winningTrial.caseDetail, artifactFileId);
 	onCaseFinished?.(winningTrial.caseDetail, winningTrial.caseRow);
-	emitEvent(runState, {
+	emitCaseRowEvent({
+		runState,
+		emitEvent,
 		type: "case.finished",
-		runId: runState.manifest.id,
-		timestamp: (/* @__PURE__ */ new Date()).toISOString(),
-		payload: winningTrial.caseRow
+		caseRow: winningTrial.caseRow
 	});
 	preparedEval.evalCaseRows.push(winningTrial.caseRow);
 }
@@ -1437,13 +1465,52 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
 							preparedEvals.push(preparedEval);
 							for (const evalCase of cases) {
 								const trialResults = [];
+								const liveCaseRow = {
+									caseId: evalCase.id,
+									evalId: evalMeta.id,
+									evalKey: evalMeta.key,
+									caseKey: buildCaseKey({
+										filePath: evalMeta.filePath,
+										evalId: evalMeta.id,
+										caseId: evalCase.id
+									}),
+									tags: evalCase.tags,
+									status: "pending",
+									durationMs: null,
+									cacheHits: 0,
+									cacheOperations: 0,
+									columns: {},
+									trial: 0
+								};
 								const preparedCase = {
 									caseId: evalCase.id,
+									liveCaseRow,
 									trialResults,
 									finalized: false
 								};
 								preparedCases.push(preparedCase);
+								upsertCaseRow(runState.cases, liveCaseRow);
+								emitCaseRowEvent({
+									runState,
+									emitEvent,
+									type: "case.updated",
+									caseRow: liveCaseRow
+								});
 								for (let trial = 0; trial < request.trials; trial++) queuedCases.push({
+									onStart: () => {
+										if (preparedCase.finalized) return;
+										preparedCase.liveCaseRow = {
+											...preparedCase.liveCaseRow,
+											status: "running"
+										};
+										upsertCaseRow(runState.cases, preparedCase.liveCaseRow);
+										emitCaseRowEvent({
+											runState,
+											emitEvent,
+											type: "case.started",
+											caseRow: preparedCase.liveCaseRow
+										});
+									},
 									execute: async ({ startTime, globalTraceDisplay }) => await executeCaseChild({
 										evalId: evalMeta.id,
 										evalKey: evalMeta.key,

package/dist/{runner-BKogjiYd.mjs → runner-BnZMGBla.mjs} RENAMED Viewed

@@ -1,2 +1,2 @@
-import { n as initRunner, t as getRunnerInstance } from "./runner-MSr8sAWm.mjs";
+import { n as initRunner, t as getRunnerInstance } from "./runner-D_pz2NON.mjs";
 export { getRunnerInstance, initRunner };

package/dist/{runner-MSr8sAWm.mjs → runner-D_pz2NON.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { n as createRunner } from "./cli-_g2qOMK6.mjs";
-import "./src-CdZsOn6y.mjs";
+import { n as createRunner } from "./cli-HBwXIJsg.mjs";
+import "./src-AeXGBJ26.mjs";
 //#region ../../apps/server/src/runner.ts
 let runnerInstance = null;
 function getRunnerInstance() {

package/dist/{src-CdZsOn6y.mjs → src-AeXGBJ26.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { Mt as defineEval$1, W as matchesEvalTags$1 } from "./runExecution-d42Lm0i5.mjs";
-import "./cli-_g2qOMK6.mjs";
+import { G as matchesEvalTags$1, Pt as defineEval$1 } from "./runExecution-pHJ0_TzH.mjs";
+import "./cli-HBwXIJsg.mjs";
 //#region src/index.ts
 /** Register an eval definition with typed tag support. */
 function defineEval(definition) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ls-stack/agent-eval",
-  "version": "0.58.1",
+  "version": "0.58.2",
   "type": "module",
   "bin": {
     "agent-evals": "./dist/bin.mjs"
@@ -33,8 +33,8 @@
     "@types/node": "^24.7.2",
     "typescript": "^5.9.2",
     "@agent-evals/runner": "0.0.1",
-    "@agent-evals/shared": "0.0.1",
-    "@agent-evals/sdk": "0.0.1"
+    "@agent-evals/sdk": "0.0.1",
+    "@agent-evals/shared": "0.0.1"
   },
   "scripts": {
     "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",

package/skills/agent-eval/SKILL.md CHANGED Viewed

@@ -35,6 +35,9 @@ display rules), read the TypeScript declarations shipped with the package:
 - `agent-evals app` watches `agent-evals.config.ts` and reloads config in
   place when the runner is idle. If config changes during an active run, the
   reload applies after the current run reaches a terminal state.
+- App-triggered runs log the queued target evals, resolved case concurrency,
+  each case start for evals that are actually running, and the terminal run
+  summary in the server terminal.
 Assume that enumerated tables in this document may lag behind the types —
 treat the types as source of truth when they disagree.
@@ -360,7 +363,18 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
   The older object-returning function form remains supported. Global
   derivations run first; runtime outputs are never overwritten, and eval-level
   derivations only fill keys still missing after global derivations. In keyed
-  form, return `undefined` to omit one output for that case.
+  form, return `undefined` to omit one output for that case. Do not call
+  `evalAssert(...)` or `evalExpect(...)` from `deriveFromTracing`; use
+  `tracingAssertions` for trace-derived pass/fail checks.
+- `tracingAssertions` can be authored globally or locally on one eval when a
+  finished-trace invariant should pass or fail the case without creating a fake
+  score column. It receives the same `{ trace, input, case }` context as
+  `deriveFromTracing`; call `evalAssert(...)` or `evalExpect(...)` inside it.
+  Useful trace helpers include `trace.findSpan(name)`, `trace.findSpans(name)`,
+  `trace.hasSpan(name)`, `trace.findSpansByKind(kind)`,
+  `trace.findToolCallSpans()`, `trace.listToolCallSpanNames()`,
+  `trace.hasToolCallSpan(name)`, `trace.listSpanNames(kind?)`,
+  `trace.listSpanNamesDfs(kind?)`, and `trace.flattenDfs()`.
 - `traceDisplay` promotes selected span attributes into the trace tree and
   detail pane; it supports aggregation across subtrees (`scope`, `mode`) and
   user-defined `transform(...)` for derived views (e.g. currency conversion).
@@ -629,8 +643,9 @@ When adding or changing evals:
 3. `evalAssert` for hard invariants and truthy type narrowing. It records
    pass/fail entries in case-detail `assertions`; failed entries are also kept
    in `assertionFailures` and fail the case. Use `evalExpect` for non-trivial
-   comparisons, `scores` for graded signals, and `passThreshold` only on
-   scores that should gate pass/fail.
+   comparisons, `tracingAssertions` for invariants derived from the finished
+   trace, `scores` for graded signals, and `passThreshold` only on scores that
+   should gate pass/fail.
 4. Surface reviewable values through execute-context `setOutput` or ambient
    `setEvalOutput` in shared workflow code, and shape them with `columns`
    formats from the `ColumnFormat` type.