npm - @ls-stack/agent-eval - Versions diffs - 0.14.0 → 0.16.0 - Mend

@ls-stack/agent-eval 0.14.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/dist/{app-DKWm1oxc.mjs → app-B8e-oWYc.mjs} +3 -3
package/dist/apps/web/dist/assets/index-9hO8NpgZ.js +117 -0
package/dist/apps/web/dist/assets/{index-BVnLr79e.css → index-MARPw1bH.css} +1 -1
package/dist/apps/web/dist/index.html +2 -2
package/dist/bin.mjs +1 -1
package/dist/{cli-CMiCEQ-3.mjs → cli-BmrtjQj_.mjs} +155 -99
package/dist/index.d.mts +40 -6
package/dist/index.mjs +4 -4
package/dist/runChild.mjs +41 -3
package/dist/{runOrchestration-H0pSUl3I.mjs → runOrchestration-BDyNrRQT.mjs} +271 -124
package/dist/{runner-DLnj18MO.mjs → runner-CsZqhbiA.mjs} +2 -2
package/dist/{runner-Dx1sMCbh.mjs → runner-DABFPXkx.mjs} +1 -1
package/dist/src-CEAJYN_X.mjs +3 -0
package/package.json +5 -4
package/skills/agent-eval/SKILL.md +408 -0
package/dist/apps/web/dist/assets/index-Cx2CvM6a.js +0 -117
package/dist/src-BgGL7DDp.mjs +0 -3

package/dist/{cli-CMiCEQ-3.mjs → cli-BmrtjQj_.mjs} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { A as getEvalDisplayStatus, F as runSummarySchema, J as resolveLlmCallsConfig, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as normalizeScoreDef, c as persistCaseDetail, d as recomputePersistedCaseStatus, f as runTouchesEval, fn as getEvalRegistry, g as setLatestRunInfoMap, h as getTargetEvalIds, i as getLatestRunInfos, j as deriveScopedSummaryFromCases, k as getEvalTitle, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, q as resolveApiCallsConfig, r as getLastRunStatuses, s as nextShortIdFromSnapshots, u as recomputeEvalStatusesInRuns, v as loadConfig, x as createFsCacheStore, y as buildDeclaredColumnDefs } from "./runOrchestration-H0pSUl3I.mjs";
+import { A as getEvalTitle, I as runSummarySchema, J as resolveApiCallsConfig, M as deriveScopedSummaryFromCases, S as createFsCacheStore, Y as resolveLlmCallsConfig, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as buildDeclaredColumnDefs, c as persistCaseDetail, d as recomputePersistedCaseStatus, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, hn as getEvalRegistry, i as getLatestRunInfos, j as getEvalDisplayStatus, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, r as getLastRunStatuses, s as nextShortIdFromSnapshots, u as recomputeEvalStatusesInRuns, v as parseEvalMetas, x as normalizeScoreDef, y as loadConfig } from "./runOrchestration-BDyNrRQT.mjs";
 import { createHash } from "node:crypto";
 import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
 import { dirname, join, relative, resolve } from "node:path";
@@ -82,98 +82,6 @@ function validateCharts(params) {
 	};
 }
 //#endregion
-//#region ../runner/src/discovery.ts
-const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
-const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
-function parseEvalMetas(filePath, content) {
-	const metas = [];
-	let searchIndex = 0;
-	while (searchIndex < content.length) {
-		const defineEvalIndex = content.indexOf("defineEval", searchIndex);
-		if (defineEvalIndex === -1) break;
-		const extracted = extractDefineEvalObject(content, defineEvalIndex);
-		if (!extracted) {
-			searchIndex = defineEvalIndex + 10;
-			continue;
-		}
-		const id = evalIdMatchRegex.exec(extracted.objectText)?.[1];
-		if (id !== void 0) {
-			const result = {
-				filePath,
-				id
-			};
-			const title = evalTitleMatchRegex.exec(extracted.objectText)?.[1];
-			if (title !== void 0) result.title = title;
-			metas.push(result);
-		}
-		searchIndex = extracted.nextIndex;
-	}
-	return metas;
-}
-function extractDefineEvalObject(content, defineEvalIndex) {
-	const openParenIndex = content.indexOf("(", defineEvalIndex);
-	if (openParenIndex === -1) return void 0;
-	const objectStartIndex = content.indexOf("{", openParenIndex);
-	if (objectStartIndex === -1) return void 0;
-	let depth = 0;
-	let quote;
-	let inBlockComment = false;
-	let inLineComment = false;
-	let isEscaped = false;
-	for (let index = objectStartIndex; index < content.length; index++) {
-		const currentChar = content[index];
-		const nextChar = content[index + 1];
-		if (inLineComment) {
-			if (currentChar === "\n") inLineComment = false;
-			continue;
-		}
-		if (inBlockComment) {
-			if (currentChar === "*" && nextChar === "/") {
-				inBlockComment = false;
-				index++;
-			}
-			continue;
-		}
-		if (quote) {
-			if (isEscaped) {
-				isEscaped = false;
-				continue;
-			}
-			if (currentChar === "\\") {
-				isEscaped = true;
-				continue;
-			}
-			if (currentChar === quote) quote = void 0;
-			continue;
-		}
-		if (currentChar === "/" && nextChar === "/") {
-			inLineComment = true;
-			index++;
-			continue;
-		}
-		if (currentChar === "/" && nextChar === "*") {
-			inBlockComment = true;
-			index++;
-			continue;
-		}
-		if (currentChar === "\"" || currentChar === "'" || currentChar === "`") {
-			quote = currentChar;
-			continue;
-		}
-		if (currentChar === "{") {
-			depth++;
-			continue;
-		}
-		if (currentChar === "}") {
-			depth--;
-			if (depth === 0) return {
-				nextIndex: index + 1,
-				objectText: content.slice(objectStartIndex, index + 1)
-			};
-		}
-	}
-}
-//#endregion
 //#region ../runner/src/gitState.ts
 function runGitCommand(workspaceRoot, args) {
 	const result = spawnSync("git", args, {
@@ -316,7 +224,10 @@ function upsertFinishedCase(runState, caseDetail, caseRow) {
 function applyChildEvalMetas(evals, childMetas) {
 	for (const childMeta of childMetas) {
 		const evalMeta = evals.get(childMeta.id);
-		if (evalMeta === void 0) continue;
+		if (evalMeta === void 0) {
+			evals.set(childMeta.id, childMeta);
+			continue;
+		}
 		evalMeta.columnDefs = childMeta.columnDefs;
 		evalMeta.caseCount = childMeta.caseCount;
 		evalMeta.stats = childMeta.stats;
@@ -728,8 +639,7 @@ function createRunner({ watchForChanges = true } = {}) {
 				workspaceRoot,
 				runDir,
 				manifest,
-				summary,
-				evals: getSortedEvalMetas()
+				summary
 			};
 			await writeFile(join(runDir, "run-child-context.json"), JSON.stringify(childContext, null, 2));
 			startRunChild({
@@ -820,6 +730,9 @@ function createRunner({ watchForChanges = true } = {}) {
 		getWorkspaceRoot() {
 			return workspaceRoot;
 		},
+		getAllowCliRunAll() {
+			return config.allowCliRunAll === true;
+		},
 		getLlmCallsConfig() {
 			return llmCallsConfig;
 		},
@@ -900,6 +813,7 @@ function parseArgs(argv) {
 	const args = {
 		command: "help",
 		subcommand: void 0,
+		positionals: [],
 		showHelp: false,
 		helpTopic: "global",
 		unknownHelpTarget: void 0,
@@ -933,6 +847,7 @@ function parseArgs(argv) {
 	}
 	for (let i = cursor; i < normalizedArgv.length; i++) {
 		const arg = normalizedArgv[i];
+		if (arg === void 0) continue;
 		const next = normalizedArgv[i + 1];
 		if (arg === "--help" || arg === "-h") args.showHelp = true;
 		else if (arg === "--eval" && next) {
@@ -955,6 +870,7 @@ function parseArgs(argv) {
 		else if (arg === "--refresh-cache") args.cacheMode = "refresh";
 		else if (arg === "--clear-cache") args.clearCache = true;
 		else if (arg === "--all") args.all = true;
+		else if (!arg.startsWith("-")) args.positionals.push(arg);
 	}
 	return args;
 }
@@ -988,6 +904,9 @@ async function runCli(argv) {
 		case "run":
 			await commandRun(args);
 			break;
+		case "show-runs":
+			await commandShowRuns(args);
+			break;
 		case "cache":
 			await commandCache(args);
 			break;
@@ -997,7 +916,7 @@ async function runCli(argv) {
 	}
 }
 function isCliCommand(command) {
-	return command === "app" || command === "list" || command === "run" || command === "cache" || command === "help";
+	return command === "app" || command === "list" || command === "run" || command === "show-runs" || command === "cache" || command === "help";
 }
 function loadWorkspaceEnv() {
 	const envPath = resolve(process.cwd(), ".env");
@@ -1059,8 +978,8 @@ async function commandApp(args) {
 	const { serve } = await import("@hono/node-server");
 	const bundledWebDist = resolve(currentDir, "apps/web/dist");
 	if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
-	const appModule = await import("./app-DKWm1oxc.mjs");
-	const runnerModule = await import("./runner-Dx1sMCbh.mjs");
+	const appModule = await import("./app-B8e-oWYc.mjs");
+	const runnerModule = await import("./runner-DABFPXkx.mjs");
 	if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
 	if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
 	await runnerModule.initRunner();
@@ -1098,6 +1017,11 @@ async function commandList(args_) {
 async function commandRun(args) {
 	const runner = createRunner({ watchForChanges: false });
 	await runner.init();
+	if (args.evalIds.length === 0 && args.caseIds.length === 0 && !runner.getAllowCliRunAll()) {
+		console.error("This workspace disables running all evals from the CLI. Pass --eval <id> or --case <id> to run a targeted subset.");
+		process.exit(1);
+		return;
+	}
 	if (args.clearCache) {
 		await runner.clearCache();
 		if (!args.json) {
@@ -1144,6 +1068,32 @@ async function commandRun(args) {
 	}
 	if (summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
 }
+async function commandShowRuns(args) {
+	const runner = createRunner({ watchForChanges: false });
+	await runner.init();
+	const runRef = args.positionals[0];
+	if (runRef !== void 0) {
+		const run = resolveRunSnapshot(runner, runRef);
+		if (!run) {
+			printMissingRun(runRef);
+			process.exit(1);
+			return;
+		}
+		const index = buildRunFileIndex(runner.getWorkspaceRoot(), run);
+		if (args.json) {
+			printJson(index);
+			return;
+		}
+		printRunFileIndex(index);
+		return;
+	}
+	const indexes = getSortedRunSnapshots(runner).map((run) => buildRunFileIndex(runner.getWorkspaceRoot(), run));
+	if (args.json) {
+		printJson(indexes);
+		return;
+	}
+	printRunFileIndexes(indexes);
+}
 async function commandCache(args) {
 	const runner = createRunner({ watchForChanges: false });
 	await runner.init();
@@ -1194,6 +1144,93 @@ async function commandCache(args) {
 	}
 	printHelp(args.helpTopic);
 }
+function getSortedRunSnapshots(runner) {
+	return runner.getRuns().toSorted((a, b) => getRunStartTime(a) - getRunStartTime(b)).map((manifest) => runner.getRun(manifest.id)).filter((run) => run !== void 0);
+}
+function buildRunFileIndex(workspaceRoot, run) {
+	const runDir = join(workspaceRoot, ".agent-evals", "runs", run.manifest.id);
+	return {
+		id: run.manifest.id,
+		shortId: run.manifest.shortId,
+		status: run.manifest.status,
+		startedAt: run.manifest.startedAt,
+		endedAt: run.manifest.endedAt,
+		target: run.manifest.target,
+		summary: run.summary,
+		files: {
+			dir: runDir,
+			run: join(runDir, "run.json"),
+			summary: join(runDir, "summary.json"),
+			cases: join(runDir, "cases.jsonl"),
+			caseDetailsDir: join(runDir, "case-details"),
+			tracesDir: join(runDir, "traces")
+		},
+		cases: run.cases.map((caseRow) => {
+			const fileName = `${encodeURIComponent(caseRow.caseId)}.json`;
+			return {
+				caseId: caseRow.caseId,
+				evalId: caseRow.evalId,
+				status: caseRow.status,
+				files: {
+					caseDetail: join(runDir, "case-details", fileName),
+					trace: join(runDir, "traces", fileName)
+				}
+			};
+		})
+	};
+}
+function resolveRunSnapshot(runner, runRef) {
+	const runs = getSortedRunSnapshots(runner);
+	if (runs.length === 0) return void 0;
+	if (runRef === void 0 || runRef === "latest") return runs[runs.length - 1];
+	return runs.find((run) => run.manifest.id === runRef || run.manifest.shortId === runRef);
+}
+function printMissingRun(runRef) {
+	console.error(runRef === void 0 ? "No saved runs found." : `No saved run found for "${runRef}".`);
+}
+function getRunStartTime(manifest) {
+	const parsed = new Date(manifest.startedAt).getTime();
+	return Number.isFinite(parsed) ? parsed : 0;
+}
+function printJson(value) {
+	console.info(JSON.stringify(value, null, 2));
+}
+function printRunFileIndexes(indexes) {
+	if (indexes.length === 0) {
+		console.info("No saved runs.");
+		return;
+	}
+	console.info(`Saved runs (${String(indexes.length)}):\n`);
+	for (const index of indexes) {
+		printRunFileIndex(index);
+		console.info("");
+	}
+}
+function printRunFileIndex(index) {
+	console.info(`${index.shortId} (${index.id})  ${index.status}  ${formatCaseCounts(index.summary)}`);
+	console.info(`  dir: ${index.files.dir}`);
+	console.info(`  run: ${index.files.run}`);
+	console.info(`  summary: ${index.files.summary}`);
+	console.info(`  cases: ${index.files.cases}`);
+	console.info(`  case details: ${index.files.caseDetailsDir}`);
+	console.info(`  traces: ${index.files.tracesDir}`);
+	if (index.cases.length === 0) return;
+	console.info("  case files:");
+	for (const caseEntry of index.cases) {
+		console.info(`    ${caseEntry.caseId} [${caseEntry.evalId}] ${caseEntry.status}`);
+		console.info(`      detail: ${caseEntry.files.caseDetail}`);
+		console.info(`      trace: ${caseEntry.files.trace}`);
+	}
+}
+function formatCaseCounts(summary) {
+	return [
+		`${String(summary.totalCases)} total`,
+		`${String(summary.passedCases)} passed`,
+		`${String(summary.failedCases)} failed`,
+		`${String(summary.errorCases)} errors`,
+		`${String(summary.cancelledCases)} cancelled`
+	].join(", ");
+}
 async function waitForRunCompletion(runner, runId) {
 	return new Promise((resolvePromise) => {
 		const check = () => {
@@ -1255,6 +1292,24 @@ Flags:
   --clear-cache              Clear the cache before starting the run
   --no-env                   Disable automatic .env loading
   --help, -h                 Show this help
+  `);
+		return;
+	}
+	if (topic === "show-runs") {
+		console.info(`
+agent-evals show-runs - Show saved run artifact file paths
+Usage:
+  agent-evals show-runs [<run-id>|latest] [--json]
+Prints the run directory and stable artifact paths for run.json, summary.json,
+cases.jsonl, case detail JSON, and trace JSON files. Run ids can be full
+timestamp ids, short ids such as r0, or latest.
+Flags:
+  --json                     Output the file index as JSON
+  --no-env                   Disable automatic .env loading
+  --help, -h                 Show this help
   `);
 		return;
 	}
@@ -1283,6 +1338,7 @@ Commands:
   app                        Start server with UI
   list                       List discovered evals
   run                        Run evals
+  show-runs [id|latest]      Show saved run artifact file paths
   cache list                 List cached operation entries
   cache clear --eval <id>    Clear cache entries for one eval
   cache clear --all          Clear every cached entry

package/dist/index.d.mts CHANGED Viewed

@@ -1674,6 +1674,12 @@ type AgentEvalsConfig = {
    * considered outdated. Defaults to `14`.
    */
   staleAfterDays?: number;
+  /**
+   * Whether `agent-evals run` may run every discovered eval when no `--eval`
+   * or `--case` filter is provided. Defaults to `false`; set to `true` to
+   * opt into unfiltered CLI runs. Grouped runs in the UI are still allowed.
+   */
+  allowCliRunAll?: boolean;
   /**
    * Global trace attribute display config for the UI.
    *
@@ -1762,6 +1768,7 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
   }>>;
   concurrency: z$1.ZodOptional<z$1.ZodNumber>;
   staleAfterDays: z$1.ZodOptional<z$1.ZodNumber>;
+  allowCliRunAll: z$1.ZodOptional<z$1.ZodBoolean>;
   traceDisplay: z$1.ZodOptional<z$1.ZodObject<{
     attributes: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
       key: z$1.ZodOptional<z$1.ZodString>;
@@ -2720,6 +2727,14 @@ type EvalCaseScope = {
   caseCacheRefs: TraceCacheRef[]; /** Background promises that should settle before the case scope finalizes. */
   pendingBackgroundJobs: Set<Promise<unknown>>;
 };
+/**
+ * Runtime phase currently owned by the eval runner.
+ *
+ * `null` means the current async execution is outside an eval run. `env`
+ * covers run-time module/environment loading, including top-level code in
+ * modules imported while a run is being prepared.
+ */
+type EvalRuntimeScope = 'env' | 'cases' | 'eval' | 'derive' | 'outputsSchema' | 'scorer';
 /** Error thrown when an eval assertion fails during case execution. */
 declare class EvalAssertionError extends Error {
   constructor(message: string);
@@ -2727,12 +2742,14 @@ declare class EvalAssertionError extends Error {
 /** Return the current eval scope for the active async context, if any. */
 declare function getCurrentScope(): EvalCaseScope | undefined;
 /**
- * Return whether the current async execution is inside an active eval case.
+ * Return the current eval runner phase for this async execution.
  *
- * This is useful for shared workflow code that wants to branch on eval-only
- * behavior without importing or inspecting the full eval scope.
+ * Returns `null` outside eval-owned work, `env` while the runner is loading
+ * eval modules for a run, `cases` while generating cases, `eval` while running
+ * case `execute`, `derive` while deriving outputs from traces, `outputsSchema`
+ * while validating outputs, and `scorer` while computing scores.
  */
-declare function isInEvalScope(): boolean;
+declare function isInEvalScope(): EvalRuntimeScope | null;
 /**
  * Register background work that should settle before eval finalization.
  *
@@ -2762,8 +2779,18 @@ type RunInEvalScopeOptions = {
   /** Authored input for the active eval case. */input?: unknown; /** Stable prefix used when generating scoped IDs with `nextEvalId()`. */
   idPrefix?: string; /** Cache adapter + mode attached to the scope before `fn` runs. */
   cacheContext?: CacheScopeContext; /** Whether registered background jobs should settle before scope finalizes. */
-  waitForBackgroundJobs?: boolean;
+  waitForBackgroundJobs?: boolean; /** Eval runner phase exposed through `isInEvalScope()`. Defaults to `eval`. */
+  runtimeScope?: EvalRuntimeScope;
 };
+/** Execute a callback while `isInEvalScope()` reports a runner phase. */
+declare function runInEvalRuntimeScope<T>(runtimeScope: EvalRuntimeScope, fn: () => Promise<T> | T): Promise<T>;
+/**
+ * Execute a callback with an existing case scope and a specific runner phase.
+ *
+ * Runner-internal helper for post-execute phases that still need access to the
+ * completed case scope through output, trace, assertion, and input helpers.
+ */
+declare function runInExistingEvalScope<T>(scope: EvalCaseScope, runtimeScope: EvalRuntimeScope, fn: () => Promise<T> | T): Promise<T>;
 /**
  * Execute a callback inside a fresh eval case scope and capture its outputs,
  * trace data, and terminal error state.
@@ -3058,6 +3085,13 @@ type EvalRunner = {
   subscribeDiscovery(listener: (event: SseEnvelope) => void): () => void; /** Stop background filesystem watchers owned by this runner instance. */
   close(): Promise<void>; /** Resolve the workspace root backing this runner instance. */
   getWorkspaceRoot(): string;
+  /**
+   * Return whether the current workspace allows an unfiltered CLI run.
+   *
+   * `false` means `agent-evals run` must include `--eval` or `--case`.
+   * Programmatic/server runs are intentionally unaffected.
+   */
+  getAllowCliRunAll(): boolean;
   /**
    * Resolved LLM-calls config used by the UI to derive the LLM calls tab.
    *
@@ -3135,4 +3169,4 @@ declare function createRunner({
  */
 declare function runCli(argv: string[]): Promise<void>;
 //#endregion
-export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallsConfigInput, type NumberDisplayOptions, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
+export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallsConfigInput, type NumberDisplayOptions, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };

package/dist/index.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { $ as evalFreshnessStatusSchema, $t as evalAssert, A as getEvalDisplayStatus, At as traceDisplayInputConfigSchema, B as apiCallMetricPlacementSchema, Bt as jsonCellSchema, C as updateManualScoreRequestSchema, Ct as spanCacheOptionsSchema, D as extractLlmCalls, Dt as traceAttributeDisplayPlacementSchema, E as extractApiCalls, Et as traceAttributeDisplayInputSchema, F as runSummarySchema, Ft as cellValueSchema, G as llmCallMetricSchema, Gt as buildTraceTree, H as apiCallsConfigSchema, Ht as repoFileRefSchema, I as DEFAULT_API_CALLS_CONFIG, It as columnDefSchema, J as resolveLlmCallsConfig, Jt as evalTracer, K as llmCallsConfigSchema, Kt as captureEvalSpanError, L as DEFAULT_LLM_CALLS_CONFIG, Lt as columnFormatSchema, M as deriveStatusFromCaseRows, Mt as traceSpanKindSchema, N as deriveStatusFromChildStatuses, Nt as traceSpanSchema, O as getNestedAttribute, Ot as traceAttributeDisplaySchema, P as runManifestSchema, Pt as traceSpanWarningSchema, Q as caseRowSchema, Qt as appendToEvalOutput, R as agentEvalsConfigSchema, Rt as columnKindSchema, S as createRunRequestSchema, St as serializedCacheSpanSchema, T as extractCacheHits, Tt as traceAttributeDisplayFormatSchema, U as llmCallMetricFormatSchema, Ut as runArtifactRefSchema, V as apiCallMetricSchema, Vt as numberDisplayOptionsSchema, W as llmCallMetricPlacementSchema, Wt as z, X as assertionFailureSchema, Xt as hashCacheKeySync, Y as trialSelectionModeSchema, Yt as hashCacheKey, Z as caseDetailSchema, Zt as EvalAssertionError, _t as cacheModeSchema, an as nextEvalId, at as evalChartAggregateSchema, bt as cacheRecordingSchema, cn as setScopeCacheContext, ct as evalChartColorSchema, dn as defineEval, dt as evalChartTooltipExtraSchema, en as getCurrentScope, et as evalStatAggregateSchema, fn as getEvalRegistry, ft as evalChartTypeSchema, gt as cacheListItemSchema, ht as cacheFileSchema, in as mergeEvalOutput, it as scoreTraceSchema, j as deriveScopedSummaryFromCases, jt as traceSpanErrorSchema, k as getEvalTitle, kt as traceDisplayConfigSchema, ln as startEvalBackgroundJob, lt as evalChartConfigSchema, mt as cacheEntrySchema, nn as incrementEvalOutput, nt as evalStatsConfigSchema, on as runInEvalScope, ot as evalChartAxisSchema, pt as evalChartsConfigSchema, q as resolveApiCallsConfig, qt as evalSpan, rn as isInEvalScope, rt as evalSummarySchema, sn as setEvalOutput, st as evalChartBuiltinMetricSchema, tn as getEvalCaseInput, tt as evalStatItemSchema, un as repoFile, ut as evalChartMetricSchema, vt as cacheOperationTypeSchema, w as sseEnvelopeSchema, wt as traceCacheRefSchema, xt as cacheStatusSchema, yt as cacheRecordingOpSchema, z as apiCallMetricFormatSchema, zt as fileRefSchema } from "./runOrchestration-H0pSUl3I.mjs";
-import { n as createRunner, t as runCli } from "./cli-CMiCEQ-3.mjs";
-import "./src-BgGL7DDp.mjs";
-export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
+import { $ as caseRowSchema, $t as appendToEvalOutput, A as getEvalTitle, At as traceDisplayConfigSchema, B as apiCallMetricFormatSchema, Bt as fileRefSchema, C as createRunRequestSchema, Ct as serializedCacheSpanSchema, D as extractApiCalls, Dt as traceAttributeDisplayInputSchema, E as extractCacheHits, Et as traceAttributeDisplayFormatSchema, F as runManifestSchema, Ft as traceSpanWarningSchema, G as llmCallMetricPlacementSchema, Gt as z, H as apiCallMetricSchema, Ht as numberDisplayOptionsSchema, I as runSummarySchema, It as cellValueSchema, J as resolveApiCallsConfig, Jt as evalSpan, K as llmCallMetricSchema, Kt as buildTraceTree, L as DEFAULT_API_CALLS_CONFIG, Lt as columnDefSchema, M as deriveScopedSummaryFromCases, Mt as traceSpanErrorSchema, N as deriveStatusFromCaseRows, Nt as traceSpanKindSchema, O as extractLlmCalls, Ot as traceAttributeDisplayPlacementSchema, P as deriveStatusFromChildStatuses, Pt as traceSpanSchema, Q as caseDetailSchema, Qt as EvalAssertionError, R as DEFAULT_LLM_CALLS_CONFIG, Rt as columnFormatSchema, St as cacheStatusSchema, T as sseEnvelopeSchema, Tt as traceCacheRefSchema, U as apiCallsConfigSchema, Ut as repoFileRefSchema, V as apiCallMetricPlacementSchema, Vt as jsonCellSchema, W as llmCallMetricFormatSchema, Wt as runArtifactRefSchema, X as trialSelectionModeSchema, Xt as hashCacheKey, Y as resolveLlmCallsConfig, Yt as evalTracer, Z as assertionFailureSchema, Zt as hashCacheKeySync, _t as cacheListItemSchema, an as mergeEvalOutput, at as scoreTraceSchema, bt as cacheRecordingOpSchema, cn as runInEvalScope, ct as evalChartBuiltinMetricSchema, dn as setScopeCacheContext, dt as evalChartMetricSchema, en as evalAssert, et as evalFreshnessStatusSchema, fn as startEvalBackgroundJob, ft as evalChartTooltipExtraSchema, gt as cacheFileSchema, hn as getEvalRegistry, ht as cacheEntrySchema, in as isInEvalScope, it as evalSummarySchema, j as getEvalDisplayStatus, jt as traceDisplayInputConfigSchema, k as getNestedAttribute, kt as traceAttributeDisplaySchema, ln as runInExistingEvalScope, lt as evalChartColorSchema, mn as defineEval, mt as evalChartsConfigSchema, nn as getEvalCaseInput, nt as evalStatItemSchema, on as nextEvalId, ot as evalChartAggregateSchema, pn as repoFile, pt as evalChartTypeSchema, q as llmCallsConfigSchema, qt as captureEvalSpanError, rn as incrementEvalOutput, rt as evalStatsConfigSchema, sn as runInEvalRuntimeScope, st as evalChartAxisSchema, tn as getCurrentScope, tt as evalStatAggregateSchema, un as setEvalOutput, ut as evalChartConfigSchema, vt as cacheModeSchema, w as updateManualScoreRequestSchema, wt as spanCacheOptionsSchema, xt as cacheRecordingSchema, yt as cacheOperationTypeSchema, z as agentEvalsConfigSchema, zt as columnKindSchema } from "./runOrchestration-BDyNrRQT.mjs";
+import { n as createRunner, t as runCli } from "./cli-BmrtjQj_.mjs";
+import "./src-CEAJYN_X.mjs";
+export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };

package/dist/runChild.mjs CHANGED Viewed

@@ -1,7 +1,9 @@
-import { F as runSummarySchema, It as columnDefSchema, P as runManifestSchema, S as createRunRequestSchema, nt as evalStatsConfigSchema, pt as evalChartsConfigSchema, t as executeRun, v as loadConfig, x as createFsCacheStore } from "./runOrchestration-H0pSUl3I.mjs";
+import { C as createRunRequestSchema, F as runManifestSchema, I as runSummarySchema, Lt as columnDefSchema, S as createFsCacheStore, mt as evalChartsConfigSchema, rt as evalStatsConfigSchema, t as executeRun, v as parseEvalMetas, y as loadConfig } from "./runOrchestration-BDyNrRQT.mjs";
 import { createHash } from "node:crypto";
 import { readFile } from "node:fs/promises";
+import { relative } from "node:path";
 import { z } from "zod/v4";
+import { glob } from "glob";
 //#region ../runner/src/runChild.ts
 const evalMetaSchema = z.object({
 	id: z.string(),
@@ -20,7 +22,7 @@ const runChildContextSchema = z.object({
 	runDir: z.string(),
 	manifest: runManifestSchema,
 	summary: runSummarySchema,
-	evals: z.array(evalMetaSchema)
+	evals: z.array(evalMetaSchema).optional()
 });
 function sendMessage(message) {
 	if (process.send === void 0) return;
@@ -37,6 +39,38 @@ function getTargetEvals(params) {
 	if (params.request.target.evalIds && params.request.target.evalIds.length > 0) return params.request.target.evalIds.map((id) => params.evals.get(id)).filter((entry) => entry !== void 0);
 	return [...params.evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
 }
+function toWorkspaceRelativePath(params) {
+	return relative(params.workspaceRoot, params.filePath).replaceAll("\\", "/");
+}
+async function discoverRunEvals(params) {
+	const discovered = [];
+	for (const pattern of params.config.include) {
+		const files = await glob(pattern, {
+			cwd: params.workspaceRoot,
+			absolute: true
+		});
+		discovered.push(...files);
+	}
+	const evals = /* @__PURE__ */ new Map();
+	for (const filePath of discovered) {
+		const source = await readFile(filePath, "utf-8");
+		const sourceFingerprint = getSourceFingerprint(source);
+		const metas = parseEvalMetas(filePath, source);
+		for (const meta of metas) evals.set(meta.id, {
+			id: meta.id,
+			title: meta.title,
+			filePath: toWorkspaceRelativePath({
+				filePath: meta.filePath,
+				workspaceRoot: params.workspaceRoot
+			}),
+			sourceFilePath: meta.filePath,
+			sourceFingerprint,
+			columnDefs: [],
+			caseCount: null
+		});
+	}
+	return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
+}
 async function readContext(contextPath) {
 	if (contextPath === void 0) throw new Error("Missing run child context path");
 	return runChildContextSchema.parse(JSON.parse(await readFile(contextPath, "utf-8")));
@@ -54,7 +88,11 @@ async function main() {
 		maxEntriesPerNamespace: config.cache?.maxEntriesPerNamespace ?? config.cache?.maxEntriesPerEval,
 		maxEntriesByNamespace: config.cache?.maxEntriesByNamespace
 	});
-	const evals = new Map(context.evals.map((evalMeta) => [evalMeta.id, evalMeta]));
+	const evalMetas = await discoverRunEvals({
+		config,
+		workspaceRoot: context.workspaceRoot
+	});
+	const evals = new Map(evalMetas.map((evalMeta) => [evalMeta.id, evalMeta]));
 	const lastRunStatusMap = /* @__PURE__ */ new Map();
 	const latestRunInfoMap = /* @__PURE__ */ new Map();
 	await executeRun({