npm - @ls-stack/agent-eval - Versions diffs - 0.13.0 → 0.15.0 - Mend

@ls-stack/agent-eval 0.13.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/dist/{app-Dg3qYVku.mjs → app-B7FUWsVm.mjs} +3 -3
package/dist/apps/web/dist/assets/index-Cx2CvM6a.js +117 -0
package/dist/apps/web/dist/index.html +1 -1
package/dist/bin.mjs +8 -3
package/dist/{cli-lOZdhO2D.mjs → cli-B-sCTyz8.mjs} +17 -98
package/dist/index.d.mts +26 -6
package/dist/index.mjs +4 -4
package/dist/runChild.mjs +41 -3
package/dist/{runOrchestration-H0pSUl3I.mjs → runOrchestration-B3fYtpKo.mjs} +269 -124
package/dist/{runner-BK1KX2SA.mjs → runner-Dt-Ynv6s.mjs} +1 -1
package/dist/{runner-CmbmfBG2.mjs → runner-vunKoSBu.mjs} +2 -2
package/dist/src-jaOlXwb5.mjs +3 -0
package/package.json +3 -2
package/skills/agent-eval/SKILL.md +400 -0
package/dist/apps/web/dist/assets/index-Drw0IpOd.js +0 -117
package/dist/src-Btb9RCYD.mjs +0 -3

package/dist/apps/web/dist/index.html CHANGED Viewed

@@ -25,7 +25,7 @@
       href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
       rel="stylesheet"
     />
-    <script type="module" crossorigin src="/assets/index-Drw0IpOd.js"></script>
+    <script type="module" crossorigin src="/assets/index-Cx2CvM6a.js"></script>
     <link rel="stylesheet" crossorigin href="/assets/index-BVnLr79e.css">
   </head>
   <body>

package/dist/bin.mjs CHANGED Viewed

@@ -1,10 +1,11 @@
 #!/usr/bin/env node
-import { t as runCli } from "./cli-lOZdhO2D.mjs";
+import { t as runCli } from "./cli-B-sCTyz8.mjs";
 import { spawn } from "node:child_process";
 //#region src/bin.ts
 const moduleMocksFlag = "--experimental-test-module-mocks";
 const inspectFlagPrefix = "--inspect";
 const inspectBrkFlagPrefix = "--inspect-brk";
+const runChildInspectArgEnv = "AGENT_EVALS_RUN_CHILD_INSPECT_ARG";
 function needsModuleMocksFlag() {
 	return !process.execArgv.includes(moduleMocksFlag);
 }
@@ -32,10 +33,13 @@ function isInspectArg(arg) {
 }
 function buildExecArgv(inspectArg) {
 	const nextExecArgv = [moduleMocksFlag, ...process.execArgv.filter((arg) => arg !== moduleMocksFlag && !isInspectArg(arg))];
-	if (inspectArg !== void 0) nextExecArgv.push(inspectArg);
-	else nextExecArgv.push(...process.execArgv.filter(isInspectArg));
+	if (inspectArg === void 0) nextExecArgv.push(...process.execArgv.filter(isInspectArg));
 	return nextExecArgv;
 }
+function setRunChildInspectArg(inspectArg) {
+	if (inspectArg === void 0) return;
+	process.env[runChildInspectArgEnv] = inspectArg;
+}
 function execArgvMatches(nextExecArgv) {
 	return process.execArgv.length === nextExecArgv.length && process.execArgv.every((arg, index) => arg === nextExecArgv[index]);
 }
@@ -67,6 +71,7 @@ async function reexecWithNodeArgs(argv, execArgv) {
 	});
 }
 const { argv, inspectArg } = parseDebugFlags(process.argv.slice(2));
+setRunChildInspectArg(inspectArg);
 const execArgv = buildExecArgv(inspectArg);
 if (needsModuleMocksFlag() || !execArgvMatches(execArgv)) await reexecWithNodeArgs(argv, execArgv);
 else await runCli(argv);

package/dist/{cli-lOZdhO2D.mjs → cli-B-sCTyz8.mjs} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { A as getEvalDisplayStatus, F as runSummarySchema, J as resolveLlmCallsConfig, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as normalizeScoreDef, c as persistCaseDetail, d as recomputePersistedCaseStatus, f as runTouchesEval, fn as getEvalRegistry, g as setLatestRunInfoMap, h as getTargetEvalIds, i as getLatestRunInfos, j as deriveScopedSummaryFromCases, k as getEvalTitle, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, q as resolveApiCallsConfig, r as getLastRunStatuses, s as nextShortIdFromSnapshots, u as recomputeEvalStatusesInRuns, v as loadConfig, x as createFsCacheStore, y as buildDeclaredColumnDefs } from "./runOrchestration-H0pSUl3I.mjs";
+import { A as getEvalTitle, I as runSummarySchema, J as resolveApiCallsConfig, M as deriveScopedSummaryFromCases, S as createFsCacheStore, Y as resolveLlmCallsConfig, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as buildDeclaredColumnDefs, c as persistCaseDetail, d as recomputePersistedCaseStatus, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, hn as getEvalRegistry, i as getLatestRunInfos, j as getEvalDisplayStatus, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, r as getLastRunStatuses, s as nextShortIdFromSnapshots, u as recomputeEvalStatusesInRuns, v as parseEvalMetas, x as normalizeScoreDef, y as loadConfig } from "./runOrchestration-B3fYtpKo.mjs";
 import { createHash } from "node:crypto";
 import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
 import { dirname, join, relative, resolve } from "node:path";
@@ -82,98 +82,6 @@ function validateCharts(params) {
 	};
 }
 //#endregion
-//#region ../runner/src/discovery.ts
-const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
-const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
-function parseEvalMetas(filePath, content) {
-	const metas = [];
-	let searchIndex = 0;
-	while (searchIndex < content.length) {
-		const defineEvalIndex = content.indexOf("defineEval", searchIndex);
-		if (defineEvalIndex === -1) break;
-		const extracted = extractDefineEvalObject(content, defineEvalIndex);
-		if (!extracted) {
-			searchIndex = defineEvalIndex + 10;
-			continue;
-		}
-		const id = evalIdMatchRegex.exec(extracted.objectText)?.[1];
-		if (id !== void 0) {
-			const result = {
-				filePath,
-				id
-			};
-			const title = evalTitleMatchRegex.exec(extracted.objectText)?.[1];
-			if (title !== void 0) result.title = title;
-			metas.push(result);
-		}
-		searchIndex = extracted.nextIndex;
-	}
-	return metas;
-}
-function extractDefineEvalObject(content, defineEvalIndex) {
-	const openParenIndex = content.indexOf("(", defineEvalIndex);
-	if (openParenIndex === -1) return void 0;
-	const objectStartIndex = content.indexOf("{", openParenIndex);
-	if (objectStartIndex === -1) return void 0;
-	let depth = 0;
-	let quote;
-	let inBlockComment = false;
-	let inLineComment = false;
-	let isEscaped = false;
-	for (let index = objectStartIndex; index < content.length; index++) {
-		const currentChar = content[index];
-		const nextChar = content[index + 1];
-		if (inLineComment) {
-			if (currentChar === "\n") inLineComment = false;
-			continue;
-		}
-		if (inBlockComment) {
-			if (currentChar === "*" && nextChar === "/") {
-				inBlockComment = false;
-				index++;
-			}
-			continue;
-		}
-		if (quote) {
-			if (isEscaped) {
-				isEscaped = false;
-				continue;
-			}
-			if (currentChar === "\\") {
-				isEscaped = true;
-				continue;
-			}
-			if (currentChar === quote) quote = void 0;
-			continue;
-		}
-		if (currentChar === "/" && nextChar === "/") {
-			inLineComment = true;
-			index++;
-			continue;
-		}
-		if (currentChar === "/" && nextChar === "*") {
-			inBlockComment = true;
-			index++;
-			continue;
-		}
-		if (currentChar === "\"" || currentChar === "'" || currentChar === "`") {
-			quote = currentChar;
-			continue;
-		}
-		if (currentChar === "{") {
-			depth++;
-			continue;
-		}
-		if (currentChar === "}") {
-			depth--;
-			if (depth === 0) return {
-				nextIndex: index + 1,
-				objectText: content.slice(objectStartIndex, index + 1)
-			};
-		}
-	}
-}
-//#endregion
 //#region ../runner/src/gitState.ts
 function runGitCommand(workspaceRoot, args) {
 	const result = spawnSync("git", args, {
@@ -208,6 +116,9 @@ function isRunChildMessage(value) {
 }
 //#endregion
 //#region ../runner/src/runChildManager.ts
+const runChildInspectArgEnv = "AGENT_EVALS_RUN_CHILD_INSPECT_ARG";
+const inspectFlagPrefix = "--inspect";
+const inspectBrkFlagPrefix = "--inspect-brk";
 function startRunChild(params) {
 	const child = spawn(process.execPath, [
 		...getRunChildExecArgv(),
@@ -256,10 +167,16 @@ function getRunChildExecArgv() {
 			if (arg === "--input-type") skipNext = true;
 			continue;
 		}
+		if (isInspectArg(arg)) continue;
 		execArgv.push(arg);
 	}
+	const inspectArg = process.env[runChildInspectArgEnv];
+	if (inspectArg !== void 0 && isInspectArg(inspectArg)) execArgv.push(inspectArg);
 	return execArgv;
 }
+function isInspectArg(arg) {
+	return arg === inspectFlagPrefix || arg.startsWith(`${inspectFlagPrefix}=`) || arg === inspectBrkFlagPrefix || arg.startsWith(`${inspectBrkFlagPrefix}=`);
+}
 function killRunChild(runState) {
 	const child = runState.childProcess;
 	runState.childProcess = void 0;
@@ -307,7 +224,10 @@ function upsertFinishedCase(runState, caseDetail, caseRow) {
 function applyChildEvalMetas(evals, childMetas) {
 	for (const childMeta of childMetas) {
 		const evalMeta = evals.get(childMeta.id);
-		if (evalMeta === void 0) continue;
+		if (evalMeta === void 0) {
+			evals.set(childMeta.id, childMeta);
+			continue;
+		}
 		evalMeta.columnDefs = childMeta.columnDefs;
 		evalMeta.caseCount = childMeta.caseCount;
 		evalMeta.stats = childMeta.stats;
@@ -719,8 +639,7 @@ function createRunner({ watchForChanges = true } = {}) {
 				workspaceRoot,
 				runDir,
 				manifest,
-				summary,
-				evals: getSortedEvalMetas()
+				summary
 			};
 			await writeFile(join(runDir, "run-child-context.json"), JSON.stringify(childContext, null, 2));
 			startRunChild({
@@ -1050,8 +969,8 @@ async function commandApp(args) {
 	const { serve } = await import("@hono/node-server");
 	const bundledWebDist = resolve(currentDir, "apps/web/dist");
 	if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
-	const appModule = await import("./app-Dg3qYVku.mjs");
-	const runnerModule = await import("./runner-BK1KX2SA.mjs");
+	const appModule = await import("./app-B7FUWsVm.mjs");
+	const runnerModule = await import("./runner-Dt-Ynv6s.mjs");
 	if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
 	if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
 	await runnerModule.initRunner();

package/dist/index.d.mts CHANGED Viewed

@@ -2720,6 +2720,14 @@ type EvalCaseScope = {
   caseCacheRefs: TraceCacheRef[]; /** Background promises that should settle before the case scope finalizes. */
   pendingBackgroundJobs: Set<Promise<unknown>>;
 };
+/**
+ * Runtime phase currently owned by the eval runner.
+ *
+ * `null` means the current async execution is outside an eval run. `env`
+ * covers run-time module/environment loading, including top-level code in
+ * modules imported while a run is being prepared.
+ */
+type EvalRuntimeScope = 'env' | 'cases' | 'eval' | 'derive' | 'outputsSchema' | 'scorer';
 /** Error thrown when an eval assertion fails during case execution. */
 declare class EvalAssertionError extends Error {
   constructor(message: string);
@@ -2727,12 +2735,14 @@ declare class EvalAssertionError extends Error {
 /** Return the current eval scope for the active async context, if any. */
 declare function getCurrentScope(): EvalCaseScope | undefined;
 /**
- * Return whether the current async execution is inside an active eval case.
+ * Return the current eval runner phase for this async execution.
  *
- * This is useful for shared workflow code that wants to branch on eval-only
- * behavior without importing or inspecting the full eval scope.
+ * Returns `null` outside eval-owned work, `env` while the runner is loading
+ * eval modules for a run, `cases` while generating cases, `eval` while running
+ * case `execute`, `derive` while deriving outputs from traces, `outputsSchema`
+ * while validating outputs, and `scorer` while computing scores.
  */
-declare function isInEvalScope(): boolean;
+declare function isInEvalScope(): EvalRuntimeScope | null;
 /**
  * Register background work that should settle before eval finalization.
  *
@@ -2762,8 +2772,18 @@ type RunInEvalScopeOptions = {
   /** Authored input for the active eval case. */input?: unknown; /** Stable prefix used when generating scoped IDs with `nextEvalId()`. */
   idPrefix?: string; /** Cache adapter + mode attached to the scope before `fn` runs. */
   cacheContext?: CacheScopeContext; /** Whether registered background jobs should settle before scope finalizes. */
-  waitForBackgroundJobs?: boolean;
+  waitForBackgroundJobs?: boolean; /** Eval runner phase exposed through `isInEvalScope()`. Defaults to `eval`. */
+  runtimeScope?: EvalRuntimeScope;
 };
+/** Execute a callback while `isInEvalScope()` reports a runner phase. */
+declare function runInEvalRuntimeScope<T>(runtimeScope: EvalRuntimeScope, fn: () => Promise<T> | T): Promise<T>;
+/**
+ * Execute a callback with an existing case scope and a specific runner phase.
+ *
+ * Runner-internal helper for post-execute phases that still need access to the
+ * completed case scope through output, trace, assertion, and input helpers.
+ */
+declare function runInExistingEvalScope<T>(scope: EvalCaseScope, runtimeScope: EvalRuntimeScope, fn: () => Promise<T> | T): Promise<T>;
 /**
  * Execute a callback inside a fresh eval case scope and capture its outputs,
  * trace data, and terminal error state.
@@ -3135,4 +3155,4 @@ declare function createRunner({
  */
 declare function runCli(argv: string[]): Promise<void>;
 //#endregion
-export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallsConfigInput, type NumberDisplayOptions, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
+export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallsConfigInput, type NumberDisplayOptions, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };

package/dist/index.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { $ as evalFreshnessStatusSchema, $t as evalAssert, A as getEvalDisplayStatus, At as traceDisplayInputConfigSchema, B as apiCallMetricPlacementSchema, Bt as jsonCellSchema, C as updateManualScoreRequestSchema, Ct as spanCacheOptionsSchema, D as extractLlmCalls, Dt as traceAttributeDisplayPlacementSchema, E as extractApiCalls, Et as traceAttributeDisplayInputSchema, F as runSummarySchema, Ft as cellValueSchema, G as llmCallMetricSchema, Gt as buildTraceTree, H as apiCallsConfigSchema, Ht as repoFileRefSchema, I as DEFAULT_API_CALLS_CONFIG, It as columnDefSchema, J as resolveLlmCallsConfig, Jt as evalTracer, K as llmCallsConfigSchema, Kt as captureEvalSpanError, L as DEFAULT_LLM_CALLS_CONFIG, Lt as columnFormatSchema, M as deriveStatusFromCaseRows, Mt as traceSpanKindSchema, N as deriveStatusFromChildStatuses, Nt as traceSpanSchema, O as getNestedAttribute, Ot as traceAttributeDisplaySchema, P as runManifestSchema, Pt as traceSpanWarningSchema, Q as caseRowSchema, Qt as appendToEvalOutput, R as agentEvalsConfigSchema, Rt as columnKindSchema, S as createRunRequestSchema, St as serializedCacheSpanSchema, T as extractCacheHits, Tt as traceAttributeDisplayFormatSchema, U as llmCallMetricFormatSchema, Ut as runArtifactRefSchema, V as apiCallMetricSchema, Vt as numberDisplayOptionsSchema, W as llmCallMetricPlacementSchema, Wt as z, X as assertionFailureSchema, Xt as hashCacheKeySync, Y as trialSelectionModeSchema, Yt as hashCacheKey, Z as caseDetailSchema, Zt as EvalAssertionError, _t as cacheModeSchema, an as nextEvalId, at as evalChartAggregateSchema, bt as cacheRecordingSchema, cn as setScopeCacheContext, ct as evalChartColorSchema, dn as defineEval, dt as evalChartTooltipExtraSchema, en as getCurrentScope, et as evalStatAggregateSchema, fn as getEvalRegistry, ft as evalChartTypeSchema, gt as cacheListItemSchema, ht as cacheFileSchema, in as mergeEvalOutput, it as scoreTraceSchema, j as deriveScopedSummaryFromCases, jt as traceSpanErrorSchema, k as getEvalTitle, kt as traceDisplayConfigSchema, ln as startEvalBackgroundJob, lt as evalChartConfigSchema, mt as cacheEntrySchema, nn as incrementEvalOutput, nt as evalStatsConfigSchema, on as runInEvalScope, ot as evalChartAxisSchema, pt as evalChartsConfigSchema, q as resolveApiCallsConfig, qt as evalSpan, rn as isInEvalScope, rt as evalSummarySchema, sn as setEvalOutput, st as evalChartBuiltinMetricSchema, tn as getEvalCaseInput, tt as evalStatItemSchema, un as repoFile, ut as evalChartMetricSchema, vt as cacheOperationTypeSchema, w as sseEnvelopeSchema, wt as traceCacheRefSchema, xt as cacheStatusSchema, yt as cacheRecordingOpSchema, z as apiCallMetricFormatSchema, zt as fileRefSchema } from "./runOrchestration-H0pSUl3I.mjs";
-import { n as createRunner, t as runCli } from "./cli-lOZdhO2D.mjs";
-import "./src-Btb9RCYD.mjs";
-export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
+import { $ as caseRowSchema, $t as appendToEvalOutput, A as getEvalTitle, At as traceDisplayConfigSchema, B as apiCallMetricFormatSchema, Bt as fileRefSchema, C as createRunRequestSchema, Ct as serializedCacheSpanSchema, D as extractApiCalls, Dt as traceAttributeDisplayInputSchema, E as extractCacheHits, Et as traceAttributeDisplayFormatSchema, F as runManifestSchema, Ft as traceSpanWarningSchema, G as llmCallMetricPlacementSchema, Gt as z, H as apiCallMetricSchema, Ht as numberDisplayOptionsSchema, I as runSummarySchema, It as cellValueSchema, J as resolveApiCallsConfig, Jt as evalSpan, K as llmCallMetricSchema, Kt as buildTraceTree, L as DEFAULT_API_CALLS_CONFIG, Lt as columnDefSchema, M as deriveScopedSummaryFromCases, Mt as traceSpanErrorSchema, N as deriveStatusFromCaseRows, Nt as traceSpanKindSchema, O as extractLlmCalls, Ot as traceAttributeDisplayPlacementSchema, P as deriveStatusFromChildStatuses, Pt as traceSpanSchema, Q as caseDetailSchema, Qt as EvalAssertionError, R as DEFAULT_LLM_CALLS_CONFIG, Rt as columnFormatSchema, St as cacheStatusSchema, T as sseEnvelopeSchema, Tt as traceCacheRefSchema, U as apiCallsConfigSchema, Ut as repoFileRefSchema, V as apiCallMetricPlacementSchema, Vt as jsonCellSchema, W as llmCallMetricFormatSchema, Wt as runArtifactRefSchema, X as trialSelectionModeSchema, Xt as hashCacheKey, Y as resolveLlmCallsConfig, Yt as evalTracer, Z as assertionFailureSchema, Zt as hashCacheKeySync, _t as cacheListItemSchema, an as mergeEvalOutput, at as scoreTraceSchema, bt as cacheRecordingOpSchema, cn as runInEvalScope, ct as evalChartBuiltinMetricSchema, dn as setScopeCacheContext, dt as evalChartMetricSchema, en as evalAssert, et as evalFreshnessStatusSchema, fn as startEvalBackgroundJob, ft as evalChartTooltipExtraSchema, gt as cacheFileSchema, hn as getEvalRegistry, ht as cacheEntrySchema, in as isInEvalScope, it as evalSummarySchema, j as getEvalDisplayStatus, jt as traceDisplayInputConfigSchema, k as getNestedAttribute, kt as traceAttributeDisplaySchema, ln as runInExistingEvalScope, lt as evalChartColorSchema, mn as defineEval, mt as evalChartsConfigSchema, nn as getEvalCaseInput, nt as evalStatItemSchema, on as nextEvalId, ot as evalChartAggregateSchema, pn as repoFile, pt as evalChartTypeSchema, q as llmCallsConfigSchema, qt as captureEvalSpanError, rn as incrementEvalOutput, rt as evalStatsConfigSchema, sn as runInEvalRuntimeScope, st as evalChartAxisSchema, tn as getCurrentScope, tt as evalStatAggregateSchema, un as setEvalOutput, ut as evalChartConfigSchema, vt as cacheModeSchema, w as updateManualScoreRequestSchema, wt as spanCacheOptionsSchema, xt as cacheRecordingSchema, yt as cacheOperationTypeSchema, z as agentEvalsConfigSchema, zt as columnKindSchema } from "./runOrchestration-B3fYtpKo.mjs";
+import { n as createRunner, t as runCli } from "./cli-B-sCTyz8.mjs";
+import "./src-jaOlXwb5.mjs";
+export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };

package/dist/runChild.mjs CHANGED Viewed

@@ -1,7 +1,9 @@
-import { F as runSummarySchema, It as columnDefSchema, P as runManifestSchema, S as createRunRequestSchema, nt as evalStatsConfigSchema, pt as evalChartsConfigSchema, t as executeRun, v as loadConfig, x as createFsCacheStore } from "./runOrchestration-H0pSUl3I.mjs";
+import { C as createRunRequestSchema, F as runManifestSchema, I as runSummarySchema, Lt as columnDefSchema, S as createFsCacheStore, mt as evalChartsConfigSchema, rt as evalStatsConfigSchema, t as executeRun, v as parseEvalMetas, y as loadConfig } from "./runOrchestration-B3fYtpKo.mjs";
 import { createHash } from "node:crypto";
 import { readFile } from "node:fs/promises";
+import { relative } from "node:path";
 import { z } from "zod/v4";
+import { glob } from "glob";
 //#region ../runner/src/runChild.ts
 const evalMetaSchema = z.object({
 	id: z.string(),
@@ -20,7 +22,7 @@ const runChildContextSchema = z.object({
 	runDir: z.string(),
 	manifest: runManifestSchema,
 	summary: runSummarySchema,
-	evals: z.array(evalMetaSchema)
+	evals: z.array(evalMetaSchema).optional()
 });
 function sendMessage(message) {
 	if (process.send === void 0) return;
@@ -37,6 +39,38 @@ function getTargetEvals(params) {
 	if (params.request.target.evalIds && params.request.target.evalIds.length > 0) return params.request.target.evalIds.map((id) => params.evals.get(id)).filter((entry) => entry !== void 0);
 	return [...params.evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
 }
+function toWorkspaceRelativePath(params) {
+	return relative(params.workspaceRoot, params.filePath).replaceAll("\\", "/");
+}
+async function discoverRunEvals(params) {
+	const discovered = [];
+	for (const pattern of params.config.include) {
+		const files = await glob(pattern, {
+			cwd: params.workspaceRoot,
+			absolute: true
+		});
+		discovered.push(...files);
+	}
+	const evals = /* @__PURE__ */ new Map();
+	for (const filePath of discovered) {
+		const source = await readFile(filePath, "utf-8");
+		const sourceFingerprint = getSourceFingerprint(source);
+		const metas = parseEvalMetas(filePath, source);
+		for (const meta of metas) evals.set(meta.id, {
+			id: meta.id,
+			title: meta.title,
+			filePath: toWorkspaceRelativePath({
+				filePath: meta.filePath,
+				workspaceRoot: params.workspaceRoot
+			}),
+			sourceFilePath: meta.filePath,
+			sourceFingerprint,
+			columnDefs: [],
+			caseCount: null
+		});
+	}
+	return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
+}
 async function readContext(contextPath) {
 	if (contextPath === void 0) throw new Error("Missing run child context path");
 	return runChildContextSchema.parse(JSON.parse(await readFile(contextPath, "utf-8")));
@@ -54,7 +88,11 @@ async function main() {
 		maxEntriesPerNamespace: config.cache?.maxEntriesPerNamespace ?? config.cache?.maxEntriesPerEval,
 		maxEntriesByNamespace: config.cache?.maxEntriesByNamespace
 	});
-	const evals = new Map(context.evals.map((evalMeta) => [evalMeta.id, evalMeta]));
+	const evalMetas = await discoverRunEvals({
+		config,
+		workspaceRoot: context.workspaceRoot
+	});
+	const evals = new Map(evalMetas.map((evalMeta) => [evalMeta.id, evalMeta]));
 	const lastRunStatusMap = /* @__PURE__ */ new Map();
 	const latestRunInfoMap = /* @__PURE__ */ new Map();
 	await executeRun({