npm - @ls-stack/agent-eval - Versions diffs - 0.28.0 → 0.30.0 - Mend

@ls-stack/agent-eval 0.28.0 → 0.30.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/{app-mBbAN-Gt.mjs → app-CbOZBHju.mjs} +33 -6
package/dist/apps/web/dist/assets/index-DEikHy2a.js +118 -0
package/dist/apps/web/dist/assets/index-DjUTm3M-.css +1 -0
package/dist/apps/web/dist/index.html +2 -2
package/dist/bin.mjs +1 -1
package/dist/{cli-BQwRbqsL.mjs → cli-CiFOqMwS.mjs} +893 -166
package/dist/index.d.mts +5758 -3526
package/dist/index.mjs +4 -4
package/dist/runChild.mjs +4 -2
package/dist/{runOrchestration-ClWYWPen.mjs → runOrchestration-CO3Vf0cQ.mjs} +654 -34
package/dist/{runner-BQn_xf36.mjs → runner-4pF_Qrc9.mjs} +1 -1
package/dist/{runner-DbVB66h9.mjs → runner-CXHkf7ih.mjs} +2 -2
package/dist/src-BiPLv9ya.mjs +3 -0
package/package.json +4 -33
package/skills/agent-eval/SKILL.md +63 -8
package/dist/apps/web/dist/assets/index-8VE7b6RK.css +0 -1
package/dist/apps/web/dist/assets/index-Czer_MdN.js +0 -118
package/dist/src-CuirVcPY.mjs +0 -3

package/dist/{runOrchestration-ClWYWPen.mjs → runOrchestration-CO3Vf0cQ.mjs} RENAMED Viewed

@@ -1,15 +1,15 @@
 import { createRequire, registerHooks } from "node:module";
-import { createHash } from "node:crypto";
+import { createHash, randomUUID } from "node:crypto";
 import { mkdir, readFile, readdir, rename, rm, stat, writeFile } from "node:fs/promises";
 import { extname, isAbsolute, join, relative, resolve } from "node:path";
 import { formatWithOptions, isDeepStrictEqual, stripVTControlCharacters } from "node:util";
 import { AsyncLocalStorage } from "node:async_hooks";
 import { z, z as z$1 } from "zod/v4";
-import { Buffer as Buffer$1 } from "node:buffer";
+import { Blob as Blob$1, Buffer as Buffer$1, File as File$1 } from "node:buffer";
 import { gunzipSync, gzipSync } from "node:zlib";
 import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
 import { existsSync } from "node:fs";
-import { resultify } from "t-result";
+import { Result, resultify } from "t-result";
 import { fileURLToPath, pathToFileURL } from "node:url";
 //#region ../sdk/src/defineEval.ts
 const evalRegistry = /* @__PURE__ */ new Map();
@@ -787,6 +787,67 @@ function evalExpect(value) {
 	return new EvalExpectationImpl(value, false);
 }
 //#endregion
+//#region ../sdk/src/manualInputFile.ts
+/**
+* Zod schema describing one file uploaded through the manual-input modal.
+*
+* Use this as the field type on your `manualInput.schema` whenever you mark
+* a field with `{ asFile: true }` in `manualInput.fields`. The UI / CLI stages
+* the selected file on disk, the runner materializes it into the run artifacts
+* directory, and the server validates this JSON metadata against the schema
+* before flowing it into the case input.
+*
+* @example
+* ```ts
+* const schema = z.object({
+*   image: manualInputFileValueSchema,
+*   note: z.string().optional(),
+* });
+*
+* defineEval({
+*   id: 'image-analyzer',
+*   manualInput: {
+*     schema,
+*     fields: { image: { asFile: true, accept: 'image/*' } },
+*   },
+*   // ...
+* });
+* ```
+*/
+const manualInputFileValueSchema = z.object({
+	name: z.string(),
+	mimeType: z.string(),
+	sizeBytes: z.number().int().nonnegative(),
+	sha256: z.string().regex(/^[a-f0-9]{64}$/),
+	path: z.string().min(1)
+});
+/**
+* Read a manual-input file artifact from disk and expose common byte, Blob,
+* File, text, and JSON views for eval code.
+*
+* @param value Manual-input file metadata received by an eval.
+* @param options.cwd Directory used to resolve relative paths. Defaults to `process.cwd()`.
+* @returns File bytes plus convenience views for common file-processing flows.
+*/
+async function readManualInputFile(value, options = {}) {
+	const absolutePath = resolve(options.cwd ?? process.cwd(), value.path);
+	const bytes = new Uint8Array(await readFile(absolutePath));
+	const arrayBuffer = bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength);
+	const blob = new Blob$1([bytes], { type: value.mimeType });
+	return {
+		value,
+		absolutePath,
+		bytes,
+		arrayBuffer,
+		blob,
+		file: new File$1([bytes], value.name, { type: value.mimeType }),
+		text: async () => await blob.text(),
+		json: async () => {
+			return JSON.parse(await blob.text());
+		}
+	};
+}
+//#endregion
 //#region ../sdk/src/repoFile.ts
 /**
 * Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
@@ -2688,6 +2749,11 @@ const evalChartConfigSchema = z.object({
 	* the rendered history window.
 	*/
 	hideIfNoValue: z.boolean().optional(),
+	/**
+	* Drop consecutive history points whose plotted metrics and tooltip extras
+	* have the same values as the previous kept point.
+	*/
+	dedupeConsecutiveValues: z.boolean().optional(),
 	type: evalChartTypeSchema,
 	/** At least one series must be declared. */
 	metrics: z.array(evalChartMetricSchema).min(1),
@@ -2713,6 +2779,122 @@ const evalChartConfigSchema = z.object({
 */
 const evalChartsConfigSchema = z.array(evalChartConfigSchema);
 //#endregion
+//#region ../shared/src/schemas/manualInput.ts
+/**
+* Common metadata shared by every manual-input field descriptor exposed to
+* the web UI. The runner builds these from the eval's authored Zod schema and
+* any per-field overrides, so the client never needs the schema itself.
+*/
+const manualInputFieldBaseSchema = z.object({
+	/** Top-level key on the eval input object that this field writes to. */
+	key: z.string(),
+	/** Human-readable label rendered next to the field in the modal. */
+	label: z.string(),
+	/** Optional helper text rendered under the label. */
+	description: z.string().optional(),
+	/** Optional placeholder rendered inside the input element. */
+	placeholder: z.string().optional(),
+	/** Whether the field must be filled before the run can be submitted. */
+	required: z.boolean(),
+	/**
+	* Default value used to prefill the field. Type matches the underlying
+	* widget kind (`string` for text/multiline/select, `number` for number,
+	* `boolean` for boolean, JSON-serialisable for `json`).
+	*/
+	defaultValue: z.unknown().optional()
+});
+/** One option rendered by the `select` widget. */
+const manualInputSelectOptionSchema = z.object({
+	value: z.string(),
+	label: z.string()
+});
+/** Single line text widget descriptor. */
+const manualInputTextFieldSchema = manualInputFieldBaseSchema.extend({
+	kind: z.literal("text"),
+	/** Optional minimum character length enforced client-side. */
+	minLength: z.number().int().min(0).optional(),
+	/** Optional maximum character length enforced client-side. */
+	maxLength: z.number().int().min(0).optional()
+});
+/** Multi-line textarea widget descriptor. */
+const manualInputMultilineFieldSchema = manualInputFieldBaseSchema.extend({
+	kind: z.literal("multiline"),
+	/** Optional minimum character length enforced client-side. */
+	minLength: z.number().int().min(0).optional(),
+	/** Optional maximum character length enforced client-side. */
+	maxLength: z.number().int().min(0).optional(),
+	/** Suggested number of visible textarea rows; UI may clamp this. */
+	rows: z.number().int().min(1).optional()
+});
+/** Numeric input widget descriptor. */
+const manualInputNumberFieldSchema = manualInputFieldBaseSchema.extend({
+	kind: z.literal("number"),
+	/** Optional inclusive lower bound. */
+	min: z.number().optional(),
+	/** Optional inclusive upper bound. */
+	max: z.number().optional(),
+	/** Optional UI step increment. */
+	step: z.number().positive().optional(),
+	/** Whether the value must be an integer. */
+	integer: z.boolean().optional()
+});
+/** Boolean checkbox/toggle widget descriptor. */
+const manualInputBooleanFieldSchema = manualInputFieldBaseSchema.extend({ kind: z.literal("boolean") });
+/** Single-select dropdown widget descriptor. */
+const manualInputSelectFieldSchema = manualInputFieldBaseSchema.extend({
+	kind: z.literal("select"),
+	options: z.array(manualInputSelectOptionSchema)
+});
+/** JSON textarea widget descriptor used for nested objects, arrays, and unions. */
+const manualInputJsonFieldSchema = manualInputFieldBaseSchema.extend({
+	kind: z.literal("json"),
+	/** Suggested number of visible textarea rows; UI may clamp this. */
+	rows: z.number().int().min(1).optional()
+});
+/**
+* File / image upload widget descriptor. The widget supports clicking to
+* pick a file, drag-and-drop onto the dropzone, and pasting an image from
+* the system clipboard. The submitted value references a staged file artifact.
+*/
+const manualInputFileFieldSchema = manualInputFieldBaseSchema.extend({
+	kind: z.literal("file"),
+	/**
+	* Browser `accept` attribute (e.g. `image/*`, `image/png,image/jpeg`,
+	* `.pdf`). When omitted the picker accepts any file type.
+	*/
+	accept: z.string().optional(),
+	/** Optional client-side maximum file size in bytes. */
+	maxSizeBytes: z.number().int().positive().optional()
+});
+/**
+* Discriminated union of all supported manual-input widget kinds. The web UI
+* dispatches to the matching field component based on `kind`.
+*/
+const manualInputFieldDescriptorSchema = z.discriminatedUnion("kind", [
+	manualInputTextFieldSchema,
+	manualInputMultilineFieldSchema,
+	manualInputNumberFieldSchema,
+	manualInputBooleanFieldSchema,
+	manualInputSelectFieldSchema,
+	manualInputJsonFieldSchema,
+	manualInputFileFieldSchema
+]);
+/**
+* Wire-format descriptor attached to an `EvalSummary` when the eval declares
+* `manualInput`. Carries the ordered list of fields the modal renders and
+* basic context shown in the modal header.
+*/
+const manualInputDescriptorSchema = z.object({
+	/** Optional title shown in the modal header. Defaults to the eval title. */
+	title: z.string().optional(),
+	/** Optional helper text shown above the form. */
+	description: z.string().optional(),
+	/** Optional submit button label. Defaults to `Run`. */
+	submitLabel: z.string().optional(),
+	/** Ordered list of fields rendered in the modal. */
+	fields: z.array(manualInputFieldDescriptorSchema)
+});
+//#endregion
 //#region ../shared/src/schemas/eval.ts
 /** Freshness signal derived from the latest relevant run plus git state. */
 const evalFreshnessStatusSchema = z.enum([
@@ -2810,7 +2992,13 @@ const evalSummarySchema = z.object({
 	* Ordered per-eval history chart configuration for the EvalCard. Opt-in:
 	* when omitted or empty, the UI renders no history chart at all.
 	*/
-	charts: evalChartsConfigSchema.optional()
+	charts: evalChartsConfigSchema.optional(),
+	/**
+	* Manual-input form descriptor when the eval declares `manualInput`. The
+	* web UI renders these fields in a modal before kicking off a run; the
+	* runner consumes the validated values as the case input.
+	*/
+	manualInput: manualInputDescriptorSchema.optional()
 });
 /** Schema for one case row in an eval run result table. */
 const caseRowSchema = z.object({
@@ -2950,7 +3138,7 @@ const caseDetailSchema = z.object({
 });
 /** Schema for discovery problems that should be shown before running evals. */
 const discoveryIssueSchema = z.object({
-	type: z.enum(["duplicate-eval-id"]),
+	type: z.enum(["duplicate-eval-id", "manual-input-with-cases"]),
 	severity: z.enum(["error"]),
 	filePath: z.string(),
 	evalId: z.string(),
@@ -3031,6 +3219,8 @@ const llmCallMetricPlacementSchema = z.enum(["header", "body"]);
 /** Where an API-call metric is rendered inside the API calls tab. */
 const apiCallMetricPlacementSchema = llmCallMetricPlacementSchema;
 const callDerivedAttributeSchema = z.custom((value) => typeof value === "function", { message: "Expected a derived attribute function" });
+const callDerivedAttributesFnSchema = z.custom((value) => typeof value === "function", { message: "Expected a derived attributes function" });
+const callDerivedAttributesConfigSchema = z.union([z.record(z.string().min(1), callDerivedAttributeSchema), callDerivedAttributesFnSchema]);
 /**
 * Schema for a single user-defined metric attached to LLM call rows.
 *
@@ -3157,10 +3347,11 @@ const llmCallsConfigSchema = z.object({
 	/**
 	* Derived attributes persisted onto every matching LLM span before
 	* `deriveFromTracing`, default outputs, trace display, and call metrics read
-	* the trace. Keys are dot-paths under `span.attributes`; return `undefined`
-	* to skip writing the attribute for one span.
+	* the trace. Use a keyed map for one-off fields, or one callback returning a
+	* path/value object for multiple fields. Keys are dot-paths under
+	* `span.attributes`; return `undefined` to skip one span or one returned key.
 	*/
-	derivedAttributes: z.record(z.string().min(1), callDerivedAttributeSchema).optional(),
+	derivedAttributes: callDerivedAttributesConfigSchema.optional(),
 	/**
 	* Model-keyed pricing registry used to calculate LLM-call costs from token
 	* counts. Built-in LLM cost fields are only derived from this registry.
@@ -3192,11 +3383,12 @@ const apiCallsConfigSchema = z.object({
 	}).optional(),
 	/**
 	* Derived attributes persisted onto every matching API span before trace
-	* display and call metrics read the trace. Keys are dot-paths under
-	* `span.attributes`; return `undefined` to skip writing the attribute for
-	* one span.
+	* display and call metrics read the trace. Use a keyed map for one-off
+	* fields, or one callback returning a path/value object for multiple fields.
+	* Keys are dot-paths under `span.attributes`; return `undefined` to skip one
+	* span or one returned key.
 	*/
-	derivedAttributes: z.record(z.string().min(1), callDerivedAttributeSchema).optional(),
+	derivedAttributes: callDerivedAttributesConfigSchema.optional(),
 	/** Custom user-defined metrics surfaced on each API call. */
 	metrics: z.array(apiCallMetricSchema).optional()
 });
@@ -3256,7 +3448,9 @@ const DEFAULT_API_CALLS_CONFIG = {
 	metrics: []
 };
 function resolveDerivedAttributes(input) {
-	return Object.entries(input ?? {}).map(([path, compute]) => ({
+	if (input === void 0) return [];
+	if (typeof input === "function") return [{ computeMany: input }];
+	return Object.entries(input).map(([path, compute]) => ({
 		path,
 		compute
 	}));
@@ -3621,11 +3815,31 @@ function mergeNestedAttribute$1(value, path, attributeValue) {
 function applyDerivedAttributesForKind(params) {
 	let attributes = params.span.attributes;
 	for (const derivedAttribute of params.derivedAttributes) {
-		if (derivedAttribute.compute === void 0) continue;
 		const span = {
 			...params.span,
 			attributes
 		};
+		if (derivedAttribute.computeMany !== void 0) {
+			const values = (() => {
+				try {
+					return derivedAttribute.computeMany({
+						attributes,
+						span,
+						get: (path) => getNestedAttribute(attributes, path)
+					});
+				} catch {
+					return;
+				}
+			})();
+			if (!isRecord$3(values)) continue;
+			for (const [path, value] of Object.entries(values)) {
+				if (value === void 0) continue;
+				attributes = mergeNestedAttribute$1(attributes, path, value);
+			}
+			continue;
+		}
+		if (derivedAttribute.path === void 0) continue;
+		if (derivedAttribute.compute === void 0) continue;
 		const value = (() => {
 			try {
 				return derivedAttribute.compute({
@@ -4128,6 +4342,7 @@ function isCacheHitEntry(entry) {
 }
 z.enum([
 	"discovery.updated",
+	"config.reload",
 	"run.started",
 	"run.summary",
 	"case.started",
@@ -4147,6 +4362,19 @@ const sseEnvelopeSchema = z.object({
 });
 //#endregion
 //#region ../shared/src/schemas/api.ts
+/** Lifecycle state for an app config reload triggered by `agent-evals.config.ts`. */
+const configReloadStatusSchema = z.enum([
+	"idle",
+	"pending",
+	"reloading"
+]);
+/** UI/API-visible state for config reloads in `agent-evals app`. */
+const configReloadStateSchema = z.object({
+	status: configReloadStatusSchema,
+	activeRunCount: z.number().int().min(0),
+	lastChangedAt: z.string().nullable(),
+	lastReloadedAt: z.string().nullable()
+});
 /** Schema for the API request that starts a new eval run. */
 const createRunRequestSchema = z.object({
 	target: z.object({
@@ -4167,7 +4395,14 @@ const createRunRequestSchema = z.object({
 	* Optional cache controls for the run. When omitted, the cache is used in
 	* its default read-through / write-on-miss mode.
 	*/
-	cache: z.object({ mode: cacheModeSchema.default("use") }).optional()
+	cache: z.object({ mode: cacheModeSchema.default("use") }).optional(),
+	/**
+	* Manual-input values keyed by eval `key` (workspace-relative file path
+	* plus authored eval id). Required for any targeted eval that declares
+	* `manualInput` in its definition; the server validates each entry against
+	* the eval's authored Zod schema before starting the run.
+	*/
+	manualInputs: z.record(z.string(), z.unknown()).optional()
 });
 /** Schema for updating a UI-authored manual score on one persisted case. */
 const updateManualScoreRequestSchema = z.object({ value: z.number().min(0).max(1).nullable() });
@@ -4819,7 +5054,9 @@ async function loadConfig() {
 	const configPath = resolve(process.cwd(), "agent-evals.config.ts");
 	if (!existsSync(configPath)) return defaultConfig;
 	try {
-		const imported = await import(pathToFileURL(configPath).href);
+		const configUrl = pathToFileURL(configPath);
+		configUrl.searchParams.set("v", randomUUID());
+		const imported = await import(configUrl.href);
 		const configModule = configModuleSchema.parse(imported);
 		const userConfig = configModule.default ?? configModule.config;
 		if (!userConfig) return defaultConfig;
@@ -4997,6 +5234,7 @@ function appendDefaultCharts(params) {
 	if (activeKeys.has("costUsd")) defaults.push({
 		heading: "LLM Cost",
 		hideIfNoValue: true,
+		dedupeConsecutiveValues: true,
 		type: "area",
 		metrics: [{
 			source: "column",
@@ -5006,7 +5244,7 @@ function appendDefaultCharts(params) {
 			color: "warning"
 		}]
 	});
-	const tokenMetrics = [
+	const inputTokenMetrics = [
 		activeKeys.has("inputTokens") ? {
 			source: "column",
 			key: "inputTokens",
@@ -5014,13 +5252,6 @@ function appendDefaultCharts(params) {
 			label: "Input",
 			color: "accent"
 		} : null,
-		activeKeys.has("outputTokens") ? {
-			source: "column",
-			key: "outputTokens",
-			aggregate: "avg",
-			label: "Output",
-			color: "success"
-		} : null,
 		activeKeys.has("cachedInputTokens") ? {
 			source: "column",
 			key: "cachedInputTokens",
@@ -5036,17 +5267,25 @@ function appendDefaultCharts(params) {
 			color: "warning"
 		} : null
 	].filter((metric) => metric !== null);
-	if (tokenMetrics.length > 0) defaults.push({
-		heading: "LLM Tokens",
+	if (inputTokenMetrics.length > 0) defaults.push({
+		heading: "LLM Input Tokens",
 		hideIfNoValue: true,
+		dedupeConsecutiveValues: true,
 		type: "bar",
-		metrics: tokenMetrics,
-		tooltipExtras: activeKeys.has("totalTokens") ? [{
+		metrics: inputTokenMetrics
+	});
+	if (activeKeys.has("outputTokens")) defaults.push({
+		heading: "LLM Output Tokens",
+		hideIfNoValue: true,
+		dedupeConsecutiveValues: true,
+		type: "bar",
+		metrics: [{
 			source: "column",
-			key: "totalTokens",
+			key: "outputTokens",
 			aggregate: "avg",
-			label: "Total"
-		}] : void 0
+			label: "Output",
+			color: "success"
+		}]
 	});
 	const merged = [...params.charts ?? [], ...defaults];
 	return merged.length > 0 ? merged : void 0;
@@ -5316,6 +5555,371 @@ function getRunFreshnessTimestamp(manifest) {
 	return manifest.endedAt ?? manifest.startedAt;
 }
 //#endregion
+//#region ../runner/src/manualInput/walker.ts
+function isObject(value) {
+	return typeof value === "object" && value !== null;
+}
+function getZodDef(schema) {
+	if (!isObject(schema)) return null;
+	const zodHolder = schema._zod;
+	if (!isObject(zodHolder)) return null;
+	const def = zodHolder.def;
+	if (!isObject(def)) return null;
+	if (typeof def.type !== "string") return null;
+	return {
+		...def,
+		type: def.type
+	};
+}
+function getDescription(schema) {
+	if (!isObject(schema)) return void 0;
+	const description = schema.description;
+	return typeof description === "string" ? description : void 0;
+}
+function getInnerSchema(def) {
+	return def.innerType;
+}
+function getChecks(def) {
+	const checks = def.checks;
+	if (!Array.isArray(checks)) return [];
+	const out = [];
+	for (const check of checks) {
+		if (!isObject(check)) continue;
+		const zodHolder = check._zod;
+		if (!isObject(zodHolder)) continue;
+		const checkDef = zodHolder.def;
+		if (!isObject(checkDef)) continue;
+		if (typeof checkDef.check !== "string") continue;
+		out.push({
+			...checkDef,
+			check: checkDef.check
+		});
+	}
+	return out;
+}
+function findCheck(checks, name) {
+	return checks.find((check) => check.check === name);
+}
+function unwrap(schema) {
+	let current = schema;
+	let required = true;
+	let defaultValue = void 0;
+	for (let depth = 0; depth < 8; depth += 1) {
+		const def = getZodDef(current);
+		if (!def) return null;
+		if (def.type === "optional" || def.type === "nullable") {
+			required = false;
+			current = getInnerSchema(def);
+			continue;
+		}
+		if (def.type === "nullish") {
+			required = false;
+			current = getInnerSchema(def);
+			continue;
+		}
+		if (def.type === "default" || def.type === "prefault") {
+			const raw = def.defaultValue;
+			if (typeof raw === "function") defaultValue = Reflect.apply(raw, void 0, []);
+			else defaultValue = raw;
+			current = getInnerSchema(def);
+			continue;
+		}
+		if (def.type === "readonly" || def.type === "pipe") {
+			current = getInnerSchema(def) ?? def.in;
+			continue;
+		}
+		return {
+			schema: current,
+			def,
+			required,
+			defaultValue
+		};
+	}
+	return null;
+}
+function humaniseKey(key) {
+	const spaced = key.replace(/([a-z0-9])([A-Z])/g, "$1 $2").replace(/[_-]+/g, " ").trim();
+	if (!spaced) return key;
+	const lowered = spaced.toLowerCase();
+	return lowered.charAt(0).toUpperCase() + lowered.slice(1);
+}
+function normaliseSelectOptions(raw) {
+	if (!raw) return void 0;
+	return raw.map((entry) => {
+		if (typeof entry === "string") return {
+			value: entry,
+			label: entry
+		};
+		return {
+			value: entry.value,
+			label: entry.label ?? entry.value
+		};
+	});
+}
+function enumOptionsFromEntries(def) {
+	const entries = def.entries;
+	if (!isObject(entries)) return null;
+	const out = [];
+	for (const [label, value] of Object.entries(entries)) if (typeof value === "string") out.push({
+		value,
+		label
+	});
+	else if (typeof value === "number") out.push({
+		value: String(value),
+		label
+	});
+	else return null;
+	return out;
+}
+function literalUnionOptions(def) {
+	const options = def.options;
+	if (!Array.isArray(options)) return null;
+	const out = [];
+	for (const option of options) {
+		const optDef = getZodDef(option);
+		if (optDef?.type !== "literal") return null;
+		const values = optDef.values;
+		if (!Array.isArray(values) || values.length !== 1) return null;
+		const value = values[0];
+		if (typeof value === "string") out.push({
+			value,
+			label: value
+		});
+		else if (typeof value === "number") {
+			const stringValue = String(value);
+			out.push({
+				value: stringValue,
+				label: stringValue
+			});
+		} else return null;
+	}
+	return out.length > 0 ? out : null;
+}
+function literalSelectOptions(def) {
+	const values = def.values;
+	if (!Array.isArray(values)) return null;
+	const out = [];
+	for (const value of values) if (typeof value === "string") out.push({
+		value,
+		label: value
+	});
+	else if (typeof value === "number") {
+		const stringValue = String(value);
+		out.push({
+			value: stringValue,
+			label: stringValue
+		});
+	} else return null;
+	return out;
+}
+function readStringChecks(def) {
+	const checks = getChecks(def);
+	const out = {};
+	const min = findCheck(checks, "min_length");
+	if (min && typeof min.minimum === "number") out.minLength = min.minimum;
+	const max = findCheck(checks, "max_length");
+	if (max && typeof max.maximum === "number") out.maxLength = max.maximum;
+	return out;
+}
+const integerNumberFormats = new Set([
+	"int",
+	"safeint",
+	"int32",
+	"uint32",
+	"int64",
+	"uint64"
+]);
+function readNumberChecks(def) {
+	const checks = getChecks(def);
+	const out = {};
+	const gt = findCheck(checks, "greater_than");
+	if (gt && typeof gt.value === "number" && gt.inclusive === true) out.min = gt.value;
+	const lt = findCheck(checks, "less_than");
+	if (lt && typeof lt.value === "number" && lt.inclusive === true) out.max = lt.value;
+	const format = findCheck(checks, "number_format");
+	if (format && typeof format.format === "string" && integerNumberFormats.has(format.format)) out.integer = true;
+	return out;
+}
+function buildField(key, fieldSchema, override) {
+	const unwrapped = unwrap(fieldSchema);
+	if (!unwrapped) return Result.err(/* @__PURE__ */ new Error(`manualInput: field "${key}" uses an unsupported Zod schema (could not introspect)`));
+	const inner = unwrapped.def;
+	const description = override?.description ?? getDescription(unwrapped.schema);
+	const base = {
+		key,
+		label: override?.label ?? humaniseKey(key),
+		description,
+		placeholder: override?.placeholder,
+		required: unwrapped.required,
+		defaultValue: override?.defaultValue !== void 0 ? override.defaultValue : unwrapped.defaultValue
+	};
+	if (override?.asJson === true) {
+		const rows = override.rows;
+		return Result.ok({
+			...base,
+			kind: "json",
+			rows
+		});
+	}
+	if (override?.asFile === true) return Result.ok({
+		...base,
+		kind: "file",
+		accept: override.accept,
+		maxSizeBytes: override.maxSizeBytes
+	});
+	const overrideOptions = normaliseSelectOptions(override?.options);
+	if (overrideOptions) return Result.ok({
+		...base,
+		kind: "select",
+		options: overrideOptions
+	});
+	switch (inner.type) {
+		case "string": {
+			const checks = readStringChecks(inner);
+			if (override?.multiline === true) return Result.ok({
+				...base,
+				kind: "multiline",
+				rows: override.rows,
+				minLength: checks.minLength,
+				maxLength: checks.maxLength
+			});
+			return Result.ok({
+				...base,
+				kind: "text",
+				minLength: checks.minLength,
+				maxLength: checks.maxLength
+			});
+		}
+		case "number":
+		case "int":
+		case "bigint": {
+			const checks = readNumberChecks(inner);
+			return Result.ok({
+				...base,
+				kind: "number",
+				min: checks.min,
+				max: checks.max,
+				integer: checks.integer
+			});
+		}
+		case "boolean": return Result.ok({
+			...base,
+			kind: "boolean"
+		});
+		case "enum": {
+			const options = enumOptionsFromEntries(inner);
+			if (options) return Result.ok({
+				...base,
+				kind: "select",
+				options
+			});
+			return Result.ok({
+				...base,
+				kind: "json",
+				rows: override?.rows
+			});
+		}
+		case "literal": {
+			const options = literalSelectOptions(inner);
+			if (options && options.length > 0) return Result.ok({
+				...base,
+				kind: "select",
+				options
+			});
+			return Result.ok({
+				...base,
+				kind: "json",
+				rows: override?.rows
+			});
+		}
+		case "union": {
+			const options = literalUnionOptions(inner);
+			if (options) return Result.ok({
+				...base,
+				kind: "select",
+				options
+			});
+			return Result.ok({
+				...base,
+				kind: "json",
+				rows: override?.rows
+			});
+		}
+		default: return Result.ok({
+			...base,
+			kind: "json",
+			rows: override?.rows
+		});
+	}
+}
+function getObjectShape(schema) {
+	const def = getZodDef(schema);
+	if (!def) return null;
+	if (def.type !== "object") return null;
+	const shape = def.shape;
+	if (!isObject(shape)) return null;
+	return shape;
+}
+/**
+* Walk an eval's `manualInput` configuration and produce the wire-format
+* descriptor consumed by the web UI. The schema must resolve to a top-level
+* `z.object(...)`; nested objects, arrays, unions, and other unsupported
+* shapes inside fields fall back to the JSON textarea widget.
+*
+* Returns a `Result` so the caller (eval discovery) can surface a discovery
+* issue without throwing when the schema is incompatible.
+*/
+function buildManualInputDescriptor(config) {
+	const shape = getObjectShape(config.schema);
+	if (!shape) return Result.err(/* @__PURE__ */ new Error("manualInput.schema must be a top-level z.object(...). Wrap nested types in an object schema."));
+	const overrides = {};
+	const rawOverrides = config.fields;
+	if (rawOverrides) {
+		for (const [key, override] of Object.entries(rawOverrides)) if (override) overrides[key] = override;
+	}
+	const fields = [];
+	for (const [key, fieldSchema] of Object.entries(shape)) {
+		const fieldResult = buildField(key, fieldSchema, overrides[key]);
+		if (fieldResult.error) return fieldResult.errorResult();
+		fields.push(fieldResult.value);
+	}
+	return Result.ok({
+		title: config.title,
+		description: config.description,
+		submitLabel: config.submitLabel,
+		fields
+	});
+}
+/**
+* Resolve an eval's `manualInput` Zod schema against a raw user submission.
+* Returns the parsed value typed against the eval's `TInput` generic, or a
+* structured `Error` carrying the Zod issues for the caller to surface.
+*/
+function parseManualInputValues(config, raw) {
+	const parsed = config.schema.safeParse(raw);
+	if (!parsed.success) return Result.err(new ManualInputValidationError(parsed.error.issues.map(formatIssue)));
+	return Result.ok(parsed.data);
+}
+/**
+* Error thrown / returned when manual-input values fail validation against
+* the eval's `manualInput.schema`. Carries the structured Zod issues so the
+* CLI and HTTP layers can surface them per-field.
+*/
+var ManualInputValidationError = class extends Error {
+	issues;
+	constructor(issues) {
+		super(issues.length === 0 ? "manualInput validation failed" : `manualInput validation failed: ${issues.map((issue) => issue.path ? `${issue.path}: ${issue.message}` : issue.message).join("; ")}`);
+		this.name = "ManualInputValidationError";
+		this.issues = issues;
+	}
+};
+function formatIssue(issue) {
+	return {
+		path: issue.path.map((segment) => typeof segment === "string" || typeof segment === "number" ? String(segment) : "").filter((segment) => segment !== "").join("."),
+		message: issue.message
+	};
+}
+//#endregion
 //#region ../runner/src/outputArtifacts.ts
 const mimeTypeExtensionMap = {
 	"application/json": ".json",
@@ -6347,8 +6951,24 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
 				await runWithModuleIsolation(moduleIsolation, async () => {
 					await runInEvalRuntimeScope("cases", async () => {
 						await entry.use(async (evalDef) => {
-							const runnableCases = resolveRunnableEvalCases({
-								cases: await runWithEvalClock(evalDef.startTime, async () => typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [], { freezeTime: evalDef.freezeTime }),
+							if (evalDef.manualInput && evalDef.cases !== void 0) throw new Error(`Eval "${evalMeta.id}" cannot declare both "cases" and "manualInput". Remove one of them.`);
+							let manualInputCase = null;
+							if (evalDef.manualInput) {
+								const rawValue = request.manualInputs?.[evalMeta.key];
+								if (rawValue === void 0) throw new Error(`Eval "${evalMeta.id}" requires manual input. Provide it via the run modal in the web UI or "--input" / "--input-file" on the CLI.`);
+								const parsed = parseManualInputValues(evalDef.manualInput, rawValue);
+								if (parsed.error) {
+									const formatted = parsed.error.issues.map((issue) => issue.path ? `${issue.path}: ${issue.message}` : issue.message).join("; ");
+									throw new Error(`Invalid manual input for eval "${evalMeta.id}": ${formatted}`);
+								}
+								manualInputCase = {
+									id: `${evalMeta.id}-manual`,
+									input: parsed.value
+								};
+							}
+							const evalCases = manualInputCase ? [manualInputCase] : await runWithEvalClock(evalDef.startTime, async () => typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [], { freezeTime: evalDef.freezeTime });
+							const runnableCases = manualInputCase ? evalCases : resolveRunnableEvalCases({
+								cases: evalCases,
 								evalId: evalMeta.id
 							});
 							const duplicateCaseIds = findDuplicateCaseIds(runnableCases);
@@ -6567,4 +7187,4 @@ function toLastRunStatus(status) {
 	return status === "pending" ? null : status;
 }
 //#endregion
-export { llmCallMetricFormatSchema as $, traceAttributeDisplayPlacementSchema as $t, extractCacheHits as A, advanceEvalTime as An, evalChartBuiltinMetricSchema as At, runManifestSchema as B, mergeEvalOutput as Bn, cacheEntryWithDebugKeySchema as Bt, normalizeScoreDef as C, deserializeCacheRecording as Cn, runLogEntrySchema as Ct, updateManualScoreRequestSchema as D, repoFile as Dn, scoreTraceSchema as Dt, createRunRequestSchema as E, serializeCacheValue as En, runLogPhaseSchema as Et, getEvalTitle as F, getCurrentScope as Fn, evalChartTypeSchema as Ft, apiCallMetricFormatSchema as G, setEvalOutput as Gn, cacheRecordingOpSchema as Gt, DEFAULT_API_CALLS_CONFIG as H, runInEvalRuntimeScope as Hn, cacheListItemSchema as Ht, getEvalDisplayStatus as I, getEvalCaseInput as In, evalChartsConfigSchema as It, apiCallsConfigSchema as J, defineEval as Jn, serializedCacheSpanSchema as Jt, apiCallMetricPlacementSchema as K, setScopeCacheContext as Kn, cacheRecordingSchema as Kt, deriveScopedSummaryFromCases as L, getEvalStartTime as Ln, cacheDebugKeyEntrySchema as Lt, extractLlmCalls as M, configureEvalRunLogs as Mn, evalChartConfigSchema as Mt, applyDerivedCallAttributes as N, evalAssert as Nn, evalChartMetricSchema as Nt, sseEnvelopeSchema as O, evalExpect as On, evalChartAggregateSchema as Ot, getNestedAttribute as P, evalLog as Pn, evalChartTooltipExtraSchema as Pt, evalDeriveConfigSchema as Q, traceAttributeDisplayInputSchema as Qt, deriveStatusFromCaseRows as R, incrementEvalOutput as Rn, cacheDebugKeyFileSchema as Rt, buildDeclaredColumnDefs as S, hashCacheKeySync as Sn, evalSummarySchema as St, createFsCacheStore as T, serializeCacheRecording as Tn, runLogLocationSchema as Tt, DEFAULT_LLM_CALLS_CONFIG as U, runInEvalScope as Un, cacheModeSchema as Ut, runSummarySchema as V, nextEvalId as Vn, cacheFileSchema as Vt, agentEvalsConfigSchema as W, runInExistingEvalScope as Wn, cacheOperationTypeSchema as Wt, evalColumnOverrideSchema as X, traceCacheRefSchema as Xt, defaultConfigKeySchema as Y, getEvalRegistry as Yn, spanCacheOptionsSchema as Yt, evalColumnsSchema as Z, traceAttributeDisplayFormatSchema as Zt, deriveEvalFreshness as _, buildTraceTree as _n, discoveryIssueSchema as _t, getLastRunStatuses as a, traceSpanSchema as an, removeDefaultConfigSchema as at, resolveEvalDefaultConfig as b, evalTracer as bn, evalStatItemSchema as bt, loadPersistedRunSnapshots as c, columnDefSchema as cn, runLogsConfigSchema as ct, persistRunState as d, fileRefSchema as dn, buildEvalKey as dt, traceAttributeDisplaySchema as en, llmCallMetricPlacementSchema as et, recomputeEvalStatusesInRuns as f, jsonCellSchema as fn, getCaseRowCaseKey as ft, resolveArtifactPath as g, z$1 as gn, caseRowSchema as gt, resolveTracePresentation as h, runArtifactRefSchema as hn, caseDetailSchema as ht, generateRunId as i, traceSpanKindSchema as in, llmCallsConfigSchema as it, extractApiCalls as j, appendToEvalOutput as jn, evalChartColorSchema as jt, extractCacheEntries as k, EvalAssertionError as kn, evalChartAxisSchema as kt, nextShortIdFromSnapshots as l, columnFormatSchema as ln, trialSelectionModeSchema as lt, runTouchesEval as m, repoFileRefSchema as mn, assertionFailureSchema as mt, getTargetEvalKeys as n, traceDisplayInputConfigSchema as nn, llmCallPricingRateSchema as nt, getLatestRunInfos as o, traceSpanWarningSchema as on, resolveApiCallsConfig as ot, recomputePersistedCaseStatus as p, numberDisplayOptionsSchema as pn, getCaseRowEvalKey as pt, apiCallMetricSchema as q, startEvalBackgroundJob as qn, cacheStatusSchema as qt, getTargetEvals as r, traceSpanErrorSchema as rn, llmCallPricingSchema as rt, loadPersistedRunSnapshot as s, cellValueSchema as sn, resolveLlmCallsConfig as st, executeRun as t, traceDisplayConfigSchema as tn, llmCallMetricSchema as tt, persistCaseDetail as u, columnKindSchema as un, buildCaseKey as ut, loadEvalModule as v, captureEvalSpanError as vn, evalFreshnessStatusSchema as vt, validateCharts as w, deserializeCacheValue as wn, runLogLevelSchema as wt, loadConfig as x, hashCacheKey as xn, evalStatsConfigSchema as xt, parseEvalDiscovery as y, evalSpan as yn, evalStatAggregateSchema as yt, deriveStatusFromChildStatuses as z, isInEvalScope as zn, cacheEntrySchema as zt };
+export { defaultConfigKeySchema as $, incrementEvalOutput as $n, cacheEntryWithDebugKeySchema as $t, createRunRequestSchema as A, buildTraceTree as An, runLogPhaseSchema as At, getEvalDisplayStatus as B, repoFile as Bn, manualInputTextFieldSchema as Bt, loadConfig as C, columnKindSchema as Cn, evalStatAggregateSchema as Ct, createFsCacheStore as D, repoFileRefSchema as Dn, runLogEntrySchema as Dt, validateCharts as E, numberDisplayOptionsSchema as En, evalSummarySchema as Et, extractApiCalls as F, hashCacheKeySync as Fn, manualInputJsonFieldSchema as Ft, runSummarySchema as G, advanceEvalTime as Gn, evalChartConfigSchema as Gt, deriveStatusFromCaseRows as H, readManualInputFile as Hn, evalChartAxisSchema as Ht, extractLlmCalls as I, deserializeCacheRecording as In, manualInputMultilineFieldSchema as It, agentEvalsConfigSchema as J, evalAssert as Jn, evalChartTypeSchema as Jt, DEFAULT_API_CALLS_CONFIG as K, appendToEvalOutput as Kn, evalChartMetricSchema as Kt, applyDerivedCallAttributes as L, deserializeCacheValue as Ln, manualInputNumberFieldSchema as Lt, sseEnvelopeSchema as M, evalSpan as Mn, manualInputBooleanFieldSchema as Mt, extractCacheEntries as N, evalTracer as Nn, manualInputDescriptorSchema as Nt, configReloadStateSchema as O, runArtifactRefSchema as On, runLogLevelSchema as Ot, extractCacheHits as P, hashCacheKey as Pn, manualInputFieldDescriptorSchema as Pt, apiCallsConfigSchema as Q, getEvalStartTime as Qn, cacheEntrySchema as Qt, getNestedAttribute as R, serializeCacheRecording as Rn, manualInputSelectFieldSchema as Rt, resolveEvalDefaultConfig as S, columnFormatSchema as Sn, evalFreshnessStatusSchema as St, normalizeScoreDef as T, jsonCellSchema as Tn, evalStatsConfigSchema as Tt, deriveStatusFromChildStatuses as U, evalExpect as Un, evalChartBuiltinMetricSchema as Ut, deriveScopedSummaryFromCases as V, manualInputFileValueSchema as Vn, evalChartAggregateSchema as Vt, runManifestSchema as W, EvalAssertionError as Wn, evalChartColorSchema as Wt, apiCallMetricPlacementSchema as X, getCurrentScope as Xn, cacheDebugKeyEntrySchema as Xt, apiCallMetricFormatSchema as Y, evalLog as Yn, evalChartsConfigSchema as Yt, apiCallMetricSchema as Z, getEvalCaseInput as Zn, cacheDebugKeyFileSchema as Zt, buildManualInputDescriptor as _, traceSpanKindSchema as _n, getCaseRowEvalKey as _t, getLastRunStatuses as a, cacheRecordingSchema as an, runInExistingEvalScope as ar, llmCallMetricSchema as at, loadEvalModule as b, cellValueSchema as bn, caseRowSchema as bt, loadPersistedRunSnapshots as c, spanCacheOptionsSchema as cn, startEvalBackgroundJob as cr, llmCallsConfigSchema as ct, persistRunState as d, traceAttributeDisplayInputSchema as dn, resolveLlmCallsConfig as dt, cacheFileSchema as en, isInEvalScope as er, evalColumnOverrideSchema as et, recomputeEvalStatusesInRuns as f, traceAttributeDisplayPlacementSchema as fn, runLogsConfigSchema as ft, resolveArtifactPath as g, traceSpanErrorSchema as gn, getCaseRowCaseKey as gt, resolveTracePresentation as h, traceDisplayInputConfigSchema as hn, buildEvalKey as ht, generateRunId as i, cacheRecordingOpSchema as in, runInEvalScope as ir, llmCallMetricPlacementSchema as it, updateManualScoreRequestSchema as j, captureEvalSpanError as jn, scoreTraceSchema as jt, configReloadStatusSchema as k, z$1 as kn, runLogLocationSchema as kt, nextShortIdFromSnapshots as l, traceCacheRefSchema as ln, defineEval as lr, removeDefaultConfigSchema as lt, runTouchesEval as m, traceDisplayConfigSchema as mn, buildCaseKey as mt, getTargetEvalKeys as n, cacheModeSchema as nn, nextEvalId as nr, evalDeriveConfigSchema as nt, getLatestRunInfos as o, cacheStatusSchema as on, setEvalOutput as or, llmCallPricingRateSchema as ot, recomputePersistedCaseStatus as p, traceAttributeDisplaySchema as pn, trialSelectionModeSchema as pt, DEFAULT_LLM_CALLS_CONFIG as q, configureEvalRunLogs as qn, evalChartTooltipExtraSchema as qt, getTargetEvals as r, cacheOperationTypeSchema as rn, runInEvalRuntimeScope as rr, llmCallMetricFormatSchema as rt, loadPersistedRunSnapshot as s, serializedCacheSpanSchema as sn, setScopeCacheContext as sr, llmCallPricingSchema as st, executeRun as t, cacheListItemSchema as tn, mergeEvalOutput as tr, evalColumnsSchema as tt, persistCaseDetail as u, traceAttributeDisplayFormatSchema as un, getEvalRegistry as ur, resolveApiCallsConfig as ut, parseManualInputValues as v, traceSpanSchema as vn, assertionFailureSchema as vt, buildDeclaredColumnDefs as w, fileRefSchema as wn, evalStatItemSchema as wt, parseEvalDiscovery as x, columnDefSchema as xn, discoveryIssueSchema as xt, deriveEvalFreshness as y, traceSpanWarningSchema as yn, caseDetailSchema as yt, getEvalTitle as z, serializeCacheValue as zn, manualInputSelectOptionSchema as zt };