npm - vieval - Versions diffs - 0.0.5 → 0.0.7 - Mend

vieval 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

package/README.md +219 -109
package/dist/bin/vieval.mjs +1 -1
package/dist/cli/index.mjs +1 -1
package/dist/{cli-DayPXzHX.mjs → cli-ImxGpoYQ.mjs} +1447 -195
package/dist/cli-ImxGpoYQ.mjs.map +1 -0
package/dist/config.d.mts +2 -2
package/dist/config.mjs +1 -1
package/dist/core/assertions/index.d.mts +1 -1
package/dist/core/inference-executors/index.d.mts +1 -1
package/dist/core/inference-executors/index.mjs +1 -1
package/dist/core/processors/results/index.d.mts +1 -1
package/dist/core/runner/index.d.mts +3 -2
package/dist/core/runner/index.mjs +3 -2
package/dist/core/runner/index.mjs.map +1 -1
package/dist/core/scheduler/index.d.mts +2 -0
package/dist/core/scheduler/index.mjs +188 -0
package/dist/core/scheduler/index.mjs.map +1 -0
package/dist/{env-BFSjny07.mjs → env--94B0UtW.mjs} +1 -1
package/dist/{env-BFSjny07.mjs.map → env--94B0UtW.mjs.map} +1 -1
package/dist/{env-BTq3dV7C.d.mts → env-BeHv_5mo.d.mts} +1 -1
package/dist/{expect-extensions-QLXESWjn.mjs → expect-extensions-DCSqlneN.mjs} +1 -1
package/dist/{expect-extensions-QLXESWjn.mjs.map → expect-extensions-DCSqlneN.mjs.map} +1 -1
package/dist/expect.mjs +1 -1
package/dist/{index-OEdqjQSe.d.mts → index-5R1_k2nv.d.mts} +195 -3
package/dist/index-fakXoZEe.d.mts +147 -0
package/dist/index.d.mts +120 -13
package/dist/index.mjs +286 -54
package/dist/index.mjs.map +1 -1
package/dist/{models-D_MsBtYw.mjs → models-DIGdOUpJ.mjs} +1 -1
package/dist/models-DIGdOUpJ.mjs.map +1 -0
package/dist/plugins/chat-models/index.d.mts +27 -1
package/dist/plugins/chat-models/index.mjs +29 -1
package/dist/plugins/chat-models/index.mjs.map +1 -1
package/dist/queue-DsZQkZO_.mjs +21 -0
package/dist/queue-DsZQkZO_.mjs.map +1 -0
package/dist/{registry-CwcMMjnZ.mjs → registry-BHGMxjpA.mjs} +164 -6
package/dist/registry-BHGMxjpA.mjs.map +1 -0
package/dist/testing/expect-extensions.mjs +1 -1
package/package.json +8 -1
package/dist/cli-DayPXzHX.mjs.map +0 -1
package/dist/models-D_MsBtYw.mjs.map +0 -1
package/dist/registry-CwcMMjnZ.mjs.map +0 -1

package/dist/index.d.mts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { H as RunResult, M as ModelDefinition, S as TaskRunOutput, W as RunScoreKind, Z as ScheduledTask, k as TaskExecutionContext, l as MatrixDefinition, q as InferenceExecutor, t as ConfigHookPlugin, u as MatrixLayer, x as TaskRunContext } from "./index-OEdqjQSe.mjs";
-import { a as requiredEnvFrom } from "./env-BTq3dV7C.mjs";
+import { D as TaskRunContext, I as TaskExecutionContext, O as TaskRunOutput, S as TaskConcurrencyConfig, Y as RunResult, Z as RunScoreKind, et as InferenceExecutor, f as MatrixDefinition, it as ScheduledTask, k as TelemetryAttributeValue, o as CliReportingConfig, p as MatrixLayer, t as ConfigHookPlugin, w as TaskExecutionPolicy, z as ModelDefinition } from "./index-5R1_k2nv.mjs";
+import { a as requiredEnvFrom } from "./env-BeHv_5mo.mjs";
 import { expect } from "./expect.mjs";
 import * as _$c12 from "c12";
@@ -66,6 +66,41 @@ type VievalVitestCompatReporterReference = VievalVitestCompatReporterValue | rea
  * CLI plugin shape bound to the full CLI config object.
  */
 type CliConfigPlugin = ConfigHookPlugin<CliConfig>;
+/**
+ * Concurrency limits that can be declared in CLI-facing config.
+ *
+ * Use when:
+ * - the CLI needs independent caps for workspace, project, task, attempt, or case scheduling scopes
+ * - config authors want to define concurrency without wiring runtime execution yet
+ *
+ * Expects:
+ * - each provided value to be a positive integer chosen by the caller
+ *
+ * Returns:
+ * - one partial concurrency descriptor keyed by scheduling scope
+ */
+interface CliConcurrencyConfig {
+  /**
+   * Workspace-level concurrency cap.
+   */
+  workspace?: number;
+  /**
+   * Project-level concurrency cap.
+   */
+  project?: number;
+  /**
+   * Task-level concurrency cap.
+   */
+  task?: number;
+  /**
+   * Attempt-level concurrency cap.
+   */
+  attempt?: number;
+  /**
+   * Case-level concurrency cap.
+   */
+  case?: number;
+}
 /**
  * Defines one project block for `vieval run`.
  */
@@ -115,6 +150,12 @@ interface CliProjectConfig {
    * Optional eval-time matrix dimensions.
    */
   evalMatrix?: MatrixDefinition | MatrixLayer;
+  /**
+   * Optional project-scoped concurrency overrides.
+   *
+   * @default inherited from top-level or CLI execution settings
+   */
+  concurrency?: Omit<CliConcurrencyConfig, 'workspace'>;
   /**
    * Optional task executor.
    *
@@ -218,9 +259,13 @@ interface CliComparisonConfig {
  * Expects:
  * - `model` resolves configured models for the current task
  * - `reporterHooks` follows `TaskRunContext['reporterHooks']`
+ * - `telemetry` follows `TaskRunContext['telemetry']`
+ * - `runtimeConcurrency` follows `TaskRunContext['runtimeConcurrency']`
  */
 interface CliProjectExecutorContext extends TaskExecutionContext {
   reporterHooks?: TaskRunContext['reporterHooks'];
+  telemetry?: TaskRunContext['telemetry'];
+  runtimeConcurrency?: TaskRunContext['runtimeConcurrency'];
 }
 /**
  * Top-level CLI config loaded from `vieval.config.*`.
@@ -232,6 +277,19 @@ interface CliConfigBase {
    * @default []
    */
   models?: ModelDefinition[];
+  /**
+   * Global concurrency defaults inherited by projects and tasks.
+   *
+   * Use when:
+   * - config authors want one shared concurrency policy across workspace, project, task, attempt, and case scopes
+   * - project-local overrides should start from a top-level baseline
+   *
+   * Expects:
+   * - each provided value to be a positive integer chosen by the caller
+   *
+   * @default undefined
+   */
+  concurrency?: CliConcurrencyConfig;
   /**
    * Global config plugins.
    *
@@ -254,6 +312,12 @@ interface CliConfigBase {
    * @default {}
    */
   env?: NodeJS.ProcessEnv;
+  /**
+   * Optional reporting integrations shared by CLI run orchestration.
+   *
+   * @default undefined
+   */
+  reporting?: CliReportingConfig;
 }
 /**
  * Project mode config for `vieval run`.
@@ -344,12 +408,44 @@ interface CaseRunContext<TInput> extends TaskRunContext {
    * - `name` to be a stable metric identifier
    * - `value` to be JSON-serializable
    */
-  metric: (name: string, value: boolean | number | string | null) => void;
+  metric: (name: string, value: TelemetryAttributeValue) => void;
+  /**
+   * Cooperative abort signal for the current case execution.
+   */
+  signal: AbortSignal;
 }
 /**
  * Callback for one task case.
  */
-type CaseRunner<TInput> = (context: CaseRunContext<TInput>) => Promise<void> | void;
+type CaseRunner<TInput> = (context: CaseRunContext<TInput>) => Promise<unknown> | unknown;
+/**
+ * Per-group options for `casesFromInputs`.
+ *
+ * Use when:
+ * - one generated case group should run with a lower case concurrency than the task default
+ * - a task should keep a broader task-level cap while one expensive case family stays bounded
+ *
+ * Expects:
+ * - `concurrency` to be a positive integer when provided
+ *
+ * Returns:
+ * - one partial case-group execution descriptor
+ */
+interface CasesFromInputsOptions extends TaskExecutionPolicy {
+  /**
+   * Case-level concurrency cap for cases registered by one `casesFromInputs(...)` call.
+   */
+  concurrency?: number;
+}
+/**
+ * Per-case registration options for `caseOf`.
+ */
+interface CaseRegistrationOptions<TInput> extends TaskExecutionPolicy {
+  /**
+   * Optional case input payload.
+   */
+  input: TInput;
+}
 /**
  * Builder callbacks passed into `describeTask`.
  */
@@ -359,35 +455,44 @@ interface DescribeTaskBuilder {
    */
   caseOf: {
     (name: string, run: CaseRunner<undefined>): void;
-    <TInput>(name: string, run: CaseRunner<TInput>, options: {
-      input: TInput;
-    }): void;
+    <TInput>(name: string, run: CaseRunner<TInput>, options: CaseRegistrationOptions<TInput>): void;
   };
   /**
    * Registers multiple cases from input list.
    */
-  casesFromInputs: <TInput>(namePrefix: string, inputs: readonly TInput[], run: CaseRunner<TInput>) => void;
+  casesFromInputs: <TInput>(namePrefix: string, inputs: readonly TInput[], run: CaseRunner<TInput>, options?: CasesFromInputsOptions) => void;
 }
 /**
  * Options for `describeTask`.
  */
-interface DescribeTaskOptions {
+interface DescribeTaskOptions extends TaskExecutionPolicy {
   /**
    * Optional description override.
    */
   description?: string;
+  /**
+   * Optional task-local concurrency overrides.
+   *
+   * Use when:
+   * - one task should cap attempt fan-out independently from the surrounding project
+   * - one task should cap case fan-out without changing global scheduling defaults
+   *
+   * Expects:
+   * - each provided value to be a positive integer
+   *
+   * @default inherited from project or CLI concurrency settings
+   */
+  concurrency?: TaskConcurrencyConfig;
 }
 /**
  * Registers one case in the currently active task scope.
  */
 declare function caseOf(name: string, run: CaseRunner<undefined>): void;
-declare function caseOf<TInput>(name: string, run: CaseRunner<TInput>, options: {
-  input: TInput;
-}): void;
+declare function caseOf<TInput>(name: string, run: CaseRunner<TInput>, options: CaseRegistrationOptions<TInput>): void;
 /**
  * Registers multiple cases in the currently active task scope.
  */
-declare function casesFromInputs<TInput>(namePrefix: string, inputs: readonly TInput[], run: CaseRunner<TInput>): void;
+declare function casesFromInputs<TInput>(namePrefix: string, inputs: readonly TInput[], run: CaseRunner<TInput>, options?: CasesFromInputsOptions): void;
 /**
  * Defines one eval task with task/case semantics similar to Vitest.
  *
@@ -399,6 +504,8 @@ declare function describeTask(name: string, build: ((builder: DescribeTaskBuilde
   readonly description: string;
   readonly name: string;
   readonly task: {
+    readonly concurrency: TaskConcurrencyConfig | undefined;
+    readonly executionPolicy: TaskExecutionPolicy | undefined;
     readonly id: string;
     readonly run: (context: TaskRunContext) => Promise<TaskRunOutput>;
   };

package/dist/index.mjs CHANGED Viewed

@@ -1,8 +1,9 @@
-import { a as defineConfig, i as registerEvalDefinition, s as loadEnv } from "./registry-CwcMMjnZ.mjs";
-import { n as requiredEnvFrom } from "./env-BFSjny07.mjs";
+import { i as registerEvalDefinition, l as loadEnv, o as createNoopTelemetryRuntime, s as defineConfig } from "./registry-BHGMxjpA.mjs";
+import { t as createSchedulerQueue } from "./queue-DsZQkZO_.mjs";
+import { n as requiredEnvFrom } from "./env--94B0UtW.mjs";
 import { defineEval, defineTask } from "./config.mjs";
 import { expect } from "./expect.mjs";
-import { errorMessageFrom } from "@moeru/std";
+import { errorMessageFrom, sleep } from "@moeru/std";
 //#region src/dsl/task.ts
 function cloneCaseMatrix(matrix) {
 	return {
@@ -14,9 +15,36 @@ function cloneCaseMatrix(matrix) {
 function createTaskCaseReporterId(index, name) {
 	return `${index}:${encodeURIComponent(name)}`;
 }
+function isTelemetryAttributeScalar(value) {
+	return typeof value === "boolean" || typeof value === "number" || typeof value === "string";
+}
+function isTelemetryAttributeArray(value) {
+	return value.every(isTelemetryAttributeScalar);
+}
+function canAttachMetricAsAttribute(value) {
+	if (isTelemetryAttributeScalar(value)) return true;
+	return Array.isArray(value) && isTelemetryAttributeArray(value);
+}
 function assertValidScore(score) {
 	if (!Number.isFinite(score) || score < 0 || score > 1) throw new Error(`Case score must be a finite number in range 0..1, got "${score}".`);
 }
+function assertNonNegativeInteger(value, label) {
+	if (!Number.isFinite(value) || !Number.isInteger(value) || value < 0) throw new Error(`Invalid ${label}: ${String(value)}`);
+}
+function assertNonNegativeNumber(value, label) {
+	if (!Number.isFinite(value) || value < 0) throw new Error(`Invalid ${label}: ${String(value)}`);
+}
+function assertPositiveInteger(value, label) {
+	if (!Number.isFinite(value) || !Number.isInteger(value) || value <= 0) throw new Error(`Invalid ${label}: ${String(value)}`);
+}
+function autoRetryDelayMs(retryIndex) {
+	return 500 * 2 ** (retryIndex - 1);
+}
+function resolveAutoRetryDelay(policy, retryIndex) {
+	const delay = policy.autoRetryDelay;
+	if (delay == null) return autoRetryDelayMs(retryIndex);
+	return typeof delay === "number" ? delay : delay(retryIndex);
+}
 function emitCaseStart(hooks, payload) {
 	try {
 		hooks?.onCaseStart?.(payload);
@@ -27,9 +55,174 @@ function emitCaseEnd(hooks, payload) {
 		hooks?.onCaseEnd?.(payload);
 	} catch {}
 }
+function emitReporterEvent(hooks, payload) {
+	try {
+		hooks?.onEvent?.(payload);
+	} catch {}
+}
+function createCaseTimeoutError(timeout) {
+	const error = /* @__PURE__ */ new Error(`Case timed out after ${timeout}ms.`);
+	error.name = "TimeoutError";
+	return error;
+}
+function normalizeExecutionPolicy(policy, label) {
+	if (policy == null) return;
+	if (policy.autoAttempt != null) assertNonNegativeInteger(policy.autoAttempt, `${label} autoAttempt`);
+	if (policy.autoRetry != null) assertNonNegativeInteger(policy.autoRetry, `${label} autoRetry`);
+	if (typeof policy.autoRetryDelay === "number") assertNonNegativeNumber(policy.autoRetryDelay, `${label} autoRetryDelay`);
+	if (policy.timeout != null) assertPositiveInteger(policy.timeout, `${label} timeout`);
+	const normalized = {
+		autoAttempt: policy.autoAttempt,
+		autoRetry: policy.autoRetry,
+		autoRetryDelay: policy.autoRetryDelay,
+		timeout: policy.timeout
+	};
+	return Object.values(normalized).some((value) => value != null) ? normalized : void 0;
+}
+function resolveCaseExecutionPolicy(taskCase, taskExecutionPolicy) {
+	return {
+		autoAttempt: taskCase.executionPolicy?.autoAttempt ?? taskExecutionPolicy?.autoAttempt ?? 0,
+		autoRetry: taskCase.executionPolicy?.autoRetry ?? taskExecutionPolicy?.autoRetry ?? 0,
+		autoRetryDelay: taskCase.executionPolicy?.autoRetryDelay ?? taskExecutionPolicy?.autoRetryDelay,
+		timeout: taskCase.executionPolicy?.timeout ?? taskExecutionPolicy?.timeout
+	};
+}
+async function runCaseOnce(context, taskCase, index, timeout) {
+	const customScoresByKind = /* @__PURE__ */ new Map();
+	const abortController = new AbortController();
+	const telemetry = context.telemetry ?? createNoopTelemetryRuntime();
+	const caseId = createTaskCaseReporterId(index, taskCase.name);
+	let timeoutHandle;
+	let timedOut = false;
+	let settled = false;
+	try {
+		return await telemetry.withSpan("vieval.case", {
+			"vieval.case.id": caseId,
+			"vieval.case.name": taskCase.name,
+			"vieval.task.id": context.task.id,
+			"vieval.task.name": context.task.entry.name
+		}, async () => {
+			const runPromise = Promise.resolve(taskCase.run({
+				...context,
+				matrix: {
+					...cloneCaseMatrix(context.task.matrix),
+					inputs: taskCase.input
+				},
+				metric(name, value) {
+					if (abortController.signal.aborted || settled) return;
+					emitReporterEvent(context.reporterHooks, {
+						caseId,
+						data: {
+							name,
+							value
+						},
+						event: "task.case.metric"
+					});
+					telemetry.addEvent("vieval.case.metric", {
+						name,
+						value
+					});
+					if (canAttachMetricAsAttribute(value)) telemetry.setAttributes({ [name]: value });
+				},
+				score(score, kind = "exact") {
+					if (abortController.signal.aborted || settled) return;
+					assertValidScore(score);
+					customScoresByKind.set(kind, score);
+					telemetry.addEvent("vieval.case.score", {
+						"vieval.score.kind": kind,
+						"vieval.score.value": score
+					});
+					emitReporterEvent(context.reporterHooks, {
+						caseId,
+						data: {
+							kind,
+							score
+						},
+						event: "task.case.score"
+					});
+				},
+				signal: abortController.signal
+			}));
+			if (timeout != null) {
+				const timeoutPromise = new Promise((_, reject) => {
+					timeoutHandle = setTimeout(() => {
+						timedOut = true;
+						abortController.abort(createCaseTimeoutError(timeout));
+						reject(createCaseTimeoutError(timeout));
+					}, timeout);
+				});
+				const output = await Promise.race([runPromise, timeoutPromise]);
+				settled = true;
+				return {
+					output,
+					scoresByKind: customScoresByKind,
+					state: "passed"
+				};
+			}
+			const output = await runPromise;
+			settled = true;
+			return {
+				output,
+				scoresByKind: customScoresByKind,
+				state: "passed"
+			};
+		});
+	} catch (error) {
+		settled = true;
+		return {
+			errorMessage: errorMessageFrom(error) ?? (timedOut && timeout != null ? `Case timed out after ${timeout}ms.` : "Unknown case failure."),
+			scoresByKind: customScoresByKind,
+			state: timedOut ? "timeout" : "failed"
+		};
+	} finally {
+		if (timeoutHandle != null) clearTimeout(timeoutHandle);
+	}
+}
+async function executeRegisteredCase(context, taskCase, index, totalCases, taskExecutionPolicy) {
+	const resolvedPolicy = resolveCaseExecutionPolicy(taskCase, taskExecutionPolicy);
+	let lastOutcome;
+	for (let retryIndex = 0; retryIndex <= resolvedPolicy.autoRetry; retryIndex += 1) {
+		if (retryIndex > 0) {
+			const retryDelayMs = resolveAutoRetryDelay(resolvedPolicy, retryIndex);
+			assertNonNegativeNumber(retryDelayMs, "autoRetryDelay result");
+			if (retryDelayMs > 0) await sleep(retryDelayMs);
+		}
+		emitCaseStart(context.reporterHooks, {
+			...resolvedPolicy.autoRetry > 0 ? {
+				autoRetry: resolvedPolicy.autoRetry,
+				retryIndex
+			} : {},
+			index,
+			...taskCase.input === void 0 ? {} : { input: taskCase.input },
+			name: taskCase.name,
+			total: totalCases
+		});
+		lastOutcome = await runCaseOnce(context, taskCase, index, resolvedPolicy.timeout);
+		if (lastOutcome.state === "passed") return lastOutcome;
+	}
+	return lastOutcome ?? {
+		errorMessage: "Unknown case failure.",
+		scoresByKind: /* @__PURE__ */ new Map(),
+		state: "failed"
+	};
+}
+function collectCaseOutcomeScores(outcome, scoreBucketsByKind) {
+	if (outcome.state !== "passed") {
+		scoreBucketsByKind.exact.push(0);
+		return;
+	}
+	if (outcome.scoresByKind.size === 0) {
+		scoreBucketsByKind.exact.push(1);
+		return;
+	}
+	scoreBucketsByKind.exact.push(outcome.scoresByKind.get("exact") ?? 1);
+	const judgeScore = outcome.scoresByKind.get("judge");
+	if (judgeScore != null) scoreBucketsByKind.judge.push(judgeScore);
+}
 function createCaseBuilder(registeredCases) {
 	function registerCase(name, run, options) {
 		registeredCases.push({
+			executionPolicy: normalizeExecutionPolicy(options, "task case"),
 			input: options?.input,
 			name,
 			run
@@ -37,11 +230,15 @@ function createCaseBuilder(registeredCases) {
 	}
 	return {
 		caseOf: registerCase,
-		casesFromInputs(namePrefix, inputs, run) {
+		casesFromInputs(namePrefix, inputs, run, options) {
+			const queueKey = options?.concurrency == null ? void 0 : {};
 			inputs.forEach((input, index) => {
 				registeredCases.push({
+					concurrency: options?.concurrency,
+					executionPolicy: normalizeExecutionPolicy(options, "casesFromInputs"),
 					input,
 					name: `${namePrefix} #${index + 1}`,
+					queueKey,
 					run
 				});
 			});
@@ -64,6 +261,7 @@ function getActiveCases() {
 }
 function caseOf(name, run, options) {
 	getActiveCases().push({
+		executionPolicy: normalizeExecutionPolicy(options, "task case"),
 		input: options?.input,
 		name,
 		run
@@ -72,16 +270,40 @@ function caseOf(name, run, options) {
 /**
 * Registers multiple cases in the currently active task scope.
 */
-function casesFromInputs(namePrefix, inputs, run) {
+function casesFromInputs(namePrefix, inputs, run, options) {
+	const queueKey = options?.concurrency == null ? void 0 : {};
 	inputs.forEach((input, index) => {
 		getActiveCases().push({
+			concurrency: options?.concurrency,
+			executionPolicy: normalizeExecutionPolicy(options, "casesFromInputs"),
 			input,
 			name: `${namePrefix} #${index + 1}`,
+			queueKey,
 			run
 		});
 	});
 }
 /**
+* Resolves the effective case concurrency for one registered task case.
+*
+* Before:
+* - registered case override `2`, task default `4`
+* - registered case override `undefined`, task default `3`
+*
+* After:
+* - `2`
+* - `3`
+*/
+function resolveCaseConcurrency(taskCase, taskConcurrency, runtimeConcurrency) {
+	const concurrency = runtimeConcurrency?.case ?? taskCase.concurrency ?? taskConcurrency?.case;
+	if (concurrency == null) return;
+	if (!Number.isFinite(concurrency) || !Number.isInteger(concurrency) || concurrency <= 0) throw new Error(`Invalid task case concurrency: ${String(concurrency)}`);
+	return concurrency;
+}
+function resolveCaseQueueKey(taskCase, defaultQueueKey) {
+	return taskCase.queueKey ?? defaultQueueKey;
+}
+/**
 * Defines one eval task with task/case semantics similar to Vitest.
 *
 * Use when:
@@ -98,10 +320,14 @@ function describeTask(name, build, options = {}) {
 		}
 		build();
 	});
+	const description = options.description ?? name;
+	const taskExecutionPolicy = normalizeExecutionPolicy(options, "describeTask");
 	const definition = defineEval({
-		description: options.description ?? name,
+		description,
 		name,
 		task: defineTask({
+			concurrency: options.concurrency,
+			executionPolicy: taskExecutionPolicy,
 			id: name,
 			async run(context) {
 				if (registeredCases.length === 0) return { scores: [{
@@ -113,62 +339,68 @@ function describeTask(name, build, options = {}) {
 					exact: [],
 					judge: []
 				};
-				await Promise.all(registeredCases.map(async (taskCase, index) => {
-					emitCaseStart(context.reporterHooks, {
-						index,
-						name: taskCase.name,
-						total: totalCases
-					});
-					let state = "passed";
-					let errorMessage;
-					const caseId = createTaskCaseReporterId(index, taskCase.name);
-					const customScoresByKind = /* @__PURE__ */ new Map();
-					try {
-						await taskCase.run({
-							...context,
-							matrix: {
-								...cloneCaseMatrix(context.task.matrix),
-								inputs: taskCase.input
-							},
-							metric(name, value) {
-								context.reporterHooks?.onEvent?.({
-									caseId,
-									data: {
-										name,
-										value
-									},
-									event: "task.case.metric"
-								});
-							},
-							score(score, kind = "exact") {
-								assertValidScore(score);
-								customScoresByKind.set(kind, score);
-							}
-						});
-					} catch (error) {
-						state = "failed";
-						errorMessage = errorMessageFrom(error) ?? "Unknown case failure.";
-					} finally {
+				const defaultCaseQueueKey = {};
+				const caseQueues = /* @__PURE__ */ new Map();
+				const hasAutoAttempt = registeredCases.some((taskCase) => resolveCaseExecutionPolicy(taskCase, taskExecutionPolicy).autoAttempt > 0);
+				const runtimeTaskConcurrency = context.task.entry.task?.concurrency ?? options.concurrency;
+				if (!hasAutoAttempt) await Promise.all(registeredCases.map(async (taskCase, index) => {
+					const executeCase = async () => {
+						const outcome = await executeRegisteredCase(context, taskCase, index, totalCases, taskExecutionPolicy);
 						emitCaseEnd(context.reporterHooks, {
-							...errorMessage == null ? {} : { errorMessage },
+							...outcome.errorMessage == null ? {} : { errorMessage: outcome.errorMessage },
 							index,
-							state,
+							...outcome.output === void 0 ? {} : { output: outcome.output },
+							state: outcome.state,
 							name: taskCase.name,
 							total: totalCases
 						});
-					}
-					if (state === "failed") {
-						scoreBucketsByKind.exact.push(0);
-						return;
-					}
-					if (customScoresByKind.size === 0) {
-						scoreBucketsByKind.exact.push(1);
+						collectCaseOutcomeScores(outcome, scoreBucketsByKind);
+					};
+					const concurrency = resolveCaseConcurrency(taskCase, runtimeTaskConcurrency, context.runtimeConcurrency);
+					if (concurrency == null) {
+						await executeCase();
 						return;
 					}
-					scoreBucketsByKind.exact.push(customScoresByKind.get("exact") ?? 1);
-					const judgeScore = customScoresByKind.get("judge");
-					if (judgeScore != null) scoreBucketsByKind.judge.push(judgeScore);
+					const queueKey = resolveCaseQueueKey(taskCase, defaultCaseQueueKey);
+					const queue = caseQueues.get(queueKey) ?? createSchedulerQueue(concurrency);
+					caseQueues.set(queueKey, queue);
+					await queue.run(executeCase);
 				}));
+				else {
+					let finalOutcomes = [];
+					let attemptIndex = 0;
+					for (;;) {
+						finalOutcomes = await Promise.all(registeredCases.map(async (taskCase, index) => {
+							const executeCase = async () => await executeRegisteredCase(context, taskCase, index, totalCases, taskExecutionPolicy);
+							const concurrency = resolveCaseConcurrency(taskCase, runtimeTaskConcurrency, context.runtimeConcurrency);
+							if (concurrency == null) return await executeCase();
+							const queueKey = resolveCaseQueueKey(taskCase, defaultCaseQueueKey);
+							const queue = caseQueues.get(queueKey) ?? createSchedulerQueue(concurrency);
+							caseQueues.set(queueKey, queue);
+							return await queue.run(executeCase);
+						}));
+						if (!finalOutcomes.some((outcome, index) => {
+							if (outcome.state === "passed") return false;
+							const taskCase = registeredCases[index];
+							if (taskCase == null) return false;
+							return attemptIndex < resolveCaseExecutionPolicy(taskCase, taskExecutionPolicy).autoAttempt;
+						})) break;
+						attemptIndex += 1;
+					}
+					finalOutcomes.forEach((outcome, index) => {
+						const taskCase = registeredCases[index];
+						if (taskCase == null) return;
+						emitCaseEnd(context.reporterHooks, {
+							...outcome.errorMessage == null ? {} : { errorMessage: outcome.errorMessage },
+							index,
+							...outcome.output === void 0 ? {} : { output: outcome.output },
+							state: outcome.state,
+							name: taskCase.name,
+							total: totalCases
+						});
+						collectCaseOutcomeScores(outcome, scoreBucketsByKind);
+					});
+				}
 				return { scores: Object.keys(scoreBucketsByKind).filter((kind) => scoreBucketsByKind[kind].length > 0).map((kind) => {
 					const values = scoreBucketsByKind[kind];
 					return {