npm - vieval - Versions diffs - 0.0.6 → 0.0.8 - Mend

vieval 0.0.6 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/README.md +219 -109
package/dist/bin/vieval.mjs +1 -1
package/dist/cli/index.mjs +1 -1
package/dist/{cli-sanbKtQq.mjs → cli-Dao25VxV.mjs} +1186 -162
package/dist/cli-Dao25VxV.mjs.map +1 -0
package/dist/config.d.mts +2 -2
package/dist/config.mjs +1 -1
package/dist/core/assertions/index.d.mts +1 -1
package/dist/core/inference-executors/index.mjs +1 -1
package/dist/core/processors/results/index.d.mts +1 -1
package/dist/core/runner/index.d.mts +2 -2
package/dist/core/runner/index.mjs +6 -40
package/dist/core/runner/index.mjs.map +1 -1
package/dist/{env--94B0UtW.mjs → env-BFSjny07.mjs} +1 -1
package/dist/{env--94B0UtW.mjs.map → env-BFSjny07.mjs.map} +1 -1
package/dist/{index-DBZKkpBe.d.mts → index-BkjyCInx.d.mts} +102 -37
package/dist/index.d.mts +14 -6
package/dist/index.mjs +110 -39
package/dist/index.mjs.map +1 -1
package/dist/{models-DIGdOUpJ.mjs → models-pBSRUZhY.mjs} +1 -1
package/dist/{models-DIGdOUpJ.mjs.map → models-pBSRUZhY.mjs.map} +1 -1
package/dist/plugins/chat-models/index.d.mts +69 -6
package/dist/plugins/chat-models/index.mjs +62 -6
package/dist/plugins/chat-models/index.mjs.map +1 -1
package/dist/{registry-CcKZqDJY.mjs → registry-BHGMxjpA.mjs} +140 -4
package/dist/registry-BHGMxjpA.mjs.map +1 -0
package/package.json +2 -1
package/dist/cli-sanbKtQq.mjs.map +0 -1
package/dist/registry-CcKZqDJY.mjs.map +0 -1

package/dist/{index-DBZKkpBe.d.mts → index-BkjyCInx.d.mts} RENAMED Viewed

@@ -495,15 +495,6 @@ interface ModelDefinition {
 declare function resolveModelByName(models: readonly ModelDefinition[], name: string): ModelDefinition | undefined;
 //#endregion
 //#region src/core/runner/task-context.d.ts
-/**
- * Options for selecting a model from the execution context.
- */
-interface TaskModelSelectionOptions {
-  /**
-   * Model id or alias name.
-   */
-  name: string;
-}
 /**
  * Task-scoped execution context exposed to runner executors.
  */
@@ -513,13 +504,9 @@ interface TaskExecutionContext {
    */
   cache: TaskCacheRuntime;
   /**
-   * Resolves model configuration for the current task.
-   *
-   * Use when:
-   * - no arguments are provided to use the model selected by run matrix/inferenceExecutor
-   * - `name` is provided to resolve a specific model id or alias
+   * Configured model registrations available to model plugins.
    */
-  model: (selection?: string | TaskModelSelectionOptions) => ModelDefinition;
+  models: readonly ModelDefinition[];
 }
 /**
  * Inputs used to build task execution context.
@@ -530,14 +517,13 @@ interface CreateTaskExecutionContextOptions {
   task: ScheduledTask;
 }
 /**
- * Creates task-scoped model resolver context for runner execution.
+ * Creates task-scoped context data for runner execution.
  *
  * Call stack:
  *
  * {@link runScheduledTasks}
  *   -> {@link createTaskExecutionContext}
- *     -> {@link resolveModelByName}
- *       -> `task.model()` / `task.model({ name })`
+ *     -> `TaskExecutionContext`
  */
 declare function createTaskExecutionContext(options: CreateTaskExecutionContextOptions): TaskExecutionContext;
 //#endregion
@@ -581,7 +567,7 @@ interface RunScheduledTasksOptions {
    * Creates per-task execution context.
    *
    * Use when:
-   * - executor code needs per-task model resolution or other task-scoped data
+   * - executor code needs per-task models, cache, or other task-scoped data
    */
   createExecutionContext?: (task: ScheduledTask) => TaskExecutionContext;
   /**
@@ -646,7 +632,39 @@ declare class RunnerExecutionError extends Error {
  */
 declare function runScheduledTasks(tasks: readonly ScheduledTask[], executor: ScheduledTaskExecutor, options?: RunScheduledTasksOptions): Promise<AggregatedRunResults>;
 //#endregion
+//#region src/core/telemetry/types.d.ts
+/** JSON-compatible scalar values accepted as telemetry attributes. */
+type TelemetryAttributeValue = boolean | number | string | null | readonly TelemetryAttributeValue[];
+/** Attribute map shared by local report projection and OpenTelemetry span calls. */
+type TelemetryAttributes = Record<string, TelemetryAttributeValue | undefined>;
+/**
+ * Internal Vieval telemetry runtime.
+ *
+ * Use when:
+ * - runner code needs one execution path for disabled and enabled telemetry
+ * - case code should run inside an active OpenTelemetry span when configured
+ *
+ * Expects:
+ * - attributes are JSON-compatible and stable enough for report filtering
+ * - callbacks are awaited by the caller
+ *
+ * Returns:
+ * - callback result, preserving thrown errors after telemetry records them
+ */
+interface TelemetryRuntime {
+  withSpan: <T>(name: string, attributes: TelemetryAttributes, callback: () => Promise<T>) => Promise<T>;
+  addEvent: (name: string, attributes?: TelemetryAttributes) => void;
+  setAttributes: (attributes: TelemetryAttributes) => void;
+  recordException: (error: unknown) => void;
+}
+//#endregion
 //#region src/config/types.d.ts
+/**
+ * Value that can be returned directly or through a promise.
+ *
+ * @param T - Resolved value type.
+ */
+type Awaitable<T> = Promise<T> | T;
 /**
  * Primitive value allowed in one matrix cell.
  *
@@ -862,6 +880,12 @@ interface TaskRunOutput {
    */
   scores: readonly RunScore[];
 }
+/**
+ * Delay policy for retries within one task case attempt.
+ *
+ * @param retryIndex Retry number where `1` is the first retry after the initial failure.
+ */
+type TaskAutoRetryDelay = number | ((retryIndex: number) => number);
 /**
  * Execution policy applied to task and case callbacks.
  *
@@ -883,6 +907,15 @@ interface TaskExecutionPolicy {
    * @default 0
    */
   autoRetry?: number;
+  /**
+   * Delay in milliseconds before a case auto retry starts.
+   *
+   * A number applies the same delay to every retry. A function receives the
+   * retry index where `1` is the first retry after the initial failure.
+   *
+   * @default retryIndex => 500 * 2 ** (retryIndex - 1)
+   */
+  autoRetryDelay?: TaskAutoRetryDelay;
   /**
    * Additional full task attempts allowed after the current attempt settles.
    *
@@ -917,6 +950,30 @@ interface TaskConcurrencyConfig {
    */
   case?: number;
 }
+/**
+ * Reporting configuration for local artifacts and optional OpenTelemetry integration.
+ */
+interface CliReportingConfig {
+  /**
+   * Optional OpenTelemetry API integration.
+   */
+  openTelemetry?: CliOpenTelemetryReportingConfig;
+}
+/**
+ * OpenTelemetry reporting configuration managed by user config setup.
+ */
+interface CliOpenTelemetryReportingConfig {
+  /**
+   * Enables Vieval active span wrapping through `@opentelemetry/api`.
+   *
+   * @default false
+   */
+  enabled?: boolean;
+  /**
+   * Called after all telemetry events and local report artifacts have been emitted.
+   */
+  onRunEnd?: () => Awaitable<void>;
+}
 /**
  * Runtime context passed into eval task `run`.
  */
@@ -964,24 +1021,13 @@ interface TaskRunContext {
    */
   task: ScheduledTask;
   /**
-   * Matrix-scoped model resolver.
-   *
-   * Runtime impact:
-   * - `context.model()` uses `context.task.matrix.run.model` first when present
-   * - then falls back to inferenceExecutor-id match
-   * - then falls back to first configured model
+   * Configured model registrations available to model plugins.
    *
-   * @example
-   * ```ts
-   * // matrix.run.model = 'gpt-4.1-mini'
-   * const defaultModel = context.model()
-   * // resolves the configured model whose id/model/alias matches 'gpt-4.1-mini'
-   *
-   * const judgeModel = context.model({ name: 'judge-large' })
-   * // explicit lookup bypasses matrix default
-   * ```
+   * Use when:
+   * - a plugin owns model selection semantics and needs access to registered models
+   * - eval code resolves matrix-selected model axes through plugin helpers
    */
-  model: TaskExecutionContext['model'];
+  models: TaskExecutionContext['models'];
   /**
    * Optional reporter lifecycle hooks for task-local case events.
    *
@@ -992,6 +1038,17 @@ interface TaskRunContext {
    * - hooks are best-effort observers and should not affect task scoring
    */
   reporterHooks?: TaskReporterHooks;
+  /**
+   * Optional telemetry runtime shared by runner, DSL, and reporter integrations.
+   *
+   * Use when:
+   * - task execution should emit events to the currently active telemetry runtime
+   * - enabled and disabled telemetry should keep the same execution path
+   *
+   * Expects:
+   * - callers inject a no-op runtime when telemetry is disabled
+   */
+  telemetry?: TelemetryRuntime;
   /**
    * Optional runtime scheduling overrides supplied by CLI or host execution.
    *
@@ -1036,6 +1093,10 @@ interface TaskCaseReporterPayload {
    * Maximum retry count configured for this case.
    */
   autoRetry?: number;
+  /**
+   * Optional case input payload registered by the task DSL.
+   */
+  input?: unknown;
   /**
    * Declared case label.
    */
@@ -1066,6 +1127,10 @@ interface TaskCaseReporterPayload {
  * - `state` describes the final case result
  */
 interface TaskCaseReporterEndPayload extends TaskCaseReporterPayload {
+  /**
+   * Optional case output returned by the task case callback.
+   */
+  output?: unknown;
   /**
    * Final case state.
    */
@@ -1288,5 +1353,5 @@ interface ConfigHookPlugin<TConfig> {
   configVievalResolved?: (config: TConfig) => void | Promise<void>;
 }
 //#endregion
-export { ScheduledTask as $, CreateTaskExecutionContextOptions as A, createRunnerRuntimeContext as B, TaskRunContext as C, RunnerTaskState as D, RunnerExecutionError as E, resolveModelByName as F, RunScore as G, AggregatedRunResults as H, asProjectRelativePath as I, CreateRunnerScheduleOptions as J, RunScoreKind as K, collectEvalEntries as L, TaskModelSelectionOptions as M, createTaskExecutionContext as N, ScheduledTaskExecutor as O, ModelDefinition as P, RunnerMatrixSelection as Q, CreateVievalRunnerRuntimeContextOptions as R, TaskReporterHooks as S, RunScheduledTasksOptions as T, AggregatedRunSummary as U, AggregatedProviderSummary as V, RunResult as W, RunnerMatrixDefinition as X, InferenceExecutor as Y, RunnerMatrixInput as Z, TaskCaseState as _, EvalDefinition as a, normalizeCacheFilePathSegments as at, TaskExecutionPolicy as b, MatrixAxisValues as c, CacheNamespace as ct, MatrixPrimitive as d, ScheduledTaskMatrix as et, MatrixRow as f, TaskCaseReporterPayload as g, TaskCaseReporterEndPayload as h, CollectedEvalEntry as i, createFilesystemTaskCacheRuntime as it, TaskExecutionContext as j, runScheduledTasks as k, MatrixDefinition as l, TaskCacheRuntime as lt, ScopedMatrices as m, defineEval as n, createRunnerSchedule as nt, EvalModule as o, CacheFileHandle as ot, MatrixValue as p, aggregateRunResults as q, defineTask as r, CreateFilesystemTaskCacheRuntimeOptions as rt, EvalModuleMap as s, CacheFileOptions as st, ConfigHookPlugin as t, ScheduledTaskMatrixMeta as tt, MatrixLayer as u, TaskConcurrencyConfig as v, TaskRunOutput as w, TaskReporterEventPayload as x, TaskDefinition as y, RunnerRuntimeContext as z };
-//# sourceMappingURL=index-DBZKkpBe.d.mts.map
+export { InferenceExecutor as $, RunScheduledTasksOptions as A, asProjectRelativePath as B, TaskDefinition as C, TaskRunContext as D, TaskReporterHooks as E, CreateTaskExecutionContextOptions as F, AggregatedProviderSummary as G, CreateVievalRunnerRuntimeContextOptions as H, TaskExecutionContext as I, RunResult as J, AggregatedRunResults as K, createTaskExecutionContext as L, RunnerTaskState as M, ScheduledTaskExecutor as N, TaskRunOutput as O, runScheduledTasks as P, CreateRunnerScheduleOptions as Q, ModelDefinition as R, TaskConcurrencyConfig as S, TaskReporterEventPayload as T, RunnerRuntimeContext as U, collectEvalEntries as V, createRunnerRuntimeContext as W, RunScoreKind as X, RunScore as Y, aggregateRunResults as Z, ScopedMatrices as _, CliOpenTelemetryReportingConfig as a, ScheduledTaskMatrixMeta as at, TaskCaseReporterPayload as b, EvalDefinition as c, createFilesystemTaskCacheRuntime as ct, MatrixAxisValues as d, CacheFileOptions as dt, RunnerMatrixDefinition as et, MatrixDefinition as f, CacheNamespace as ft, MatrixValue as g, MatrixRow as h, Awaitable as i, ScheduledTaskMatrix as it, RunnerExecutionError as j, TelemetryAttributeValue as k, EvalModule as l, normalizeCacheFilePathSegments as lt, MatrixPrimitive as m, defineEval as n, RunnerMatrixSelection as nt, CliReportingConfig as o, createRunnerSchedule as ot, MatrixLayer as p, TaskCacheRuntime as pt, AggregatedRunSummary as q, defineTask as r, ScheduledTask as rt, CollectedEvalEntry as s, CreateFilesystemTaskCacheRuntimeOptions as st, ConfigHookPlugin as t, RunnerMatrixInput as tt, EvalModuleMap as u, CacheFileHandle as ut, TaskAutoRetryDelay as v, TaskExecutionPolicy as w, TaskCaseState as x, TaskCaseReporterEndPayload as y, resolveModelByName as z };
+//# sourceMappingURL=index-BkjyCInx.d.mts.map

package/dist/index.d.mts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { $ as ScheduledTask, C as TaskRunContext, K as RunScoreKind, P as ModelDefinition, W as RunResult, Y as InferenceExecutor, b as TaskExecutionPolicy, j as TaskExecutionContext, l as MatrixDefinition, t as ConfigHookPlugin, u as MatrixLayer, v as TaskConcurrencyConfig, w as TaskRunOutput } from "./index-DBZKkpBe.mjs";
+import { $ as InferenceExecutor, D as TaskRunContext, I as TaskExecutionContext, J as RunResult, O as TaskRunOutput, R as ModelDefinition, S as TaskConcurrencyConfig, X as RunScoreKind, f as MatrixDefinition, k as TelemetryAttributeValue, o as CliReportingConfig, p as MatrixLayer, rt as ScheduledTask, t as ConfigHookPlugin, w as TaskExecutionPolicy } from "./index-BkjyCInx.mjs";
 import { a as requiredEnvFrom } from "./env-BeHv_5mo.mjs";
 import { expect } from "./expect.mjs";
 import * as _$c12 from "c12";
@@ -137,7 +137,7 @@ interface CliProjectConfig {
    * Model definitions available to project runtime execution.
    *
    * Inference executors control schedule fan-out, while models provide
-   * runtime lookup metadata for `context.model(...)` during task execution.
+   * runtime lookup metadata for model plugin helpers during task execution.
    *
    * @default inherited from top-level config models
    */
@@ -253,16 +253,18 @@ interface CliComparisonConfig {
  * Execution context exposed to project-level `executor` implementations.
  *
  * Use when:
- * - a project executor needs the task-scoped model resolver plus case reporter hooks
+ * - a project executor needs task-scoped models plus case reporter hooks
  * - custom scheduling logic wants the same hook shape as `TaskRunContext`
  *
  * Expects:
- * - `model` resolves configured models for the current task
+ * - `models` exposes configured model registrations for plugin helpers
  * - `reporterHooks` follows `TaskRunContext['reporterHooks']`
+ * - `telemetry` follows `TaskRunContext['telemetry']`
  * - `runtimeConcurrency` follows `TaskRunContext['runtimeConcurrency']`
  */
 interface CliProjectExecutorContext extends TaskExecutionContext {
   reporterHooks?: TaskRunContext['reporterHooks'];
+  telemetry?: TaskRunContext['telemetry'];
   runtimeConcurrency?: TaskRunContext['runtimeConcurrency'];
 }
 /**
@@ -310,6 +312,12 @@ interface CliConfigBase {
    * @default {}
    */
   env?: NodeJS.ProcessEnv;
+  /**
+   * Optional reporting integrations shared by CLI run orchestration.
+   *
+   * @default undefined
+   */
+  reporting?: CliReportingConfig;
 }
 /**
  * Project mode config for `vieval run`.
@@ -400,7 +408,7 @@ interface CaseRunContext<TInput> extends TaskRunContext {
    * - `name` to be a stable metric identifier
    * - `value` to be JSON-serializable
    */
-  metric: (name: string, value: boolean | number | string | null) => void;
+  metric: (name: string, value: TelemetryAttributeValue) => void;
   /**
    * Cooperative abort signal for the current case execution.
    */
@@ -409,7 +417,7 @@ interface CaseRunContext<TInput> extends TaskRunContext {
 /**
  * Callback for one task case.
  */
-type CaseRunner<TInput> = (context: CaseRunContext<TInput>) => Promise<void> | void;
+type CaseRunner<TInput> = (context: CaseRunContext<TInput>) => Promise<unknown> | unknown;
 /**
  * Per-group options for `casesFromInputs`.
  *

package/dist/index.mjs CHANGED Viewed

@@ -1,9 +1,9 @@
-import { a as defineConfig, i as registerEvalDefinition, s as loadEnv } from "./registry-CcKZqDJY.mjs";
+import { i as registerEvalDefinition, l as loadEnv, o as createNoopTelemetryRuntime, s as defineConfig } from "./registry-BHGMxjpA.mjs";
 import { t as createSchedulerQueue } from "./queue-DsZQkZO_.mjs";
-import { n as requiredEnvFrom } from "./env--94B0UtW.mjs";
+import { n as requiredEnvFrom } from "./env-BFSjny07.mjs";
 import { defineEval, defineTask } from "./config.mjs";
 import { expect } from "./expect.mjs";
-import { errorMessageFrom } from "@moeru/std";
+import { errorMessageFrom, sleep } from "@moeru/std";
 //#region src/dsl/task.ts
 function cloneCaseMatrix(matrix) {
 	return {
@@ -15,15 +15,36 @@ function cloneCaseMatrix(matrix) {
 function createTaskCaseReporterId(index, name) {
 	return `${index}:${encodeURIComponent(name)}`;
 }
+function isTelemetryAttributeScalar(value) {
+	return typeof value === "boolean" || typeof value === "number" || typeof value === "string";
+}
+function isTelemetryAttributeArray(value) {
+	return value.every(isTelemetryAttributeScalar);
+}
+function canAttachMetricAsAttribute(value) {
+	if (isTelemetryAttributeScalar(value)) return true;
+	return Array.isArray(value) && isTelemetryAttributeArray(value);
+}
 function assertValidScore(score) {
 	if (!Number.isFinite(score) || score < 0 || score > 1) throw new Error(`Case score must be a finite number in range 0..1, got "${score}".`);
 }
 function assertNonNegativeInteger(value, label) {
 	if (!Number.isFinite(value) || !Number.isInteger(value) || value < 0) throw new Error(`Invalid ${label}: ${String(value)}`);
 }
+function assertNonNegativeNumber(value, label) {
+	if (!Number.isFinite(value) || value < 0) throw new Error(`Invalid ${label}: ${String(value)}`);
+}
 function assertPositiveInteger(value, label) {
 	if (!Number.isFinite(value) || !Number.isInteger(value) || value <= 0) throw new Error(`Invalid ${label}: ${String(value)}`);
 }
+function autoRetryDelayMs(retryIndex) {
+	return 500 * 2 ** (retryIndex - 1);
+}
+function resolveAutoRetryDelay(policy, retryIndex) {
+	const delay = policy.autoRetryDelay;
+	if (delay == null) return autoRetryDelayMs(retryIndex);
+	return typeof delay === "number" ? delay : delay(retryIndex);
+}
 function emitCaseStart(hooks, payload) {
 	try {
 		hooks?.onCaseStart?.(payload);
@@ -34,6 +55,11 @@ function emitCaseEnd(hooks, payload) {
 		hooks?.onCaseEnd?.(payload);
 	} catch {}
 }
+function emitReporterEvent(hooks, payload) {
+	try {
+		hooks?.onEvent?.(payload);
+	} catch {}
+}
 function createCaseTimeoutError(timeout) {
 	const error = /* @__PURE__ */ new Error(`Case timed out after ${timeout}ms.`);
 	error.name = "TimeoutError";
@@ -43,10 +69,12 @@ function normalizeExecutionPolicy(policy, label) {
 	if (policy == null) return;
 	if (policy.autoAttempt != null) assertNonNegativeInteger(policy.autoAttempt, `${label} autoAttempt`);
 	if (policy.autoRetry != null) assertNonNegativeInteger(policy.autoRetry, `${label} autoRetry`);
+	if (typeof policy.autoRetryDelay === "number") assertNonNegativeNumber(policy.autoRetryDelay, `${label} autoRetryDelay`);
 	if (policy.timeout != null) assertPositiveInteger(policy.timeout, `${label} timeout`);
 	const normalized = {
 		autoAttempt: policy.autoAttempt,
 		autoRetry: policy.autoRetry,
+		autoRetryDelay: policy.autoRetryDelay,
 		timeout: policy.timeout
 	};
 	return Object.values(normalized).some((value) => value != null) ? normalized : void 0;
@@ -55,55 +83,90 @@ function resolveCaseExecutionPolicy(taskCase, taskExecutionPolicy) {
 	return {
 		autoAttempt: taskCase.executionPolicy?.autoAttempt ?? taskExecutionPolicy?.autoAttempt ?? 0,
 		autoRetry: taskCase.executionPolicy?.autoRetry ?? taskExecutionPolicy?.autoRetry ?? 0,
+		autoRetryDelay: taskCase.executionPolicy?.autoRetryDelay ?? taskExecutionPolicy?.autoRetryDelay,
 		timeout: taskCase.executionPolicy?.timeout ?? taskExecutionPolicy?.timeout
 	};
 }
 async function runCaseOnce(context, taskCase, index, timeout) {
 	const customScoresByKind = /* @__PURE__ */ new Map();
 	const abortController = new AbortController();
+	const telemetry = context.telemetry ?? createNoopTelemetryRuntime();
+	const caseId = createTaskCaseReporterId(index, taskCase.name);
 	let timeoutHandle;
 	let timedOut = false;
 	let settled = false;
 	try {
-		const runPromise = Promise.resolve(taskCase.run({
-			...context,
-			matrix: {
-				...cloneCaseMatrix(context.task.matrix),
-				inputs: taskCase.input
-			},
-			metric(name, value) {
-				if (abortController.signal.aborted || settled) return;
-				context.reporterHooks?.onEvent?.({
-					caseId: createTaskCaseReporterId(index, taskCase.name),
-					data: {
+		return await telemetry.withSpan("vieval.case", {
+			"vieval.case.id": caseId,
+			"vieval.case.name": taskCase.name,
+			"vieval.task.id": context.task.id,
+			"vieval.task.name": context.task.entry.name
+		}, async () => {
+			const runPromise = Promise.resolve(taskCase.run({
+				...context,
+				matrix: {
+					...cloneCaseMatrix(context.task.matrix),
+					inputs: taskCase.input
+				},
+				metric(name, value) {
+					if (abortController.signal.aborted || settled) return;
+					emitReporterEvent(context.reporterHooks, {
+						caseId,
+						data: {
+							name,
+							value
+						},
+						event: "task.case.metric"
+					});
+					telemetry.addEvent("vieval.case.metric", {
 						name,
 						value
-					},
-					event: "task.case.metric"
+					});
+					if (canAttachMetricAsAttribute(value)) telemetry.setAttributes({ [name]: value });
+				},
+				score(score, kind = "exact") {
+					if (abortController.signal.aborted || settled) return;
+					assertValidScore(score);
+					customScoresByKind.set(kind, score);
+					telemetry.addEvent("vieval.case.score", {
+						"vieval.score.kind": kind,
+						"vieval.score.value": score
+					});
+					emitReporterEvent(context.reporterHooks, {
+						caseId,
+						data: {
+							kind,
+							score
+						},
+						event: "task.case.score"
+					});
+				},
+				signal: abortController.signal
+			}));
+			if (timeout != null) {
+				const timeoutPromise = new Promise((_, reject) => {
+					timeoutHandle = setTimeout(() => {
+						timedOut = true;
+						abortController.abort(createCaseTimeoutError(timeout));
+						reject(createCaseTimeoutError(timeout));
+					}, timeout);
 				});
-			},
-			score(score, kind = "exact") {
-				if (abortController.signal.aborted || settled) return;
-				assertValidScore(score);
-				customScoresByKind.set(kind, score);
-			},
-			signal: abortController.signal
-		}));
-		if (timeout != null) {
-			const timeoutPromise = new Promise((_, reject) => {
-				timeoutHandle = setTimeout(() => {
-					timedOut = true;
-					abortController.abort(createCaseTimeoutError(timeout));
-					reject(createCaseTimeoutError(timeout));
-				}, timeout);
-			});
-			await Promise.race([runPromise, timeoutPromise]);
-		} else await runPromise;
-		settled = true;
-		return {
-			scoresByKind: customScoresByKind,
-			state: "passed"
-		};
+				const output = await Promise.race([runPromise, timeoutPromise]);
+				settled = true;
+				return {
+					output,
+					scoresByKind: customScoresByKind,
+					state: "passed"
+				};
+			}
+			const output = await runPromise;
+			settled = true;
+			return {
+				output,
+				scoresByKind: customScoresByKind,
+				state: "passed"
+			};
+		});
 	} catch (error) {
 		settled = true;
 		return {
@@ -119,12 +182,18 @@ async function executeRegisteredCase(context, taskCase, index, totalCases, taskE
 	const resolvedPolicy = resolveCaseExecutionPolicy(taskCase, taskExecutionPolicy);
 	let lastOutcome;
 	for (let retryIndex = 0; retryIndex <= resolvedPolicy.autoRetry; retryIndex += 1) {
+		if (retryIndex > 0) {
+			const retryDelayMs = resolveAutoRetryDelay(resolvedPolicy, retryIndex);
+			assertNonNegativeNumber(retryDelayMs, "autoRetryDelay result");
+			if (retryDelayMs > 0) await sleep(retryDelayMs);
+		}
 		emitCaseStart(context.reporterHooks, {
 			...resolvedPolicy.autoRetry > 0 ? {
 				autoRetry: resolvedPolicy.autoRetry,
 				retryIndex
 			} : {},
 			index,
+			...taskCase.input === void 0 ? {} : { input: taskCase.input },
 			name: taskCase.name,
 			total: totalCases
 		});
@@ -280,6 +349,7 @@ function describeTask(name, build, options = {}) {
 						emitCaseEnd(context.reporterHooks, {
 							...outcome.errorMessage == null ? {} : { errorMessage: outcome.errorMessage },
 							index,
+							...outcome.output === void 0 ? {} : { output: outcome.output },
 							state: outcome.state,
 							name: taskCase.name,
 							total: totalCases
@@ -323,6 +393,7 @@ function describeTask(name, build, options = {}) {
 						emitCaseEnd(context.reporterHooks, {
 							...outcome.errorMessage == null ? {} : { errorMessage: outcome.errorMessage },
 							index,
+							...outcome.output === void 0 ? {} : { output: outcome.output },
 							state: outcome.state,
 							name: taskCase.name,
 							total: totalCases