npm - vieval - Versions diffs - 0.0.10 → 0.0.12 - Mend

vieval 0.0.10 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

package/README.md +31 -31
package/dist/bin/vieval.mjs +1 -1
package/dist/bin/vieval.mjs.map +1 -1
package/dist/cli/index.d.mts +1 -1
package/dist/cli/index.mjs +1 -1
package/dist/{cli-DTDgaqeI.mjs → cli-uzS81IPd.mjs} +1483 -1483
package/dist/cli-uzS81IPd.mjs.map +1 -0
package/dist/config.d.mts +1 -1
package/dist/config.mjs +1 -1
package/dist/config.mjs.map +1 -1
package/dist/core/assertions/index.d.mts +156 -156
package/dist/core/assertions/index.mjs +82 -82
package/dist/core/assertions/index.mjs.map +1 -1
package/dist/core/inference-executors/index.d.mts +37 -37
package/dist/core/inference-executors/index.mjs +54 -53
package/dist/core/inference-executors/index.mjs.map +1 -1
package/dist/core/processors/results/index.d.mts +18 -18
package/dist/core/processors/results/index.mjs.map +1 -1
package/dist/core/runner/index.d.mts +2 -2
package/dist/core/runner/index.mjs +259 -259
package/dist/core/runner/index.mjs.map +1 -1
package/dist/core/scheduler/index.d.mts +1 -1
package/dist/core/scheduler/index.mjs +65 -65
package/dist/core/scheduler/index.mjs.map +1 -1
package/dist/{env-DfWZy_n4.d.mts → env-Br6jaWGL.d.mts} +9 -9
package/dist/{env-nV5rVErX.mjs → env-egxaJtNn.mjs} +8 -8
package/dist/env-egxaJtNn.mjs.map +1 -0
package/dist/{expect-extensions-DCSqlneN.mjs → expect-extensions-BKdEPt3h.mjs} +46 -46
package/dist/expect-extensions-BKdEPt3h.mjs.map +1 -0
package/dist/expect.d.mts +1 -3
package/dist/expect.mjs +1 -1
package/dist/expect.mjs.map +1 -1
package/dist/{index-D_aMeWqO.d.mts → index-BLIlhiWT.d.mts} +565 -565
package/dist/{index-Bg0atWBF.d.mts → index-CIaJClcC.d.mts} +48 -48
package/dist/index.d.mts +208 -197
package/dist/index.mjs +148 -148
package/dist/index.mjs.map +1 -1
package/dist/{models-pBSRUZhY.mjs → models-CaCOUPZw.mjs} +1 -1
package/dist/{models-pBSRUZhY.mjs.map → models-CaCOUPZw.mjs.map} +1 -1
package/dist/plugins/chat-models/index.d.mts +279 -279
package/dist/plugins/chat-models/index.mjs +360 -360
package/dist/plugins/chat-models/index.mjs.map +1 -1
package/dist/{queue-DsZQkZO_.mjs → queue-BL86z2W_.mjs} +1 -1
package/dist/{queue-DsZQkZO_.mjs.map → queue-BL86z2W_.mjs.map} +1 -1
package/dist/{registry-DMnwE_mY.mjs → registry-BK7k6X81.mjs} +294 -294
package/dist/registry-BK7k6X81.mjs.map +1 -0
package/dist/testing/expect-extensions.d.mts +27 -27
package/dist/testing/expect-extensions.mjs +1 -1
package/package.json +12 -12
package/dist/cli-DTDgaqeI.mjs.map +0 -1
package/dist/env-nV5rVErX.mjs.map +0 -1
package/dist/expect-extensions-DCSqlneN.mjs.map +0 -1
package/dist/registry-DMnwE_mY.mjs.map +0 -1

package/dist/{index-D_aMeWqO.d.mts → index-BLIlhiWT.d.mts} RENAMED Viewed

@@ -2,23 +2,6 @@ import { ReadStream, WriteStream } from "node:fs";
 import { Buffer } from "node:buffer";
 //#region src/core/cache/types.d.ts
-/**
- * Cache entry options used to derive one deterministic cache file path.
- */
-interface CacheFileOptions {
-  /**
-   * Optional file extension for the cache artifact (for example: `json`, `txt`, `wav`).
-   */
-  ext?: string;
-  /**
-   * Deterministic key segments used to build the relative cache path.
-   */
-  key: readonly string[];
-  /**
-   * Optional media type hint used by adapters when extension is omitted.
-   */
-  mediaType?: string;
-}
 /**
  * One cache file handle exposed to task code.
  *
@@ -34,18 +17,35 @@ interface CacheFileOptions {
  * - read/write helpers over one deterministic cache artifact path
  */
 interface CacheFileHandle {
-  path: string;
   exists: () => Promise<boolean>;
+  loadAsCasesInput: <T>() => Promise<T[]>;
+  loadAsExpectFixture: <T>() => Promise<T>;
   openReadStream: () => ReadStream;
   openWriteStream: () => Promise<WriteStream>;
+  path: string;
   readBuffer: () => Promise<Buffer>;
-  writeBuffer: (value: Buffer) => Promise<void>;
-  readText: (encoding?: BufferEncoding) => Promise<string>;
-  writeText: (value: string, encoding?: BufferEncoding) => Promise<void>;
   readJson: <T>() => Promise<T>;
+  readText: (encoding?: BufferEncoding) => Promise<string>;
+  writeBuffer: (value: Buffer) => Promise<void>;
   writeJson: (value: unknown) => Promise<void>;
-  loadAsCasesInput: <T>() => Promise<T[]>;
-  loadAsExpectFixture: <T>() => Promise<T>;
+  writeText: (value: string, encoding?: BufferEncoding) => Promise<void>;
+}
+/**
+ * Cache entry options used to derive one deterministic cache file path.
+ */
+interface CacheFileOptions {
+  /**
+   * Optional file extension for the cache artifact (for example: `json`, `txt`, `wav`).
+   */
+  ext?: string;
+  /**
+   * Deterministic key segments used to build the relative cache path.
+   */
+  key: readonly string[];
+  /**
+   * Optional media type hint used by adapters when extension is omitted.
+   */
+  mediaType?: string;
 }
 /**
  * Namespaced cache accessor for deterministic cache artifacts.
@@ -78,16 +78,6 @@ interface CreateFilesystemTaskCacheRuntimeOptions {
    */
   workspaceId: string;
 }
-/**
- * Normalizes cache file options into deterministic relative path segments.
- *
- * Before:
- * - `{ key: ['cases', 'dataset hash', 'v1'], ext: 'json' }`
- *
- * After:
- * - `['cases', 'dataset-hash', 'v1.json']`
- */
-declare function normalizeCacheFilePathSegments(options: CacheFileOptions): string[];
 /**
  * Creates a deterministic filesystem-backed task cache runtime.
  *
@@ -104,6 +94,16 @@ declare function normalizeCacheFilePathSegments(options: CacheFileOptions): stri
  *   `<cacheRootDirectory>/<workspaceId>/<projectName>/<namespace>/...`
  */
 declare function createFilesystemTaskCacheRuntime(options: CreateFilesystemTaskCacheRuntimeOptions): TaskCacheRuntime;
+/**
+ * Normalizes cache file options into deterministic relative path segments.
+ *
+ * Before:
+ * - `{ key: ['cases', 'dataset hash', 'v1'], ext: 'json' }`
+ *
+ * After:
+ * - `['cases', 'dataset-hash', 'v1.json']`
+ */
+declare function normalizeCacheFilePathSegments(options: CacheFileOptions): string[];
 //#endregion
 //#region src/core/runner/schedule.d.ts
 /**
@@ -116,30 +116,21 @@ interface InferenceExecutor {
   id: string;
 }
 /**
- * Stores the selected value for each matrix axis.
+ * Maps matrix axis names to the values that should be expanded.
  */
-type RunnerMatrixSelection = Record<string, string>;
+type RunnerMatrixDefinition = MatrixDefinition;
 /**
- * Stores stable row ids for one resolved scheduled task matrix.
+ * Accepts either flat axis definitions or one layered matrix object.
  */
-interface ScheduledTaskMatrixMeta {
-  /**
-   * Stable row id for the resolved run matrix selection.
-   */
-  runRowId: string;
-  /**
-   * Stable row id for the resolved eval matrix selection.
-   */
-  evalRowId: string;
-}
+type RunnerMatrixInput = MatrixLayer | RunnerMatrixDefinition;
+/**
+ * Stores the selected value for each matrix axis.
+ */
+type RunnerMatrixSelection = Record<string, string>;
 /**
  * Stores the structured matrix payload for one scheduled task.
  */
 interface ScheduledTaskMatrix {
-  /**
-   * Runtime matrix selection visible to task code.
-   */
-  run: RunnerMatrixSelection;
   /**
    * Eval-time matrix selection visible to task code.
    */
@@ -148,35 +139,23 @@ interface ScheduledTaskMatrix {
    * Stable row ids for both scopes.
    */
   meta: ScheduledTaskMatrixMeta;
+  /**
+   * Runtime matrix selection visible to task code.
+   */
+  run: RunnerMatrixSelection;
 }
 /**
- * Maps matrix axis names to the values that should be expanded.
- */
-type RunnerMatrixDefinition = MatrixDefinition;
-/**
- * Accepts either flat axis definitions or one layered matrix object.
- */
-type RunnerMatrixInput = RunnerMatrixDefinition | MatrixLayer;
-/**
- * Represents one fully expanded runner task.
+ * Stores stable row ids for one resolved scheduled task matrix.
  */
-interface ScheduledTask {
-  /**
-   * Stable task id derived from the entry, inferenceExecutor, and matrix selection.
-   */
-  id: string;
-  /**
-   * The collected eval entry to execute.
-   */
-  entry: CollectedEvalEntry;
+interface ScheduledTaskMatrixMeta {
   /**
-   * The inferenceExecutor selected for this task.
+   * Stable row id for the resolved eval matrix selection.
    */
-  inferenceExecutor: InferenceExecutor;
+  evalRowId: string;
   /**
-   * The concrete scoped matrix selection for this task.
+   * Stable row id for the resolved run matrix selection.
    */
-  matrix: ScheduledTaskMatrix;
+  runRowId: string;
 }
 /**
  * Configures how the runner should expand its execution matrix.
@@ -186,6 +165,10 @@ interface CreateRunnerScheduleOptions {
    * Collected eval entries that should be scheduled.
    */
   entries: readonly CollectedEvalEntry[];
+  /**
+   * Optional eval-time matrix axes expanded as a cartesian product.
+   */
+  evalMatrix?: RunnerMatrixInput;
   /**
    * Providers that should run each entry.
    */
@@ -194,10 +177,27 @@ interface CreateRunnerScheduleOptions {
    * Optional run-time matrix axes expanded as a cartesian product.
    */
   runMatrix?: RunnerMatrixInput;
+}
+/**
+ * Represents one fully expanded runner task.
+ */
+interface ScheduledTask {
   /**
-   * Optional eval-time matrix axes expanded as a cartesian product.
+   * The collected eval entry to execute.
    */
-  evalMatrix?: RunnerMatrixInput;
+  entry: CollectedEvalEntry;
+  /**
+   * Stable task id derived from the entry, inferenceExecutor, and matrix selection.
+   */
+  id: string;
+  /**
+   * The inferenceExecutor selected for this task.
+   */
+  inferenceExecutor: InferenceExecutor;
+  /**
+   * The concrete scoped matrix selection for this task.
+   */
+  matrix: ScheduledTaskMatrix;
 }
 /**
  * Expands collected entries into a stable runner schedule.
@@ -221,128 +221,128 @@ declare function createRunnerSchedule(options: CreateRunnerScheduleOptions): Sch
 //#endregion
 //#region src/core/runner/aggregate.d.ts
 /**
- * Identifies the scoring family for a single eval score.
- */
-type RunScoreKind = 'exact' | 'judge';
-/**
- * Represents one normalized score emitted by a completed eval run.
+ * Stores inferenceExecutor-level score aggregates across multiple runs.
  */
-interface RunScore {
+interface AggregatedProviderSummary {
   /**
-   * Score family used for aggregation.
+   * Mean of all exact-match scores or `null` when absent.
    */
-  kind: RunScoreKind;
+  exactAverage: null | number;
   /**
-   * Normalized score in the `0..1` range.
+   * Hybrid average derived from the inferenceExecutor exact and judge means.
    */
-  score: number;
-}
-/**
- * Captures the output of one scheduled runner task.
- */
-interface RunResult {
+  hybridAverage: null | number;
   /**
-   * Stable run id, usually copied from the scheduled task id.
+   * Stable inferenceExecutor id.
    */
-  id: string;
+  inferenceExecutorId: string;
   /**
-   * Collected eval entry id.
+   * Mean of all judge-based scores or `null` when absent.
    */
-  entryId: string;
+  judgeAverage: null | number;
   /**
-   * Stable inferenceExecutor id.
+   * Number of runs included in this inferenceExecutor bucket.
    */
-  inferenceExecutorId: string;
+  runCount: number;
+}
+/**
+ * Stores the final aggregation output for a batch of runner results.
+ */
+interface AggregatedRunResults {
   /**
-   * Concrete matrix selection used by the run.
+   * Provider-level summaries sorted by inferenceExecutor id.
    */
-  matrix: ScheduledTaskMatrix;
+  inferenceExecutors: AggregatedProviderSummary[];
   /**
-   * Raw scores emitted by the eval.
+   * Overall summary across every run.
    */
-  scores: readonly RunScore[];
+  overall: {
+    exactAverage: null | number;
+    hybridAverage: null | number;
+    judgeAverage: null | number;
+    runCount: number;
+  };
+  /**
+   * Per-run normalized score summaries.
+   */
+  runs: AggregatedRunSummary[];
 }
 /**
  * Stores the per-run score averages after normalization.
  */
 interface AggregatedRunSummary {
-  /**
-   * Stable run id.
-   */
-  id: string;
   /**
    * Collected eval entry id.
    */
   entryId: string;
   /**
-   * Stable inferenceExecutor id.
+   * Mean of exact-match scores or `null` when absent.
    */
-  inferenceExecutorId: string;
+  exactAverage: null | number;
   /**
-   * Concrete matrix selection used by the run.
+   * Hybrid average. Uses both families when present, otherwise falls back to the
+   * single available family.
    */
-  matrix: ScheduledTaskMatrix;
+  hybridAverage: null | number;
   /**
-   * Mean of exact-match scores or `null` when absent.
+   * Stable run id.
    */
-  exactAverage: number | null;
+  id: string;
+  /**
+   * Stable inferenceExecutor id.
+   */
+  inferenceExecutorId: string;
   /**
    * Mean of judge-based scores or `null` when absent.
    */
-  judgeAverage: number | null;
+  judgeAverage: null | number;
   /**
-   * Hybrid average. Uses both families when present, otherwise falls back to the
-   * single available family.
+   * Concrete matrix selection used by the run.
    */
-  hybridAverage: number | null;
+  matrix: ScheduledTaskMatrix;
 }
 /**
- * Stores inferenceExecutor-level score aggregates across multiple runs.
+ * Captures the output of one scheduled runner task.
  */
-interface AggregatedProviderSummary {
+interface RunResult {
   /**
-   * Stable inferenceExecutor id.
+   * Collected eval entry id.
    */
-  inferenceExecutorId: string;
+  entryId: string;
   /**
-   * Number of runs included in this inferenceExecutor bucket.
+   * Stable run id, usually copied from the scheduled task id.
    */
-  runCount: number;
+  id: string;
   /**
-   * Mean of all exact-match scores or `null` when absent.
+   * Stable inferenceExecutor id.
    */
-  exactAverage: number | null;
+  inferenceExecutorId: string;
   /**
-   * Mean of all judge-based scores or `null` when absent.
+   * Concrete matrix selection used by the run.
    */
-  judgeAverage: number | null;
+  matrix: ScheduledTaskMatrix;
   /**
-   * Hybrid average derived from the inferenceExecutor exact and judge means.
+   * Raw scores emitted by the eval.
    */
-  hybridAverage: number | null;
+  scores: readonly RunScore[];
 }
 /**
- * Stores the final aggregation output for a batch of runner results.
+ * Represents one normalized score emitted by a completed eval run.
  */
-interface AggregatedRunResults {
-  /**
-   * Per-run normalized score summaries.
-   */
-  runs: AggregatedRunSummary[];
+interface RunScore {
   /**
-   * Provider-level summaries sorted by inferenceExecutor id.
+   * Score family used for aggregation.
    */
-  inferenceExecutors: AggregatedProviderSummary[];
+  kind: RunScoreKind;
   /**
-   * Overall summary across every run.
+   * Normalized score in the `0..1` range.
    */
-  overall: {
-    exactAverage: number | null;
-    judgeAverage: number | null;
-    hybridAverage: number | null;
-    runCount: number;
-  };
+  score: number;
 }
+/**
+ * Identifies the scoring family for a single eval score.
+ */
+type RunScoreKind = 'exact' | 'judge';
 /**
  * Aggregates exact-match and judge-based scores into hybrid runner summaries.
  *
@@ -365,19 +365,6 @@ interface AggregatedRunResults {
 declare function aggregateRunResults(results: readonly RunResult[]): AggregatedRunResults;
 //#endregion
 //#region src/core/runner/runtime-context.d.ts
-/**
- * Shared runtime context used by the vieval runner.
- *
- * Use when:
- * - runner services need stable path resolution without module-level side effects
- * - call sites want deterministic control over workspace root detection
- */
-interface RunnerRuntimeContext {
-  /**
-   * Absolute project root directory used for path normalization.
-   */
-  projectRootDirectory: string;
-}
 /**
  * Options used to construct the runner runtime context.
  */
@@ -395,6 +382,19 @@ interface CreateVievalRunnerRuntimeContextOptions {
    */
   fallbackProjectRootDirectory?: string;
 }
+/**
+ * Shared runtime context used by the vieval runner.
+ *
+ * Use when:
+ * - runner services need stable path resolution without module-level side effects
+ * - call sites want deterministic control over workspace root detection
+ */
+interface RunnerRuntimeContext {
+  /**
+   * Absolute project root directory used for path normalization.
+   */
+  projectRootDirectory: string;
+}
 /**
  * Creates a side-effect-free runtime context for runner path normalization.
  *
@@ -455,13 +455,17 @@ declare function collectEvalEntries(modules: EvalModuleMap, context: RunnerRunti
  */
 interface ModelDefinition {
   /**
-   * Stable model id.
+   * Alias names that can resolve this model.
    */
-  id: string;
+  aliases: string[];
   /**
-   * Inference-executor id used for matching and reporting.
+   * Optional execution policy hints attached to this model.
    */
-  inferenceExecutorId: string;
+  executionPolicy?: TaskExecutionPolicy;
+  /**
+   * Stable model id.
+   */
+  id: string;
   /**
    * Executor reference passed through config.
    *
@@ -470,17 +474,13 @@ interface ModelDefinition {
    */
   inferenceExecutor: unknown;
   /**
-   * Concrete model name passed to the inference executor.
-   */
-  model: string;
-  /**
-   * Alias names that can resolve this model.
+   * Inference-executor id used for matching and reporting.
    */
-  aliases: string[];
+  inferenceExecutorId: string;
   /**
-   * Optional execution policy hints attached to this model.
+   * Concrete model name passed to the inference executor.
    */
-  executionPolicy?: TaskExecutionPolicy;
+  model: string;
   /**
    * Optional model-level call parameters.
    */
@@ -495,6 +495,14 @@ interface ModelDefinition {
 declare function resolveModelByName(models: readonly ModelDefinition[], name: string): ModelDefinition | undefined;
 //#endregion
 //#region src/core/runner/task-context.d.ts
+/**
+ * Inputs used to build task execution context.
+ */
+interface CreateTaskExecutionContextOptions {
+  cache?: TaskCacheRuntime;
+  models: readonly ModelDefinition[];
+  task: ScheduledTask;
+}
 /**
  * Task-scoped execution context exposed to runner executors.
  */
@@ -508,14 +516,6 @@ interface TaskExecutionContext {
    */
   models: readonly ModelDefinition[];
 }
-/**
- * Inputs used to build task execution context.
- */
-interface CreateTaskExecutionContextOptions {
-  cache?: TaskCacheRuntime;
-  models: readonly ModelDefinition[];
-  task: ScheduledTask;
-}
 /**
  * Creates task-scoped context data for runner execution.
  *
@@ -528,20 +528,6 @@ interface CreateTaskExecutionContextOptions {
 declare function createTaskExecutionContext(options: CreateTaskExecutionContextOptions): TaskExecutionContext;
 //#endregion
 //#region src/core/runner/run.d.ts
-/**
- * Executes one scheduled runner task and returns a normalized run result.
- *
- * Use when:
- * - a scheduler already selected the task and execution context
- * - the caller wants a typed executor contract for runner workers
- *
- * Expects:
- * - the task context to be ready for model resolution and task-scoped work
- *
- * Returns:
- * - a normalized run result with score entries ready for aggregation
- */
-type ScheduledTaskExecutor = (task: ScheduledTask, context: TaskExecutionContext) => Promise<RunResult>;
 /**
  * Terminal task state reported by runner lifecycle hooks.
  *
@@ -551,7 +537,7 @@ type ScheduledTaskExecutor = (task: ScheduledTask, context: TaskExecutionContext
  * Expects:
  * - hooks treat the value as final for the completed task
  */
-type RunnerTaskState = 'passed' | 'failed';
+type RunnerTaskState = 'failed' | 'passed';
 /**
  * Optional runner execution hooks used while processing scheduled tasks.
  *
@@ -571,15 +557,11 @@ interface RunScheduledTasksOptions {
    */
   createExecutionContext?: (task: ScheduledTask) => TaskExecutionContext;
   /**
-   * Runs before the executor starts handling a task.
-   *
-   * Use when:
-   * - callers want to observe task activation before execution begins
+   * Maximum number of tasks to execute concurrently.
    *
-   * Expects:
-   * - thrown errors abort the task before executor work starts
+   * @default 1
    */
-  onTaskStart?: (task: ScheduledTask) => void;
+  maxConcurrency?: number;
   /**
    * Runs after the executor settles for a task.
    *
@@ -592,12 +574,30 @@ interface RunScheduledTasksOptions {
    */
   onTaskEnd?: (task: ScheduledTask, state: RunnerTaskState) => void;
   /**
-   * Maximum number of tasks to execute concurrently.
+   * Runs before the executor starts handling a task.
    *
-   * @default 1
+   * Use when:
+   * - callers want to observe task activation before execution begins
+   *
+   * Expects:
+   * - thrown errors abort the task before executor work starts
    */
-  maxConcurrency?: number;
+  onTaskStart?: (task: ScheduledTask) => void;
 }
+/**
+ * Executes one scheduled runner task and returns a normalized run result.
+ *
+ * Use when:
+ * - a scheduler already selected the task and execution context
+ * - the caller wants a typed executor contract for runner workers
+ *
+ * Expects:
+ * - the task context to be ready for model resolution and task-scoped work
+ *
+ * Returns:
+ * - a normalized run result with score entries ready for aggregation
+ */
+type ScheduledTaskExecutor = (task: ScheduledTask, context: TaskExecutionContext) => Promise<RunResult>;
 /**
  * Error thrown when a scheduled run fails before producing a normalized result.
  */
@@ -633,10 +633,10 @@ declare class RunnerExecutionError extends Error {
 declare function runScheduledTasks(tasks: readonly ScheduledTask[], executor: ScheduledTaskExecutor, options?: RunScheduledTasksOptions): Promise<AggregatedRunResults>;
 //#endregion
 //#region src/core/telemetry/types.d.ts
-/** JSON-compatible scalar values accepted as telemetry attributes. */
-type TelemetryAttributeValue = boolean | number | string | null | readonly TelemetryAttributeValue[];
 /** Attribute map shared by local report projection and OpenTelemetry span calls. */
 type TelemetryAttributes = Record<string, TelemetryAttributeValue | undefined>;
+/** JSON-compatible scalar values accepted as telemetry attributes. */
+type TelemetryAttributeValue = boolean | null | number | readonly TelemetryAttributeValue[] | string;
 /**
  * Internal Vieval telemetry runtime.
  *
@@ -652,10 +652,10 @@ type TelemetryAttributes = Record<string, TelemetryAttributeValue | undefined>;
  * - callback result, preserving thrown errors after telemetry records them
  */
 interface TelemetryRuntime {
-  withSpan: <T>(name: string, attributes: TelemetryAttributes, callback: () => Promise<T>) => Promise<T>;
   addEvent: (name: string, attributes?: TelemetryAttributes) => void;
-  setAttributes: (attributes: TelemetryAttributes) => void;
   recordException: (error: unknown) => void;
+  setAttributes: (attributes: TelemetryAttributes) => void;
+  withSpan: <T>(name: string, attributes: TelemetryAttributes, callback: () => Promise<T>) => Promise<T>;
 }
 //#endregion
 //#region src/config/types.d.ts
@@ -666,46 +666,94 @@ interface TelemetryRuntime {
  */
 type Awaitable<T> = Promise<T> | T;
 /**
- * Primitive value allowed in one matrix cell.
- *
- * Use when:
- * - defining axis values for canonical layered matrix config
- * - preserving JSON-safe primitive values through config normalization
- *
- * Expects:
- * - values remain serializable and comparable with stringified task ids
- *
- * Returns:
- * - one JSON-friendly primitive matrix value
+ * OpenTelemetry reporting configuration managed by user config setup.
  */
-type MatrixPrimitive = string | number | boolean;
+interface CliOpenTelemetryReportingConfig {
+  /**
+   * Enables Vieval active span wrapping through `@opentelemetry/api`.
+   *
+   * @default false
+   */
+  enabled?: boolean;
+  /**
+   * Called after all telemetry events and local report artifacts have been emitted.
+   */
+  onRunEnd?: () => Awaitable<void>;
+}
 /**
- * Canonical matrix value type.
- *
- * Use when:
- * - declaring matrix axis values at the config boundary
- *
- * Expects:
- * - values are normalized from config input without extra wrapping
- *
- * Returns:
- * - a primitive cell value suitable for matrix expansion
+ * Reporting configuration for local artifacts and optional OpenTelemetry integration.
  */
-type MatrixValue = MatrixPrimitive;
+interface CliReportingConfig {
+  /**
+   * Optional OpenTelemetry API integration.
+   */
+  openTelemetry?: CliOpenTelemetryReportingConfig;
+}
 /**
- * Canonical row payload for one matrix combination.
- *
- * Use when:
- * - storing the selected values for a resolved matrix row
- * - passing task-level matrix context between layers
- *
- * Expects:
- * - keys are axis names and values are resolved axis selections
- *
- * Returns:
- * - one resolved row object
+ * Represents a normalized evaluation entry collected by the runner.
  */
-type MatrixRow = Record<string, MatrixValue>;
+type CollectedEvalEntry<TDefinition extends EvalDefinition = EvalDefinition> = TDefinition & {
+  directory: string;
+  filePath: string;
+  id: string;
+};
+/**
+ * Declares the metadata required for a single vieval evaluation module.
+ */
+interface EvalDefinition {
+  description: string;
+  /**
+   * Optional matrix layering for this eval definition.
+   *
+   * Use when:
+   * - one eval file needs control-group variants that differ from project defaults
+   *
+   * @example
+   * ```ts
+   * matrix: {
+   *   runMatrix: {
+   *     extend: {
+   *       promptStyle: ['concise'],
+   *     },
+   *     override: {
+   *       scenario: ['eval-scenario'],
+   *     },
+   *   },
+   *   evalMatrix: {
+   *     override: {
+   *       rubric: ['strict'],
+   *     },
+   *   },
+   * }
+   * ```
+   *
+   * Context impact:
+   *
+   * ```txt
+   * project.runMatrix + eval.matrix.runMatrix + task.matrix.runMatrix
+   *   => context.task.matrix.run
+   *
+   * project.evalMatrix + eval.matrix.evalMatrix + task.matrix.evalMatrix
+   *   => context.task.matrix.eval
+   * ```
+   */
+  matrix?: ScopedMatrices;
+  name: string;
+  /**
+   * Optional task implementation executed by runner.
+   */
+  task?: TaskDefinition;
+}
+/**
+ * Describes the shape of an imported vieval evaluation module.
+ */
+interface EvalModule<TDefinition extends EvalDefinition = EvalDefinition> {
+  default: TDefinition;
+}
+/**
+ * Maps module URLs to their loaded vieval evaluation modules.
+ */
+type EvalModuleMap = Record<string, EvalModule>;
 /**
  * Canonical axis value list for one matrix definition.
  *
@@ -767,6 +815,15 @@ type MatrixDefinition = Record<string, MatrixAxisValues>;
  * ```
  */
 interface MatrixLayer {
+  /**
+   * Matrix axes disabled at this layer.
+   *
+   * @example
+   * ```ts
+   * disable: ['temperatureProfile']
+   * ```
+   */
+  disable?: readonly string[];
   /**
    * Matrix axes inherited or appended at this layer.
    *
@@ -790,16 +847,48 @@ interface MatrixLayer {
    * ```
    */
   override?: MatrixDefinition;
-  /**
-   * Matrix axes disabled at this layer.
-   *
-   * @example
-   * ```ts
-   * disable: ['temperatureProfile']
-   * ```
-   */
-  disable?: readonly string[];
 }
+/**
+ * Primitive value allowed in one matrix cell.
+ *
+ * Use when:
+ * - defining axis values for canonical layered matrix config
+ * - preserving JSON-safe primitive values through config normalization
+ *
+ * Expects:
+ * - values remain serializable and comparable with stringified task ids
+ *
+ * Returns:
+ * - one JSON-friendly primitive matrix value
+ */
+type MatrixPrimitive = boolean | number | string;
+/**
+ * Canonical row payload for one matrix combination.
+ *
+ * Use when:
+ * - storing the selected values for a resolved matrix row
+ * - passing task-level matrix context between layers
+ *
+ * Expects:
+ * - keys are axis names and values are resolved axis selections
+ *
+ * Returns:
+ * - one resolved row object
+ */
+type MatrixRow = Record<string, MatrixValue>;
+/**
+ * Canonical matrix value type.
+ *
+ * Use when:
+ * - declaring matrix axis values at the config boundary
+ *
+ * Expects:
+ * - values are normalized from config input without extra wrapping
+ *
+ * Returns:
+ * - a primitive cell value suitable for matrix expansion
+ */
+type MatrixValue = MatrixPrimitive;
 /**
  * Canonical run/eval matrix grouping.
  *
@@ -845,88 +934,111 @@ interface MatrixLayer {
  */
 interface ScopedMatrices {
   /**
-   * Runtime matrix scope.
+   * Eval-time matrix scope.
    *
    * @example
    * ```ts
-   * runMatrix: {
-   *   extend: {
-   *     promptLanguage: ['en', 'zh'],
+   * evalMatrix: {
+   *   override: {
+   *     rubric: ['strict'],
    *   },
    * }
    * ```
    */
-  runMatrix?: MatrixLayer;
+  evalMatrix?: MatrixLayer;
   /**
-   * Eval-time matrix scope.
+   * Runtime matrix scope.
    *
    * @example
    * ```ts
-   * evalMatrix: {
-   *   override: {
-   *     rubric: ['strict'],
+   * runMatrix: {
+   *   extend: {
+   *     promptLanguage: ['en', 'zh'],
    *   },
    * }
    * ```
    */
-  evalMatrix?: MatrixLayer;
-}
-/**
- * Output of one eval task execution.
- */
-interface TaskRunOutput {
-  /**
-   * Scores emitted by this task run.
-   */
-  scores: readonly RunScore[];
+  runMatrix?: MatrixLayer;
 }
 /**
  * Delay policy for retries within one task case attempt.
  *
  * @param retryIndex Retry number where `1` is the first retry after the initial failure.
  */
-type TaskAutoRetryDelay = number | ((retryIndex: number) => number);
+type TaskAutoRetryDelay = ((retryIndex: number) => number) | number;
 /**
- * Execution policy applied to task and case callbacks.
+ * Payload emitted when a task case ends.
  *
  * Use when:
- * - one task or case should time out after a bounded duration
- * - failures should retry within the current attempt or trigger a later full task attempt
+ * - reporter hooks need the case position plus terminal state
  *
  * Expects:
- * - `timeout` to be a positive integer when provided
- * - `autoRetry` and `autoAttempt` to be non-negative integers when provided
+ * - `name` is the declared DSL case label
+ * - `index` is the zero-based case position within the task
+ * - `total` is the total number of registered cases
+ * - `state` describes the final case result
+ */
+interface TaskCaseReporterEndPayload extends TaskCaseReporterPayload {
+  /**
+   * Optional failure message when `state` is `failed`.
+   */
+  errorMessage?: string;
+  /**
+   * Optional case output returned by the task case callback.
+   */
+  output?: unknown;
+  /**
+   * Final case state.
+   */
+  state: TaskCaseState;
+}
+/**
+ * Payload emitted when a task case starts.
  *
- * Returns:
- * - one partial execution policy descriptor
+ * Use when:
+ * - reporter hooks need a stable position for one case within the task
+ *
+ * Expects:
+ * - `name` is the declared DSL case label
+ * - `index` is the zero-based case position within the task
+ * - `total` is the total number of registered cases
  */
-interface TaskExecutionPolicy {
+interface TaskCaseReporterPayload {
   /**
-   * Additional retries allowed within the current attempt.
-   *
-   * @default 0
+   * Maximum retry count configured for this case.
    */
   autoRetry?: number;
   /**
-   * Delay in milliseconds before a case auto retry starts.
-   *
-   * A number applies the same delay to every retry. A function receives the
-   * retry index where `1` is the first retry after the initial failure.
-   *
-   * @default retryIndex => 500 * 2 ** (retryIndex - 1)
+   * Zero-based case position within the task.
    */
-  autoRetryDelay?: TaskAutoRetryDelay;
+  index: number;
   /**
-   * Additional full task attempts allowed after the current attempt settles.
-   *
-   * @default 0
+   * Optional case input payload registered by the task DSL.
    */
-  autoAttempt?: number;
+  input?: unknown;
   /**
-   * Timeout in milliseconds for one case execution.
+   * Declared case label.
    */
-  timeout?: number;
+  name: string;
+  /**
+   * Current retry attempt index, where `0` is the first try.
+   */
+  retryIndex?: number;
+  /**
+   * Total number of registered cases.
+   */
+  total: number;
 }
+/**
+ * Allowed terminal outcomes for one task case.
+ *
+ * Use when:
+ * - emitting case lifecycle events from the task DSL
+ *
+ * Expects:
+ * - consumers treat the value as the final state for the case
+ */
+type TaskCaseState = 'failed' | 'passed' | 'timeout';
 /**
  * Task-local concurrency metadata.
  *
@@ -951,194 +1063,124 @@ interface TaskConcurrencyConfig {
   case?: number;
 }
 /**
- * Reporting configuration for local artifacts and optional OpenTelemetry integration.
+ * Eval task definition used by `defineTask`.
  */
-interface CliReportingConfig {
+interface TaskDefinition {
   /**
-   * Optional OpenTelemetry API integration.
+   * Optional task-local concurrency metadata.
+   *
+   * Use when:
+   * - task declarations need to preserve task-scoped attempt/case caps for later scheduler wiring
+   * - higher-level orchestration wants to inspect task-local concurrency without executing the task
+   *
+   * Expects:
+   * - each provided value to be a positive integer chosen by the caller
+   *
+   * Returns:
+   * - one partial task-local concurrency descriptor
    */
-  openTelemetry?: CliOpenTelemetryReportingConfig;
-}
-/**
- * OpenTelemetry reporting configuration managed by user config setup.
- */
-interface CliOpenTelemetryReportingConfig {
+  concurrency?: TaskConcurrencyConfig;
   /**
-   * Enables Vieval active span wrapping through `@opentelemetry/api`.
+   * Optional task-local execution policy.
+   */
+  executionPolicy?: TaskExecutionPolicy;
+  /**
+   * Stable task id for diagnostics.
+   */
+  id: string;
+  /**
+   * Optional matrix layering for this task definition.
    *
-   * @default false
+   * Use when:
+   * - task-local experiments should refine project/eval defaults
+   *
+   * @example
+   * ```ts
+   * matrix: {
+   *   runMatrix: {
+   *     override: {
+   *       model: ['gpt-4.1-mini'],
+   *     },
+   *   },
+   *   evalMatrix: {
+   *     extend: {
+   *       evaluator: ['default-judge'],
+   *     },
+   *   },
+   * }
+   * ```
    */
-  enabled?: boolean;
+  matrix?: ScopedMatrices;
   /**
-   * Called after all telemetry events and local report artifacts have been emitted.
+   * Executes one scheduled eval task.
    */
-  onRunEnd?: () => Awaitable<void>;
+  run: (context: TaskRunContext) => Promise<TaskRunOutput> | TaskRunOutput;
 }
 /**
- * Runtime context passed into eval task `run`.
+ * Execution policy applied to task and case callbacks.
+ *
+ * Use when:
+ * - one task or case should time out after a bounded duration
+ * - failures should retry within the current attempt or trigger a later full task attempt
+ *
+ * Expects:
+ * - `timeout` to be a positive integer when provided
+ * - `autoRetry` and `autoAttempt` to be non-negative integers when provided
+ *
+ * Returns:
+ * - one partial execution policy descriptor
  */
-interface TaskRunContext {
+interface TaskExecutionPolicy {
   /**
-   * Task-scoped cache runtime.
+   * Additional full task attempts allowed after the current attempt settles.
    *
-   * Use when:
-   * - benchmark setup needs deterministic artifact reuse across attempts
-   * - case-level logic needs typed text/json/binary cache loaders
+   * @default 0
    */
-  cache: TaskExecutionContext['cache'];
+  autoAttempt?: number;
   /**
-   * Scheduled runner task metadata.
+   * Additional retries allowed within the current attempt.
    *
-   * Matrix impact on runtime context:
+   * @default 0
+   */
+  autoRetry?: number;
+  /**
+   * Delay in milliseconds before a case auto retry starts.
    *
-   * ```txt
-   * project/eval/task matrix definitions
-   *   -> scheduler expands run rows x eval rows
-   *   -> one scheduled task per row pair
-   *   -> context.task.matrix = {
-   *        run:  selected run-axis values,
-   *        eval: selected eval-axis values,
-   *        meta: { runRowId, evalRowId }
-   *      }
-   * ```
+   * A number applies the same delay to every retry. A function receives the
+   * retry index where `1` is the first retry after the initial failure.
    *
-   * Practical impact:
-   * - `runMatrix` axes appear under `context.task.matrix.run.*`
-   * - `evalMatrix` axes appear under `context.task.matrix.eval.*`
-   * - row ids are stable labels for grouping/aggregation under `context.task.matrix.meta.*`
-   *
-   * @example
-   * ```ts
-   * // If final selected rows are:
-   * // run:  { model: 'gpt-4.1-mini', scenario: 'stress', promptLanguage: 'zh' }
-   * // eval: { rubric: 'strict', rubricModel: 'judge-large' }
-   *
-   * context.task.matrix.run.model // 'gpt-4.1-mini'
-   * context.task.matrix.run.scenario // 'stress'
-   * context.task.matrix.eval.rubric // 'strict'
-   * context.task.matrix.meta.runRowId // stable encoded row id
-   * ```
-   */
-  task: ScheduledTask;
-  /**
-   * Configured model registrations available to model plugins.
-   *
-   * Use when:
-   * - a plugin owns model selection semantics and needs access to registered models
-   * - eval code resolves matrix-selected model axes through plugin helpers
-   */
-  models: TaskExecutionContext['models'];
-  /**
-   * Optional reporter lifecycle hooks for task-local case events.
-   *
-   * Use when:
-   * - a caller wants visibility into each case without coupling to the CLI reporter layer
-   *
-   * Expects:
-   * - hooks are best-effort observers and should not affect task scoring
-   */
-  reporterHooks?: TaskReporterHooks;
-  /**
-   * Optional telemetry runtime shared by runner, DSL, and reporter integrations.
-   *
-   * Use when:
-   * - task execution should emit events to the currently active telemetry runtime
-   * - enabled and disabled telemetry should keep the same execution path
-   *
-   * Expects:
-   * - callers inject a no-op runtime when telemetry is disabled
-   */
-  telemetry?: TelemetryRuntime;
-  /**
-   * Optional runtime scheduling overrides supplied by CLI or host execution.
-   *
-   * Use when:
-   * - run operators need to override task/case concurrency without editing eval code
-   * - DSL task runners need to distinguish runtime flags from code defaults
-   *
-   * Expects:
-   * - values are positive integers when provided
-   *
-   * @default undefined
-   */
-  runtimeConcurrency?: TaskConcurrencyConfig;
-  /**
-   * Cooperative abort signal for the current execution.
-   */
-  signal?: AbortSignal;
-}
-/**
- * Allowed terminal outcomes for one task case.
- *
- * Use when:
- * - emitting case lifecycle events from the task DSL
- *
- * Expects:
- * - consumers treat the value as the final state for the case
- */
-type TaskCaseState = 'passed' | 'failed' | 'timeout';
-/**
- * Payload emitted when a task case starts.
- *
- * Use when:
- * - reporter hooks need a stable position for one case within the task
- *
- * Expects:
- * - `name` is the declared DSL case label
- * - `index` is the zero-based case position within the task
- * - `total` is the total number of registered cases
- */
-interface TaskCaseReporterPayload {
-  /**
-   * Maximum retry count configured for this case.
-   */
-  autoRetry?: number;
-  /**
-   * Optional case input payload registered by the task DSL.
-   */
-  input?: unknown;
-  /**
-   * Declared case label.
-   */
-  name: string;
-  /**
-   * Current retry attempt index, where `0` is the first try.
-   */
-  retryIndex?: number;
-  /**
-   * Zero-based case position within the task.
+   * @default retryIndex => 500 * 2 ** (retryIndex - 1)
    */
-  index: number;
+  autoRetryDelay?: TaskAutoRetryDelay;
   /**
-   * Total number of registered cases.
+   * Timeout in milliseconds for one case execution.
    */
-  total: number;
+  timeout?: number;
 }
 /**
- * Payload emitted when a task case ends.
+ * Payload emitted by task code for custom report events.
  *
  * Use when:
- * - reporter hooks need the case position plus terminal state
+ * - reporting runtime telemetry such as inference requests, responses, or tool calls
+ * - attaching modality-specific metrics without coupling task logic to CLI internals
  *
  * Expects:
- * - `name` is the declared DSL case label
- * - `index` is the zero-based case position within the task
- * - `total` is the total number of registered cases
- * - `state` describes the final case result
+ * - `event` to be a stable event name
+ * - `data` to be JSON-serializable for report artifact persistence
  */
-interface TaskCaseReporterEndPayload extends TaskCaseReporterPayload {
+interface TaskReporterEventPayload {
   /**
-   * Optional case output returned by the task case callback.
+   * Optional stable case id when the event maps to one case lifecycle.
    */
-  output?: unknown;
+  caseId?: string;
   /**
-   * Final case state.
+   * Optional custom payload persisted under event `data`.
    */
-  state: TaskCaseState;
+  data?: unknown;
   /**
-   * Optional failure message when `state` is `failed`.
+   * Event name written into report event envelopes.
    */
-  errorMessage?: string;
+  event: string;
 }
 /**
  * Reporter hooks invoked around each task case execution.
@@ -1151,14 +1193,14 @@ interface TaskCaseReporterEndPayload extends TaskCaseReporterPayload {
  * - hooks observe case start/end events but do not influence scoring
  */
 interface TaskReporterHooks {
-  /**
-   * Runs when a case is about to execute.
-   */
-  onCaseStart?: (payload: TaskCaseReporterPayload) => void;
   /**
    * Runs after a case settles.
    */
   onCaseEnd?: (payload: TaskCaseReporterEndPayload) => void;
+  /**
+   * Runs when a case is about to execute.
+   */
+  onCaseStart?: (payload: TaskCaseReporterPayload) => void;
   /**
    * Runs when task code emits a custom telemetry/reporting event.
    *
@@ -1169,149 +1211,107 @@ interface TaskReporterHooks {
   onEvent?: (payload: TaskReporterEventPayload) => void;
 }
 /**
- * Payload emitted by task code for custom report events.
- *
- * Use when:
- * - reporting runtime telemetry such as inference requests, responses, or tool calls
- * - attaching modality-specific metrics without coupling task logic to CLI internals
- *
- * Expects:
- * - `event` to be a stable event name
- * - `data` to be JSON-serializable for report artifact persistence
+ * Runtime context passed into eval task `run`.
  */
-interface TaskReporterEventPayload {
-  /**
-   * Event name written into report event envelopes.
-   */
-  event: string;
-  /**
-   * Optional custom payload persisted under event `data`.
-   */
-  data?: unknown;
+interface TaskRunContext {
   /**
-   * Optional stable case id when the event maps to one case lifecycle.
+   * Task-scoped cache runtime.
+   *
+   * Use when:
+   * - benchmark setup needs deterministic artifact reuse across attempts
+   * - case-level logic needs typed text/json/binary cache loaders
    */
-  caseId?: string;
-}
-/**
- * Eval task definition used by `defineTask`.
- */
-interface TaskDefinition {
+  cache: TaskExecutionContext['cache'];
   /**
-   * Stable task id for diagnostics.
+   * Configured model registrations available to model plugins.
+   *
+   * Use when:
+   * - a plugin owns model selection semantics and needs access to registered models
+   * - eval code resolves matrix-selected model axes through plugin helpers
    */
-  id: string;
+  models: TaskExecutionContext['models'];
   /**
-   * Optional task-local concurrency metadata.
+   * Optional reporter lifecycle hooks for task-local case events.
    *
    * Use when:
-   * - task declarations need to preserve task-scoped attempt/case caps for later scheduler wiring
-   * - higher-level orchestration wants to inspect task-local concurrency without executing the task
+   * - a caller wants visibility into each case without coupling to the CLI reporter layer
    *
    * Expects:
-   * - each provided value to be a positive integer chosen by the caller
-   *
-   * Returns:
-   * - one partial task-local concurrency descriptor
-   */
-  concurrency?: TaskConcurrencyConfig;
-  /**
-   * Optional task-local execution policy.
+   * - hooks are best-effort observers and should not affect task scoring
    */
-  executionPolicy?: TaskExecutionPolicy;
+  reporterHooks?: TaskReporterHooks;
   /**
-   * Optional matrix layering for this task definition.
+   * Optional runtime scheduling overrides supplied by CLI or host execution.
    *
    * Use when:
-   * - task-local experiments should refine project/eval defaults
+   * - run operators need to override task/case concurrency without editing eval code
+   * - DSL task runners need to distinguish runtime flags from code defaults
    *
-   * @example
-   * ```ts
-   * matrix: {
-   *   runMatrix: {
-   *     override: {
-   *       model: ['gpt-4.1-mini'],
-   *     },
-   *   },
-   *   evalMatrix: {
-   *     extend: {
-   *       evaluator: ['default-judge'],
-   *     },
-   *   },
-   * }
-   * ```
+   * Expects:
+   * - values are positive integers when provided
+   *
+   * @default undefined
    */
-  matrix?: ScopedMatrices;
+  runtimeConcurrency?: TaskConcurrencyConfig;
   /**
-   * Executes one scheduled eval task.
+   * Cooperative abort signal for the current execution.
    */
-  run: (context: TaskRunContext) => Promise<TaskRunOutput> | TaskRunOutput;
-}
-/**
- * Declares the metadata required for a single vieval evaluation module.
- */
-interface EvalDefinition {
-  description: string;
-  name: string;
+  signal?: AbortSignal;
   /**
-   * Optional matrix layering for this eval definition.
+   * Scheduled runner task metadata.
    *
-   * Use when:
-   * - one eval file needs control-group variants that differ from project defaults
+   * Matrix impact on runtime context:
    *
-   * @example
-   * ```ts
-   * matrix: {
-   *   runMatrix: {
-   *     extend: {
-   *       promptStyle: ['concise'],
-   *     },
-   *     override: {
-   *       scenario: ['eval-scenario'],
-   *     },
-   *   },
-   *   evalMatrix: {
-   *     override: {
-   *       rubric: ['strict'],
-   *     },
-   *   },
-   * }
+   * ```txt
+   * project/eval/task matrix definitions
+   *   -> scheduler expands run rows x eval rows
+   *   -> one scheduled task per row pair
+   *   -> context.task.matrix = {
+   *        run:  selected run-axis values,
+   *        eval: selected eval-axis values,
+   *        meta: { runRowId, evalRowId }
+   *      }
    * ```
    *
-   * Context impact:
+   * Practical impact:
+   * - `runMatrix` axes appear under `context.task.matrix.run.*`
+   * - `evalMatrix` axes appear under `context.task.matrix.eval.*`
+   * - row ids are stable labels for grouping/aggregation under `context.task.matrix.meta.*`
    *
-   * ```txt
-   * project.runMatrix + eval.matrix.runMatrix + task.matrix.runMatrix
-   *   => context.task.matrix.run
+   * @example
+   * ```ts
+   * // If final selected rows are:
+   * // run:  { model: 'gpt-4.1-mini', scenario: 'stress', promptLanguage: 'zh' }
+   * // eval: { rubric: 'strict', rubricModel: 'judge-large' }
    *
-   * project.evalMatrix + eval.matrix.evalMatrix + task.matrix.evalMatrix
-   *   => context.task.matrix.eval
+   * context.task.matrix.run.model // 'gpt-4.1-mini'
+   * context.task.matrix.run.scenario // 'stress'
+   * context.task.matrix.eval.rubric // 'strict'
+   * context.task.matrix.meta.runRowId // stable encoded row id
    * ```
    */
-  matrix?: ScopedMatrices;
+  task: ScheduledTask;
   /**
-   * Optional task implementation executed by runner.
+   * Optional telemetry runtime shared by runner, DSL, and reporter integrations.
+   *
+   * Use when:
+   * - task execution should emit events to the currently active telemetry runtime
+   * - enabled and disabled telemetry should keep the same execution path
+   *
+   * Expects:
+   * - callers inject a no-op runtime when telemetry is disabled
    */
-  task?: TaskDefinition;
+  telemetry?: TelemetryRuntime;
 }
 /**
- * Describes the shape of an imported vieval evaluation module.
+ * Output of one eval task execution.
  */
-interface EvalModule<TDefinition extends EvalDefinition = EvalDefinition> {
-  default: TDefinition;
+interface TaskRunOutput {
+  /**
+   * Scores emitted by this task run.
+   */
+  scores: readonly RunScore[];
 }
-/**
- * Maps module URLs to their loaded vieval evaluation modules.
- */
-type EvalModuleMap = Record<string, EvalModule>;
-/**
- * Represents a normalized evaluation entry collected by the runner.
- */
-type CollectedEvalEntry<TDefinition extends EvalDefinition = EvalDefinition> = TDefinition & {
-  directory: string;
-  filePath: string;
-  id: string;
-};
 //#endregion
 //#region src/config/define.d.ts
 /**
@@ -1339,19 +1339,19 @@ declare function defineTask<const TDefinition extends TaskDefinition>(definition
  * - a typed plugin shape bound to one config object
  */
 interface ConfigHookPlugin<TConfig> {
-  /**
-   * Stable plugin name for diagnostics.
-   */
-  name: string;
   /**
    * Optional config transform hook.
    */
-  configVieval?: (config: TConfig) => TConfig | void | Promise<TConfig | void>;
+  configVieval?: (config: TConfig) => Promise<TConfig | void> | TConfig | void;
   /**
    * Optional hook after config is finalized.
    */
-  configVievalResolved?: (config: TConfig) => void | Promise<void>;
+  configVievalResolved?: (config: TConfig) => Promise<void> | void;
+  /**
+   * Stable plugin name for diagnostics.
+   */
+  name: string;
 }
 //#endregion
 export { InferenceExecutor as $, RunScheduledTasksOptions as A, asProjectRelativePath as B, TaskDefinition as C, TaskRunContext as D, TaskReporterHooks as E, CreateTaskExecutionContextOptions as F, AggregatedProviderSummary as G, CreateVievalRunnerRuntimeContextOptions as H, TaskExecutionContext as I, RunResult as J, AggregatedRunResults as K, createTaskExecutionContext as L, RunnerTaskState as M, ScheduledTaskExecutor as N, TaskRunOutput as O, runScheduledTasks as P, CreateRunnerScheduleOptions as Q, ModelDefinition as R, TaskConcurrencyConfig as S, TaskReporterEventPayload as T, RunnerRuntimeContext as U, collectEvalEntries as V, createRunnerRuntimeContext as W, RunScoreKind as X, RunScore as Y, aggregateRunResults as Z, ScopedMatrices as _, CliOpenTelemetryReportingConfig as a, ScheduledTaskMatrixMeta as at, TaskCaseReporterPayload as b, EvalDefinition as c, createFilesystemTaskCacheRuntime as ct, MatrixAxisValues as d, CacheFileOptions as dt, RunnerMatrixDefinition as et, MatrixDefinition as f, CacheNamespace as ft, MatrixValue as g, MatrixRow as h, Awaitable as i, ScheduledTaskMatrix as it, RunnerExecutionError as j, TelemetryAttributeValue as k, EvalModule as l, normalizeCacheFilePathSegments as lt, MatrixPrimitive as m, defineEval as n, RunnerMatrixSelection as nt, CliReportingConfig as o, createRunnerSchedule as ot, MatrixLayer as p, TaskCacheRuntime as pt, AggregatedRunSummary as q, defineTask as r, ScheduledTask as rt, CollectedEvalEntry as s, CreateFilesystemTaskCacheRuntimeOptions as st, ConfigHookPlugin as t, RunnerMatrixInput as tt, EvalModuleMap as u, CacheFileHandle as ut, TaskAutoRetryDelay as v, TaskExecutionPolicy as w, TaskCaseState as x, TaskCaseReporterEndPayload as y, resolveModelByName as z };
-//# sourceMappingURL=index-D_aMeWqO.d.mts.map
+//# sourceMappingURL=index-BLIlhiWT.d.mts.map