npm - vieval - Versions diffs - 0.0.1 - Mend

vieval 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

package/README.md +290 -0
package/dist/assertions-DcAjfVDA.mjs +183 -0
package/dist/assertions-DcAjfVDA.mjs.map +1 -0
package/dist/cli/index.d.mts +11 -0
package/dist/cli/index.mjs +1434 -0
package/dist/cli/index.mjs.map +1 -0
package/dist/config-D2fe1SnT.mjs +17 -0
package/dist/config-D2fe1SnT.mjs.map +1 -0
package/dist/config.d.mts +3 -0
package/dist/config.mjs +3 -0
package/dist/core/assertions/index.d.mts +2 -0
package/dist/core/assertions/index.mjs +2 -0
package/dist/core/inference-executors/index.d.mts +273 -0
package/dist/core/inference-executors/index.mjs +225 -0
package/dist/core/inference-executors/index.mjs.map +1 -0
package/dist/core/processors/results/index.d.mts +96 -0
package/dist/core/processors/results/index.mjs +64 -0
package/dist/core/processors/results/index.mjs.map +1 -0
package/dist/core/runner/index.d.mts +2 -0
package/dist/core/runner/index.mjs +2 -0
package/dist/expect-0jPJ7Zio.d.mts +2318 -0
package/dist/expect-extensions-CwPtgTz8.mjs +13471 -0
package/dist/expect-extensions-CwPtgTz8.mjs.map +1 -0
package/dist/expect-i9WZWGrA.mjs +17 -0
package/dist/expect-i9WZWGrA.mjs.map +1 -0
package/dist/expect.d.mts +2 -0
package/dist/expect.mjs +2 -0
package/dist/index-DP7jsORl.d.mts +947 -0
package/dist/index-oSXhM1zx.d.mts +314 -0
package/dist/index.d.mts +92 -0
package/dist/index.mjs +150 -0
package/dist/index.mjs.map +1 -0
package/dist/magic-string.es-CH1jwzMg.mjs +1013 -0
package/dist/magic-string.es-CH1jwzMg.mjs.map +1 -0
package/dist/models-D_MsBtYw.mjs +14 -0
package/dist/models-D_MsBtYw.mjs.map +1 -0
package/dist/plugin-DVaRZY2x.d.mts +84 -0
package/dist/plugins/chat-models/index.d.mts +90 -0
package/dist/plugins/chat-models/index.mjs +48 -0
package/dist/plugins/chat-models/index.mjs.map +1 -0
package/dist/registry-ChOjjdEC.mjs +245 -0
package/dist/registry-ChOjjdEC.mjs.map +1 -0
package/dist/runner-4ZsOveoY.mjs +480 -0
package/dist/runner-4ZsOveoY.mjs.map +1 -0
package/dist/testing/expect-extensions.d.mts +86 -0
package/dist/testing/expect-extensions.mjs +2 -0
package/package.json +88 -0

package/dist/index-DP7jsORl.d.mts ADDED Viewed

@@ -0,0 +1,947 @@
+import { n as ModelDefinition } from "./plugin-DVaRZY2x.mjs";
+//#region src/core/runner/schedule.d.ts
+/**
+ * Describes the inferenceExecutor target for a scheduled eval run.
+ */
+interface InferenceExecutor {
+  /**
+   * Stable inferenceExecutor identifier such as `openai:gpt-4.1-mini`.
+   */
+  id: string;
+}
+/**
+ * Stores the selected value for each matrix axis.
+ */
+type RunnerMatrixSelection = Record<string, string>;
+/**
+ * Stores stable row ids for one resolved scheduled task matrix.
+ */
+interface ScheduledTaskMatrixMeta {
+  /**
+   * Stable row id for the resolved run matrix selection.
+   */
+  runRowId: string;
+  /**
+   * Stable row id for the resolved eval matrix selection.
+   */
+  evalRowId: string;
+}
+/**
+ * Stores the structured matrix payload for one scheduled task.
+ */
+interface ScheduledTaskMatrix {
+  /**
+   * Runtime matrix selection visible to task code.
+   */
+  run: RunnerMatrixSelection;
+  /**
+   * Eval-time matrix selection visible to task code.
+   */
+  eval: RunnerMatrixSelection;
+  /**
+   * Stable row ids for both scopes.
+   */
+  meta: ScheduledTaskMatrixMeta;
+}
+/**
+ * Maps matrix axis names to the values that should be expanded.
+ */
+type RunnerMatrixDefinition = MatrixDefinition;
+/**
+ * Accepts either flat axis definitions or one layered matrix object.
+ */
+type RunnerMatrixInput = RunnerMatrixDefinition | MatrixLayer;
+/**
+ * Represents one fully expanded runner task.
+ */
+interface ScheduledTask {
+  /**
+   * Stable task id derived from the entry, inferenceExecutor, and matrix selection.
+   */
+  id: string;
+  /**
+   * The collected eval entry to execute.
+   */
+  entry: CollectedEvalEntry;
+  /**
+   * The inferenceExecutor selected for this task.
+   */
+  inferenceExecutor: InferenceExecutor;
+  /**
+   * The concrete scoped matrix selection for this task.
+   */
+  matrix: ScheduledTaskMatrix;
+}
+/**
+ * Configures how the runner should expand its execution matrix.
+ */
+interface CreateRunnerScheduleOptions {
+  /**
+   * Collected eval entries that should be scheduled.
+   */
+  entries: readonly CollectedEvalEntry[];
+  /**
+   * Providers that should run each entry.
+   */
+  inferenceExecutors: readonly InferenceExecutor[];
+  /**
+   * Optional run-time matrix axes expanded as a cartesian product.
+   */
+  runMatrix?: RunnerMatrixInput;
+  /**
+   * Optional eval-time matrix axes expanded as a cartesian product.
+   */
+  evalMatrix?: RunnerMatrixInput;
+}
+/**
+ * Expands collected entries into a stable runner schedule.
+ *
+ * Call stack:
+ *
+ * {@link collectEvalEntries} (`../runner`)
+ *   -> {@link createRunnerSchedule}
+ *     -> {@link expandAxesToRows}
+ *       -> {@link ScheduledTask}[]
+ *
+ * Use when:
+ * - the runner already knows which eval entries are available
+ * - each entry must run against multiple inferenceExecutors or matrix variants
+ *
+ * Expects:
+ * - `entries` and `inferenceExecutors` to be provided in the desired execution order
+ * - matrix axes to use insertion order when generating combinations
+ */
+declare function createRunnerSchedule(options: CreateRunnerScheduleOptions): ScheduledTask[];
+//#endregion
+//#region src/core/runner/aggregate.d.ts
+/**
+ * Identifies the scoring family for a single eval score.
+ */
+type RunScoreKind = 'exact' | 'judge';
+/**
+ * Represents one normalized score emitted by a completed eval run.
+ */
+interface RunScore {
+  /**
+   * Score family used for aggregation.
+   */
+  kind: RunScoreKind;
+  /**
+   * Normalized score in the `0..1` range.
+   */
+  score: number;
+}
+/**
+ * Captures the output of one scheduled runner task.
+ */
+interface RunResult {
+  /**
+   * Stable run id, usually copied from the scheduled task id.
+   */
+  id: string;
+  /**
+   * Collected eval entry id.
+   */
+  entryId: string;
+  /**
+   * Stable inferenceExecutor id.
+   */
+  inferenceExecutorId: string;
+  /**
+   * Concrete matrix selection used by the run.
+   */
+  matrix: ScheduledTaskMatrix;
+  /**
+   * Raw scores emitted by the eval.
+   */
+  scores: readonly RunScore[];
+}
+/**
+ * Stores the per-run score averages after normalization.
+ */
+interface AggregatedRunSummary {
+  /**
+   * Stable run id.
+   */
+  id: string;
+  /**
+   * Collected eval entry id.
+   */
+  entryId: string;
+  /**
+   * Stable inferenceExecutor id.
+   */
+  inferenceExecutorId: string;
+  /**
+   * Concrete matrix selection used by the run.
+   */
+  matrix: ScheduledTaskMatrix;
+  /**
+   * Mean of exact-match scores or `null` when absent.
+   */
+  exactAverage: number | null;
+  /**
+   * Mean of judge-based scores or `null` when absent.
+   */
+  judgeAverage: number | null;
+  /**
+   * Hybrid average. Uses both families when present, otherwise falls back to the
+   * single available family.
+   */
+  hybridAverage: number | null;
+}
+/**
+ * Stores inferenceExecutor-level score aggregates across multiple runs.
+ */
+interface AggregatedProviderSummary {
+  /**
+   * Stable inferenceExecutor id.
+   */
+  inferenceExecutorId: string;
+  /**
+   * Number of runs included in this inferenceExecutor bucket.
+   */
+  runCount: number;
+  /**
+   * Mean of all exact-match scores or `null` when absent.
+   */
+  exactAverage: number | null;
+  /**
+   * Mean of all judge-based scores or `null` when absent.
+   */
+  judgeAverage: number | null;
+  /**
+   * Hybrid average derived from the inferenceExecutor exact and judge means.
+   */
+  hybridAverage: number | null;
+}
+/**
+ * Stores the final aggregation output for a batch of runner results.
+ */
+interface AggregatedRunResults {
+  /**
+   * Per-run normalized score summaries.
+   */
+  runs: AggregatedRunSummary[];
+  /**
+   * Provider-level summaries sorted by inferenceExecutor id.
+   */
+  inferenceExecutors: AggregatedProviderSummary[];
+  /**
+   * Overall summary across every run.
+   */
+  overall: {
+    exactAverage: number | null;
+    judgeAverage: number | null;
+    hybridAverage: number | null;
+    runCount: number;
+  };
+}
+/**
+ * Aggregates exact-match and judge-based scores into hybrid runner summaries.
+ *
+ * Call stack:
+ *
+ * {@link runScheduledTasks}
+ *   -> {@link aggregateRunResults}
+ *     -> {@link createRunSummary}
+ *     -> {@link createProviderSummary}
+ *       -> `report output`
+ *
+ * Use when:
+ * - a runner batch mixes deterministic exact checks with judge-based grading
+ * - inferenceExecutor comparison should preserve both score families and one hybrid view
+ *
+ * Expects:
+ * - each score to be normalized to the `0..1` range before aggregation
+ * - `scores.kind` to use only `'exact'` or `'judge'`
+ */
+declare function aggregateRunResults(results: readonly RunResult[]): AggregatedRunResults;
+//#endregion
+//#region src/core/runner/runtime-context.d.ts
+/**
+ * Shared runtime context used by the vieval runner.
+ *
+ * Use when:
+ * - runner services need stable path resolution without module-level side effects
+ * - call sites want deterministic control over workspace root detection
+ */
+interface RunnerRuntimeContext {
+  /**
+   * Absolute project root directory used for path normalization.
+   */
+  projectRootDirectory: string;
+}
+/**
+ * Options used to construct the runner runtime context.
+ */
+interface CreateVievalRunnerRuntimeContextOptions {
+  /**
+   * Directory used to search for the nearest pnpm workspace.
+   *
+   * @default directory of this module file
+   */
+  cwd?: string;
+  /**
+   * Absolute fallback directory when a pnpm workspace root is not found.
+   *
+   * @default package root directory (`packages/vieval`)
+   */
+  fallbackProjectRootDirectory?: string;
+}
+/**
+ * Creates a side-effect-free runtime context for runner path normalization.
+ *
+ * Call stack:
+ *
+ * {@link createRunnerRuntimeContext}
+ *   -> `findWorkspaceDir(cwd)`
+ *     -> `resolve projectRootDirectory`
+ *       -> `{ projectRootDirectory }`
+ *
+ * Use when:
+ * - initializing runner infrastructure before collecting eval modules
+ * - tests need deterministic root resolution behavior
+ */
+declare function createRunnerRuntimeContext(options?: CreateVievalRunnerRuntimeContextOptions): Promise<RunnerRuntimeContext>;
+//#endregion
+//#region src/core/runner/collect.d.ts
+/**
+ * Converts a file path into a project-relative path when possible.
+ *
+ * Before: `/repo/plugins/airi-plugin-game-chess/src/agent/evals/chess-commentary.eval.ts`
+ * After: `plugins/airi-plugin-game-chess/src/agent/evals/chess-commentary.eval.ts`
+ *
+ * Before: `D:/repo/plugins/airi-plugin-game-chess/src/agent/evals/chess-commentary.eval.ts`
+ * After: `D:/repo/plugins/airi-plugin-game-chess/src/agent/evals/chess-commentary.eval.ts`
+ */
+declare function asProjectRelativePath(filePath: string, context: RunnerRuntimeContext): string;
+/**
+ * Collects loaded vieval modules into sorted runner entries with stable ids.
+ *
+ * Call stack:
+ *
+ * `import.meta.glob(...)`
+ *   -> {@link collectEvalEntries}
+ *     -> {@link createCollectedEvalEntry}
+ *       -> {@link CollectedEvalEntry}[]
+ *
+ * Use when:
+ * - the runner has already loaded candidate eval modules
+ * - downstream scheduling needs stable entry ids and directory metadata
+ */
+declare function collectEvalEntries(modules: EvalModuleMap, context: RunnerRuntimeContext): CollectedEvalEntry[];
+//#endregion
+//#region src/core/runner/task-context.d.ts
+/**
+ * Options for selecting a model from the execution context.
+ */
+interface TaskModelSelectionOptions {
+  /**
+   * Model id or alias name.
+   */
+  name: string;
+}
+/**
+ * Task-scoped execution context exposed to runner executors.
+ */
+interface TaskExecutionContext {
+  /**
+   * Resolves model configuration for the current task.
+   *
+   * Use when:
+   * - no arguments are provided to use the model selected by run matrix/inferenceExecutor
+   * - `name` is provided to resolve a specific model id or alias
+   */
+  model: (selection?: string | TaskModelSelectionOptions) => ModelDefinition;
+}
+/**
+ * Inputs used to build task execution context.
+ */
+interface CreateTaskExecutionContextOptions {
+  models: readonly ModelDefinition[];
+  task: ScheduledTask;
+}
+/**
+ * Creates task-scoped model resolver context for runner execution.
+ *
+ * Call stack:
+ *
+ * {@link runScheduledTasks}
+ *   -> {@link createTaskExecutionContext}
+ *     -> {@link resolveModelByName}
+ *       -> `task.model()` / `task.model({ name })`
+ */
+declare function createTaskExecutionContext(options: CreateTaskExecutionContextOptions): TaskExecutionContext;
+//#endregion
+//#region src/core/runner/run.d.ts
+/**
+ * Executes one scheduled runner task and returns a normalized run result.
+ *
+ * Use when:
+ * - a scheduler already selected the task and execution context
+ * - the caller wants a typed executor contract for runner workers
+ *
+ * Expects:
+ * - the task context to be ready for model resolution and task-scoped work
+ *
+ * Returns:
+ * - a normalized run result with score entries ready for aggregation
+ */
+type ScheduledTaskExecutor = (task: ScheduledTask, context: TaskExecutionContext) => Promise<RunResult>;
+/**
+ * Terminal task state reported by runner lifecycle hooks.
+ *
+ * Use when:
+ * - reporting the outcome of one scheduled task to lifecycle observers
+ *
+ * Expects:
+ * - hooks treat the value as final for the completed task
+ */
+type RunnerTaskState = 'passed' | 'failed';
+/**
+ * Optional runner execution hooks used while processing scheduled tasks.
+ *
+ * Use when:
+ * - callers want lifecycle visibility around sequential task execution
+ * - task execution should remain deterministic while still observable
+ *
+ * Expects:
+ * - hook functions are synchronous lifecycle observers
+ */
+interface RunScheduledTasksOptions {
+  /**
+   * Creates per-task execution context.
+   *
+   * Use when:
+   * - executor code needs per-task model resolution or other task-scoped data
+   */
+  createExecutionContext?: (task: ScheduledTask) => TaskExecutionContext;
+  /**
+   * Runs before the executor starts handling a task.
+   *
+   * Use when:
+   * - callers want to observe task activation before execution begins
+   *
+   * Expects:
+   * - thrown errors abort the task before executor work starts
+   */
+  onTaskStart?: (task: ScheduledTask) => void;
+  /**
+   * Runs after the executor settles for a task.
+   *
+   * Use when:
+   * - callers want to observe successful and failed task completion
+   *
+   * Expects:
+   * - thrown errors abort successful runs
+   * - failed-task observers do not override the executor error for the task
+   */
+  onTaskEnd?: (task: ScheduledTask, state: RunnerTaskState) => void;
+}
+/**
+ * Error thrown when a scheduled run fails before producing a normalized result.
+ */
+declare class RunnerExecutionError extends Error {
+  /**
+   * Stable task id that failed.
+   */
+  taskId: string;
+  constructor(taskId: string, cause: unknown);
+}
+/**
+ * Executes runner tasks sequentially and aggregates the normalized results.
+ *
+ * Call stack:
+ *
+ * {@link createRunnerSchedule}
+ *   -> {@link runScheduledTasks}
+ *     -> `executor(task)`
+ *       -> {@link aggregateRunResults}
+ *
+ * Use when:
+ * - the caller already expanded the runner matrix
+ * - task execution should stay deterministic and easy to debug
+ *
+ * Expects:
+ * - `executor` to return normalized `0..1` scores
+ * - callers to handle concurrency outside this helper when needed
+ * - `onTaskStart` / `onTaskEnd` hooks to be synchronous lifecycle observers
+ *
+ * Throws:
+ * - `RunnerExecutionError` when task setup, hooks, or the executor throws
+ */
+declare function runScheduledTasks(tasks: readonly ScheduledTask[], executor: ScheduledTaskExecutor, options?: RunScheduledTasksOptions): Promise<AggregatedRunResults>;
+//#endregion
+//#region src/config/types.d.ts
+/**
+ * Primitive value allowed in one matrix cell.
+ *
+ * Use when:
+ * - defining axis values for canonical layered matrix config
+ * - preserving JSON-safe primitive values through config normalization
+ *
+ * Expects:
+ * - values remain serializable and comparable with stringified task ids
+ *
+ * Returns:
+ * - one JSON-friendly primitive matrix value
+ */
+type MatrixPrimitive = string | number | boolean;
+/**
+ * Canonical matrix value type.
+ *
+ * Use when:
+ * - declaring matrix axis values at the config boundary
+ *
+ * Expects:
+ * - values are normalized from config input without extra wrapping
+ *
+ * Returns:
+ * - a primitive cell value suitable for matrix expansion
+ */
+type MatrixValue = MatrixPrimitive;
+/**
+ * Canonical row payload for one matrix combination.
+ *
+ * Use when:
+ * - storing the selected values for a resolved matrix row
+ * - passing task-level matrix context between layers
+ *
+ * Expects:
+ * - keys are axis names and values are resolved axis selections
+ *
+ * Returns:
+ * - one resolved row object
+ */
+type MatrixRow = Record<string, MatrixValue>;
+/**
+ * Canonical axis value list for one matrix definition.
+ *
+ * Use when:
+ * - describing the values that one axis can expand into
+ *
+ * Expects:
+ * - values are ordered and deterministic
+ *
+ * Returns:
+ * - one axis value list
+ */
+type MatrixAxisValues = readonly MatrixValue[];
+/**
+ * Canonical layered matrix definition.
+ *
+ * Use when:
+ * - a config layer extends, overrides, or disables matrix axes
+ *
+ * Expects:
+ * - `extend` adds or inherits axes
+ * - `override` replaces axis values at the current layer
+ * - `disable` removes axes from the active layer
+ *
+ * Returns:
+ * - one structured layer object
+ */
+type MatrixDefinition = Record<string, MatrixAxisValues>;
+/**
+ * Canonical matrix layer payload.
+ *
+ * Use when:
+ * - a project, eval, or task needs scoped matrix layering
+ *
+ * Expects:
+ * - absent sections are treated as empty
+ *
+ * Resolution order:
+ *
+ * ```txt
+ * current-layer:
+ *   disable -> extend -> override
+ * ```
+ *
+ * Returns:
+ * - a layer object with optional extend, override, and disable sections
+ *
+ * @example
+ * ```ts
+ * const layer: MatrixLayer = {
+ *   disable: ['temperatureProfile'],
+ *   extend: {
+ *     scenario: ['baseline', 'stress'],
+ *   },
+ *   override: {
+ *     model: ['gpt-4.1-mini'],
+ *   },
+ * }
+ * ```
+ */
+interface MatrixLayer {
+  /**
+   * Matrix axes inherited or appended at this layer.
+   *
+   * @example
+   * ```ts
+   * extend: {
+   *   promptLanguage: ['en', 'zh'],
+   *   scenario: ['baseline'],
+   * }
+   * ```
+   */
+  extend?: MatrixDefinition;
+  /**
+   * Matrix axes replaced at this layer.
+   *
+   * @example
+   * ```ts
+   * override: {
+   *   rubric: ['strict'],
+   * }
+   * ```
+   */
+  override?: MatrixDefinition;
+  /**
+   * Matrix axes disabled at this layer.
+   *
+   * @example
+   * ```ts
+   * disable: ['temperatureProfile']
+   * ```
+   */
+  disable?: readonly string[];
+}
+/**
+ * Canonical run/eval matrix grouping.
+ *
+ * Use when:
+ * - a task or eval definition needs separate run and eval matrix scopes
+ *
+ * Expects:
+ * - each scope is optional and independently normalized
+ *
+ * Orchestration model:
+ *
+ * ```txt
+ * run scope:
+ *   project.runMatrix -> eval.matrix.runMatrix -> task.matrix.runMatrix
+ *
+ * eval scope:
+ *   project.evalMatrix -> eval.matrix.evalMatrix -> task.matrix.evalMatrix
+ *
+ * expanded tasks:
+ *   run rows x eval rows
+ * ```
+ *
+ * Returns:
+ * - a grouped matrix object with optional run and eval layers
+ *
+ * @example
+ * ```ts
+ * const scoped: ScopedMatrices = {
+ *   runMatrix: {
+ *     extend: {
+ *       model: ['gpt-4.1-mini', 'gpt-4.1'],
+ *       scenario: ['baseline', 'stress'],
+ *     },
+ *   },
+ *   evalMatrix: {
+ *     extend: {
+ *       rubric: ['strict', 'lenient'],
+ *       rubricModel: ['judge-mini', 'judge-large'],
+ *     },
+ *   },
+ * }
+ * ```
+ */
+interface ScopedMatrices {
+  /**
+   * Runtime matrix scope.
+   *
+   * @example
+   * ```ts
+   * runMatrix: {
+   *   extend: {
+   *     promptLanguage: ['en', 'zh'],
+   *   },
+   * }
+   * ```
+   */
+  runMatrix?: MatrixLayer;
+  /**
+   * Eval-time matrix scope.
+   *
+   * @example
+   * ```ts
+   * evalMatrix: {
+   *   override: {
+   *     rubric: ['strict'],
+   *   },
+   * }
+   * ```
+   */
+  evalMatrix?: MatrixLayer;
+}
+/**
+ * Output of one eval task execution.
+ */
+interface TaskRunOutput {
+  /**
+   * Scores emitted by this task run.
+   */
+  scores: readonly RunScore[];
+}
+/**
+ * Runtime context passed into eval task `run`.
+ */
+interface TaskRunContext {
+  /**
+   * Scheduled runner task metadata.
+   *
+   * Matrix impact on runtime context:
+   *
+   * ```txt
+   * project/eval/task matrix definitions
+   *   -> scheduler expands run rows x eval rows
+   *   -> one scheduled task per row pair
+   *   -> context.task.matrix = {
+   *        run:  selected run-axis values,
+   *        eval: selected eval-axis values,
+   *        meta: { runRowId, evalRowId }
+   *      }
+   * ```
+   *
+   * Practical impact:
+   * - `runMatrix` axes appear under `context.task.matrix.run.*`
+   * - `evalMatrix` axes appear under `context.task.matrix.eval.*`
+   * - row ids are stable labels for grouping/aggregation under `context.task.matrix.meta.*`
+   *
+   * @example
+   * ```ts
+   * // If final selected rows are:
+   * // run:  { model: 'gpt-4.1-mini', scenario: 'stress', promptLanguage: 'zh' }
+   * // eval: { rubric: 'strict', rubricModel: 'judge-large' }
+   *
+   * context.task.matrix.run.model // 'gpt-4.1-mini'
+   * context.task.matrix.run.scenario // 'stress'
+   * context.task.matrix.eval.rubric // 'strict'
+   * context.task.matrix.meta.runRowId // stable encoded row id
+   * ```
+   */
+  task: ScheduledTask;
+  /**
+   * Matrix-scoped model resolver.
+   *
+   * Runtime impact:
+   * - `context.model()` uses `context.task.matrix.run.model` first when present
+   * - then falls back to inferenceExecutor-id match
+   * - then falls back to first configured model
+   *
+   * @example
+   * ```ts
+   * // matrix.run.model = 'gpt-4.1-mini'
+   * const defaultModel = context.model()
+   * // resolves the configured model whose id/model/alias matches 'gpt-4.1-mini'
+   *
+   * const judgeModel = context.model({ name: 'judge-large' })
+   * // explicit lookup bypasses matrix default
+   * ```
+   */
+  model: TaskExecutionContext['model'];
+  /**
+   * Optional reporter lifecycle hooks for task-local case events.
+   *
+   * Use when:
+   * - a caller wants visibility into each case without coupling to the CLI reporter layer
+   *
+   * Expects:
+   * - hooks are best-effort observers and should not affect task scoring
+   */
+  reporterHooks?: TaskReporterHooks;
+}
+/**
+ * Allowed terminal outcomes for one task case.
+ *
+ * Use when:
+ * - emitting case lifecycle events from the task DSL
+ *
+ * Expects:
+ * - consumers treat the value as the final state for the case
+ */
+type TaskCaseState = 'passed' | 'failed';
+/**
+ * Payload emitted when a task case starts.
+ *
+ * Use when:
+ * - reporter hooks need a stable position for one case within the task
+ *
+ * Expects:
+ * - `name` is the declared DSL case label
+ * - `index` is the zero-based case position within the task
+ * - `total` is the total number of registered cases
+ */
+interface TaskCaseReporterPayload {
+  /**
+   * Declared case label.
+   */
+  name: string;
+  /**
+   * Zero-based case position within the task.
+   */
+  index: number;
+  /**
+   * Total number of registered cases.
+   */
+  total: number;
+}
+/**
+ * Payload emitted when a task case ends.
+ *
+ * Use when:
+ * - reporter hooks need the case position plus terminal state
+ *
+ * Expects:
+ * - `name` is the declared DSL case label
+ * - `index` is the zero-based case position within the task
+ * - `total` is the total number of registered cases
+ * - `state` describes the final case result
+ */
+interface TaskCaseReporterEndPayload extends TaskCaseReporterPayload {
+  /**
+   * Final case state.
+   */
+  state: TaskCaseState;
+}
+/**
+ * Reporter hooks invoked around each task case execution.
+ *
+ * Use when:
+ * - a caller needs case-level lifecycle visibility from the DSL runner
+ * - downstream reporters should stay decoupled from the task execution path
+ *
+ * Expects:
+ * - hooks observe case start/end events but do not influence scoring
+ */
+interface TaskReporterHooks {
+  /**
+   * Runs when a case is about to execute.
+   */
+  onCaseStart?: (payload: TaskCaseReporterPayload) => void;
+  /**
+   * Runs after a case settles.
+   */
+  onCaseEnd?: (payload: TaskCaseReporterEndPayload) => void;
+}
+/**
+ * Eval task definition used by `defineTask`.
+ */
+interface TaskDefinition {
+  /**
+   * Stable task id for diagnostics.
+   */
+  id: string;
+  /**
+   * Optional matrix layering for this task definition.
+   *
+   * Use when:
+   * - task-local experiments should refine project/eval defaults
+   *
+   * @example
+   * ```ts
+   * matrix: {
+   *   runMatrix: {
+   *     override: {
+   *       model: ['gpt-4.1-mini'],
+   *     },
+   *   },
+   *   evalMatrix: {
+   *     extend: {
+   *       evaluator: ['default-judge'],
+   *     },
+   *   },
+   * }
+   * ```
+   */
+  matrix?: ScopedMatrices;
+  /**
+   * Executes one scheduled eval task.
+   */
+  run: (context: TaskRunContext) => Promise<TaskRunOutput> | TaskRunOutput;
+}
+/**
+ * Declares the metadata required for a single vieval evaluation module.
+ */
+interface EvalDefinition {
+  description: string;
+  name: string;
+  /**
+   * Optional matrix layering for this eval definition.
+   *
+   * Use when:
+   * - one eval file needs control-group variants that differ from project defaults
+   *
+   * @example
+   * ```ts
+   * matrix: {
+   *   runMatrix: {
+   *     extend: {
+   *       promptStyle: ['concise'],
+   *     },
+   *     override: {
+   *       scenario: ['eval-scenario'],
+   *     },
+   *   },
+   *   evalMatrix: {
+   *     override: {
+   *       rubric: ['strict'],
+   *     },
+   *   },
+   * }
+   * ```
+   *
+   * Context impact:
+   *
+   * ```txt
+   * project.runMatrix + eval.matrix.runMatrix + task.matrix.runMatrix
+   *   => context.task.matrix.run
+   *
+   * project.evalMatrix + eval.matrix.evalMatrix + task.matrix.evalMatrix
+   *   => context.task.matrix.eval
+   * ```
+   */
+  matrix?: ScopedMatrices;
+  /**
+   * Optional task implementation executed by runner.
+   */
+  task?: TaskDefinition;
+}
+/**
+ * Describes the shape of an imported vieval evaluation module.
+ */
+interface EvalModule<TDefinition extends EvalDefinition = EvalDefinition> {
+  default: TDefinition;
+}
+/**
+ * Maps module URLs to their loaded vieval evaluation modules.
+ */
+type EvalModuleMap = Record<string, EvalModule>;
+/**
+ * Represents a normalized evaluation entry collected by the runner.
+ */
+type CollectedEvalEntry<TDefinition extends EvalDefinition = EvalDefinition> = TDefinition & {
+  directory: string;
+  filePath: string;
+  id: string;
+};
+//#endregion
+//#region src/config/define.d.ts
+/**
+ * Returns the provided vieval definition while preserving literal field types.
+ */
+declare function defineEval<const TDefinition extends EvalDefinition>(definition: TDefinition): TDefinition;
+/**
+ * Returns the provided task definition while preserving literal field types.
+ */
+declare function defineTask<const TDefinition extends TaskDefinition>(definition: TDefinition): TDefinition;
+//#endregion
+export { asProjectRelativePath as A, RunScoreKind as B, RunnerTaskState as C, TaskExecutionContext as D, CreateTaskExecutionContextOptions as E, AggregatedProviderSummary as F, RunnerMatrixInput as G, CreateRunnerScheduleOptions as H, AggregatedRunResults as I, ScheduledTaskMatrix as J, RunnerMatrixSelection as K, AggregatedRunSummary as L, CreateVievalRunnerRuntimeContextOptions as M, RunnerRuntimeContext as N, TaskModelSelectionOptions as O, createRunnerRuntimeContext as P, RunResult as R, RunnerExecutionError as S, runScheduledTasks as T, InferenceExecutor as U, aggregateRunResults as V, RunnerMatrixDefinition as W, createRunnerSchedule as X, ScheduledTaskMatrixMeta as Y, TaskDefinition as _, EvalModule as a, TaskRunOutput as b, MatrixDefinition as c, MatrixRow as d, MatrixValue as f, TaskCaseState as g, TaskCaseReporterPayload as h, EvalDefinition as i, collectEvalEntries as j, createTaskExecutionContext as k, MatrixLayer as l, TaskCaseReporterEndPayload as m, defineTask as n, EvalModuleMap as o, ScopedMatrices as p, ScheduledTask as q, CollectedEvalEntry as r, MatrixAxisValues as s, defineEval as t, MatrixPrimitive as u, TaskReporterHooks as v, ScheduledTaskExecutor as w, RunScheduledTasksOptions as x, TaskRunContext as y, RunScore as z };
+//# sourceMappingURL=index-DP7jsORl.d.mts.map