vieval 0.0.10 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/README.md +31 -31
  2. package/dist/bin/vieval.mjs +1 -1
  3. package/dist/bin/vieval.mjs.map +1 -1
  4. package/dist/cli/index.d.mts +1 -1
  5. package/dist/cli/index.mjs +1 -1
  6. package/dist/{cli-DTDgaqeI.mjs → cli-uzS81IPd.mjs} +1483 -1483
  7. package/dist/cli-uzS81IPd.mjs.map +1 -0
  8. package/dist/config.d.mts +1 -1
  9. package/dist/config.mjs +1 -1
  10. package/dist/config.mjs.map +1 -1
  11. package/dist/core/assertions/index.d.mts +156 -156
  12. package/dist/core/assertions/index.mjs +82 -82
  13. package/dist/core/assertions/index.mjs.map +1 -1
  14. package/dist/core/inference-executors/index.d.mts +37 -37
  15. package/dist/core/inference-executors/index.mjs +54 -53
  16. package/dist/core/inference-executors/index.mjs.map +1 -1
  17. package/dist/core/processors/results/index.d.mts +18 -18
  18. package/dist/core/processors/results/index.mjs.map +1 -1
  19. package/dist/core/runner/index.d.mts +2 -2
  20. package/dist/core/runner/index.mjs +259 -259
  21. package/dist/core/runner/index.mjs.map +1 -1
  22. package/dist/core/scheduler/index.d.mts +1 -1
  23. package/dist/core/scheduler/index.mjs +65 -65
  24. package/dist/core/scheduler/index.mjs.map +1 -1
  25. package/dist/{env-DfWZy_n4.d.mts → env-Br6jaWGL.d.mts} +9 -9
  26. package/dist/{env-nV5rVErX.mjs → env-egxaJtNn.mjs} +8 -8
  27. package/dist/env-egxaJtNn.mjs.map +1 -0
  28. package/dist/{expect-extensions-DCSqlneN.mjs → expect-extensions-BKdEPt3h.mjs} +46 -46
  29. package/dist/expect-extensions-BKdEPt3h.mjs.map +1 -0
  30. package/dist/expect.d.mts +1 -3
  31. package/dist/expect.mjs +1 -1
  32. package/dist/expect.mjs.map +1 -1
  33. package/dist/{index-D_aMeWqO.d.mts → index-BLIlhiWT.d.mts} +565 -565
  34. package/dist/{index-Bg0atWBF.d.mts → index-CIaJClcC.d.mts} +48 -48
  35. package/dist/index.d.mts +208 -197
  36. package/dist/index.mjs +148 -148
  37. package/dist/index.mjs.map +1 -1
  38. package/dist/{models-pBSRUZhY.mjs → models-CaCOUPZw.mjs} +1 -1
  39. package/dist/{models-pBSRUZhY.mjs.map → models-CaCOUPZw.mjs.map} +1 -1
  40. package/dist/plugins/chat-models/index.d.mts +279 -279
  41. package/dist/plugins/chat-models/index.mjs +360 -360
  42. package/dist/plugins/chat-models/index.mjs.map +1 -1
  43. package/dist/{queue-DsZQkZO_.mjs → queue-BL86z2W_.mjs} +1 -1
  44. package/dist/{queue-DsZQkZO_.mjs.map → queue-BL86z2W_.mjs.map} +1 -1
  45. package/dist/{registry-DMnwE_mY.mjs → registry-BK7k6X81.mjs} +294 -294
  46. package/dist/registry-BK7k6X81.mjs.map +1 -0
  47. package/dist/testing/expect-extensions.d.mts +27 -27
  48. package/dist/testing/expect-extensions.mjs +1 -1
  49. package/package.json +12 -12
  50. package/dist/cli-DTDgaqeI.mjs.map +0 -1
  51. package/dist/env-nV5rVErX.mjs.map +0 -1
  52. package/dist/expect-extensions-DCSqlneN.mjs.map +0 -1
  53. package/dist/registry-DMnwE_mY.mjs.map +0 -1
@@ -2,23 +2,6 @@ import { ReadStream, WriteStream } from "node:fs";
2
2
  import { Buffer } from "node:buffer";
3
3
 
4
4
  //#region src/core/cache/types.d.ts
5
- /**
6
- * Cache entry options used to derive one deterministic cache file path.
7
- */
8
- interface CacheFileOptions {
9
- /**
10
- * Optional file extension for the cache artifact (for example: `json`, `txt`, `wav`).
11
- */
12
- ext?: string;
13
- /**
14
- * Deterministic key segments used to build the relative cache path.
15
- */
16
- key: readonly string[];
17
- /**
18
- * Optional media type hint used by adapters when extension is omitted.
19
- */
20
- mediaType?: string;
21
- }
22
5
  /**
23
6
  * One cache file handle exposed to task code.
24
7
  *
@@ -34,18 +17,35 @@ interface CacheFileOptions {
34
17
  * - read/write helpers over one deterministic cache artifact path
35
18
  */
36
19
  interface CacheFileHandle {
37
- path: string;
38
20
  exists: () => Promise<boolean>;
21
+ loadAsCasesInput: <T>() => Promise<T[]>;
22
+ loadAsExpectFixture: <T>() => Promise<T>;
39
23
  openReadStream: () => ReadStream;
40
24
  openWriteStream: () => Promise<WriteStream>;
25
+ path: string;
41
26
  readBuffer: () => Promise<Buffer>;
42
- writeBuffer: (value: Buffer) => Promise<void>;
43
- readText: (encoding?: BufferEncoding) => Promise<string>;
44
- writeText: (value: string, encoding?: BufferEncoding) => Promise<void>;
45
27
  readJson: <T>() => Promise<T>;
28
+ readText: (encoding?: BufferEncoding) => Promise<string>;
29
+ writeBuffer: (value: Buffer) => Promise<void>;
46
30
  writeJson: (value: unknown) => Promise<void>;
47
- loadAsCasesInput: <T>() => Promise<T[]>;
48
- loadAsExpectFixture: <T>() => Promise<T>;
31
+ writeText: (value: string, encoding?: BufferEncoding) => Promise<void>;
32
+ }
33
+ /**
34
+ * Cache entry options used to derive one deterministic cache file path.
35
+ */
36
+ interface CacheFileOptions {
37
+ /**
38
+ * Optional file extension for the cache artifact (for example: `json`, `txt`, `wav`).
39
+ */
40
+ ext?: string;
41
+ /**
42
+ * Deterministic key segments used to build the relative cache path.
43
+ */
44
+ key: readonly string[];
45
+ /**
46
+ * Optional media type hint used by adapters when extension is omitted.
47
+ */
48
+ mediaType?: string;
49
49
  }
50
50
  /**
51
51
  * Namespaced cache accessor for deterministic cache artifacts.
@@ -78,16 +78,6 @@ interface CreateFilesystemTaskCacheRuntimeOptions {
78
78
  */
79
79
  workspaceId: string;
80
80
  }
81
- /**
82
- * Normalizes cache file options into deterministic relative path segments.
83
- *
84
- * Before:
85
- * - `{ key: ['cases', 'dataset hash', 'v1'], ext: 'json' }`
86
- *
87
- * After:
88
- * - `['cases', 'dataset-hash', 'v1.json']`
89
- */
90
- declare function normalizeCacheFilePathSegments(options: CacheFileOptions): string[];
91
81
  /**
92
82
  * Creates a deterministic filesystem-backed task cache runtime.
93
83
  *
@@ -104,6 +94,16 @@ declare function normalizeCacheFilePathSegments(options: CacheFileOptions): stri
104
94
  * `<cacheRootDirectory>/<workspaceId>/<projectName>/<namespace>/...`
105
95
  */
106
96
  declare function createFilesystemTaskCacheRuntime(options: CreateFilesystemTaskCacheRuntimeOptions): TaskCacheRuntime;
97
+ /**
98
+ * Normalizes cache file options into deterministic relative path segments.
99
+ *
100
+ * Before:
101
+ * - `{ key: ['cases', 'dataset hash', 'v1'], ext: 'json' }`
102
+ *
103
+ * After:
104
+ * - `['cases', 'dataset-hash', 'v1.json']`
105
+ */
106
+ declare function normalizeCacheFilePathSegments(options: CacheFileOptions): string[];
107
107
  //#endregion
108
108
  //#region src/core/runner/schedule.d.ts
109
109
  /**
@@ -116,30 +116,21 @@ interface InferenceExecutor {
116
116
  id: string;
117
117
  }
118
118
  /**
119
- * Stores the selected value for each matrix axis.
119
+ * Maps matrix axis names to the values that should be expanded.
120
120
  */
121
- type RunnerMatrixSelection = Record<string, string>;
121
+ type RunnerMatrixDefinition = MatrixDefinition;
122
122
  /**
123
- * Stores stable row ids for one resolved scheduled task matrix.
123
+ * Accepts either flat axis definitions or one layered matrix object.
124
124
  */
125
- interface ScheduledTaskMatrixMeta {
126
- /**
127
- * Stable row id for the resolved run matrix selection.
128
- */
129
- runRowId: string;
130
- /**
131
- * Stable row id for the resolved eval matrix selection.
132
- */
133
- evalRowId: string;
134
- }
125
+ type RunnerMatrixInput = MatrixLayer | RunnerMatrixDefinition;
126
+ /**
127
+ * Stores the selected value for each matrix axis.
128
+ */
129
+ type RunnerMatrixSelection = Record<string, string>;
135
130
  /**
136
131
  * Stores the structured matrix payload for one scheduled task.
137
132
  */
138
133
  interface ScheduledTaskMatrix {
139
- /**
140
- * Runtime matrix selection visible to task code.
141
- */
142
- run: RunnerMatrixSelection;
143
134
  /**
144
135
  * Eval-time matrix selection visible to task code.
145
136
  */
@@ -148,35 +139,23 @@ interface ScheduledTaskMatrix {
148
139
  * Stable row ids for both scopes.
149
140
  */
150
141
  meta: ScheduledTaskMatrixMeta;
142
+ /**
143
+ * Runtime matrix selection visible to task code.
144
+ */
145
+ run: RunnerMatrixSelection;
151
146
  }
152
147
  /**
153
- * Maps matrix axis names to the values that should be expanded.
154
- */
155
- type RunnerMatrixDefinition = MatrixDefinition;
156
- /**
157
- * Accepts either flat axis definitions or one layered matrix object.
158
- */
159
- type RunnerMatrixInput = RunnerMatrixDefinition | MatrixLayer;
160
- /**
161
- * Represents one fully expanded runner task.
148
+ * Stores stable row ids for one resolved scheduled task matrix.
162
149
  */
163
- interface ScheduledTask {
164
- /**
165
- * Stable task id derived from the entry, inferenceExecutor, and matrix selection.
166
- */
167
- id: string;
168
- /**
169
- * The collected eval entry to execute.
170
- */
171
- entry: CollectedEvalEntry;
150
+ interface ScheduledTaskMatrixMeta {
172
151
  /**
173
- * The inferenceExecutor selected for this task.
152
+ * Stable row id for the resolved eval matrix selection.
174
153
  */
175
- inferenceExecutor: InferenceExecutor;
154
+ evalRowId: string;
176
155
  /**
177
- * The concrete scoped matrix selection for this task.
156
+ * Stable row id for the resolved run matrix selection.
178
157
  */
179
- matrix: ScheduledTaskMatrix;
158
+ runRowId: string;
180
159
  }
181
160
  /**
182
161
  * Configures how the runner should expand its execution matrix.
@@ -186,6 +165,10 @@ interface CreateRunnerScheduleOptions {
186
165
  * Collected eval entries that should be scheduled.
187
166
  */
188
167
  entries: readonly CollectedEvalEntry[];
168
+ /**
169
+ * Optional eval-time matrix axes expanded as a cartesian product.
170
+ */
171
+ evalMatrix?: RunnerMatrixInput;
189
172
  /**
190
173
  * Providers that should run each entry.
191
174
  */
@@ -194,10 +177,27 @@ interface CreateRunnerScheduleOptions {
194
177
  * Optional run-time matrix axes expanded as a cartesian product.
195
178
  */
196
179
  runMatrix?: RunnerMatrixInput;
180
+ }
181
+ /**
182
+ * Represents one fully expanded runner task.
183
+ */
184
+ interface ScheduledTask {
197
185
  /**
198
- * Optional eval-time matrix axes expanded as a cartesian product.
186
+ * The collected eval entry to execute.
199
187
  */
200
- evalMatrix?: RunnerMatrixInput;
188
+ entry: CollectedEvalEntry;
189
+ /**
190
+ * Stable task id derived from the entry, inferenceExecutor, and matrix selection.
191
+ */
192
+ id: string;
193
+ /**
194
+ * The inferenceExecutor selected for this task.
195
+ */
196
+ inferenceExecutor: InferenceExecutor;
197
+ /**
198
+ * The concrete scoped matrix selection for this task.
199
+ */
200
+ matrix: ScheduledTaskMatrix;
201
201
  }
202
202
  /**
203
203
  * Expands collected entries into a stable runner schedule.
@@ -221,128 +221,128 @@ declare function createRunnerSchedule(options: CreateRunnerScheduleOptions): Sch
221
221
  //#endregion
222
222
  //#region src/core/runner/aggregate.d.ts
223
223
  /**
224
- * Identifies the scoring family for a single eval score.
225
- */
226
- type RunScoreKind = 'exact' | 'judge';
227
- /**
228
- * Represents one normalized score emitted by a completed eval run.
224
+ * Stores inferenceExecutor-level score aggregates across multiple runs.
229
225
  */
230
- interface RunScore {
226
+ interface AggregatedProviderSummary {
231
227
  /**
232
- * Score family used for aggregation.
228
+ * Mean of all exact-match scores or `null` when absent.
233
229
  */
234
- kind: RunScoreKind;
230
+ exactAverage: null | number;
235
231
  /**
236
- * Normalized score in the `0..1` range.
232
+ * Hybrid average derived from the inferenceExecutor exact and judge means.
237
233
  */
238
- score: number;
239
- }
240
- /**
241
- * Captures the output of one scheduled runner task.
242
- */
243
- interface RunResult {
234
+ hybridAverage: null | number;
244
235
  /**
245
- * Stable run id, usually copied from the scheduled task id.
236
+ * Stable inferenceExecutor id.
246
237
  */
247
- id: string;
238
+ inferenceExecutorId: string;
248
239
  /**
249
- * Collected eval entry id.
240
+ * Mean of all judge-based scores or `null` when absent.
250
241
  */
251
- entryId: string;
242
+ judgeAverage: null | number;
252
243
  /**
253
- * Stable inferenceExecutor id.
244
+ * Number of runs included in this inferenceExecutor bucket.
254
245
  */
255
- inferenceExecutorId: string;
246
+ runCount: number;
247
+ }
248
+ /**
249
+ * Stores the final aggregation output for a batch of runner results.
250
+ */
251
+ interface AggregatedRunResults {
256
252
  /**
257
- * Concrete matrix selection used by the run.
253
+ * Provider-level summaries sorted by inferenceExecutor id.
258
254
  */
259
- matrix: ScheduledTaskMatrix;
255
+ inferenceExecutors: AggregatedProviderSummary[];
260
256
  /**
261
- * Raw scores emitted by the eval.
257
+ * Overall summary across every run.
262
258
  */
263
- scores: readonly RunScore[];
259
+ overall: {
260
+ exactAverage: null | number;
261
+ hybridAverage: null | number;
262
+ judgeAverage: null | number;
263
+ runCount: number;
264
+ };
265
+ /**
266
+ * Per-run normalized score summaries.
267
+ */
268
+ runs: AggregatedRunSummary[];
264
269
  }
265
270
  /**
266
271
  * Stores the per-run score averages after normalization.
267
272
  */
268
273
  interface AggregatedRunSummary {
269
- /**
270
- * Stable run id.
271
- */
272
- id: string;
273
274
  /**
274
275
  * Collected eval entry id.
275
276
  */
276
277
  entryId: string;
277
278
  /**
278
- * Stable inferenceExecutor id.
279
+ * Mean of exact-match scores or `null` when absent.
279
280
  */
280
- inferenceExecutorId: string;
281
+ exactAverage: null | number;
281
282
  /**
282
- * Concrete matrix selection used by the run.
283
+ * Hybrid average. Uses both families when present, otherwise falls back to the
284
+ * single available family.
283
285
  */
284
- matrix: ScheduledTaskMatrix;
286
+ hybridAverage: null | number;
285
287
  /**
286
- * Mean of exact-match scores or `null` when absent.
288
+ * Stable run id.
287
289
  */
288
- exactAverage: number | null;
290
+ id: string;
291
+ /**
292
+ * Stable inferenceExecutor id.
293
+ */
294
+ inferenceExecutorId: string;
289
295
  /**
290
296
  * Mean of judge-based scores or `null` when absent.
291
297
  */
292
- judgeAverage: number | null;
298
+ judgeAverage: null | number;
293
299
  /**
294
- * Hybrid average. Uses both families when present, otherwise falls back to the
295
- * single available family.
300
+ * Concrete matrix selection used by the run.
296
301
  */
297
- hybridAverage: number | null;
302
+ matrix: ScheduledTaskMatrix;
298
303
  }
299
304
  /**
300
- * Stores inferenceExecutor-level score aggregates across multiple runs.
305
+ * Captures the output of one scheduled runner task.
301
306
  */
302
- interface AggregatedProviderSummary {
307
+ interface RunResult {
303
308
  /**
304
- * Stable inferenceExecutor id.
309
+ * Collected eval entry id.
305
310
  */
306
- inferenceExecutorId: string;
311
+ entryId: string;
307
312
  /**
308
- * Number of runs included in this inferenceExecutor bucket.
313
+ * Stable run id, usually copied from the scheduled task id.
309
314
  */
310
- runCount: number;
315
+ id: string;
311
316
  /**
312
- * Mean of all exact-match scores or `null` when absent.
317
+ * Stable inferenceExecutor id.
313
318
  */
314
- exactAverage: number | null;
319
+ inferenceExecutorId: string;
315
320
  /**
316
- * Mean of all judge-based scores or `null` when absent.
321
+ * Concrete matrix selection used by the run.
317
322
  */
318
- judgeAverage: number | null;
323
+ matrix: ScheduledTaskMatrix;
319
324
  /**
320
- * Hybrid average derived from the inferenceExecutor exact and judge means.
325
+ * Raw scores emitted by the eval.
321
326
  */
322
- hybridAverage: number | null;
327
+ scores: readonly RunScore[];
323
328
  }
324
329
  /**
325
- * Stores the final aggregation output for a batch of runner results.
330
+ * Represents one normalized score emitted by a completed eval run.
326
331
  */
327
- interface AggregatedRunResults {
328
- /**
329
- * Per-run normalized score summaries.
330
- */
331
- runs: AggregatedRunSummary[];
332
+ interface RunScore {
332
333
  /**
333
- * Provider-level summaries sorted by inferenceExecutor id.
334
+ * Score family used for aggregation.
334
335
  */
335
- inferenceExecutors: AggregatedProviderSummary[];
336
+ kind: RunScoreKind;
336
337
  /**
337
- * Overall summary across every run.
338
+ * Normalized score in the `0..1` range.
338
339
  */
339
- overall: {
340
- exactAverage: number | null;
341
- judgeAverage: number | null;
342
- hybridAverage: number | null;
343
- runCount: number;
344
- };
340
+ score: number;
345
341
  }
342
+ /**
343
+ * Identifies the scoring family for a single eval score.
344
+ */
345
+ type RunScoreKind = 'exact' | 'judge';
346
346
  /**
347
347
  * Aggregates exact-match and judge-based scores into hybrid runner summaries.
348
348
  *
@@ -365,19 +365,6 @@ interface AggregatedRunResults {
365
365
  declare function aggregateRunResults(results: readonly RunResult[]): AggregatedRunResults;
366
366
  //#endregion
367
367
  //#region src/core/runner/runtime-context.d.ts
368
- /**
369
- * Shared runtime context used by the vieval runner.
370
- *
371
- * Use when:
372
- * - runner services need stable path resolution without module-level side effects
373
- * - call sites want deterministic control over workspace root detection
374
- */
375
- interface RunnerRuntimeContext {
376
- /**
377
- * Absolute project root directory used for path normalization.
378
- */
379
- projectRootDirectory: string;
380
- }
381
368
  /**
382
369
  * Options used to construct the runner runtime context.
383
370
  */
@@ -395,6 +382,19 @@ interface CreateVievalRunnerRuntimeContextOptions {
395
382
  */
396
383
  fallbackProjectRootDirectory?: string;
397
384
  }
385
+ /**
386
+ * Shared runtime context used by the vieval runner.
387
+ *
388
+ * Use when:
389
+ * - runner services need stable path resolution without module-level side effects
390
+ * - call sites want deterministic control over workspace root detection
391
+ */
392
+ interface RunnerRuntimeContext {
393
+ /**
394
+ * Absolute project root directory used for path normalization.
395
+ */
396
+ projectRootDirectory: string;
397
+ }
398
398
  /**
399
399
  * Creates a side-effect-free runtime context for runner path normalization.
400
400
  *
@@ -455,13 +455,17 @@ declare function collectEvalEntries(modules: EvalModuleMap, context: RunnerRunti
455
455
  */
456
456
  interface ModelDefinition {
457
457
  /**
458
- * Stable model id.
458
+ * Alias names that can resolve this model.
459
459
  */
460
- id: string;
460
+ aliases: string[];
461
461
  /**
462
- * Inference-executor id used for matching and reporting.
462
+ * Optional execution policy hints attached to this model.
463
463
  */
464
- inferenceExecutorId: string;
464
+ executionPolicy?: TaskExecutionPolicy;
465
+ /**
466
+ * Stable model id.
467
+ */
468
+ id: string;
465
469
  /**
466
470
  * Executor reference passed through config.
467
471
  *
@@ -470,17 +474,13 @@ interface ModelDefinition {
470
474
  */
471
475
  inferenceExecutor: unknown;
472
476
  /**
473
- * Concrete model name passed to the inference executor.
474
- */
475
- model: string;
476
- /**
477
- * Alias names that can resolve this model.
477
+ * Inference-executor id used for matching and reporting.
478
478
  */
479
- aliases: string[];
479
+ inferenceExecutorId: string;
480
480
  /**
481
- * Optional execution policy hints attached to this model.
481
+ * Concrete model name passed to the inference executor.
482
482
  */
483
- executionPolicy?: TaskExecutionPolicy;
483
+ model: string;
484
484
  /**
485
485
  * Optional model-level call parameters.
486
486
  */
@@ -495,6 +495,14 @@ interface ModelDefinition {
495
495
  declare function resolveModelByName(models: readonly ModelDefinition[], name: string): ModelDefinition | undefined;
496
496
  //#endregion
497
497
  //#region src/core/runner/task-context.d.ts
498
+ /**
499
+ * Inputs used to build task execution context.
500
+ */
501
+ interface CreateTaskExecutionContextOptions {
502
+ cache?: TaskCacheRuntime;
503
+ models: readonly ModelDefinition[];
504
+ task: ScheduledTask;
505
+ }
498
506
  /**
499
507
  * Task-scoped execution context exposed to runner executors.
500
508
  */
@@ -508,14 +516,6 @@ interface TaskExecutionContext {
508
516
  */
509
517
  models: readonly ModelDefinition[];
510
518
  }
511
- /**
512
- * Inputs used to build task execution context.
513
- */
514
- interface CreateTaskExecutionContextOptions {
515
- cache?: TaskCacheRuntime;
516
- models: readonly ModelDefinition[];
517
- task: ScheduledTask;
518
- }
519
519
  /**
520
520
  * Creates task-scoped context data for runner execution.
521
521
  *
@@ -528,20 +528,6 @@ interface CreateTaskExecutionContextOptions {
528
528
  declare function createTaskExecutionContext(options: CreateTaskExecutionContextOptions): TaskExecutionContext;
529
529
  //#endregion
530
530
  //#region src/core/runner/run.d.ts
531
- /**
532
- * Executes one scheduled runner task and returns a normalized run result.
533
- *
534
- * Use when:
535
- * - a scheduler already selected the task and execution context
536
- * - the caller wants a typed executor contract for runner workers
537
- *
538
- * Expects:
539
- * - the task context to be ready for model resolution and task-scoped work
540
- *
541
- * Returns:
542
- * - a normalized run result with score entries ready for aggregation
543
- */
544
- type ScheduledTaskExecutor = (task: ScheduledTask, context: TaskExecutionContext) => Promise<RunResult>;
545
531
  /**
546
532
  * Terminal task state reported by runner lifecycle hooks.
547
533
  *
@@ -551,7 +537,7 @@ type ScheduledTaskExecutor = (task: ScheduledTask, context: TaskExecutionContext
551
537
  * Expects:
552
538
  * - hooks treat the value as final for the completed task
553
539
  */
554
- type RunnerTaskState = 'passed' | 'failed';
540
+ type RunnerTaskState = 'failed' | 'passed';
555
541
  /**
556
542
  * Optional runner execution hooks used while processing scheduled tasks.
557
543
  *
@@ -571,15 +557,11 @@ interface RunScheduledTasksOptions {
571
557
  */
572
558
  createExecutionContext?: (task: ScheduledTask) => TaskExecutionContext;
573
559
  /**
574
- * Runs before the executor starts handling a task.
575
- *
576
- * Use when:
577
- * - callers want to observe task activation before execution begins
560
+ * Maximum number of tasks to execute concurrently.
578
561
  *
579
- * Expects:
580
- * - thrown errors abort the task before executor work starts
562
+ * @default 1
581
563
  */
582
- onTaskStart?: (task: ScheduledTask) => void;
564
+ maxConcurrency?: number;
583
565
  /**
584
566
  * Runs after the executor settles for a task.
585
567
  *
@@ -592,12 +574,30 @@ interface RunScheduledTasksOptions {
592
574
  */
593
575
  onTaskEnd?: (task: ScheduledTask, state: RunnerTaskState) => void;
594
576
  /**
595
- * Maximum number of tasks to execute concurrently.
577
+ * Runs before the executor starts handling a task.
596
578
  *
597
- * @default 1
579
+ * Use when:
580
+ * - callers want to observe task activation before execution begins
581
+ *
582
+ * Expects:
583
+ * - thrown errors abort the task before executor work starts
598
584
  */
599
- maxConcurrency?: number;
585
+ onTaskStart?: (task: ScheduledTask) => void;
600
586
  }
587
+ /**
588
+ * Executes one scheduled runner task and returns a normalized run result.
589
+ *
590
+ * Use when:
591
+ * - a scheduler already selected the task and execution context
592
+ * - the caller wants a typed executor contract for runner workers
593
+ *
594
+ * Expects:
595
+ * - the task context to be ready for model resolution and task-scoped work
596
+ *
597
+ * Returns:
598
+ * - a normalized run result with score entries ready for aggregation
599
+ */
600
+ type ScheduledTaskExecutor = (task: ScheduledTask, context: TaskExecutionContext) => Promise<RunResult>;
601
601
  /**
602
602
  * Error thrown when a scheduled run fails before producing a normalized result.
603
603
  */
@@ -633,10 +633,10 @@ declare class RunnerExecutionError extends Error {
633
633
  declare function runScheduledTasks(tasks: readonly ScheduledTask[], executor: ScheduledTaskExecutor, options?: RunScheduledTasksOptions): Promise<AggregatedRunResults>;
634
634
  //#endregion
635
635
  //#region src/core/telemetry/types.d.ts
636
- /** JSON-compatible scalar values accepted as telemetry attributes. */
637
- type TelemetryAttributeValue = boolean | number | string | null | readonly TelemetryAttributeValue[];
638
636
  /** Attribute map shared by local report projection and OpenTelemetry span calls. */
639
637
  type TelemetryAttributes = Record<string, TelemetryAttributeValue | undefined>;
638
+ /** JSON-compatible scalar values accepted as telemetry attributes. */
639
+ type TelemetryAttributeValue = boolean | null | number | readonly TelemetryAttributeValue[] | string;
640
640
  /**
641
641
  * Internal Vieval telemetry runtime.
642
642
  *
@@ -652,10 +652,10 @@ type TelemetryAttributes = Record<string, TelemetryAttributeValue | undefined>;
652
652
  * - callback result, preserving thrown errors after telemetry records them
653
653
  */
654
654
  interface TelemetryRuntime {
655
- withSpan: <T>(name: string, attributes: TelemetryAttributes, callback: () => Promise<T>) => Promise<T>;
656
655
  addEvent: (name: string, attributes?: TelemetryAttributes) => void;
657
- setAttributes: (attributes: TelemetryAttributes) => void;
658
656
  recordException: (error: unknown) => void;
657
+ setAttributes: (attributes: TelemetryAttributes) => void;
658
+ withSpan: <T>(name: string, attributes: TelemetryAttributes, callback: () => Promise<T>) => Promise<T>;
659
659
  }
660
660
  //#endregion
661
661
  //#region src/config/types.d.ts
@@ -666,46 +666,94 @@ interface TelemetryRuntime {
666
666
  */
667
667
  type Awaitable<T> = Promise<T> | T;
668
668
  /**
669
- * Primitive value allowed in one matrix cell.
670
- *
671
- * Use when:
672
- * - defining axis values for canonical layered matrix config
673
- * - preserving JSON-safe primitive values through config normalization
674
- *
675
- * Expects:
676
- * - values remain serializable and comparable with stringified task ids
677
- *
678
- * Returns:
679
- * - one JSON-friendly primitive matrix value
669
+ * OpenTelemetry reporting configuration managed by user config setup.
680
670
  */
681
- type MatrixPrimitive = string | number | boolean;
671
+ interface CliOpenTelemetryReportingConfig {
672
+ /**
673
+ * Enables Vieval active span wrapping through `@opentelemetry/api`.
674
+ *
675
+ * @default false
676
+ */
677
+ enabled?: boolean;
678
+ /**
679
+ * Called after all telemetry events and local report artifacts have been emitted.
680
+ */
681
+ onRunEnd?: () => Awaitable<void>;
682
+ }
682
683
  /**
683
- * Canonical matrix value type.
684
- *
685
- * Use when:
686
- * - declaring matrix axis values at the config boundary
687
- *
688
- * Expects:
689
- * - values are normalized from config input without extra wrapping
690
- *
691
- * Returns:
692
- * - a primitive cell value suitable for matrix expansion
684
+ * Reporting configuration for local artifacts and optional OpenTelemetry integration.
693
685
  */
694
- type MatrixValue = MatrixPrimitive;
686
+ interface CliReportingConfig {
687
+ /**
688
+ * Optional OpenTelemetry API integration.
689
+ */
690
+ openTelemetry?: CliOpenTelemetryReportingConfig;
691
+ }
695
692
  /**
696
- * Canonical row payload for one matrix combination.
697
- *
698
- * Use when:
699
- * - storing the selected values for a resolved matrix row
700
- * - passing task-level matrix context between layers
701
- *
702
- * Expects:
703
- * - keys are axis names and values are resolved axis selections
704
- *
705
- * Returns:
706
- * - one resolved row object
693
+ * Represents a normalized evaluation entry collected by the runner.
707
694
  */
708
- type MatrixRow = Record<string, MatrixValue>;
695
+ type CollectedEvalEntry<TDefinition extends EvalDefinition = EvalDefinition> = TDefinition & {
696
+ directory: string;
697
+ filePath: string;
698
+ id: string;
699
+ };
700
+ /**
701
+ * Declares the metadata required for a single vieval evaluation module.
702
+ */
703
+ interface EvalDefinition {
704
+ description: string;
705
+ /**
706
+ * Optional matrix layering for this eval definition.
707
+ *
708
+ * Use when:
709
+ * - one eval file needs control-group variants that differ from project defaults
710
+ *
711
+ * @example
712
+ * ```ts
713
+ * matrix: {
714
+ * runMatrix: {
715
+ * extend: {
716
+ * promptStyle: ['concise'],
717
+ * },
718
+ * override: {
719
+ * scenario: ['eval-scenario'],
720
+ * },
721
+ * },
722
+ * evalMatrix: {
723
+ * override: {
724
+ * rubric: ['strict'],
725
+ * },
726
+ * },
727
+ * }
728
+ * ```
729
+ *
730
+ * Context impact:
731
+ *
732
+ * ```txt
733
+ * project.runMatrix + eval.matrix.runMatrix + task.matrix.runMatrix
734
+ * => context.task.matrix.run
735
+ *
736
+ * project.evalMatrix + eval.matrix.evalMatrix + task.matrix.evalMatrix
737
+ * => context.task.matrix.eval
738
+ * ```
739
+ */
740
+ matrix?: ScopedMatrices;
741
+ name: string;
742
+ /**
743
+ * Optional task implementation executed by runner.
744
+ */
745
+ task?: TaskDefinition;
746
+ }
747
+ /**
748
+ * Describes the shape of an imported vieval evaluation module.
749
+ */
750
+ interface EvalModule<TDefinition extends EvalDefinition = EvalDefinition> {
751
+ default: TDefinition;
752
+ }
753
+ /**
754
+ * Maps module URLs to their loaded vieval evaluation modules.
755
+ */
756
+ type EvalModuleMap = Record<string, EvalModule>;
709
757
  /**
710
758
  * Canonical axis value list for one matrix definition.
711
759
  *
@@ -767,6 +815,15 @@ type MatrixDefinition = Record<string, MatrixAxisValues>;
767
815
  * ```
768
816
  */
769
817
  interface MatrixLayer {
818
+ /**
819
+ * Matrix axes disabled at this layer.
820
+ *
821
+ * @example
822
+ * ```ts
823
+ * disable: ['temperatureProfile']
824
+ * ```
825
+ */
826
+ disable?: readonly string[];
770
827
  /**
771
828
  * Matrix axes inherited or appended at this layer.
772
829
  *
@@ -790,16 +847,48 @@ interface MatrixLayer {
790
847
  * ```
791
848
  */
792
849
  override?: MatrixDefinition;
793
- /**
794
- * Matrix axes disabled at this layer.
795
- *
796
- * @example
797
- * ```ts
798
- * disable: ['temperatureProfile']
799
- * ```
800
- */
801
- disable?: readonly string[];
802
850
  }
851
+ /**
852
+ * Primitive value allowed in one matrix cell.
853
+ *
854
+ * Use when:
855
+ * - defining axis values for canonical layered matrix config
856
+ * - preserving JSON-safe primitive values through config normalization
857
+ *
858
+ * Expects:
859
+ * - values remain serializable and comparable with stringified task ids
860
+ *
861
+ * Returns:
862
+ * - one JSON-friendly primitive matrix value
863
+ */
864
+ type MatrixPrimitive = boolean | number | string;
865
+ /**
866
+ * Canonical row payload for one matrix combination.
867
+ *
868
+ * Use when:
869
+ * - storing the selected values for a resolved matrix row
870
+ * - passing task-level matrix context between layers
871
+ *
872
+ * Expects:
873
+ * - keys are axis names and values are resolved axis selections
874
+ *
875
+ * Returns:
876
+ * - one resolved row object
877
+ */
878
+ type MatrixRow = Record<string, MatrixValue>;
879
+ /**
880
+ * Canonical matrix value type.
881
+ *
882
+ * Use when:
883
+ * - declaring matrix axis values at the config boundary
884
+ *
885
+ * Expects:
886
+ * - values are normalized from config input without extra wrapping
887
+ *
888
+ * Returns:
889
+ * - a primitive cell value suitable for matrix expansion
890
+ */
891
+ type MatrixValue = MatrixPrimitive;
803
892
  /**
804
893
  * Canonical run/eval matrix grouping.
805
894
  *
@@ -845,88 +934,111 @@ interface MatrixLayer {
845
934
  */
846
935
  interface ScopedMatrices {
847
936
  /**
848
- * Runtime matrix scope.
937
+ * Eval-time matrix scope.
849
938
  *
850
939
  * @example
851
940
  * ```ts
852
- * runMatrix: {
853
- * extend: {
854
- * promptLanguage: ['en', 'zh'],
941
+ * evalMatrix: {
942
+ * override: {
943
+ * rubric: ['strict'],
855
944
  * },
856
945
  * }
857
946
  * ```
858
947
  */
859
- runMatrix?: MatrixLayer;
948
+ evalMatrix?: MatrixLayer;
860
949
  /**
861
- * Eval-time matrix scope.
950
+ * Runtime matrix scope.
862
951
  *
863
952
  * @example
864
953
  * ```ts
865
- * evalMatrix: {
866
- * override: {
867
- * rubric: ['strict'],
954
+ * runMatrix: {
955
+ * extend: {
956
+ * promptLanguage: ['en', 'zh'],
868
957
  * },
869
958
  * }
870
959
  * ```
871
960
  */
872
- evalMatrix?: MatrixLayer;
873
- }
874
- /**
875
- * Output of one eval task execution.
876
- */
877
- interface TaskRunOutput {
878
- /**
879
- * Scores emitted by this task run.
880
- */
881
- scores: readonly RunScore[];
961
+ runMatrix?: MatrixLayer;
882
962
  }
883
963
  /**
884
964
  * Delay policy for retries within one task case attempt.
885
965
  *
886
966
  * @param retryIndex Retry number where `1` is the first retry after the initial failure.
887
967
  */
888
- type TaskAutoRetryDelay = number | ((retryIndex: number) => number);
968
+ type TaskAutoRetryDelay = ((retryIndex: number) => number) | number;
889
969
  /**
890
- * Execution policy applied to task and case callbacks.
970
+ * Payload emitted when a task case ends.
891
971
  *
892
972
  * Use when:
893
- * - one task or case should time out after a bounded duration
894
- * - failures should retry within the current attempt or trigger a later full task attempt
973
+ * - reporter hooks need the case position plus terminal state
895
974
  *
896
975
  * Expects:
897
- * - `timeout` to be a positive integer when provided
898
- * - `autoRetry` and `autoAttempt` to be non-negative integers when provided
976
+ * - `name` is the declared DSL case label
977
+ * - `index` is the zero-based case position within the task
978
+ * - `total` is the total number of registered cases
979
+ * - `state` describes the final case result
980
+ */
981
+ interface TaskCaseReporterEndPayload extends TaskCaseReporterPayload {
982
+ /**
983
+ * Optional failure message when `state` is `failed`.
984
+ */
985
+ errorMessage?: string;
986
+ /**
987
+ * Optional case output returned by the task case callback.
988
+ */
989
+ output?: unknown;
990
+ /**
991
+ * Final case state.
992
+ */
993
+ state: TaskCaseState;
994
+ }
995
+ /**
996
+ * Payload emitted when a task case starts.
899
997
  *
900
- * Returns:
901
- * - one partial execution policy descriptor
998
+ * Use when:
999
+ * - reporter hooks need a stable position for one case within the task
1000
+ *
1001
+ * Expects:
1002
+ * - `name` is the declared DSL case label
1003
+ * - `index` is the zero-based case position within the task
1004
+ * - `total` is the total number of registered cases
902
1005
  */
903
- interface TaskExecutionPolicy {
1006
+ interface TaskCaseReporterPayload {
904
1007
  /**
905
- * Additional retries allowed within the current attempt.
906
- *
907
- * @default 0
1008
+ * Maximum retry count configured for this case.
908
1009
  */
909
1010
  autoRetry?: number;
910
1011
  /**
911
- * Delay in milliseconds before a case auto retry starts.
912
- *
913
- * A number applies the same delay to every retry. A function receives the
914
- * retry index where `1` is the first retry after the initial failure.
915
- *
916
- * @default retryIndex => 500 * 2 ** (retryIndex - 1)
1012
+ * Zero-based case position within the task.
917
1013
  */
918
- autoRetryDelay?: TaskAutoRetryDelay;
1014
+ index: number;
919
1015
  /**
920
- * Additional full task attempts allowed after the current attempt settles.
921
- *
922
- * @default 0
1016
+ * Optional case input payload registered by the task DSL.
923
1017
  */
924
- autoAttempt?: number;
1018
+ input?: unknown;
925
1019
  /**
926
- * Timeout in milliseconds for one case execution.
1020
+ * Declared case label.
927
1021
  */
928
- timeout?: number;
1022
+ name: string;
1023
+ /**
1024
+ * Current retry attempt index, where `0` is the first try.
1025
+ */
1026
+ retryIndex?: number;
1027
+ /**
1028
+ * Total number of registered cases.
1029
+ */
1030
+ total: number;
929
1031
  }
1032
+ /**
1033
+ * Allowed terminal outcomes for one task case.
1034
+ *
1035
+ * Use when:
1036
+ * - emitting case lifecycle events from the task DSL
1037
+ *
1038
+ * Expects:
1039
+ * - consumers treat the value as the final state for the case
1040
+ */
1041
+ type TaskCaseState = 'failed' | 'passed' | 'timeout';
930
1042
  /**
931
1043
  * Task-local concurrency metadata.
932
1044
  *
@@ -951,194 +1063,124 @@ interface TaskConcurrencyConfig {
951
1063
  case?: number;
952
1064
  }
953
1065
  /**
954
- * Reporting configuration for local artifacts and optional OpenTelemetry integration.
1066
+ * Eval task definition used by `defineTask`.
955
1067
  */
956
- interface CliReportingConfig {
1068
+ interface TaskDefinition {
957
1069
  /**
958
- * Optional OpenTelemetry API integration.
1070
+ * Optional task-local concurrency metadata.
1071
+ *
1072
+ * Use when:
1073
+ * - task declarations need to preserve task-scoped attempt/case caps for later scheduler wiring
1074
+ * - higher-level orchestration wants to inspect task-local concurrency without executing the task
1075
+ *
1076
+ * Expects:
1077
+ * - each provided value to be a positive integer chosen by the caller
1078
+ *
1079
+ * Returns:
1080
+ * - one partial task-local concurrency descriptor
959
1081
  */
960
- openTelemetry?: CliOpenTelemetryReportingConfig;
961
- }
962
- /**
963
- * OpenTelemetry reporting configuration managed by user config setup.
964
- */
965
- interface CliOpenTelemetryReportingConfig {
1082
+ concurrency?: TaskConcurrencyConfig;
966
1083
  /**
967
- * Enables Vieval active span wrapping through `@opentelemetry/api`.
1084
+ * Optional task-local execution policy.
1085
+ */
1086
+ executionPolicy?: TaskExecutionPolicy;
1087
+ /**
1088
+ * Stable task id for diagnostics.
1089
+ */
1090
+ id: string;
1091
+ /**
1092
+ * Optional matrix layering for this task definition.
968
1093
  *
969
- * @default false
1094
+ * Use when:
1095
+ * - task-local experiments should refine project/eval defaults
1096
+ *
1097
+ * @example
1098
+ * ```ts
1099
+ * matrix: {
1100
+ * runMatrix: {
1101
+ * override: {
1102
+ * model: ['gpt-4.1-mini'],
1103
+ * },
1104
+ * },
1105
+ * evalMatrix: {
1106
+ * extend: {
1107
+ * evaluator: ['default-judge'],
1108
+ * },
1109
+ * },
1110
+ * }
1111
+ * ```
970
1112
  */
971
- enabled?: boolean;
1113
+ matrix?: ScopedMatrices;
972
1114
  /**
973
- * Called after all telemetry events and local report artifacts have been emitted.
1115
+ * Executes one scheduled eval task.
974
1116
  */
975
- onRunEnd?: () => Awaitable<void>;
1117
+ run: (context: TaskRunContext) => Promise<TaskRunOutput> | TaskRunOutput;
976
1118
  }
977
1119
  /**
978
- * Runtime context passed into eval task `run`.
1120
+ * Execution policy applied to task and case callbacks.
1121
+ *
1122
+ * Use when:
1123
+ * - one task or case should time out after a bounded duration
1124
+ * - failures should retry within the current attempt or trigger a later full task attempt
1125
+ *
1126
+ * Expects:
1127
+ * - `timeout` to be a positive integer when provided
1128
+ * - `autoRetry` and `autoAttempt` to be non-negative integers when provided
1129
+ *
1130
+ * Returns:
1131
+ * - one partial execution policy descriptor
979
1132
  */
980
- interface TaskRunContext {
1133
+ interface TaskExecutionPolicy {
981
1134
  /**
982
- * Task-scoped cache runtime.
1135
+ * Additional full task attempts allowed after the current attempt settles.
983
1136
  *
984
- * Use when:
985
- * - benchmark setup needs deterministic artifact reuse across attempts
986
- * - case-level logic needs typed text/json/binary cache loaders
1137
+ * @default 0
987
1138
  */
988
- cache: TaskExecutionContext['cache'];
1139
+ autoAttempt?: number;
989
1140
  /**
990
- * Scheduled runner task metadata.
1141
+ * Additional retries allowed within the current attempt.
991
1142
  *
992
- * Matrix impact on runtime context:
1143
+ * @default 0
1144
+ */
1145
+ autoRetry?: number;
1146
+ /**
1147
+ * Delay in milliseconds before a case auto retry starts.
993
1148
  *
994
- * ```txt
995
- * project/eval/task matrix definitions
996
- * -> scheduler expands run rows x eval rows
997
- * -> one scheduled task per row pair
998
- * -> context.task.matrix = {
999
- * run: selected run-axis values,
1000
- * eval: selected eval-axis values,
1001
- * meta: { runRowId, evalRowId }
1002
- * }
1003
- * ```
1149
+ * A number applies the same delay to every retry. A function receives the
1150
+ * retry index where `1` is the first retry after the initial failure.
1004
1151
  *
1005
- * Practical impact:
1006
- * - `runMatrix` axes appear under `context.task.matrix.run.*`
1007
- * - `evalMatrix` axes appear under `context.task.matrix.eval.*`
1008
- * - row ids are stable labels for grouping/aggregation under `context.task.matrix.meta.*`
1009
- *
1010
- * @example
1011
- * ```ts
1012
- * // If final selected rows are:
1013
- * // run: { model: 'gpt-4.1-mini', scenario: 'stress', promptLanguage: 'zh' }
1014
- * // eval: { rubric: 'strict', rubricModel: 'judge-large' }
1015
- *
1016
- * context.task.matrix.run.model // 'gpt-4.1-mini'
1017
- * context.task.matrix.run.scenario // 'stress'
1018
- * context.task.matrix.eval.rubric // 'strict'
1019
- * context.task.matrix.meta.runRowId // stable encoded row id
1020
- * ```
1021
- */
1022
- task: ScheduledTask;
1023
- /**
1024
- * Configured model registrations available to model plugins.
1025
- *
1026
- * Use when:
1027
- * - a plugin owns model selection semantics and needs access to registered models
1028
- * - eval code resolves matrix-selected model axes through plugin helpers
1029
- */
1030
- models: TaskExecutionContext['models'];
1031
- /**
1032
- * Optional reporter lifecycle hooks for task-local case events.
1033
- *
1034
- * Use when:
1035
- * - a caller wants visibility into each case without coupling to the CLI reporter layer
1036
- *
1037
- * Expects:
1038
- * - hooks are best-effort observers and should not affect task scoring
1039
- */
1040
- reporterHooks?: TaskReporterHooks;
1041
- /**
1042
- * Optional telemetry runtime shared by runner, DSL, and reporter integrations.
1043
- *
1044
- * Use when:
1045
- * - task execution should emit events to the currently active telemetry runtime
1046
- * - enabled and disabled telemetry should keep the same execution path
1047
- *
1048
- * Expects:
1049
- * - callers inject a no-op runtime when telemetry is disabled
1050
- */
1051
- telemetry?: TelemetryRuntime;
1052
- /**
1053
- * Optional runtime scheduling overrides supplied by CLI or host execution.
1054
- *
1055
- * Use when:
1056
- * - run operators need to override task/case concurrency without editing eval code
1057
- * - DSL task runners need to distinguish runtime flags from code defaults
1058
- *
1059
- * Expects:
1060
- * - values are positive integers when provided
1061
- *
1062
- * @default undefined
1063
- */
1064
- runtimeConcurrency?: TaskConcurrencyConfig;
1065
- /**
1066
- * Cooperative abort signal for the current execution.
1067
- */
1068
- signal?: AbortSignal;
1069
- }
1070
- /**
1071
- * Allowed terminal outcomes for one task case.
1072
- *
1073
- * Use when:
1074
- * - emitting case lifecycle events from the task DSL
1075
- *
1076
- * Expects:
1077
- * - consumers treat the value as the final state for the case
1078
- */
1079
- type TaskCaseState = 'passed' | 'failed' | 'timeout';
1080
- /**
1081
- * Payload emitted when a task case starts.
1082
- *
1083
- * Use when:
1084
- * - reporter hooks need a stable position for one case within the task
1085
- *
1086
- * Expects:
1087
- * - `name` is the declared DSL case label
1088
- * - `index` is the zero-based case position within the task
1089
- * - `total` is the total number of registered cases
1090
- */
1091
- interface TaskCaseReporterPayload {
1092
- /**
1093
- * Maximum retry count configured for this case.
1094
- */
1095
- autoRetry?: number;
1096
- /**
1097
- * Optional case input payload registered by the task DSL.
1098
- */
1099
- input?: unknown;
1100
- /**
1101
- * Declared case label.
1102
- */
1103
- name: string;
1104
- /**
1105
- * Current retry attempt index, where `0` is the first try.
1106
- */
1107
- retryIndex?: number;
1108
- /**
1109
- * Zero-based case position within the task.
1152
+ * @default retryIndex => 500 * 2 ** (retryIndex - 1)
1110
1153
  */
1111
- index: number;
1154
+ autoRetryDelay?: TaskAutoRetryDelay;
1112
1155
  /**
1113
- * Total number of registered cases.
1156
+ * Timeout in milliseconds for one case execution.
1114
1157
  */
1115
- total: number;
1158
+ timeout?: number;
1116
1159
  }
1117
1160
  /**
1118
- * Payload emitted when a task case ends.
1161
+ * Payload emitted by task code for custom report events.
1119
1162
  *
1120
1163
  * Use when:
1121
- * - reporter hooks need the case position plus terminal state
1164
+ * - reporting runtime telemetry such as inference requests, responses, or tool calls
1165
+ * - attaching modality-specific metrics without coupling task logic to CLI internals
1122
1166
  *
1123
1167
  * Expects:
1124
- * - `name` is the declared DSL case label
1125
- * - `index` is the zero-based case position within the task
1126
- * - `total` is the total number of registered cases
1127
- * - `state` describes the final case result
1168
+ * - `event` to be a stable event name
1169
+ * - `data` to be JSON-serializable for report artifact persistence
1128
1170
  */
1129
- interface TaskCaseReporterEndPayload extends TaskCaseReporterPayload {
1171
+ interface TaskReporterEventPayload {
1130
1172
  /**
1131
- * Optional case output returned by the task case callback.
1173
+ * Optional stable case id when the event maps to one case lifecycle.
1132
1174
  */
1133
- output?: unknown;
1175
+ caseId?: string;
1134
1176
  /**
1135
- * Final case state.
1177
+ * Optional custom payload persisted under event `data`.
1136
1178
  */
1137
- state: TaskCaseState;
1179
+ data?: unknown;
1138
1180
  /**
1139
- * Optional failure message when `state` is `failed`.
1181
+ * Event name written into report event envelopes.
1140
1182
  */
1141
- errorMessage?: string;
1183
+ event: string;
1142
1184
  }
1143
1185
  /**
1144
1186
  * Reporter hooks invoked around each task case execution.
@@ -1151,14 +1193,14 @@ interface TaskCaseReporterEndPayload extends TaskCaseReporterPayload {
1151
1193
  * - hooks observe case start/end events but do not influence scoring
1152
1194
  */
1153
1195
  interface TaskReporterHooks {
1154
- /**
1155
- * Runs when a case is about to execute.
1156
- */
1157
- onCaseStart?: (payload: TaskCaseReporterPayload) => void;
1158
1196
  /**
1159
1197
  * Runs after a case settles.
1160
1198
  */
1161
1199
  onCaseEnd?: (payload: TaskCaseReporterEndPayload) => void;
1200
+ /**
1201
+ * Runs when a case is about to execute.
1202
+ */
1203
+ onCaseStart?: (payload: TaskCaseReporterPayload) => void;
1162
1204
  /**
1163
1205
  * Runs when task code emits a custom telemetry/reporting event.
1164
1206
  *
@@ -1169,149 +1211,107 @@ interface TaskReporterHooks {
1169
1211
  onEvent?: (payload: TaskReporterEventPayload) => void;
1170
1212
  }
1171
1213
  /**
1172
- * Payload emitted by task code for custom report events.
1173
- *
1174
- * Use when:
1175
- * - reporting runtime telemetry such as inference requests, responses, or tool calls
1176
- * - attaching modality-specific metrics without coupling task logic to CLI internals
1177
- *
1178
- * Expects:
1179
- * - `event` to be a stable event name
1180
- * - `data` to be JSON-serializable for report artifact persistence
1214
+ * Runtime context passed into eval task `run`.
1181
1215
  */
1182
- interface TaskReporterEventPayload {
1183
- /**
1184
- * Event name written into report event envelopes.
1185
- */
1186
- event: string;
1187
- /**
1188
- * Optional custom payload persisted under event `data`.
1189
- */
1190
- data?: unknown;
1216
+ interface TaskRunContext {
1191
1217
  /**
1192
- * Optional stable case id when the event maps to one case lifecycle.
1218
+ * Task-scoped cache runtime.
1219
+ *
1220
+ * Use when:
1221
+ * - benchmark setup needs deterministic artifact reuse across attempts
1222
+ * - case-level logic needs typed text/json/binary cache loaders
1193
1223
  */
1194
- caseId?: string;
1195
- }
1196
- /**
1197
- * Eval task definition used by `defineTask`.
1198
- */
1199
- interface TaskDefinition {
1224
+ cache: TaskExecutionContext['cache'];
1200
1225
  /**
1201
- * Stable task id for diagnostics.
1226
+ * Configured model registrations available to model plugins.
1227
+ *
1228
+ * Use when:
1229
+ * - a plugin owns model selection semantics and needs access to registered models
1230
+ * - eval code resolves matrix-selected model axes through plugin helpers
1202
1231
  */
1203
- id: string;
1232
+ models: TaskExecutionContext['models'];
1204
1233
  /**
1205
- * Optional task-local concurrency metadata.
1234
+ * Optional reporter lifecycle hooks for task-local case events.
1206
1235
  *
1207
1236
  * Use when:
1208
- * - task declarations need to preserve task-scoped attempt/case caps for later scheduler wiring
1209
- * - higher-level orchestration wants to inspect task-local concurrency without executing the task
1237
+ * - a caller wants visibility into each case without coupling to the CLI reporter layer
1210
1238
  *
1211
1239
  * Expects:
1212
- * - each provided value to be a positive integer chosen by the caller
1213
- *
1214
- * Returns:
1215
- * - one partial task-local concurrency descriptor
1216
- */
1217
- concurrency?: TaskConcurrencyConfig;
1218
- /**
1219
- * Optional task-local execution policy.
1240
+ * - hooks are best-effort observers and should not affect task scoring
1220
1241
  */
1221
- executionPolicy?: TaskExecutionPolicy;
1242
+ reporterHooks?: TaskReporterHooks;
1222
1243
  /**
1223
- * Optional matrix layering for this task definition.
1244
+ * Optional runtime scheduling overrides supplied by CLI or host execution.
1224
1245
  *
1225
1246
  * Use when:
1226
- * - task-local experiments should refine project/eval defaults
1247
+ * - run operators need to override task/case concurrency without editing eval code
1248
+ * - DSL task runners need to distinguish runtime flags from code defaults
1227
1249
  *
1228
- * @example
1229
- * ```ts
1230
- * matrix: {
1231
- * runMatrix: {
1232
- * override: {
1233
- * model: ['gpt-4.1-mini'],
1234
- * },
1235
- * },
1236
- * evalMatrix: {
1237
- * extend: {
1238
- * evaluator: ['default-judge'],
1239
- * },
1240
- * },
1241
- * }
1242
- * ```
1250
+ * Expects:
1251
+ * - values are positive integers when provided
1252
+ *
1253
+ * @default undefined
1243
1254
  */
1244
- matrix?: ScopedMatrices;
1255
+ runtimeConcurrency?: TaskConcurrencyConfig;
1245
1256
  /**
1246
- * Executes one scheduled eval task.
1257
+ * Cooperative abort signal for the current execution.
1247
1258
  */
1248
- run: (context: TaskRunContext) => Promise<TaskRunOutput> | TaskRunOutput;
1249
- }
1250
- /**
1251
- * Declares the metadata required for a single vieval evaluation module.
1252
- */
1253
- interface EvalDefinition {
1254
- description: string;
1255
- name: string;
1259
+ signal?: AbortSignal;
1256
1260
  /**
1257
- * Optional matrix layering for this eval definition.
1261
+ * Scheduled runner task metadata.
1258
1262
  *
1259
- * Use when:
1260
- * - one eval file needs control-group variants that differ from project defaults
1263
+ * Matrix impact on runtime context:
1261
1264
  *
1262
- * @example
1263
- * ```ts
1264
- * matrix: {
1265
- * runMatrix: {
1266
- * extend: {
1267
- * promptStyle: ['concise'],
1268
- * },
1269
- * override: {
1270
- * scenario: ['eval-scenario'],
1271
- * },
1272
- * },
1273
- * evalMatrix: {
1274
- * override: {
1275
- * rubric: ['strict'],
1276
- * },
1277
- * },
1278
- * }
1265
+ * ```txt
1266
+ * project/eval/task matrix definitions
1267
+ * -> scheduler expands run rows x eval rows
1268
+ * -> one scheduled task per row pair
1269
+ * -> context.task.matrix = {
1270
+ * run: selected run-axis values,
1271
+ * eval: selected eval-axis values,
1272
+ * meta: { runRowId, evalRowId }
1273
+ * }
1279
1274
  * ```
1280
1275
  *
1281
- * Context impact:
1276
+ * Practical impact:
1277
+ * - `runMatrix` axes appear under `context.task.matrix.run.*`
1278
+ * - `evalMatrix` axes appear under `context.task.matrix.eval.*`
1279
+ * - row ids are stable labels for grouping/aggregation under `context.task.matrix.meta.*`
1282
1280
  *
1283
- * ```txt
1284
- * project.runMatrix + eval.matrix.runMatrix + task.matrix.runMatrix
1285
- * => context.task.matrix.run
1281
+ * @example
1282
+ * ```ts
1283
+ * // If final selected rows are:
1284
+ * // run: { model: 'gpt-4.1-mini', scenario: 'stress', promptLanguage: 'zh' }
1285
+ * // eval: { rubric: 'strict', rubricModel: 'judge-large' }
1286
1286
  *
1287
- * project.evalMatrix + eval.matrix.evalMatrix + task.matrix.evalMatrix
1288
- * => context.task.matrix.eval
1287
+ * context.task.matrix.run.model // 'gpt-4.1-mini'
1288
+ * context.task.matrix.run.scenario // 'stress'
1289
+ * context.task.matrix.eval.rubric // 'strict'
1290
+ * context.task.matrix.meta.runRowId // stable encoded row id
1289
1291
  * ```
1290
1292
  */
1291
- matrix?: ScopedMatrices;
1293
+ task: ScheduledTask;
1292
1294
  /**
1293
- * Optional task implementation executed by runner.
1295
+ * Optional telemetry runtime shared by runner, DSL, and reporter integrations.
1296
+ *
1297
+ * Use when:
1298
+ * - task execution should emit events to the currently active telemetry runtime
1299
+ * - enabled and disabled telemetry should keep the same execution path
1300
+ *
1301
+ * Expects:
1302
+ * - callers inject a no-op runtime when telemetry is disabled
1294
1303
  */
1295
- task?: TaskDefinition;
1304
+ telemetry?: TelemetryRuntime;
1296
1305
  }
1297
1306
  /**
1298
- * Describes the shape of an imported vieval evaluation module.
1307
+ * Output of one eval task execution.
1299
1308
  */
1300
- interface EvalModule<TDefinition extends EvalDefinition = EvalDefinition> {
1301
- default: TDefinition;
1309
+ interface TaskRunOutput {
1310
+ /**
1311
+ * Scores emitted by this task run.
1312
+ */
1313
+ scores: readonly RunScore[];
1302
1314
  }
1303
- /**
1304
- * Maps module URLs to their loaded vieval evaluation modules.
1305
- */
1306
- type EvalModuleMap = Record<string, EvalModule>;
1307
- /**
1308
- * Represents a normalized evaluation entry collected by the runner.
1309
- */
1310
- type CollectedEvalEntry<TDefinition extends EvalDefinition = EvalDefinition> = TDefinition & {
1311
- directory: string;
1312
- filePath: string;
1313
- id: string;
1314
- };
1315
1315
  //#endregion
1316
1316
  //#region src/config/define.d.ts
1317
1317
  /**
@@ -1339,19 +1339,19 @@ declare function defineTask<const TDefinition extends TaskDefinition>(definition
1339
1339
  * - a typed plugin shape bound to one config object
1340
1340
  */
1341
1341
  interface ConfigHookPlugin<TConfig> {
1342
- /**
1343
- * Stable plugin name for diagnostics.
1344
- */
1345
- name: string;
1346
1342
  /**
1347
1343
  * Optional config transform hook.
1348
1344
  */
1349
- configVieval?: (config: TConfig) => TConfig | void | Promise<TConfig | void>;
1345
+ configVieval?: (config: TConfig) => Promise<TConfig | void> | TConfig | void;
1350
1346
  /**
1351
1347
  * Optional hook after config is finalized.
1352
1348
  */
1353
- configVievalResolved?: (config: TConfig) => void | Promise<void>;
1349
+ configVievalResolved?: (config: TConfig) => Promise<void> | void;
1350
+ /**
1351
+ * Stable plugin name for diagnostics.
1352
+ */
1353
+ name: string;
1354
1354
  }
1355
1355
  //#endregion
1356
1356
  export { InferenceExecutor as $, RunScheduledTasksOptions as A, asProjectRelativePath as B, TaskDefinition as C, TaskRunContext as D, TaskReporterHooks as E, CreateTaskExecutionContextOptions as F, AggregatedProviderSummary as G, CreateVievalRunnerRuntimeContextOptions as H, TaskExecutionContext as I, RunResult as J, AggregatedRunResults as K, createTaskExecutionContext as L, RunnerTaskState as M, ScheduledTaskExecutor as N, TaskRunOutput as O, runScheduledTasks as P, CreateRunnerScheduleOptions as Q, ModelDefinition as R, TaskConcurrencyConfig as S, TaskReporterEventPayload as T, RunnerRuntimeContext as U, collectEvalEntries as V, createRunnerRuntimeContext as W, RunScoreKind as X, RunScore as Y, aggregateRunResults as Z, ScopedMatrices as _, CliOpenTelemetryReportingConfig as a, ScheduledTaskMatrixMeta as at, TaskCaseReporterPayload as b, EvalDefinition as c, createFilesystemTaskCacheRuntime as ct, MatrixAxisValues as d, CacheFileOptions as dt, RunnerMatrixDefinition as et, MatrixDefinition as f, CacheNamespace as ft, MatrixValue as g, MatrixRow as h, Awaitable as i, ScheduledTaskMatrix as it, RunnerExecutionError as j, TelemetryAttributeValue as k, EvalModule as l, normalizeCacheFilePathSegments as lt, MatrixPrimitive as m, defineEval as n, RunnerMatrixSelection as nt, CliReportingConfig as o, createRunnerSchedule as ot, MatrixLayer as p, TaskCacheRuntime as pt, AggregatedRunSummary as q, defineTask as r, ScheduledTask as rt, CollectedEvalEntry as s, CreateFilesystemTaskCacheRuntimeOptions as st, ConfigHookPlugin as t, RunnerMatrixInput as tt, EvalModuleMap as u, CacheFileHandle as ut, TaskAutoRetryDelay as v, TaskExecutionPolicy as w, TaskCaseState as x, TaskCaseReporterEndPayload as y, resolveModelByName as z };
1357
- //# sourceMappingURL=index-D_aMeWqO.d.mts.map
1357
+ //# sourceMappingURL=index-BLIlhiWT.d.mts.map