vieval 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/README.md +290 -0
  2. package/dist/assertions-DcAjfVDA.mjs +183 -0
  3. package/dist/assertions-DcAjfVDA.mjs.map +1 -0
  4. package/dist/cli/index.d.mts +11 -0
  5. package/dist/cli/index.mjs +1434 -0
  6. package/dist/cli/index.mjs.map +1 -0
  7. package/dist/config-D2fe1SnT.mjs +17 -0
  8. package/dist/config-D2fe1SnT.mjs.map +1 -0
  9. package/dist/config.d.mts +3 -0
  10. package/dist/config.mjs +3 -0
  11. package/dist/core/assertions/index.d.mts +2 -0
  12. package/dist/core/assertions/index.mjs +2 -0
  13. package/dist/core/inference-executors/index.d.mts +273 -0
  14. package/dist/core/inference-executors/index.mjs +225 -0
  15. package/dist/core/inference-executors/index.mjs.map +1 -0
  16. package/dist/core/processors/results/index.d.mts +96 -0
  17. package/dist/core/processors/results/index.mjs +64 -0
  18. package/dist/core/processors/results/index.mjs.map +1 -0
  19. package/dist/core/runner/index.d.mts +2 -0
  20. package/dist/core/runner/index.mjs +2 -0
  21. package/dist/expect-0jPJ7Zio.d.mts +2318 -0
  22. package/dist/expect-extensions-CwPtgTz8.mjs +13471 -0
  23. package/dist/expect-extensions-CwPtgTz8.mjs.map +1 -0
  24. package/dist/expect-i9WZWGrA.mjs +17 -0
  25. package/dist/expect-i9WZWGrA.mjs.map +1 -0
  26. package/dist/expect.d.mts +2 -0
  27. package/dist/expect.mjs +2 -0
  28. package/dist/index-DP7jsORl.d.mts +947 -0
  29. package/dist/index-oSXhM1zx.d.mts +314 -0
  30. package/dist/index.d.mts +92 -0
  31. package/dist/index.mjs +150 -0
  32. package/dist/index.mjs.map +1 -0
  33. package/dist/magic-string.es-CH1jwzMg.mjs +1013 -0
  34. package/dist/magic-string.es-CH1jwzMg.mjs.map +1 -0
  35. package/dist/models-D_MsBtYw.mjs +14 -0
  36. package/dist/models-D_MsBtYw.mjs.map +1 -0
  37. package/dist/plugin-DVaRZY2x.d.mts +84 -0
  38. package/dist/plugins/chat-models/index.d.mts +90 -0
  39. package/dist/plugins/chat-models/index.mjs +48 -0
  40. package/dist/plugins/chat-models/index.mjs.map +1 -0
  41. package/dist/registry-ChOjjdEC.mjs +245 -0
  42. package/dist/registry-ChOjjdEC.mjs.map +1 -0
  43. package/dist/runner-4ZsOveoY.mjs +480 -0
  44. package/dist/runner-4ZsOveoY.mjs.map +1 -0
  45. package/dist/testing/expect-extensions.d.mts +86 -0
  46. package/dist/testing/expect-extensions.mjs +2 -0
  47. package/package.json +88 -0
@@ -0,0 +1,947 @@
1
+ import { n as ModelDefinition } from "./plugin-DVaRZY2x.mjs";
2
+
3
+ //#region src/core/runner/schedule.d.ts
4
+ /**
5
+ * Describes the inferenceExecutor target for a scheduled eval run.
6
+ */
7
+ interface InferenceExecutor {
8
+ /**
9
+ * Stable inferenceExecutor identifier such as `openai:gpt-4.1-mini`.
10
+ */
11
+ id: string;
12
+ }
13
+ /**
14
+ * Stores the selected value for each matrix axis.
15
+ */
16
+ type RunnerMatrixSelection = Record<string, string>;
17
+ /**
18
+ * Stores stable row ids for one resolved scheduled task matrix.
19
+ */
20
+ interface ScheduledTaskMatrixMeta {
21
+ /**
22
+ * Stable row id for the resolved run matrix selection.
23
+ */
24
+ runRowId: string;
25
+ /**
26
+ * Stable row id for the resolved eval matrix selection.
27
+ */
28
+ evalRowId: string;
29
+ }
30
+ /**
31
+ * Stores the structured matrix payload for one scheduled task.
32
+ */
33
+ interface ScheduledTaskMatrix {
34
+ /**
35
+ * Runtime matrix selection visible to task code.
36
+ */
37
+ run: RunnerMatrixSelection;
38
+ /**
39
+ * Eval-time matrix selection visible to task code.
40
+ */
41
+ eval: RunnerMatrixSelection;
42
+ /**
43
+ * Stable row ids for both scopes.
44
+ */
45
+ meta: ScheduledTaskMatrixMeta;
46
+ }
47
+ /**
48
+ * Maps matrix axis names to the values that should be expanded.
49
+ */
50
+ type RunnerMatrixDefinition = MatrixDefinition;
51
+ /**
52
+ * Accepts either flat axis definitions or one layered matrix object.
53
+ */
54
+ type RunnerMatrixInput = RunnerMatrixDefinition | MatrixLayer;
55
+ /**
56
+ * Represents one fully expanded runner task.
57
+ */
58
+ interface ScheduledTask {
59
+ /**
60
+ * Stable task id derived from the entry, inferenceExecutor, and matrix selection.
61
+ */
62
+ id: string;
63
+ /**
64
+ * The collected eval entry to execute.
65
+ */
66
+ entry: CollectedEvalEntry;
67
+ /**
68
+ * The inferenceExecutor selected for this task.
69
+ */
70
+ inferenceExecutor: InferenceExecutor;
71
+ /**
72
+ * The concrete scoped matrix selection for this task.
73
+ */
74
+ matrix: ScheduledTaskMatrix;
75
+ }
76
+ /**
77
+ * Configures how the runner should expand its execution matrix.
78
+ */
79
+ interface CreateRunnerScheduleOptions {
80
+ /**
81
+ * Collected eval entries that should be scheduled.
82
+ */
83
+ entries: readonly CollectedEvalEntry[];
84
+ /**
85
+ * Providers that should run each entry.
86
+ */
87
+ inferenceExecutors: readonly InferenceExecutor[];
88
+ /**
89
+ * Optional run-time matrix axes expanded as a cartesian product.
90
+ */
91
+ runMatrix?: RunnerMatrixInput;
92
+ /**
93
+ * Optional eval-time matrix axes expanded as a cartesian product.
94
+ */
95
+ evalMatrix?: RunnerMatrixInput;
96
+ }
97
+ /**
98
+ * Expands collected entries into a stable runner schedule.
99
+ *
100
+ * Call stack:
101
+ *
102
+ * {@link collectEvalEntries} (`../runner`)
103
+ * -> {@link createRunnerSchedule}
104
+ * -> {@link expandAxesToRows}
105
+ * -> {@link ScheduledTask}[]
106
+ *
107
+ * Use when:
108
+ * - the runner already knows which eval entries are available
109
+ * - each entry must run against multiple inferenceExecutors or matrix variants
110
+ *
111
+ * Expects:
112
+ * - `entries` and `inferenceExecutors` to be provided in the desired execution order
113
+ * - matrix axes to use insertion order when generating combinations
114
+ */
115
+ declare function createRunnerSchedule(options: CreateRunnerScheduleOptions): ScheduledTask[];
116
+ //#endregion
117
+ //#region src/core/runner/aggregate.d.ts
118
+ /**
119
+ * Identifies the scoring family for a single eval score.
120
+ */
121
+ type RunScoreKind = 'exact' | 'judge';
122
+ /**
123
+ * Represents one normalized score emitted by a completed eval run.
124
+ */
125
+ interface RunScore {
126
+ /**
127
+ * Score family used for aggregation.
128
+ */
129
+ kind: RunScoreKind;
130
+ /**
131
+ * Normalized score in the `0..1` range.
132
+ */
133
+ score: number;
134
+ }
135
+ /**
136
+ * Captures the output of one scheduled runner task.
137
+ */
138
+ interface RunResult {
139
+ /**
140
+ * Stable run id, usually copied from the scheduled task id.
141
+ */
142
+ id: string;
143
+ /**
144
+ * Collected eval entry id.
145
+ */
146
+ entryId: string;
147
+ /**
148
+ * Stable inferenceExecutor id.
149
+ */
150
+ inferenceExecutorId: string;
151
+ /**
152
+ * Concrete matrix selection used by the run.
153
+ */
154
+ matrix: ScheduledTaskMatrix;
155
+ /**
156
+ * Raw scores emitted by the eval.
157
+ */
158
+ scores: readonly RunScore[];
159
+ }
160
+ /**
161
+ * Stores the per-run score averages after normalization.
162
+ */
163
+ interface AggregatedRunSummary {
164
+ /**
165
+ * Stable run id.
166
+ */
167
+ id: string;
168
+ /**
169
+ * Collected eval entry id.
170
+ */
171
+ entryId: string;
172
+ /**
173
+ * Stable inferenceExecutor id.
174
+ */
175
+ inferenceExecutorId: string;
176
+ /**
177
+ * Concrete matrix selection used by the run.
178
+ */
179
+ matrix: ScheduledTaskMatrix;
180
+ /**
181
+ * Mean of exact-match scores or `null` when absent.
182
+ */
183
+ exactAverage: number | null;
184
+ /**
185
+ * Mean of judge-based scores or `null` when absent.
186
+ */
187
+ judgeAverage: number | null;
188
+ /**
189
+ * Hybrid average. Uses both families when present, otherwise falls back to the
190
+ * single available family.
191
+ */
192
+ hybridAverage: number | null;
193
+ }
194
+ /**
195
+ * Stores inferenceExecutor-level score aggregates across multiple runs.
196
+ */
197
+ interface AggregatedProviderSummary {
198
+ /**
199
+ * Stable inferenceExecutor id.
200
+ */
201
+ inferenceExecutorId: string;
202
+ /**
203
+ * Number of runs included in this inferenceExecutor bucket.
204
+ */
205
+ runCount: number;
206
+ /**
207
+ * Mean of all exact-match scores or `null` when absent.
208
+ */
209
+ exactAverage: number | null;
210
+ /**
211
+ * Mean of all judge-based scores or `null` when absent.
212
+ */
213
+ judgeAverage: number | null;
214
+ /**
215
+ * Hybrid average derived from the inferenceExecutor exact and judge means.
216
+ */
217
+ hybridAverage: number | null;
218
+ }
219
+ /**
220
+ * Stores the final aggregation output for a batch of runner results.
221
+ */
222
+ interface AggregatedRunResults {
223
+ /**
224
+ * Per-run normalized score summaries.
225
+ */
226
+ runs: AggregatedRunSummary[];
227
+ /**
228
+ * Provider-level summaries sorted by inferenceExecutor id.
229
+ */
230
+ inferenceExecutors: AggregatedProviderSummary[];
231
+ /**
232
+ * Overall summary across every run.
233
+ */
234
+ overall: {
235
+ exactAverage: number | null;
236
+ judgeAverage: number | null;
237
+ hybridAverage: number | null;
238
+ runCount: number;
239
+ };
240
+ }
241
+ /**
242
+ * Aggregates exact-match and judge-based scores into hybrid runner summaries.
243
+ *
244
+ * Call stack:
245
+ *
246
+ * {@link runScheduledTasks}
247
+ * -> {@link aggregateRunResults}
248
+ * -> {@link createRunSummary}
249
+ * -> {@link createProviderSummary}
250
+ * -> `report output`
251
+ *
252
+ * Use when:
253
+ * - a runner batch mixes deterministic exact checks with judge-based grading
254
+ * - inferenceExecutor comparison should preserve both score families and one hybrid view
255
+ *
256
+ * Expects:
257
+ * - each score to be normalized to the `0..1` range before aggregation
258
+ * - `scores.kind` to use only `'exact'` or `'judge'`
259
+ */
260
+ declare function aggregateRunResults(results: readonly RunResult[]): AggregatedRunResults;
261
+ //#endregion
262
+ //#region src/core/runner/runtime-context.d.ts
263
+ /**
264
+ * Shared runtime context used by the vieval runner.
265
+ *
266
+ * Use when:
267
+ * - runner services need stable path resolution without module-level side effects
268
+ * - call sites want deterministic control over workspace root detection
269
+ */
270
+ interface RunnerRuntimeContext {
271
+ /**
272
+ * Absolute project root directory used for path normalization.
273
+ */
274
+ projectRootDirectory: string;
275
+ }
276
+ /**
277
+ * Options used to construct the runner runtime context.
278
+ */
279
+ interface CreateVievalRunnerRuntimeContextOptions {
280
+ /**
281
+ * Directory used to search for the nearest pnpm workspace.
282
+ *
283
+ * @default directory of this module file
284
+ */
285
+ cwd?: string;
286
+ /**
287
+ * Absolute fallback directory when a pnpm workspace root is not found.
288
+ *
289
+ * @default package root directory (`packages/vieval`)
290
+ */
291
+ fallbackProjectRootDirectory?: string;
292
+ }
293
+ /**
294
+ * Creates a side-effect-free runtime context for runner path normalization.
295
+ *
296
+ * Call stack:
297
+ *
298
+ * {@link createRunnerRuntimeContext}
299
+ * -> `findWorkspaceDir(cwd)`
300
+ * -> `resolve projectRootDirectory`
301
+ * -> `{ projectRootDirectory }`
302
+ *
303
+ * Use when:
304
+ * - initializing runner infrastructure before collecting eval modules
305
+ * - tests need deterministic root resolution behavior
306
+ */
307
+ declare function createRunnerRuntimeContext(options?: CreateVievalRunnerRuntimeContextOptions): Promise<RunnerRuntimeContext>;
308
+ //#endregion
309
+ //#region src/core/runner/collect.d.ts
310
+ /**
311
+ * Converts a file path into a project-relative path when possible.
312
+ *
313
+ * Before: `/repo/plugins/airi-plugin-game-chess/src/agent/evals/chess-commentary.eval.ts`
314
+ * After: `plugins/airi-plugin-game-chess/src/agent/evals/chess-commentary.eval.ts`
315
+ *
316
+ * Before: `D:/repo/plugins/airi-plugin-game-chess/src/agent/evals/chess-commentary.eval.ts`
317
+ * After: `D:/repo/plugins/airi-plugin-game-chess/src/agent/evals/chess-commentary.eval.ts`
318
+ */
319
+ declare function asProjectRelativePath(filePath: string, context: RunnerRuntimeContext): string;
320
+ /**
321
+ * Collects loaded vieval modules into sorted runner entries with stable ids.
322
+ *
323
+ * Call stack:
324
+ *
325
+ * `import.meta.glob(...)`
326
+ * -> {@link collectEvalEntries}
327
+ * -> {@link createCollectedEvalEntry}
328
+ * -> {@link CollectedEvalEntry}[]
329
+ *
330
+ * Use when:
331
+ * - the runner has already loaded candidate eval modules
332
+ * - downstream scheduling needs stable entry ids and directory metadata
333
+ */
334
+ declare function collectEvalEntries(modules: EvalModuleMap, context: RunnerRuntimeContext): CollectedEvalEntry[];
335
+ //#endregion
336
+ //#region src/core/runner/task-context.d.ts
337
+ /**
338
+ * Options for selecting a model from the execution context.
339
+ */
340
+ interface TaskModelSelectionOptions {
341
+ /**
342
+ * Model id or alias name.
343
+ */
344
+ name: string;
345
+ }
346
+ /**
347
+ * Task-scoped execution context exposed to runner executors.
348
+ */
349
+ interface TaskExecutionContext {
350
+ /**
351
+ * Resolves model configuration for the current task.
352
+ *
353
+ * Use when:
354
+ * - no arguments are provided to use the model selected by run matrix/inferenceExecutor
355
+ * - `name` is provided to resolve a specific model id or alias
356
+ */
357
+ model: (selection?: string | TaskModelSelectionOptions) => ModelDefinition;
358
+ }
359
+ /**
360
+ * Inputs used to build task execution context.
361
+ */
362
+ interface CreateTaskExecutionContextOptions {
363
+ models: readonly ModelDefinition[];
364
+ task: ScheduledTask;
365
+ }
366
+ /**
367
+ * Creates task-scoped model resolver context for runner execution.
368
+ *
369
+ * Call stack:
370
+ *
371
+ * {@link runScheduledTasks}
372
+ * -> {@link createTaskExecutionContext}
373
+ * -> {@link resolveModelByName}
374
+ * -> `task.model()` / `task.model({ name })`
375
+ */
376
+ declare function createTaskExecutionContext(options: CreateTaskExecutionContextOptions): TaskExecutionContext;
377
+ //#endregion
378
+ //#region src/core/runner/run.d.ts
379
+ /**
380
+ * Executes one scheduled runner task and returns a normalized run result.
381
+ *
382
+ * Use when:
383
+ * - a scheduler already selected the task and execution context
384
+ * - the caller wants a typed executor contract for runner workers
385
+ *
386
+ * Expects:
387
+ * - the task context to be ready for model resolution and task-scoped work
388
+ *
389
+ * Returns:
390
+ * - a normalized run result with score entries ready for aggregation
391
+ */
392
+ type ScheduledTaskExecutor = (task: ScheduledTask, context: TaskExecutionContext) => Promise<RunResult>;
393
+ /**
394
+ * Terminal task state reported by runner lifecycle hooks.
395
+ *
396
+ * Use when:
397
+ * - reporting the outcome of one scheduled task to lifecycle observers
398
+ *
399
+ * Expects:
400
+ * - hooks treat the value as final for the completed task
401
+ */
402
+ type RunnerTaskState = 'passed' | 'failed';
403
+ /**
404
+ * Optional runner execution hooks used while processing scheduled tasks.
405
+ *
406
+ * Use when:
407
+ * - callers want lifecycle visibility around sequential task execution
408
+ * - task execution should remain deterministic while still observable
409
+ *
410
+ * Expects:
411
+ * - hook functions are synchronous lifecycle observers
412
+ */
413
+ interface RunScheduledTasksOptions {
414
+ /**
415
+ * Creates per-task execution context.
416
+ *
417
+ * Use when:
418
+ * - executor code needs per-task model resolution or other task-scoped data
419
+ */
420
+ createExecutionContext?: (task: ScheduledTask) => TaskExecutionContext;
421
+ /**
422
+ * Runs before the executor starts handling a task.
423
+ *
424
+ * Use when:
425
+ * - callers want to observe task activation before execution begins
426
+ *
427
+ * Expects:
428
+ * - thrown errors abort the task before executor work starts
429
+ */
430
+ onTaskStart?: (task: ScheduledTask) => void;
431
+ /**
432
+ * Runs after the executor settles for a task.
433
+ *
434
+ * Use when:
435
+ * - callers want to observe successful and failed task completion
436
+ *
437
+ * Expects:
438
+ * - thrown errors abort successful runs
439
+ * - failed-task observers do not override the executor error for the task
440
+ */
441
+ onTaskEnd?: (task: ScheduledTask, state: RunnerTaskState) => void;
442
+ }
443
+ /**
444
+ * Error thrown when a scheduled run fails before producing a normalized result.
445
+ */
446
+ declare class RunnerExecutionError extends Error {
447
+ /**
448
+ * Stable task id that failed.
449
+ */
450
+ taskId: string;
451
+ constructor(taskId: string, cause: unknown);
452
+ }
453
+ /**
454
+ * Executes runner tasks sequentially and aggregates the normalized results.
455
+ *
456
+ * Call stack:
457
+ *
458
+ * {@link createRunnerSchedule}
459
+ * -> {@link runScheduledTasks}
460
+ * -> `executor(task)`
461
+ * -> {@link aggregateRunResults}
462
+ *
463
+ * Use when:
464
+ * - the caller already expanded the runner matrix
465
+ * - task execution should stay deterministic and easy to debug
466
+ *
467
+ * Expects:
468
+ * - `executor` to return normalized `0..1` scores
469
+ * - callers to handle concurrency outside this helper when needed
470
+ * - `onTaskStart` / `onTaskEnd` hooks to be synchronous lifecycle observers
471
+ *
472
+ * Throws:
473
+ * - `RunnerExecutionError` when task setup, hooks, or the executor throws
474
+ */
475
+ declare function runScheduledTasks(tasks: readonly ScheduledTask[], executor: ScheduledTaskExecutor, options?: RunScheduledTasksOptions): Promise<AggregatedRunResults>;
476
+ //#endregion
477
+ //#region src/config/types.d.ts
478
+ /**
479
+ * Primitive value allowed in one matrix cell.
480
+ *
481
+ * Use when:
482
+ * - defining axis values for canonical layered matrix config
483
+ * - preserving JSON-safe primitive values through config normalization
484
+ *
485
+ * Expects:
486
+ * - values remain serializable and comparable with stringified task ids
487
+ *
488
+ * Returns:
489
+ * - one JSON-friendly primitive matrix value
490
+ */
491
+ type MatrixPrimitive = string | number | boolean;
492
+ /**
493
+ * Canonical matrix value type.
494
+ *
495
+ * Use when:
496
+ * - declaring matrix axis values at the config boundary
497
+ *
498
+ * Expects:
499
+ * - values are normalized from config input without extra wrapping
500
+ *
501
+ * Returns:
502
+ * - a primitive cell value suitable for matrix expansion
503
+ */
504
+ type MatrixValue = MatrixPrimitive;
505
+ /**
506
+ * Canonical row payload for one matrix combination.
507
+ *
508
+ * Use when:
509
+ * - storing the selected values for a resolved matrix row
510
+ * - passing task-level matrix context between layers
511
+ *
512
+ * Expects:
513
+ * - keys are axis names and values are resolved axis selections
514
+ *
515
+ * Returns:
516
+ * - one resolved row object
517
+ */
518
+ type MatrixRow = Record<string, MatrixValue>;
519
+ /**
520
+ * Canonical axis value list for one matrix definition.
521
+ *
522
+ * Use when:
523
+ * - describing the values that one axis can expand into
524
+ *
525
+ * Expects:
526
+ * - values are ordered and deterministic
527
+ *
528
+ * Returns:
529
+ * - one axis value list
530
+ */
531
+ type MatrixAxisValues = readonly MatrixValue[];
532
+ /**
533
+ * Canonical layered matrix definition.
534
+ *
535
+ * Use when:
536
+ * - a config layer extends, overrides, or disables matrix axes
537
+ *
538
+ * Expects:
539
+ * - `extend` adds or inherits axes
540
+ * - `override` replaces axis values at the current layer
541
+ * - `disable` removes axes from the active layer
542
+ *
543
+ * Returns:
544
+ * - one structured layer object
545
+ */
546
+ type MatrixDefinition = Record<string, MatrixAxisValues>;
547
+ /**
548
+ * Canonical matrix layer payload.
549
+ *
550
+ * Use when:
551
+ * - a project, eval, or task needs scoped matrix layering
552
+ *
553
+ * Expects:
554
+ * - absent sections are treated as empty
555
+ *
556
+ * Resolution order:
557
+ *
558
+ * ```txt
559
+ * current-layer:
560
+ * disable -> extend -> override
561
+ * ```
562
+ *
563
+ * Returns:
564
+ * - a layer object with optional extend, override, and disable sections
565
+ *
566
+ * @example
567
+ * ```ts
568
+ * const layer: MatrixLayer = {
569
+ * disable: ['temperatureProfile'],
570
+ * extend: {
571
+ * scenario: ['baseline', 'stress'],
572
+ * },
573
+ * override: {
574
+ * model: ['gpt-4.1-mini'],
575
+ * },
576
+ * }
577
+ * ```
578
+ */
579
+ interface MatrixLayer {
580
+ /**
581
+ * Matrix axes inherited or appended at this layer.
582
+ *
583
+ * @example
584
+ * ```ts
585
+ * extend: {
586
+ * promptLanguage: ['en', 'zh'],
587
+ * scenario: ['baseline'],
588
+ * }
589
+ * ```
590
+ */
591
+ extend?: MatrixDefinition;
592
+ /**
593
+ * Matrix axes replaced at this layer.
594
+ *
595
+ * @example
596
+ * ```ts
597
+ * override: {
598
+ * rubric: ['strict'],
599
+ * }
600
+ * ```
601
+ */
602
+ override?: MatrixDefinition;
603
+ /**
604
+ * Matrix axes disabled at this layer.
605
+ *
606
+ * @example
607
+ * ```ts
608
+ * disable: ['temperatureProfile']
609
+ * ```
610
+ */
611
+ disable?: readonly string[];
612
+ }
613
+ /**
614
+ * Canonical run/eval matrix grouping.
615
+ *
616
+ * Use when:
617
+ * - a task or eval definition needs separate run and eval matrix scopes
618
+ *
619
+ * Expects:
620
+ * - each scope is optional and independently normalized
621
+ *
622
+ * Orchestration model:
623
+ *
624
+ * ```txt
625
+ * run scope:
626
+ * project.runMatrix -> eval.matrix.runMatrix -> task.matrix.runMatrix
627
+ *
628
+ * eval scope:
629
+ * project.evalMatrix -> eval.matrix.evalMatrix -> task.matrix.evalMatrix
630
+ *
631
+ * expanded tasks:
632
+ * run rows x eval rows
633
+ * ```
634
+ *
635
+ * Returns:
636
+ * - a grouped matrix object with optional run and eval layers
637
+ *
638
+ * @example
639
+ * ```ts
640
+ * const scoped: ScopedMatrices = {
641
+ * runMatrix: {
642
+ * extend: {
643
+ * model: ['gpt-4.1-mini', 'gpt-4.1'],
644
+ * scenario: ['baseline', 'stress'],
645
+ * },
646
+ * },
647
+ * evalMatrix: {
648
+ * extend: {
649
+ * rubric: ['strict', 'lenient'],
650
+ * rubricModel: ['judge-mini', 'judge-large'],
651
+ * },
652
+ * },
653
+ * }
654
+ * ```
655
+ */
656
+ interface ScopedMatrices {
657
+ /**
658
+ * Runtime matrix scope.
659
+ *
660
+ * @example
661
+ * ```ts
662
+ * runMatrix: {
663
+ * extend: {
664
+ * promptLanguage: ['en', 'zh'],
665
+ * },
666
+ * }
667
+ * ```
668
+ */
669
+ runMatrix?: MatrixLayer;
670
+ /**
671
+ * Eval-time matrix scope.
672
+ *
673
+ * @example
674
+ * ```ts
675
+ * evalMatrix: {
676
+ * override: {
677
+ * rubric: ['strict'],
678
+ * },
679
+ * }
680
+ * ```
681
+ */
682
+ evalMatrix?: MatrixLayer;
683
+ }
684
+ /**
685
+ * Output of one eval task execution.
686
+ */
687
+ interface TaskRunOutput {
688
+ /**
689
+ * Scores emitted by this task run.
690
+ */
691
+ scores: readonly RunScore[];
692
+ }
693
+ /**
694
+ * Runtime context passed into eval task `run`.
695
+ */
696
+ interface TaskRunContext {
697
+ /**
698
+ * Scheduled runner task metadata.
699
+ *
700
+ * Matrix impact on runtime context:
701
+ *
702
+ * ```txt
703
+ * project/eval/task matrix definitions
704
+ * -> scheduler expands run rows x eval rows
705
+ * -> one scheduled task per row pair
706
+ * -> context.task.matrix = {
707
+ * run: selected run-axis values,
708
+ * eval: selected eval-axis values,
709
+ * meta: { runRowId, evalRowId }
710
+ * }
711
+ * ```
712
+ *
713
+ * Practical impact:
714
+ * - `runMatrix` axes appear under `context.task.matrix.run.*`
715
+ * - `evalMatrix` axes appear under `context.task.matrix.eval.*`
716
+ * - row ids are stable labels for grouping/aggregation under `context.task.matrix.meta.*`
717
+ *
718
+ * @example
719
+ * ```ts
720
+ * // If final selected rows are:
721
+ * // run: { model: 'gpt-4.1-mini', scenario: 'stress', promptLanguage: 'zh' }
722
+ * // eval: { rubric: 'strict', rubricModel: 'judge-large' }
723
+ *
724
+ * context.task.matrix.run.model // 'gpt-4.1-mini'
725
+ * context.task.matrix.run.scenario // 'stress'
726
+ * context.task.matrix.eval.rubric // 'strict'
727
+ * context.task.matrix.meta.runRowId // stable encoded row id
728
+ * ```
729
+ */
730
+ task: ScheduledTask;
731
+ /**
732
+ * Matrix-scoped model resolver.
733
+ *
734
+ * Runtime impact:
735
+ * - `context.model()` uses `context.task.matrix.run.model` first when present
736
+ * - then falls back to inferenceExecutor-id match
737
+ * - then falls back to first configured model
738
+ *
739
+ * @example
740
+ * ```ts
741
+ * // matrix.run.model = 'gpt-4.1-mini'
742
+ * const defaultModel = context.model()
743
+ * // resolves the configured model whose id/model/alias matches 'gpt-4.1-mini'
744
+ *
745
+ * const judgeModel = context.model({ name: 'judge-large' })
746
+ * // explicit lookup bypasses matrix default
747
+ * ```
748
+ */
749
+ model: TaskExecutionContext['model'];
750
+ /**
751
+ * Optional reporter lifecycle hooks for task-local case events.
752
+ *
753
+ * Use when:
754
+ * - a caller wants visibility into each case without coupling to the CLI reporter layer
755
+ *
756
+ * Expects:
757
+ * - hooks are best-effort observers and should not affect task scoring
758
+ */
759
+ reporterHooks?: TaskReporterHooks;
760
+ }
761
+ /**
762
+ * Allowed terminal outcomes for one task case.
763
+ *
764
+ * Use when:
765
+ * - emitting case lifecycle events from the task DSL
766
+ *
767
+ * Expects:
768
+ * - consumers treat the value as the final state for the case
769
+ */
770
+ type TaskCaseState = 'passed' | 'failed';
771
+ /**
772
+ * Payload emitted when a task case starts.
773
+ *
774
+ * Use when:
775
+ * - reporter hooks need a stable position for one case within the task
776
+ *
777
+ * Expects:
778
+ * - `name` is the declared DSL case label
779
+ * - `index` is the zero-based case position within the task
780
+ * - `total` is the total number of registered cases
781
+ */
782
+ interface TaskCaseReporterPayload {
783
+ /**
784
+ * Declared case label.
785
+ */
786
+ name: string;
787
+ /**
788
+ * Zero-based case position within the task.
789
+ */
790
+ index: number;
791
+ /**
792
+ * Total number of registered cases.
793
+ */
794
+ total: number;
795
+ }
796
+ /**
797
+ * Payload emitted when a task case ends.
798
+ *
799
+ * Use when:
800
+ * - reporter hooks need the case position plus terminal state
801
+ *
802
+ * Expects:
803
+ * - `name` is the declared DSL case label
804
+ * - `index` is the zero-based case position within the task
805
+ * - `total` is the total number of registered cases
806
+ * - `state` describes the final case result
807
+ */
808
+ interface TaskCaseReporterEndPayload extends TaskCaseReporterPayload {
809
+ /**
810
+ * Final case state.
811
+ */
812
+ state: TaskCaseState;
813
+ }
814
+ /**
815
+ * Reporter hooks invoked around each task case execution.
816
+ *
817
+ * Use when:
818
+ * - a caller needs case-level lifecycle visibility from the DSL runner
819
+ * - downstream reporters should stay decoupled from the task execution path
820
+ *
821
+ * Expects:
822
+ * - hooks observe case start/end events but do not influence scoring
823
+ */
824
+ interface TaskReporterHooks {
825
+ /**
826
+ * Runs when a case is about to execute.
827
+ */
828
+ onCaseStart?: (payload: TaskCaseReporterPayload) => void;
829
+ /**
830
+ * Runs after a case settles.
831
+ */
832
+ onCaseEnd?: (payload: TaskCaseReporterEndPayload) => void;
833
+ }
834
+ /**
835
+ * Eval task definition used by `defineTask`.
836
+ */
837
+ interface TaskDefinition {
838
+ /**
839
+ * Stable task id for diagnostics.
840
+ */
841
+ id: string;
842
+ /**
843
+ * Optional matrix layering for this task definition.
844
+ *
845
+ * Use when:
846
+ * - task-local experiments should refine project/eval defaults
847
+ *
848
+ * @example
849
+ * ```ts
850
+ * matrix: {
851
+ * runMatrix: {
852
+ * override: {
853
+ * model: ['gpt-4.1-mini'],
854
+ * },
855
+ * },
856
+ * evalMatrix: {
857
+ * extend: {
858
+ * evaluator: ['default-judge'],
859
+ * },
860
+ * },
861
+ * }
862
+ * ```
863
+ */
864
+ matrix?: ScopedMatrices;
865
+ /**
866
+ * Executes one scheduled eval task.
867
+ */
868
+ run: (context: TaskRunContext) => Promise<TaskRunOutput> | TaskRunOutput;
869
+ }
870
+ /**
871
+ * Declares the metadata required for a single vieval evaluation module.
872
+ */
873
+ interface EvalDefinition {
874
+ description: string;
875
+ name: string;
876
+ /**
877
+ * Optional matrix layering for this eval definition.
878
+ *
879
+ * Use when:
880
+ * - one eval file needs control-group variants that differ from project defaults
881
+ *
882
+ * @example
883
+ * ```ts
884
+ * matrix: {
885
+ * runMatrix: {
886
+ * extend: {
887
+ * promptStyle: ['concise'],
888
+ * },
889
+ * override: {
890
+ * scenario: ['eval-scenario'],
891
+ * },
892
+ * },
893
+ * evalMatrix: {
894
+ * override: {
895
+ * rubric: ['strict'],
896
+ * },
897
+ * },
898
+ * }
899
+ * ```
900
+ *
901
+ * Context impact:
902
+ *
903
+ * ```txt
904
+ * project.runMatrix + eval.matrix.runMatrix + task.matrix.runMatrix
905
+ * => context.task.matrix.run
906
+ *
907
+ * project.evalMatrix + eval.matrix.evalMatrix + task.matrix.evalMatrix
908
+ * => context.task.matrix.eval
909
+ * ```
910
+ */
911
+ matrix?: ScopedMatrices;
912
+ /**
913
+ * Optional task implementation executed by runner.
914
+ */
915
+ task?: TaskDefinition;
916
+ }
917
+ /**
918
+ * Describes the shape of an imported vieval evaluation module.
919
+ */
920
+ interface EvalModule<TDefinition extends EvalDefinition = EvalDefinition> {
921
+ default: TDefinition;
922
+ }
923
+ /**
924
+ * Maps module URLs to their loaded vieval evaluation modules.
925
+ */
926
+ type EvalModuleMap = Record<string, EvalModule>;
927
+ /**
928
+ * Represents a normalized evaluation entry collected by the runner.
929
+ */
930
+ type CollectedEvalEntry<TDefinition extends EvalDefinition = EvalDefinition> = TDefinition & {
931
+ directory: string;
932
+ filePath: string;
933
+ id: string;
934
+ };
935
+ //#endregion
936
+ //#region src/config/define.d.ts
937
+ /**
938
+ * Returns the provided vieval definition while preserving literal field types.
939
+ */
940
+ declare function defineEval<const TDefinition extends EvalDefinition>(definition: TDefinition): TDefinition;
941
+ /**
942
+ * Returns the provided task definition while preserving literal field types.
943
+ */
944
+ declare function defineTask<const TDefinition extends TaskDefinition>(definition: TDefinition): TDefinition;
945
+ //#endregion
946
+ export { asProjectRelativePath as A, RunScoreKind as B, RunnerTaskState as C, TaskExecutionContext as D, CreateTaskExecutionContextOptions as E, AggregatedProviderSummary as F, RunnerMatrixInput as G, CreateRunnerScheduleOptions as H, AggregatedRunResults as I, ScheduledTaskMatrix as J, RunnerMatrixSelection as K, AggregatedRunSummary as L, CreateVievalRunnerRuntimeContextOptions as M, RunnerRuntimeContext as N, TaskModelSelectionOptions as O, createRunnerRuntimeContext as P, RunResult as R, RunnerExecutionError as S, runScheduledTasks as T, InferenceExecutor as U, aggregateRunResults as V, RunnerMatrixDefinition as W, createRunnerSchedule as X, ScheduledTaskMatrixMeta as Y, TaskDefinition as _, EvalModule as a, TaskRunOutput as b, MatrixDefinition as c, MatrixRow as d, MatrixValue as f, TaskCaseState as g, TaskCaseReporterPayload as h, EvalDefinition as i, collectEvalEntries as j, createTaskExecutionContext as k, MatrixLayer as l, TaskCaseReporterEndPayload as m, defineTask as n, EvalModuleMap as o, ScopedMatrices as p, ScheduledTask as q, CollectedEvalEntry as r, MatrixAxisValues as s, defineEval as t, MatrixPrimitive as u, TaskReporterHooks as v, ScheduledTaskExecutor as w, RunScheduledTasksOptions as x, TaskRunContext as y, RunScore as z };
947
+ //# sourceMappingURL=index-DP7jsORl.d.mts.map