vieval 0.0.1 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/README.md +8 -5
  2. package/dist/cli/index.d.mts +1 -1
  3. package/dist/cli/index.mjs +1232 -83
  4. package/dist/cli/index.mjs.map +1 -1
  5. package/dist/{config-D2fe1SnT.mjs → config-CHN24egi.mjs} +1 -1
  6. package/dist/{config-D2fe1SnT.mjs.map → config-CHN24egi.mjs.map} +1 -1
  7. package/dist/config.d.mts +2 -3
  8. package/dist/config.mjs +2 -2
  9. package/dist/core/assertions/index.d.mts +1 -1
  10. package/dist/core/inference-executors/index.d.mts +1 -45
  11. package/dist/core/inference-executors/index.mjs +1 -38
  12. package/dist/core/inference-executors/index.mjs.map +1 -1
  13. package/dist/core/processors/results/index.d.mts +1 -1
  14. package/dist/core/runner/index.d.mts +2 -2
  15. package/dist/core/runner/index.mjs +2 -2
  16. package/dist/env-C7X81PWa.mjs +41 -0
  17. package/dist/env-C7X81PWa.mjs.map +1 -0
  18. package/dist/env-DtpjACOW.d.mts +47 -0
  19. package/dist/expect-B2vaoRVZ.d.mts +10 -0
  20. package/dist/{expect-i9WZWGrA.mjs → expect-CaXiUkwY.mjs} +3 -3
  21. package/dist/expect-CaXiUkwY.mjs.map +1 -0
  22. package/dist/expect-extensions-BOzwV5EJ.mjs +197 -0
  23. package/dist/expect-extensions-BOzwV5EJ.mjs.map +1 -0
  24. package/dist/expect.d.mts +1 -1
  25. package/dist/expect.mjs +1 -1
  26. package/dist/{index-DP7jsORl.d.mts → index-BDMEAmf2.d.mts} +246 -3
  27. package/dist/{index-oSXhM1zx.d.mts → index-C3gPFmcR.d.mts} +2 -2
  28. package/dist/index.d.mts +326 -6
  29. package/dist/index.mjs +65 -23
  30. package/dist/index.mjs.map +1 -1
  31. package/dist/{models-D_MsBtYw.mjs → models-DIGdOUpJ.mjs} +1 -1
  32. package/dist/{models-D_MsBtYw.mjs.map → models-DIGdOUpJ.mjs.map} +1 -1
  33. package/dist/plugins/chat-models/index.d.mts +465 -6
  34. package/dist/plugins/chat-models/index.mjs +469 -6
  35. package/dist/plugins/chat-models/index.mjs.map +1 -1
  36. package/dist/{registry-ChOjjdEC.mjs → registry-CHJcTN2W.mjs} +75 -16
  37. package/dist/registry-CHJcTN2W.mjs.map +1 -0
  38. package/dist/{runner-4ZsOveoY.mjs → runner-Dpy-eivM.mjs} +177 -21
  39. package/dist/runner-Dpy-eivM.mjs.map +1 -0
  40. package/dist/testing/expect-extensions.d.mts +44 -38
  41. package/dist/testing/expect-extensions.mjs +1 -1
  42. package/package.json +11 -4
  43. package/dist/expect-0jPJ7Zio.d.mts +0 -2318
  44. package/dist/expect-extensions-CwPtgTz8.mjs +0 -13471
  45. package/dist/expect-extensions-CwPtgTz8.mjs.map +0 -1
  46. package/dist/expect-i9WZWGrA.mjs.map +0 -1
  47. package/dist/magic-string.es-CH1jwzMg.mjs +0 -1013
  48. package/dist/magic-string.es-CH1jwzMg.mjs.map +0 -1
  49. package/dist/plugin-DVaRZY2x.d.mts +0 -84
  50. package/dist/registry-ChOjjdEC.mjs.map +0 -1
  51. package/dist/runner-4ZsOveoY.mjs.map +0 -1
@@ -1,5 +1,110 @@
1
- import { n as ModelDefinition } from "./plugin-DVaRZY2x.mjs";
1
+ import { ReadStream, WriteStream } from "node:fs";
2
+ import { Buffer } from "node:buffer";
2
3
 
4
+ //#region src/core/cache/types.d.ts
5
+ /**
6
+ * Cache entry options used to derive one deterministic cache file path.
7
+ */
8
+ interface CacheFileOptions {
9
+ /**
10
+ * Optional file extension for the cache artifact (for example: `json`, `txt`, `wav`).
11
+ */
12
+ ext?: string;
13
+ /**
14
+ * Deterministic key segments used to build the relative cache path.
15
+ */
16
+ key: readonly string[];
17
+ /**
18
+ * Optional media type hint used by adapters when extension is omitted.
19
+ */
20
+ mediaType?: string;
21
+ }
22
+ /**
23
+ * One cache file handle exposed to task code.
24
+ *
25
+ * Use when:
26
+ * - benchmark setup needs deterministic artifact storage
27
+ * - task runtime needs typed file helpers for text/json/binary payloads
28
+ *
29
+ * Expects:
30
+ * - `path` to be stable for the same namespace + key
31
+ * - read helpers to throw when the file does not exist or payload is invalid
32
+ *
33
+ * Returns:
34
+ * - read/write helpers over one deterministic cache artifact path
35
+ */
36
+ interface CacheFileHandle {
37
+ path: string;
38
+ exists: () => Promise<boolean>;
39
+ openReadStream: () => ReadStream;
40
+ openWriteStream: () => Promise<WriteStream>;
41
+ readBuffer: () => Promise<Buffer>;
42
+ writeBuffer: (value: Buffer) => Promise<void>;
43
+ readText: (encoding?: BufferEncoding) => Promise<string>;
44
+ writeText: (value: string, encoding?: BufferEncoding) => Promise<void>;
45
+ readJson: <T>() => Promise<T>;
46
+ writeJson: (value: unknown) => Promise<void>;
47
+ loadAsCasesInput: <T>() => Promise<T[]>;
48
+ loadAsExpectFixture: <T>() => Promise<T>;
49
+ }
50
+ /**
51
+ * Namespaced cache accessor for deterministic cache artifacts.
52
+ */
53
+ interface CacheNamespace {
54
+ file: (options: CacheFileOptions) => CacheFileHandle;
55
+ }
56
+ /**
57
+ * Task-scoped cache runtime injected into `TaskRunContext`.
58
+ */
59
+ interface TaskCacheRuntime {
60
+ namespace: (name: string) => CacheNamespace;
61
+ }
62
+ //#endregion
63
+ //#region src/core/cache/filesystem.d.ts
64
+ /**
65
+ * Options for creating the filesystem-backed task cache runtime.
66
+ */
67
+ interface CreateFilesystemTaskCacheRuntimeOptions {
68
+ /**
69
+ * Absolute cache root directory.
70
+ */
71
+ cacheRootDirectory: string;
72
+ /**
73
+ * Project identifier under one workspace cache scope.
74
+ */
75
+ projectName: string;
76
+ /**
77
+ * Workspace identifier used to share cache roots across projects.
78
+ */
79
+ workspaceId: string;
80
+ }
81
+ /**
82
+ * Normalizes cache file options into deterministic relative path segments.
83
+ *
84
+ * Before:
85
+ * - `{ key: ['cases', 'dataset hash', 'v1'], ext: 'json' }`
86
+ *
87
+ * After:
88
+ * - `['cases', 'dataset-hash', 'v1.json']`
89
+ */
90
+ declare function normalizeCacheFilePathSegments(options: CacheFileOptions): string[];
91
+ /**
92
+ * Creates a deterministic filesystem-backed task cache runtime.
93
+ *
94
+ * Use when:
95
+ * - eval tasks need reproducible cache paths for expensive pre-processing outputs
96
+ * - benchmark adapters need one artifact-oriented API for text/json/binary reads and writes
97
+ *
98
+ * Expects:
99
+ * - `cacheRootDirectory` to be writable by the running process
100
+ * - `workspaceId` + `projectName` to stay stable for reproducible paths
101
+ *
102
+ * Returns:
103
+ * - task cache runtime that resolves namespaced file handles under:
104
+ * `<cacheRootDirectory>/<workspaceId>/<projectName>/<namespace>/...`
105
+ */
106
+ declare function createFilesystemTaskCacheRuntime(options: CreateFilesystemTaskCacheRuntimeOptions): TaskCacheRuntime;
107
+ //#endregion
3
108
  //#region src/core/runner/schedule.d.ts
4
109
  /**
5
110
  * Describes the inferenceExecutor target for a scheduled eval run.
@@ -333,6 +438,58 @@ declare function asProjectRelativePath(filePath: string, context: RunnerRuntimeC
333
438
  */
334
439
  declare function collectEvalEntries(modules: EvalModuleMap, context: RunnerRuntimeContext): CollectedEvalEntry[];
335
440
  //#endregion
441
+ //#region src/config/models.d.ts
442
+ /**
443
+ * Canonical model definition consumed by vieval runtime and config.
444
+ *
445
+ * Use when:
446
+ * - declaring models in `vieval.config.*`
447
+ * - resolving task runtime models by id, alias, or concrete model name
448
+ *
449
+ * Expects:
450
+ * - `id` to be stable and unique within one config
451
+ * - `inferenceExecutorId` to match scheduler/executor identifiers
452
+ *
453
+ * Returns:
454
+ * - one normalized model registration record
455
+ */
456
+ interface ModelDefinition {
457
+ /**
458
+ * Stable model id.
459
+ */
460
+ id: string;
461
+ /**
462
+ * Inference-executor id used for matching and reporting.
463
+ */
464
+ inferenceExecutorId: string;
465
+ /**
466
+ * Executor reference passed through config.
467
+ *
468
+ * `vieval` core treats this as opaque runtime metadata. Builder plugins can
469
+ * narrow this field with plugin-specific executor input types.
470
+ */
471
+ inferenceExecutor: unknown;
472
+ /**
473
+ * Concrete model name passed to the inference executor.
474
+ */
475
+ model: string;
476
+ /**
477
+ * Alias names that can resolve this model.
478
+ */
479
+ aliases: string[];
480
+ /**
481
+ * Optional model-level call parameters.
482
+ */
483
+ parameters?: Record<string, unknown>;
484
+ }
485
+ /**
486
+ * Resolves one model by id, model name, or alias in registration order.
487
+ *
488
+ * Returns:
489
+ * - the first matching model, or `undefined` when no match exists
490
+ */
491
+ declare function resolveModelByName(models: readonly ModelDefinition[], name: string): ModelDefinition | undefined;
492
+ //#endregion
336
493
  //#region src/core/runner/task-context.d.ts
337
494
  /**
338
495
  * Options for selecting a model from the execution context.
@@ -347,6 +504,10 @@ interface TaskModelSelectionOptions {
347
504
  * Task-scoped execution context exposed to runner executors.
348
505
  */
349
506
  interface TaskExecutionContext {
507
+ /**
508
+ * Deterministic cache runtime scoped to the current task project.
509
+ */
510
+ cache: TaskCacheRuntime;
350
511
  /**
351
512
  * Resolves model configuration for the current task.
352
513
  *
@@ -360,6 +521,7 @@ interface TaskExecutionContext {
360
521
  * Inputs used to build task execution context.
361
522
  */
362
523
  interface CreateTaskExecutionContextOptions {
524
+ cache?: TaskCacheRuntime;
363
525
  models: readonly ModelDefinition[];
364
526
  task: ScheduledTask;
365
527
  }
@@ -439,6 +601,12 @@ interface RunScheduledTasksOptions {
439
601
  * - failed-task observers do not override the executor error for the task
440
602
  */
441
603
  onTaskEnd?: (task: ScheduledTask, state: RunnerTaskState) => void;
604
+ /**
605
+ * Maximum number of tasks to execute concurrently.
606
+ *
607
+ * @default 1
608
+ */
609
+ maxConcurrency?: number;
442
610
  }
443
611
  /**
444
612
  * Error thrown when a scheduled run fails before producing a normalized result.
@@ -694,6 +862,14 @@ interface TaskRunOutput {
694
862
  * Runtime context passed into eval task `run`.
695
863
  */
696
864
  interface TaskRunContext {
865
+ /**
866
+ * Task-scoped cache runtime.
867
+ *
868
+ * Use when:
869
+ * - benchmark setup needs deterministic artifact reuse across attempts
870
+ * - case-level logic needs typed text/json/binary cache loaders
871
+ */
872
+ cache: TaskExecutionContext['cache'];
697
873
  /**
698
874
  * Scheduled runner task metadata.
699
875
  *
@@ -810,6 +986,10 @@ interface TaskCaseReporterEndPayload extends TaskCaseReporterPayload {
810
986
  * Final case state.
811
987
  */
812
988
  state: TaskCaseState;
989
+ /**
990
+ * Optional failure message when `state` is `failed`.
991
+ */
992
+ errorMessage?: string;
813
993
  }
814
994
  /**
815
995
  * Reporter hooks invoked around each task case execution.
@@ -830,6 +1010,39 @@ interface TaskReporterHooks {
830
1010
  * Runs after a case settles.
831
1011
  */
832
1012
  onCaseEnd?: (payload: TaskCaseReporterEndPayload) => void;
1013
+ /**
1014
+ * Runs when task code emits a custom telemetry/reporting event.
1015
+ *
1016
+ * Use when:
1017
+ * - eval implementations need report artifacts beyond case lifecycle counters
1018
+ * - model/runtime integrations emit inference, metering, or tool-call events
1019
+ */
1020
+ onEvent?: (payload: TaskReporterEventPayload) => void;
1021
+ }
1022
+ /**
1023
+ * Payload emitted by task code for custom report events.
1024
+ *
1025
+ * Use when:
1026
+ * - reporting runtime telemetry such as inference requests, responses, or tool calls
1027
+ * - attaching modality-specific metrics without coupling task logic to CLI internals
1028
+ *
1029
+ * Expects:
1030
+ * - `event` to be a stable event name
1031
+ * - `data` to be JSON-serializable for report artifact persistence
1032
+ */
1033
+ interface TaskReporterEventPayload {
1034
+ /**
1035
+ * Event name written into report event envelopes.
1036
+ */
1037
+ event: string;
1038
+ /**
1039
+ * Optional custom payload persisted under event `data`.
1040
+ */
1041
+ data?: unknown;
1042
+ /**
1043
+ * Optional stable case id when the event maps to one case lifecycle.
1044
+ */
1045
+ caseId?: string;
833
1046
  }
834
1047
  /**
835
1048
  * Eval task definition used by `defineTask`.
@@ -943,5 +1156,35 @@ declare function defineEval<const TDefinition extends EvalDefinition>(definition
943
1156
  */
944
1157
  declare function defineTask<const TDefinition extends TaskDefinition>(definition: TDefinition): TDefinition;
945
1158
  //#endregion
946
- export { asProjectRelativePath as A, RunScoreKind as B, RunnerTaskState as C, TaskExecutionContext as D, CreateTaskExecutionContextOptions as E, AggregatedProviderSummary as F, RunnerMatrixInput as G, CreateRunnerScheduleOptions as H, AggregatedRunResults as I, ScheduledTaskMatrix as J, RunnerMatrixSelection as K, AggregatedRunSummary as L, CreateVievalRunnerRuntimeContextOptions as M, RunnerRuntimeContext as N, TaskModelSelectionOptions as O, createRunnerRuntimeContext as P, RunResult as R, RunnerExecutionError as S, runScheduledTasks as T, InferenceExecutor as U, aggregateRunResults as V, RunnerMatrixDefinition as W, createRunnerSchedule as X, ScheduledTaskMatrixMeta as Y, TaskDefinition as _, EvalModule as a, TaskRunOutput as b, MatrixDefinition as c, MatrixRow as d, MatrixValue as f, TaskCaseState as g, TaskCaseReporterPayload as h, EvalDefinition as i, collectEvalEntries as j, createTaskExecutionContext as k, MatrixLayer as l, TaskCaseReporterEndPayload as m, defineTask as n, EvalModuleMap as o, ScopedMatrices as p, ScheduledTask as q, CollectedEvalEntry as r, MatrixAxisValues as s, defineEval as t, MatrixPrimitive as u, TaskReporterHooks as v, ScheduledTaskExecutor as w, RunScheduledTasksOptions as x, TaskRunContext as y, RunScore as z };
947
- //# sourceMappingURL=index-DP7jsORl.d.mts.map
1159
+ //#region src/config/plugin.d.ts
1160
+ /**
1161
+ * Generic plugin contract for vieval config lifecycle hooks.
1162
+ *
1163
+ * Use when:
1164
+ * - a plugin needs to transform config before CLI normalization
1165
+ * - a plugin needs a final resolved-config callback
1166
+ *
1167
+ * Expects:
1168
+ * - `name` to be stable for diagnostics
1169
+ * - hooks to return either a full config object or `void`
1170
+ *
1171
+ * Returns:
1172
+ * - a typed plugin shape bound to one config object
1173
+ */
1174
+ interface ConfigHookPlugin<TConfig> {
1175
+ /**
1176
+ * Stable plugin name for diagnostics.
1177
+ */
1178
+ name: string;
1179
+ /**
1180
+ * Optional config transform hook.
1181
+ */
1182
+ configVieval?: (config: TConfig) => TConfig | void | Promise<TConfig | void>;
1183
+ /**
1184
+ * Optional hook after config is finalized.
1185
+ */
1186
+ configVievalResolved?: (config: TConfig) => void | Promise<void>;
1187
+ }
1188
+ //#endregion
1189
+ export { ScheduledTaskMatrixMeta as $, TaskModelSelectionOptions as A, AggregatedRunResults as B, RunScheduledTasksOptions as C, runScheduledTasks as D, ScheduledTaskExecutor as E, collectEvalEntries as F, aggregateRunResults as G, RunResult as H, CreateVievalRunnerRuntimeContextOptions as I, RunnerMatrixDefinition as J, CreateRunnerScheduleOptions as K, RunnerRuntimeContext as L, ModelDefinition as M, resolveModelByName as N, CreateTaskExecutionContextOptions as O, asProjectRelativePath as P, ScheduledTaskMatrix as Q, createRunnerRuntimeContext as R, TaskRunOutput as S, RunnerTaskState as T, RunScore as U, AggregatedRunSummary as V, RunScoreKind as W, RunnerMatrixSelection as X, RunnerMatrixInput as Y, ScheduledTask as Z, TaskCaseState as _, EvalDefinition as a, CacheFileOptions as at, TaskReporterHooks as b, MatrixAxisValues as c, MatrixPrimitive as d, createRunnerSchedule as et, MatrixRow as f, TaskCaseReporterPayload as g, TaskCaseReporterEndPayload as h, CollectedEvalEntry as i, CacheFileHandle as it, createTaskExecutionContext as j, TaskExecutionContext as k, MatrixDefinition as l, ScopedMatrices as m, defineEval as n, createFilesystemTaskCacheRuntime as nt, EvalModule as o, CacheNamespace as ot, MatrixValue as p, InferenceExecutor as q, defineTask as r, normalizeCacheFilePathSegments as rt, EvalModuleMap as s, TaskCacheRuntime as st, ConfigHookPlugin as t, CreateFilesystemTaskCacheRuntimeOptions as tt, MatrixLayer as u, TaskDefinition as v, RunnerExecutionError as w, TaskRunContext as x, TaskReporterEventPayload as y, AggregatedProviderSummary as z };
1190
+ //# sourceMappingURL=index-BDMEAmf2.d.mts.map
@@ -1,4 +1,4 @@
1
- import { B as RunScoreKind, z as RunScore } from "./index-DP7jsORl.mjs";
1
+ import { U as RunScore, W as RunScoreKind } from "./index-BDMEAmf2.mjs";
2
2
 
3
3
  //#region src/core/assertions/index.d.ts
4
4
  /**
@@ -311,4 +311,4 @@ declare function toRunScores(outcomes: readonly AssertionOutcome[]): RunScore[];
311
311
  declare function collectFailedAssertions(outcomes: readonly AssertionOutcome[]): AssertionOutcome[];
312
312
  //#endregion
313
313
  export { expectToolCallArgs as C, expectStructuredOutput as S, toRunScores as T, expectMustExclude as _, CustomAssertionOptions as a, expectRegex as b, RegexAssertionOptions as c, StructuredOutputAssertionOptions as d, ToolCall as f, expectCustom as g, evaluateAssertions as h, AssertionState as i, RubricAssertionOptions as l, collectFailedAssertions as m, AssertionContext as n, MustExcludeAssertionOptions as o, ToolCallArgsAssertionOptions as p, AssertionOutcome as r, MustIncludeAssertionOptions as s, Assertion as t, RubricJudgeResult as u, expectMustInclude as v, normalizeMatchText as w, expectRubric as x, expectNot as y };
314
- //# sourceMappingURL=index-oSXhM1zx.d.mts.map
314
+ //# sourceMappingURL=index-C3gPFmcR.d.mts.map
package/dist/index.d.mts CHANGED
@@ -1,11 +1,302 @@
1
- import { b as TaskRunOutput, y as TaskRunContext } from "./index-DP7jsORl.mjs";
2
- import { t as expect } from "./expect-0jPJ7Zio.mjs";
1
+ import { H as RunResult, M as ModelDefinition, S as TaskRunOutput, W as RunScoreKind, Z as ScheduledTask, k as TaskExecutionContext, l as MatrixDefinition, q as InferenceExecutor, t as ConfigHookPlugin, u as MatrixLayer, x as TaskRunContext } from "./index-BDMEAmf2.mjs";
2
+ import { a as requiredEnvFrom } from "./env-DtpjACOW.mjs";
3
+ import { t as expect } from "./expect-B2vaoRVZ.mjs";
4
+ import * as _$c12 from "c12";
3
5
 
6
+ //#region src/cli/reporters/vitest-compat-reporter.d.ts
7
+ type Awaitable<T> = T | Promise<T>;
8
+ /**
9
+ * Normalized module-like entity delivered to vitest-compatible reporter hooks.
10
+ */
11
+ interface VievalVitestCompatModule {
12
+ id: string;
13
+ name: string;
14
+ projectName: string;
15
+ }
16
+ /**
17
+ * Normalized test-case-like entity delivered to vitest-compatible reporter hooks.
18
+ */
19
+ interface VievalVitestCompatCase {
20
+ id: string;
21
+ name: string;
22
+ module: VievalVitestCompatModule;
23
+ state: 'failed' | 'passed' | 'pending' | 'skipped';
24
+ }
25
+ /**
26
+ * Supported vitest-style reporter lifecycle hooks.
27
+ *
28
+ * Use when:
29
+ * - external reporter modules should observe vieval task/case lifecycle events
30
+ * - the project wants a familiar Vitest reporter callback model
31
+ *
32
+ * Expects:
33
+ * - hook handlers to be best-effort observers only
34
+ * - thrown errors are ignored to avoid interrupting eval execution
35
+ */
36
+ interface VievalVitestCompatReporter {
37
+ onTestCaseReady?: (testCase: VievalVitestCompatCase) => Awaitable<void>;
38
+ onTestCaseResult?: (testCase: VievalVitestCompatCase) => Awaitable<void>;
39
+ onTestModuleCollected?: (module: VievalVitestCompatModule) => Awaitable<void>;
40
+ onTestModuleEnd?: (module: VievalVitestCompatModule) => Awaitable<void>;
41
+ onTestModuleQueued?: (module: VievalVitestCompatModule) => Awaitable<void>;
42
+ onTestModuleStart?: (module: VievalVitestCompatModule) => Awaitable<void>;
43
+ onTestRunEnd?: (modules: readonly VievalVitestCompatModule[], errors: readonly {
44
+ message: string;
45
+ }[], state: 'failed' | 'passed') => Awaitable<void>;
46
+ onTestRunStart?: (specifications: readonly {
47
+ moduleId: string;
48
+ projectName: string;
49
+ }[]) => Awaitable<void>;
50
+ }
51
+ /**
52
+ * Supported project reporter references.
53
+ *
54
+ * - String: module path or package name, default export used.
55
+ * - Reporter object: inline hook object (Vitest-style inline reporter).
56
+ * - Tuple: [string or reporter object, constructor options].
57
+ *
58
+ * Source permalink:
59
+ * `https://github.com/vitest-dev/vitest/blob/b865b4d83d1e7874607ba1b2d84b9e2d135ecd33/packages/vitest/src/node/config/resolveConfig.ts#L674-L713`
60
+ */
61
+ type VievalVitestCompatReporterValue = string | VievalVitestCompatReporter;
62
+ type VievalVitestCompatReporterReference = VievalVitestCompatReporterValue | readonly [VievalVitestCompatReporterValue, unknown?];
63
+ //#endregion
4
64
  //#region src/cli/config.d.ts
65
+ /**
66
+ * CLI plugin shape bound to the full CLI config object.
67
+ */
68
+ type CliConfigPlugin = ConfigHookPlugin<CliConfig>;
69
+ /**
70
+ * Defines one project block for `vieval run`.
71
+ */
72
+ interface CliProjectConfig {
73
+ /**
74
+ * Project label used in summary output.
75
+ */
76
+ name: string;
77
+ /**
78
+ * Project root used for include/exclude glob matching.
79
+ *
80
+ * @default process cwd
81
+ */
82
+ root?: string;
83
+ /**
84
+ * Glob patterns for eval file discovery.
85
+ *
86
+ * @default Common eval file globs for TypeScript and JavaScript module formats.
87
+ */
88
+ include?: string[];
89
+ /**
90
+ * Glob patterns excluded from discovery.
91
+ *
92
+ * @default Common exclusion globs for dependencies, build output, and VCS directories.
93
+ */
94
+ exclude?: string[];
95
+ /**
96
+ * Providers expanded by scheduler.
97
+ *
98
+ * @default [{ id: 'default' }]
99
+ */
100
+ inferenceExecutors?: InferenceExecutor[];
101
+ /**
102
+ * Model definitions available to project runtime execution.
103
+ *
104
+ * Inference executors control schedule fan-out, while models provide
105
+ * runtime lookup metadata for `context.model(...)` during task execution.
106
+ *
107
+ * @default inherited from top-level config models
108
+ */
109
+ models?: ModelDefinition[];
110
+ /**
111
+ * Optional run-time matrix dimensions.
112
+ */
113
+ runMatrix?: MatrixDefinition | MatrixLayer;
114
+ /**
115
+ * Optional eval-time matrix dimensions.
116
+ */
117
+ evalMatrix?: MatrixDefinition | MatrixLayer;
118
+ /**
119
+ * Optional task executor.
120
+ *
121
+ * Use when this project should execute live inferenceExecutor requests.
122
+ * If omitted, `vieval run` performs collection + scheduling only.
123
+ */
124
+ executor?: (task: ScheduledTask, context: CliProjectExecutorContext) => Promise<RunResult>;
125
+ /**
126
+ * Optional project-local plugins.
127
+ */
128
+ plugins?: CliConfigPlugin[];
129
+ /**
130
+ * Optional vitest-compatible reporter modules.
131
+ *
132
+ * Use when:
133
+ * - project runs should emit additional reporter callbacks using Vitest-style lifecycle names
134
+ *
135
+ * @default []
136
+ */
137
+ reporters?: VievalVitestCompatReporterReference[];
138
+ }
139
+ /**
140
+ * One workspace descriptor for workspace-mode configs.
141
+ */
142
+ interface CliWorkspaceConfig {
143
+ /**
144
+ * Workspace identifier.
145
+ */
146
+ id: string;
147
+ /**
148
+ * Workspace root path.
149
+ */
150
+ root: string;
151
+ }
152
+ /**
153
+ * One explicit comparison method descriptor.
154
+ */
155
+ interface CliComparisonMethodConfig {
156
+ /**
157
+ * Method identifier shown in compare reports.
158
+ */
159
+ id: string;
160
+ /**
161
+ * Workspace path containing this method's `vieval.config.*`.
162
+ */
163
+ workspace: string;
164
+ /**
165
+ * Project name to execute inside workspace config.
166
+ */
167
+ project: string;
168
+ /**
169
+ * Optional explicit config file path for this workspace.
170
+ */
171
+ configFilePath?: string;
172
+ }
173
+ /**
174
+ * Benchmark identity and shared cache namespace.
175
+ */
176
+ interface CliComparisonBenchmarkConfig {
177
+ /**
178
+ * Benchmark identifier used in report artifacts.
179
+ */
180
+ id: string;
181
+ /**
182
+ * Shared cache namespace reused across method runs.
183
+ */
184
+ sharedCaseNamespace: string;
185
+ }
186
+ /**
187
+ * One comparison entry loaded by `vieval compare`.
188
+ */
189
+ interface CliComparisonConfig {
190
+ /**
191
+ * Comparison id selected by `--comparison`.
192
+ */
193
+ id: string;
194
+ /**
195
+ * Benchmark metadata for reporting and shared cache coordination.
196
+ */
197
+ benchmark: CliComparisonBenchmarkConfig;
198
+ /**
199
+ * Optional explicit method list.
200
+ */
201
+ methods?: CliComparisonMethodConfig[];
202
+ /**
203
+ * Optional workspace glob(s) discovered relative to config directory.
204
+ */
205
+ includesWorkspaces?: string | string[];
206
+ /**
207
+ * Optional workspace exclude glob(s), also relative to config directory.
208
+ */
209
+ excludesWorkspaces?: string | string[];
210
+ }
211
+ /**
212
+ * Execution context exposed to project-level `executor` implementations.
213
+ *
214
+ * Use when:
215
+ * - a project executor needs the task-scoped model resolver plus case reporter hooks
216
+ * - custom scheduling logic wants the same hook shape as `TaskRunContext`
217
+ *
218
+ * Expects:
219
+ * - `model` resolves configured models for the current task
220
+ * - `reporterHooks` follows `TaskRunContext['reporterHooks']`
221
+ */
222
+ interface CliProjectExecutorContext extends TaskExecutionContext {
223
+ reporterHooks?: TaskRunContext['reporterHooks'];
224
+ }
225
+ /**
226
+ * Top-level CLI config loaded from `vieval.config.*`.
227
+ */
228
+ interface CliConfigBase {
229
+ /**
230
+ * Global model definitions inherited by projects.
231
+ *
232
+ * @default []
233
+ */
234
+ models?: ModelDefinition[];
235
+ /**
236
+ * Global config plugins.
237
+ *
238
+ * @default []
239
+ */
240
+ plugins?: CliConfigPlugin[];
241
+ /**
242
+ * Global vitest-compatible reporter modules inherited by projects.
243
+ *
244
+ * @default []
245
+ */
246
+ reporters?: VievalVitestCompatReporterReference[];
247
+ /**
248
+ * Environment variables injected into `process.env` during `vieval run`.
249
+ *
250
+ * Use when:
251
+ * - eval tasks depend on runtime env values (for example inferenceExecutor API keys)
252
+ * - config wants deterministic env values without shell-level exports
253
+ *
254
+ * @default {}
255
+ */
256
+ env?: NodeJS.ProcessEnv;
257
+ }
258
+ /**
259
+ * Project mode config for `vieval run`.
260
+ */
261
+ interface CliProjectModeConfig extends CliConfigBase {
262
+ /**
263
+ * Project list expanded by `vieval run`.
264
+ *
265
+ * @default [{ name: 'default' }]
266
+ */
267
+ projects?: CliProjectConfig[];
268
+ comparisons?: never;
269
+ workspaces?: never;
270
+ }
271
+ /**
272
+ * Workspace mode config placeholder for future workspace orchestration.
273
+ */
274
+ interface CliWorkspaceModeConfig extends CliConfigBase {
275
+ workspaces: CliWorkspaceConfig[];
276
+ projects?: never;
277
+ comparisons?: never;
278
+ }
279
+ /**
280
+ * Comparison mode config for `vieval compare`.
281
+ */
282
+ interface CliComparisonModeConfig extends CliConfigBase {
283
+ comparisons: CliComparisonConfig[];
284
+ projects?: never;
285
+ workspaces?: never;
286
+ }
287
+ /**
288
+ * Top-level CLI config loaded from `vieval.config.*`.
289
+ *
290
+ * Exactly one top-level mode is allowed:
291
+ * - `projects`
292
+ * - `workspaces`
293
+ * - `comparisons`
294
+ */
295
+ type CliConfig = CliProjectModeConfig | CliWorkspaceModeConfig | CliComparisonModeConfig;
5
296
  /**
6
297
  * Helper used by `vieval.config.*` for better type inference.
7
298
  */
8
- declare const defineConfig: any;
299
+ declare const defineConfig: _$c12.DefineConfig<CliConfig, _$c12.ConfigLayerMeta>;
9
300
  /**
10
301
  * Loads `.env*` files using Vite's env resolution behavior.
11
302
  *
@@ -33,6 +324,27 @@ interface CaseRunContext<TInput> extends TaskRunContext {
33
324
  matrix: TaskRunContext['task']['matrix'] & {
34
325
  inputs: TInput;
35
326
  };
327
+ /**
328
+ * Overrides one case score family with a custom normalized value.
329
+ *
330
+ * Use when:
331
+ * - one case computes a benchmark-native score that should flow into run aggregation
332
+ *
333
+ * Expects:
334
+ * - `score` to stay in the `0..1` range
335
+ */
336
+ score: (score: number, kind?: RunScoreKind) => void;
337
+ /**
338
+ * Emits one custom case metric into report events.
339
+ *
340
+ * Use when:
341
+ * - tasks need structured benchmark metadata beyond exact/judge score families
342
+ *
343
+ * Expects:
344
+ * - `name` to be a stable metric identifier
345
+ * - `value` to be JSON-serializable
346
+ */
347
+ metric: (name: string, value: boolean | number | string | null) => void;
36
348
  }
37
349
  /**
38
350
  * Callback for one task case.
@@ -45,7 +357,12 @@ interface DescribeTaskBuilder {
45
357
  /**
46
358
  * Registers one explicit case.
47
359
  */
48
- caseOf: <TInput>(name: string, run: CaseRunner<TInput>, input: TInput) => void;
360
+ caseOf: {
361
+ (name: string, run: CaseRunner<undefined>): void;
362
+ <TInput>(name: string, run: CaseRunner<TInput>, options: {
363
+ input: TInput;
364
+ }): void;
365
+ };
49
366
  /**
50
367
  * Registers multiple cases from input list.
51
368
  */
@@ -63,7 +380,10 @@ interface DescribeTaskOptions {
63
380
  /**
64
381
  * Registers one case in the currently active task scope.
65
382
  */
66
- declare function caseOf<TInput>(name: string, run: CaseRunner<TInput>, input: TInput): void;
383
+ declare function caseOf(name: string, run: CaseRunner<undefined>): void;
384
+ declare function caseOf<TInput>(name: string, run: CaseRunner<TInput>, options: {
385
+ input: TInput;
386
+ }): void;
67
387
  /**
68
388
  * Registers multiple cases in the currently active task scope.
69
389
  */
@@ -88,5 +408,5 @@ declare function describeTask(name: string, build: ((builder: DescribeTaskBuilde
88
408
  */
89
409
  declare const describeEval: typeof describeTask;
90
410
  //#endregion
91
- export { caseOf, casesFromInputs, defineConfig, describeEval, describeTask, expect, loadEnv };
411
+ export { caseOf, casesFromInputs, defineConfig, describeEval, describeTask, expect, loadEnv, requiredEnvFrom };
92
412
  //# sourceMappingURL=index.d.mts.map