vieval 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -5
- package/dist/cli/index.d.mts +1 -1
- package/dist/cli/index.mjs +1204 -61
- package/dist/cli/index.mjs.map +1 -1
- package/dist/{config-D2fe1SnT.mjs → config-CHN24egi.mjs} +1 -1
- package/dist/{config-D2fe1SnT.mjs.map → config-CHN24egi.mjs.map} +1 -1
- package/dist/config.d.mts +2 -3
- package/dist/config.mjs +2 -2
- package/dist/core/assertions/index.d.mts +1 -1
- package/dist/core/inference-executors/index.d.mts +1 -45
- package/dist/core/inference-executors/index.mjs +1 -38
- package/dist/core/inference-executors/index.mjs.map +1 -1
- package/dist/core/processors/results/index.d.mts +1 -1
- package/dist/core/runner/index.d.mts +2 -2
- package/dist/core/runner/index.mjs +2 -2
- package/dist/env-C7X81PWa.mjs +41 -0
- package/dist/env-C7X81PWa.mjs.map +1 -0
- package/dist/env-DtpjACOW.d.mts +47 -0
- package/dist/expect-B2vaoRVZ.d.mts +10 -0
- package/dist/{expect-i9WZWGrA.mjs → expect-CaXiUkwY.mjs} +3 -3
- package/dist/expect-CaXiUkwY.mjs.map +1 -0
- package/dist/expect-extensions-BOzwV5EJ.mjs +197 -0
- package/dist/expect-extensions-BOzwV5EJ.mjs.map +1 -0
- package/dist/expect.d.mts +1 -1
- package/dist/expect.mjs +1 -1
- package/dist/{index-DP7jsORl.d.mts → index-BDMEAmf2.d.mts} +246 -3
- package/dist/{index-oSXhM1zx.d.mts → index-C3gPFmcR.d.mts} +2 -2
- package/dist/index.d.mts +326 -6
- package/dist/index.mjs +65 -23
- package/dist/index.mjs.map +1 -1
- package/dist/{models-D_MsBtYw.mjs → models-DIGdOUpJ.mjs} +1 -1
- package/dist/{models-D_MsBtYw.mjs.map → models-DIGdOUpJ.mjs.map} +1 -1
- package/dist/plugins/chat-models/index.d.mts +465 -6
- package/dist/plugins/chat-models/index.mjs +469 -6
- package/dist/plugins/chat-models/index.mjs.map +1 -1
- package/dist/{registry-ChOjjdEC.mjs → registry-CHJcTN2W.mjs} +75 -16
- package/dist/registry-CHJcTN2W.mjs.map +1 -0
- package/dist/{runner-4ZsOveoY.mjs → runner-Dpy-eivM.mjs} +177 -21
- package/dist/runner-Dpy-eivM.mjs.map +1 -0
- package/dist/testing/expect-extensions.d.mts +44 -38
- package/dist/testing/expect-extensions.mjs +1 -1
- package/package.json +11 -4
- package/dist/expect-0jPJ7Zio.d.mts +0 -2318
- package/dist/expect-extensions-CwPtgTz8.mjs +0 -13471
- package/dist/expect-extensions-CwPtgTz8.mjs.map +0 -1
- package/dist/expect-i9WZWGrA.mjs.map +0 -1
- package/dist/magic-string.es-CH1jwzMg.mjs +0 -1013
- package/dist/magic-string.es-CH1jwzMg.mjs.map +0 -1
- package/dist/plugin-DVaRZY2x.d.mts +0 -84
- package/dist/registry-ChOjjdEC.mjs.map +0 -1
- package/dist/runner-4ZsOveoY.mjs.map +0 -1
|
@@ -1,5 +1,110 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { ReadStream, WriteStream } from "node:fs";
|
|
2
|
+
import { Buffer } from "node:buffer";
|
|
2
3
|
|
|
4
|
+
//#region src/core/cache/types.d.ts
|
|
5
|
+
/**
|
|
6
|
+
* Cache entry options used to derive one deterministic cache file path.
|
|
7
|
+
*/
|
|
8
|
+
interface CacheFileOptions {
|
|
9
|
+
/**
|
|
10
|
+
* Optional file extension for the cache artifact (for example: `json`, `txt`, `wav`).
|
|
11
|
+
*/
|
|
12
|
+
ext?: string;
|
|
13
|
+
/**
|
|
14
|
+
* Deterministic key segments used to build the relative cache path.
|
|
15
|
+
*/
|
|
16
|
+
key: readonly string[];
|
|
17
|
+
/**
|
|
18
|
+
* Optional media type hint used by adapters when extension is omitted.
|
|
19
|
+
*/
|
|
20
|
+
mediaType?: string;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* One cache file handle exposed to task code.
|
|
24
|
+
*
|
|
25
|
+
* Use when:
|
|
26
|
+
* - benchmark setup needs deterministic artifact storage
|
|
27
|
+
* - task runtime needs typed file helpers for text/json/binary payloads
|
|
28
|
+
*
|
|
29
|
+
* Expects:
|
|
30
|
+
* - `path` to be stable for the same namespace + key
|
|
31
|
+
* - read helpers to throw when the file does not exist or payload is invalid
|
|
32
|
+
*
|
|
33
|
+
* Returns:
|
|
34
|
+
* - read/write helpers over one deterministic cache artifact path
|
|
35
|
+
*/
|
|
36
|
+
interface CacheFileHandle {
|
|
37
|
+
path: string;
|
|
38
|
+
exists: () => Promise<boolean>;
|
|
39
|
+
openReadStream: () => ReadStream;
|
|
40
|
+
openWriteStream: () => Promise<WriteStream>;
|
|
41
|
+
readBuffer: () => Promise<Buffer>;
|
|
42
|
+
writeBuffer: (value: Buffer) => Promise<void>;
|
|
43
|
+
readText: (encoding?: BufferEncoding) => Promise<string>;
|
|
44
|
+
writeText: (value: string, encoding?: BufferEncoding) => Promise<void>;
|
|
45
|
+
readJson: <T>() => Promise<T>;
|
|
46
|
+
writeJson: (value: unknown) => Promise<void>;
|
|
47
|
+
loadAsCasesInput: <T>() => Promise<T[]>;
|
|
48
|
+
loadAsExpectFixture: <T>() => Promise<T>;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Namespaced cache accessor for deterministic cache artifacts.
|
|
52
|
+
*/
|
|
53
|
+
interface CacheNamespace {
|
|
54
|
+
file: (options: CacheFileOptions) => CacheFileHandle;
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Task-scoped cache runtime injected into `TaskRunContext`.
|
|
58
|
+
*/
|
|
59
|
+
interface TaskCacheRuntime {
|
|
60
|
+
namespace: (name: string) => CacheNamespace;
|
|
61
|
+
}
|
|
62
|
+
//#endregion
|
|
63
|
+
//#region src/core/cache/filesystem.d.ts
|
|
64
|
+
/**
|
|
65
|
+
* Options for creating the filesystem-backed task cache runtime.
|
|
66
|
+
*/
|
|
67
|
+
interface CreateFilesystemTaskCacheRuntimeOptions {
|
|
68
|
+
/**
|
|
69
|
+
* Absolute cache root directory.
|
|
70
|
+
*/
|
|
71
|
+
cacheRootDirectory: string;
|
|
72
|
+
/**
|
|
73
|
+
* Project identifier under one workspace cache scope.
|
|
74
|
+
*/
|
|
75
|
+
projectName: string;
|
|
76
|
+
/**
|
|
77
|
+
* Workspace identifier used to share cache roots across projects.
|
|
78
|
+
*/
|
|
79
|
+
workspaceId: string;
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Normalizes cache file options into deterministic relative path segments.
|
|
83
|
+
*
|
|
84
|
+
* Before:
|
|
85
|
+
* - `{ key: ['cases', 'dataset hash', 'v1'], ext: 'json' }`
|
|
86
|
+
*
|
|
87
|
+
* After:
|
|
88
|
+
* - `['cases', 'dataset-hash', 'v1.json']`
|
|
89
|
+
*/
|
|
90
|
+
declare function normalizeCacheFilePathSegments(options: CacheFileOptions): string[];
|
|
91
|
+
/**
|
|
92
|
+
* Creates a deterministic filesystem-backed task cache runtime.
|
|
93
|
+
*
|
|
94
|
+
* Use when:
|
|
95
|
+
* - eval tasks need reproducible cache paths for expensive pre-processing outputs
|
|
96
|
+
* - benchmark adapters need one artifact-oriented API for text/json/binary reads and writes
|
|
97
|
+
*
|
|
98
|
+
* Expects:
|
|
99
|
+
* - `cacheRootDirectory` to be writable by the running process
|
|
100
|
+
* - `workspaceId` + `projectName` to stay stable for reproducible paths
|
|
101
|
+
*
|
|
102
|
+
* Returns:
|
|
103
|
+
* - task cache runtime that resolves namespaced file handles under:
|
|
104
|
+
* `<cacheRootDirectory>/<workspaceId>/<projectName>/<namespace>/...`
|
|
105
|
+
*/
|
|
106
|
+
declare function createFilesystemTaskCacheRuntime(options: CreateFilesystemTaskCacheRuntimeOptions): TaskCacheRuntime;
|
|
107
|
+
//#endregion
|
|
3
108
|
//#region src/core/runner/schedule.d.ts
|
|
4
109
|
/**
|
|
5
110
|
* Describes the inferenceExecutor target for a scheduled eval run.
|
|
@@ -333,6 +438,58 @@ declare function asProjectRelativePath(filePath: string, context: RunnerRuntimeC
|
|
|
333
438
|
*/
|
|
334
439
|
declare function collectEvalEntries(modules: EvalModuleMap, context: RunnerRuntimeContext): CollectedEvalEntry[];
|
|
335
440
|
//#endregion
|
|
441
|
+
//#region src/config/models.d.ts
|
|
442
|
+
/**
|
|
443
|
+
* Canonical model definition consumed by vieval runtime and config.
|
|
444
|
+
*
|
|
445
|
+
* Use when:
|
|
446
|
+
* - declaring models in `vieval.config.*`
|
|
447
|
+
* - resolving task runtime models by id, alias, or concrete model name
|
|
448
|
+
*
|
|
449
|
+
* Expects:
|
|
450
|
+
* - `id` to be stable and unique within one config
|
|
451
|
+
* - `inferenceExecutorId` to match scheduler/executor identifiers
|
|
452
|
+
*
|
|
453
|
+
* Returns:
|
|
454
|
+
* - one normalized model registration record
|
|
455
|
+
*/
|
|
456
|
+
interface ModelDefinition {
|
|
457
|
+
/**
|
|
458
|
+
* Stable model id.
|
|
459
|
+
*/
|
|
460
|
+
id: string;
|
|
461
|
+
/**
|
|
462
|
+
* Inference-executor id used for matching and reporting.
|
|
463
|
+
*/
|
|
464
|
+
inferenceExecutorId: string;
|
|
465
|
+
/**
|
|
466
|
+
* Executor reference passed through config.
|
|
467
|
+
*
|
|
468
|
+
* `vieval` core treats this as opaque runtime metadata. Builder plugins can
|
|
469
|
+
* narrow this field with plugin-specific executor input types.
|
|
470
|
+
*/
|
|
471
|
+
inferenceExecutor: unknown;
|
|
472
|
+
/**
|
|
473
|
+
* Concrete model name passed to the inference executor.
|
|
474
|
+
*/
|
|
475
|
+
model: string;
|
|
476
|
+
/**
|
|
477
|
+
* Alias names that can resolve this model.
|
|
478
|
+
*/
|
|
479
|
+
aliases: string[];
|
|
480
|
+
/**
|
|
481
|
+
* Optional model-level call parameters.
|
|
482
|
+
*/
|
|
483
|
+
parameters?: Record<string, unknown>;
|
|
484
|
+
}
|
|
485
|
+
/**
|
|
486
|
+
* Resolves one model by id, model name, or alias in registration order.
|
|
487
|
+
*
|
|
488
|
+
* Returns:
|
|
489
|
+
* - the first matching model, or `undefined` when no match exists
|
|
490
|
+
*/
|
|
491
|
+
declare function resolveModelByName(models: readonly ModelDefinition[], name: string): ModelDefinition | undefined;
|
|
492
|
+
//#endregion
|
|
336
493
|
//#region src/core/runner/task-context.d.ts
|
|
337
494
|
/**
|
|
338
495
|
* Options for selecting a model from the execution context.
|
|
@@ -347,6 +504,10 @@ interface TaskModelSelectionOptions {
|
|
|
347
504
|
* Task-scoped execution context exposed to runner executors.
|
|
348
505
|
*/
|
|
349
506
|
interface TaskExecutionContext {
|
|
507
|
+
/**
|
|
508
|
+
* Deterministic cache runtime scoped to the current task project.
|
|
509
|
+
*/
|
|
510
|
+
cache: TaskCacheRuntime;
|
|
350
511
|
/**
|
|
351
512
|
* Resolves model configuration for the current task.
|
|
352
513
|
*
|
|
@@ -360,6 +521,7 @@ interface TaskExecutionContext {
|
|
|
360
521
|
* Inputs used to build task execution context.
|
|
361
522
|
*/
|
|
362
523
|
interface CreateTaskExecutionContextOptions {
|
|
524
|
+
cache?: TaskCacheRuntime;
|
|
363
525
|
models: readonly ModelDefinition[];
|
|
364
526
|
task: ScheduledTask;
|
|
365
527
|
}
|
|
@@ -439,6 +601,12 @@ interface RunScheduledTasksOptions {
|
|
|
439
601
|
* - failed-task observers do not override the executor error for the task
|
|
440
602
|
*/
|
|
441
603
|
onTaskEnd?: (task: ScheduledTask, state: RunnerTaskState) => void;
|
|
604
|
+
/**
|
|
605
|
+
* Maximum number of tasks to execute concurrently.
|
|
606
|
+
*
|
|
607
|
+
* @default 1
|
|
608
|
+
*/
|
|
609
|
+
maxConcurrency?: number;
|
|
442
610
|
}
|
|
443
611
|
/**
|
|
444
612
|
* Error thrown when a scheduled run fails before producing a normalized result.
|
|
@@ -694,6 +862,14 @@ interface TaskRunOutput {
|
|
|
694
862
|
* Runtime context passed into eval task `run`.
|
|
695
863
|
*/
|
|
696
864
|
interface TaskRunContext {
|
|
865
|
+
/**
|
|
866
|
+
* Task-scoped cache runtime.
|
|
867
|
+
*
|
|
868
|
+
* Use when:
|
|
869
|
+
* - benchmark setup needs deterministic artifact reuse across attempts
|
|
870
|
+
* - case-level logic needs typed text/json/binary cache loaders
|
|
871
|
+
*/
|
|
872
|
+
cache: TaskExecutionContext['cache'];
|
|
697
873
|
/**
|
|
698
874
|
* Scheduled runner task metadata.
|
|
699
875
|
*
|
|
@@ -810,6 +986,10 @@ interface TaskCaseReporterEndPayload extends TaskCaseReporterPayload {
|
|
|
810
986
|
* Final case state.
|
|
811
987
|
*/
|
|
812
988
|
state: TaskCaseState;
|
|
989
|
+
/**
|
|
990
|
+
* Optional failure message when `state` is `failed`.
|
|
991
|
+
*/
|
|
992
|
+
errorMessage?: string;
|
|
813
993
|
}
|
|
814
994
|
/**
|
|
815
995
|
* Reporter hooks invoked around each task case execution.
|
|
@@ -830,6 +1010,39 @@ interface TaskReporterHooks {
|
|
|
830
1010
|
* Runs after a case settles.
|
|
831
1011
|
*/
|
|
832
1012
|
onCaseEnd?: (payload: TaskCaseReporterEndPayload) => void;
|
|
1013
|
+
/**
|
|
1014
|
+
* Runs when task code emits a custom telemetry/reporting event.
|
|
1015
|
+
*
|
|
1016
|
+
* Use when:
|
|
1017
|
+
* - eval implementations need report artifacts beyond case lifecycle counters
|
|
1018
|
+
* - model/runtime integrations emit inference, metering, or tool-call events
|
|
1019
|
+
*/
|
|
1020
|
+
onEvent?: (payload: TaskReporterEventPayload) => void;
|
|
1021
|
+
}
|
|
1022
|
+
/**
|
|
1023
|
+
* Payload emitted by task code for custom report events.
|
|
1024
|
+
*
|
|
1025
|
+
* Use when:
|
|
1026
|
+
* - reporting runtime telemetry such as inference requests, responses, or tool calls
|
|
1027
|
+
* - attaching modality-specific metrics without coupling task logic to CLI internals
|
|
1028
|
+
*
|
|
1029
|
+
* Expects:
|
|
1030
|
+
* - `event` to be a stable event name
|
|
1031
|
+
* - `data` to be JSON-serializable for report artifact persistence
|
|
1032
|
+
*/
|
|
1033
|
+
interface TaskReporterEventPayload {
|
|
1034
|
+
/**
|
|
1035
|
+
* Event name written into report event envelopes.
|
|
1036
|
+
*/
|
|
1037
|
+
event: string;
|
|
1038
|
+
/**
|
|
1039
|
+
* Optional custom payload persisted under event `data`.
|
|
1040
|
+
*/
|
|
1041
|
+
data?: unknown;
|
|
1042
|
+
/**
|
|
1043
|
+
* Optional stable case id when the event maps to one case lifecycle.
|
|
1044
|
+
*/
|
|
1045
|
+
caseId?: string;
|
|
833
1046
|
}
|
|
834
1047
|
/**
|
|
835
1048
|
* Eval task definition used by `defineTask`.
|
|
@@ -943,5 +1156,35 @@ declare function defineEval<const TDefinition extends EvalDefinition>(definition
|
|
|
943
1156
|
*/
|
|
944
1157
|
declare function defineTask<const TDefinition extends TaskDefinition>(definition: TDefinition): TDefinition;
|
|
945
1158
|
//#endregion
|
|
946
|
-
|
|
947
|
-
|
|
1159
|
+
//#region src/config/plugin.d.ts
|
|
1160
|
+
/**
|
|
1161
|
+
* Generic plugin contract for vieval config lifecycle hooks.
|
|
1162
|
+
*
|
|
1163
|
+
* Use when:
|
|
1164
|
+
* - a plugin needs to transform config before CLI normalization
|
|
1165
|
+
* - a plugin needs a final resolved-config callback
|
|
1166
|
+
*
|
|
1167
|
+
* Expects:
|
|
1168
|
+
* - `name` to be stable for diagnostics
|
|
1169
|
+
* - hooks to return either a full config object or `void`
|
|
1170
|
+
*
|
|
1171
|
+
* Returns:
|
|
1172
|
+
* - a typed plugin shape bound to one config object
|
|
1173
|
+
*/
|
|
1174
|
+
interface ConfigHookPlugin<TConfig> {
|
|
1175
|
+
/**
|
|
1176
|
+
* Stable plugin name for diagnostics.
|
|
1177
|
+
*/
|
|
1178
|
+
name: string;
|
|
1179
|
+
/**
|
|
1180
|
+
* Optional config transform hook.
|
|
1181
|
+
*/
|
|
1182
|
+
configVieval?: (config: TConfig) => TConfig | void | Promise<TConfig | void>;
|
|
1183
|
+
/**
|
|
1184
|
+
* Optional hook after config is finalized.
|
|
1185
|
+
*/
|
|
1186
|
+
configVievalResolved?: (config: TConfig) => void | Promise<void>;
|
|
1187
|
+
}
|
|
1188
|
+
//#endregion
|
|
1189
|
+
export { ScheduledTaskMatrixMeta as $, TaskModelSelectionOptions as A, AggregatedRunResults as B, RunScheduledTasksOptions as C, runScheduledTasks as D, ScheduledTaskExecutor as E, collectEvalEntries as F, aggregateRunResults as G, RunResult as H, CreateVievalRunnerRuntimeContextOptions as I, RunnerMatrixDefinition as J, CreateRunnerScheduleOptions as K, RunnerRuntimeContext as L, ModelDefinition as M, resolveModelByName as N, CreateTaskExecutionContextOptions as O, asProjectRelativePath as P, ScheduledTaskMatrix as Q, createRunnerRuntimeContext as R, TaskRunOutput as S, RunnerTaskState as T, RunScore as U, AggregatedRunSummary as V, RunScoreKind as W, RunnerMatrixSelection as X, RunnerMatrixInput as Y, ScheduledTask as Z, TaskCaseState as _, EvalDefinition as a, CacheFileOptions as at, TaskReporterHooks as b, MatrixAxisValues as c, MatrixPrimitive as d, createRunnerSchedule as et, MatrixRow as f, TaskCaseReporterPayload as g, TaskCaseReporterEndPayload as h, CollectedEvalEntry as i, CacheFileHandle as it, createTaskExecutionContext as j, TaskExecutionContext as k, MatrixDefinition as l, ScopedMatrices as m, defineEval as n, createFilesystemTaskCacheRuntime as nt, EvalModule as o, CacheNamespace as ot, MatrixValue as p, InferenceExecutor as q, defineTask as r, normalizeCacheFilePathSegments as rt, EvalModuleMap as s, TaskCacheRuntime as st, ConfigHookPlugin as t, CreateFilesystemTaskCacheRuntimeOptions as tt, MatrixLayer as u, TaskDefinition as v, RunnerExecutionError as w, TaskRunContext as x, TaskReporterEventPayload as y, AggregatedProviderSummary as z };
|
|
1190
|
+
//# sourceMappingURL=index-BDMEAmf2.d.mts.map
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { U as RunScore, W as RunScoreKind } from "./index-BDMEAmf2.mjs";
|
|
2
2
|
|
|
3
3
|
//#region src/core/assertions/index.d.ts
|
|
4
4
|
/**
|
|
@@ -311,4 +311,4 @@ declare function toRunScores(outcomes: readonly AssertionOutcome[]): RunScore[];
|
|
|
311
311
|
declare function collectFailedAssertions(outcomes: readonly AssertionOutcome[]): AssertionOutcome[];
|
|
312
312
|
//#endregion
|
|
313
313
|
export { expectToolCallArgs as C, expectStructuredOutput as S, toRunScores as T, expectMustExclude as _, CustomAssertionOptions as a, expectRegex as b, RegexAssertionOptions as c, StructuredOutputAssertionOptions as d, ToolCall as f, expectCustom as g, evaluateAssertions as h, AssertionState as i, RubricAssertionOptions as l, collectFailedAssertions as m, AssertionContext as n, MustExcludeAssertionOptions as o, ToolCallArgsAssertionOptions as p, AssertionOutcome as r, MustIncludeAssertionOptions as s, Assertion as t, RubricJudgeResult as u, expectMustInclude as v, normalizeMatchText as w, expectRubric as x, expectNot as y };
|
|
314
|
-
//# sourceMappingURL=index-
|
|
314
|
+
//# sourceMappingURL=index-C3gPFmcR.d.mts.map
|
package/dist/index.d.mts
CHANGED
|
@@ -1,11 +1,302 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
1
|
+
import { H as RunResult, M as ModelDefinition, S as TaskRunOutput, W as RunScoreKind, Z as ScheduledTask, k as TaskExecutionContext, l as MatrixDefinition, q as InferenceExecutor, t as ConfigHookPlugin, u as MatrixLayer, x as TaskRunContext } from "./index-BDMEAmf2.mjs";
|
|
2
|
+
import { a as requiredEnvFrom } from "./env-DtpjACOW.mjs";
|
|
3
|
+
import { t as expect } from "./expect-B2vaoRVZ.mjs";
|
|
4
|
+
import * as _$c12 from "c12";
|
|
3
5
|
|
|
6
|
+
//#region src/cli/reporters/vitest-compat-reporter.d.ts
|
|
7
|
+
type Awaitable<T> = T | Promise<T>;
|
|
8
|
+
/**
|
|
9
|
+
* Normalized module-like entity delivered to vitest-compatible reporter hooks.
|
|
10
|
+
*/
|
|
11
|
+
interface VievalVitestCompatModule {
|
|
12
|
+
id: string;
|
|
13
|
+
name: string;
|
|
14
|
+
projectName: string;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Normalized test-case-like entity delivered to vitest-compatible reporter hooks.
|
|
18
|
+
*/
|
|
19
|
+
interface VievalVitestCompatCase {
|
|
20
|
+
id: string;
|
|
21
|
+
name: string;
|
|
22
|
+
module: VievalVitestCompatModule;
|
|
23
|
+
state: 'failed' | 'passed' | 'pending' | 'skipped';
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Supported vitest-style reporter lifecycle hooks.
|
|
27
|
+
*
|
|
28
|
+
* Use when:
|
|
29
|
+
* - external reporter modules should observe vieval task/case lifecycle events
|
|
30
|
+
* - the project wants a familiar Vitest reporter callback model
|
|
31
|
+
*
|
|
32
|
+
* Expects:
|
|
33
|
+
* - hook handlers to be best-effort observers only
|
|
34
|
+
* - thrown errors are ignored to avoid interrupting eval execution
|
|
35
|
+
*/
|
|
36
|
+
interface VievalVitestCompatReporter {
|
|
37
|
+
onTestCaseReady?: (testCase: VievalVitestCompatCase) => Awaitable<void>;
|
|
38
|
+
onTestCaseResult?: (testCase: VievalVitestCompatCase) => Awaitable<void>;
|
|
39
|
+
onTestModuleCollected?: (module: VievalVitestCompatModule) => Awaitable<void>;
|
|
40
|
+
onTestModuleEnd?: (module: VievalVitestCompatModule) => Awaitable<void>;
|
|
41
|
+
onTestModuleQueued?: (module: VievalVitestCompatModule) => Awaitable<void>;
|
|
42
|
+
onTestModuleStart?: (module: VievalVitestCompatModule) => Awaitable<void>;
|
|
43
|
+
onTestRunEnd?: (modules: readonly VievalVitestCompatModule[], errors: readonly {
|
|
44
|
+
message: string;
|
|
45
|
+
}[], state: 'failed' | 'passed') => Awaitable<void>;
|
|
46
|
+
onTestRunStart?: (specifications: readonly {
|
|
47
|
+
moduleId: string;
|
|
48
|
+
projectName: string;
|
|
49
|
+
}[]) => Awaitable<void>;
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Supported project reporter references.
|
|
53
|
+
*
|
|
54
|
+
* - String: module path or package name, default export used.
|
|
55
|
+
* - Reporter object: inline hook object (Vitest-style inline reporter).
|
|
56
|
+
* - Tuple: [string or reporter object, constructor options].
|
|
57
|
+
*
|
|
58
|
+
* Source permalink:
|
|
59
|
+
* `https://github.com/vitest-dev/vitest/blob/b865b4d83d1e7874607ba1b2d84b9e2d135ecd33/packages/vitest/src/node/config/resolveConfig.ts#L674-L713`
|
|
60
|
+
*/
|
|
61
|
+
type VievalVitestCompatReporterValue = string | VievalVitestCompatReporter;
|
|
62
|
+
type VievalVitestCompatReporterReference = VievalVitestCompatReporterValue | readonly [VievalVitestCompatReporterValue, unknown?];
|
|
63
|
+
//#endregion
|
|
4
64
|
//#region src/cli/config.d.ts
|
|
65
|
+
/**
|
|
66
|
+
* CLI plugin shape bound to the full CLI config object.
|
|
67
|
+
*/
|
|
68
|
+
type CliConfigPlugin = ConfigHookPlugin<CliConfig>;
|
|
69
|
+
/**
|
|
70
|
+
* Defines one project block for `vieval run`.
|
|
71
|
+
*/
|
|
72
|
+
interface CliProjectConfig {
|
|
73
|
+
/**
|
|
74
|
+
* Project label used in summary output.
|
|
75
|
+
*/
|
|
76
|
+
name: string;
|
|
77
|
+
/**
|
|
78
|
+
* Project root used for include/exclude glob matching.
|
|
79
|
+
*
|
|
80
|
+
* @default process cwd
|
|
81
|
+
*/
|
|
82
|
+
root?: string;
|
|
83
|
+
/**
|
|
84
|
+
* Glob patterns for eval file discovery.
|
|
85
|
+
*
|
|
86
|
+
* @default Common eval file globs for TypeScript and JavaScript module formats.
|
|
87
|
+
*/
|
|
88
|
+
include?: string[];
|
|
89
|
+
/**
|
|
90
|
+
* Glob patterns excluded from discovery.
|
|
91
|
+
*
|
|
92
|
+
* @default Common exclusion globs for dependencies, build output, and VCS directories.
|
|
93
|
+
*/
|
|
94
|
+
exclude?: string[];
|
|
95
|
+
/**
|
|
96
|
+
* Providers expanded by scheduler.
|
|
97
|
+
*
|
|
98
|
+
* @default [{ id: 'default' }]
|
|
99
|
+
*/
|
|
100
|
+
inferenceExecutors?: InferenceExecutor[];
|
|
101
|
+
/**
|
|
102
|
+
* Model definitions available to project runtime execution.
|
|
103
|
+
*
|
|
104
|
+
* Inference executors control schedule fan-out, while models provide
|
|
105
|
+
* runtime lookup metadata for `context.model(...)` during task execution.
|
|
106
|
+
*
|
|
107
|
+
* @default inherited from top-level config models
|
|
108
|
+
*/
|
|
109
|
+
models?: ModelDefinition[];
|
|
110
|
+
/**
|
|
111
|
+
* Optional run-time matrix dimensions.
|
|
112
|
+
*/
|
|
113
|
+
runMatrix?: MatrixDefinition | MatrixLayer;
|
|
114
|
+
/**
|
|
115
|
+
* Optional eval-time matrix dimensions.
|
|
116
|
+
*/
|
|
117
|
+
evalMatrix?: MatrixDefinition | MatrixLayer;
|
|
118
|
+
/**
|
|
119
|
+
* Optional task executor.
|
|
120
|
+
*
|
|
121
|
+
* Use when this project should execute live inferenceExecutor requests.
|
|
122
|
+
* If omitted, `vieval run` performs collection + scheduling only.
|
|
123
|
+
*/
|
|
124
|
+
executor?: (task: ScheduledTask, context: CliProjectExecutorContext) => Promise<RunResult>;
|
|
125
|
+
/**
|
|
126
|
+
* Optional project-local plugins.
|
|
127
|
+
*/
|
|
128
|
+
plugins?: CliConfigPlugin[];
|
|
129
|
+
/**
|
|
130
|
+
* Optional vitest-compatible reporter modules.
|
|
131
|
+
*
|
|
132
|
+
* Use when:
|
|
133
|
+
* - project runs should emit additional reporter callbacks using Vitest-style lifecycle names
|
|
134
|
+
*
|
|
135
|
+
* @default []
|
|
136
|
+
*/
|
|
137
|
+
reporters?: VievalVitestCompatReporterReference[];
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* One workspace descriptor for workspace-mode configs.
|
|
141
|
+
*/
|
|
142
|
+
interface CliWorkspaceConfig {
|
|
143
|
+
/**
|
|
144
|
+
* Workspace identifier.
|
|
145
|
+
*/
|
|
146
|
+
id: string;
|
|
147
|
+
/**
|
|
148
|
+
* Workspace root path.
|
|
149
|
+
*/
|
|
150
|
+
root: string;
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* One explicit comparison method descriptor.
|
|
154
|
+
*/
|
|
155
|
+
interface CliComparisonMethodConfig {
|
|
156
|
+
/**
|
|
157
|
+
* Method identifier shown in compare reports.
|
|
158
|
+
*/
|
|
159
|
+
id: string;
|
|
160
|
+
/**
|
|
161
|
+
* Workspace path containing this method's `vieval.config.*`.
|
|
162
|
+
*/
|
|
163
|
+
workspace: string;
|
|
164
|
+
/**
|
|
165
|
+
* Project name to execute inside workspace config.
|
|
166
|
+
*/
|
|
167
|
+
project: string;
|
|
168
|
+
/**
|
|
169
|
+
* Optional explicit config file path for this workspace.
|
|
170
|
+
*/
|
|
171
|
+
configFilePath?: string;
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Benchmark identity and shared cache namespace.
|
|
175
|
+
*/
|
|
176
|
+
interface CliComparisonBenchmarkConfig {
|
|
177
|
+
/**
|
|
178
|
+
* Benchmark identifier used in report artifacts.
|
|
179
|
+
*/
|
|
180
|
+
id: string;
|
|
181
|
+
/**
|
|
182
|
+
* Shared cache namespace reused across method runs.
|
|
183
|
+
*/
|
|
184
|
+
sharedCaseNamespace: string;
|
|
185
|
+
}
|
|
186
|
+
/**
|
|
187
|
+
* One comparison entry loaded by `vieval compare`.
|
|
188
|
+
*/
|
|
189
|
+
interface CliComparisonConfig {
|
|
190
|
+
/**
|
|
191
|
+
* Comparison id selected by `--comparison`.
|
|
192
|
+
*/
|
|
193
|
+
id: string;
|
|
194
|
+
/**
|
|
195
|
+
* Benchmark metadata for reporting and shared cache coordination.
|
|
196
|
+
*/
|
|
197
|
+
benchmark: CliComparisonBenchmarkConfig;
|
|
198
|
+
/**
|
|
199
|
+
* Optional explicit method list.
|
|
200
|
+
*/
|
|
201
|
+
methods?: CliComparisonMethodConfig[];
|
|
202
|
+
/**
|
|
203
|
+
* Optional workspace glob(s) discovered relative to config directory.
|
|
204
|
+
*/
|
|
205
|
+
includesWorkspaces?: string | string[];
|
|
206
|
+
/**
|
|
207
|
+
* Optional workspace exclude glob(s), also relative to config directory.
|
|
208
|
+
*/
|
|
209
|
+
excludesWorkspaces?: string | string[];
|
|
210
|
+
}
|
|
211
|
+
/**
|
|
212
|
+
* Execution context exposed to project-level `executor` implementations.
|
|
213
|
+
*
|
|
214
|
+
* Use when:
|
|
215
|
+
* - a project executor needs the task-scoped model resolver plus case reporter hooks
|
|
216
|
+
* - custom scheduling logic wants the same hook shape as `TaskRunContext`
|
|
217
|
+
*
|
|
218
|
+
* Expects:
|
|
219
|
+
* - `model` resolves configured models for the current task
|
|
220
|
+
* - `reporterHooks` follows `TaskRunContext['reporterHooks']`
|
|
221
|
+
*/
|
|
222
|
+
interface CliProjectExecutorContext extends TaskExecutionContext {
|
|
223
|
+
reporterHooks?: TaskRunContext['reporterHooks'];
|
|
224
|
+
}
|
|
225
|
+
/**
|
|
226
|
+
* Top-level CLI config loaded from `vieval.config.*`.
|
|
227
|
+
*/
|
|
228
|
+
interface CliConfigBase {
|
|
229
|
+
/**
|
|
230
|
+
* Global model definitions inherited by projects.
|
|
231
|
+
*
|
|
232
|
+
* @default []
|
|
233
|
+
*/
|
|
234
|
+
models?: ModelDefinition[];
|
|
235
|
+
/**
|
|
236
|
+
* Global config plugins.
|
|
237
|
+
*
|
|
238
|
+
* @default []
|
|
239
|
+
*/
|
|
240
|
+
plugins?: CliConfigPlugin[];
|
|
241
|
+
/**
|
|
242
|
+
* Global vitest-compatible reporter modules inherited by projects.
|
|
243
|
+
*
|
|
244
|
+
* @default []
|
|
245
|
+
*/
|
|
246
|
+
reporters?: VievalVitestCompatReporterReference[];
|
|
247
|
+
/**
|
|
248
|
+
* Environment variables injected into `process.env` during `vieval run`.
|
|
249
|
+
*
|
|
250
|
+
* Use when:
|
|
251
|
+
* - eval tasks depend on runtime env values (for example inferenceExecutor API keys)
|
|
252
|
+
* - config wants deterministic env values without shell-level exports
|
|
253
|
+
*
|
|
254
|
+
* @default {}
|
|
255
|
+
*/
|
|
256
|
+
env?: NodeJS.ProcessEnv;
|
|
257
|
+
}
|
|
258
|
+
/**
|
|
259
|
+
* Project mode config for `vieval run`.
|
|
260
|
+
*/
|
|
261
|
+
interface CliProjectModeConfig extends CliConfigBase {
|
|
262
|
+
/**
|
|
263
|
+
* Project list expanded by `vieval run`.
|
|
264
|
+
*
|
|
265
|
+
* @default [{ name: 'default' }]
|
|
266
|
+
*/
|
|
267
|
+
projects?: CliProjectConfig[];
|
|
268
|
+
comparisons?: never;
|
|
269
|
+
workspaces?: never;
|
|
270
|
+
}
|
|
271
|
+
/**
|
|
272
|
+
* Workspace mode config placeholder for future workspace orchestration.
|
|
273
|
+
*/
|
|
274
|
+
interface CliWorkspaceModeConfig extends CliConfigBase {
|
|
275
|
+
workspaces: CliWorkspaceConfig[];
|
|
276
|
+
projects?: never;
|
|
277
|
+
comparisons?: never;
|
|
278
|
+
}
|
|
279
|
+
/**
|
|
280
|
+
* Comparison mode config for `vieval compare`.
|
|
281
|
+
*/
|
|
282
|
+
interface CliComparisonModeConfig extends CliConfigBase {
|
|
283
|
+
comparisons: CliComparisonConfig[];
|
|
284
|
+
projects?: never;
|
|
285
|
+
workspaces?: never;
|
|
286
|
+
}
|
|
287
|
+
/**
|
|
288
|
+
* Top-level CLI config loaded from `vieval.config.*`.
|
|
289
|
+
*
|
|
290
|
+
* Exactly one top-level mode is allowed:
|
|
291
|
+
* - `projects`
|
|
292
|
+
* - `workspaces`
|
|
293
|
+
* - `comparisons`
|
|
294
|
+
*/
|
|
295
|
+
type CliConfig = CliProjectModeConfig | CliWorkspaceModeConfig | CliComparisonModeConfig;
|
|
5
296
|
/**
|
|
6
297
|
* Helper used by `vieval.config.*` for better type inference.
|
|
7
298
|
*/
|
|
8
|
-
declare const defineConfig:
|
|
299
|
+
declare const defineConfig: _$c12.DefineConfig<CliConfig, _$c12.ConfigLayerMeta>;
|
|
9
300
|
/**
|
|
10
301
|
* Loads `.env*` files using Vite's env resolution behavior.
|
|
11
302
|
*
|
|
@@ -33,6 +324,27 @@ interface CaseRunContext<TInput> extends TaskRunContext {
|
|
|
33
324
|
matrix: TaskRunContext['task']['matrix'] & {
|
|
34
325
|
inputs: TInput;
|
|
35
326
|
};
|
|
327
|
+
/**
|
|
328
|
+
* Overrides one case score family with a custom normalized value.
|
|
329
|
+
*
|
|
330
|
+
* Use when:
|
|
331
|
+
* - one case computes a benchmark-native score that should flow into run aggregation
|
|
332
|
+
*
|
|
333
|
+
* Expects:
|
|
334
|
+
* - `score` to stay in the `0..1` range
|
|
335
|
+
*/
|
|
336
|
+
score: (score: number, kind?: RunScoreKind) => void;
|
|
337
|
+
/**
|
|
338
|
+
* Emits one custom case metric into report events.
|
|
339
|
+
*
|
|
340
|
+
* Use when:
|
|
341
|
+
* - tasks need structured benchmark metadata beyond exact/judge score families
|
|
342
|
+
*
|
|
343
|
+
* Expects:
|
|
344
|
+
* - `name` to be a stable metric identifier
|
|
345
|
+
* - `value` to be JSON-serializable
|
|
346
|
+
*/
|
|
347
|
+
metric: (name: string, value: boolean | number | string | null) => void;
|
|
36
348
|
}
|
|
37
349
|
/**
|
|
38
350
|
* Callback for one task case.
|
|
@@ -45,7 +357,12 @@ interface DescribeTaskBuilder {
|
|
|
45
357
|
/**
|
|
46
358
|
* Registers one explicit case.
|
|
47
359
|
*/
|
|
48
|
-
caseOf:
|
|
360
|
+
caseOf: {
|
|
361
|
+
(name: string, run: CaseRunner<undefined>): void;
|
|
362
|
+
<TInput>(name: string, run: CaseRunner<TInput>, options: {
|
|
363
|
+
input: TInput;
|
|
364
|
+
}): void;
|
|
365
|
+
};
|
|
49
366
|
/**
|
|
50
367
|
* Registers multiple cases from input list.
|
|
51
368
|
*/
|
|
@@ -63,7 +380,10 @@ interface DescribeTaskOptions {
|
|
|
63
380
|
/**
|
|
64
381
|
* Registers one case in the currently active task scope.
|
|
65
382
|
*/
|
|
66
|
-
declare function caseOf
|
|
383
|
+
declare function caseOf(name: string, run: CaseRunner<undefined>): void;
|
|
384
|
+
declare function caseOf<TInput>(name: string, run: CaseRunner<TInput>, options: {
|
|
385
|
+
input: TInput;
|
|
386
|
+
}): void;
|
|
67
387
|
/**
|
|
68
388
|
* Registers multiple cases in the currently active task scope.
|
|
69
389
|
*/
|
|
@@ -88,5 +408,5 @@ declare function describeTask(name: string, build: ((builder: DescribeTaskBuilde
|
|
|
88
408
|
*/
|
|
89
409
|
declare const describeEval: typeof describeTask;
|
|
90
410
|
//#endregion
|
|
91
|
-
export { caseOf, casesFromInputs, defineConfig, describeEval, describeTask, expect, loadEnv };
|
|
411
|
+
export { caseOf, casesFromInputs, defineConfig, describeEval, describeTask, expect, loadEnv, requiredEnvFrom };
|
|
92
412
|
//# sourceMappingURL=index.d.mts.map
|