vieval 0.0.10 → 0.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -31
- package/dist/bin/vieval.mjs +1 -1
- package/dist/bin/vieval.mjs.map +1 -1
- package/dist/cli/index.d.mts +1 -1
- package/dist/cli/index.mjs +1 -1
- package/dist/{cli-DTDgaqeI.mjs → cli-uzS81IPd.mjs} +1483 -1483
- package/dist/cli-uzS81IPd.mjs.map +1 -0
- package/dist/config.d.mts +1 -1
- package/dist/config.mjs +1 -1
- package/dist/config.mjs.map +1 -1
- package/dist/core/assertions/index.d.mts +156 -156
- package/dist/core/assertions/index.mjs +82 -82
- package/dist/core/assertions/index.mjs.map +1 -1
- package/dist/core/inference-executors/index.d.mts +37 -37
- package/dist/core/inference-executors/index.mjs +54 -53
- package/dist/core/inference-executors/index.mjs.map +1 -1
- package/dist/core/processors/results/index.d.mts +18 -18
- package/dist/core/processors/results/index.mjs.map +1 -1
- package/dist/core/runner/index.d.mts +2 -2
- package/dist/core/runner/index.mjs +259 -259
- package/dist/core/runner/index.mjs.map +1 -1
- package/dist/core/scheduler/index.d.mts +1 -1
- package/dist/core/scheduler/index.mjs +65 -65
- package/dist/core/scheduler/index.mjs.map +1 -1
- package/dist/{env-DfWZy_n4.d.mts → env-Br6jaWGL.d.mts} +9 -9
- package/dist/{env-nV5rVErX.mjs → env-egxaJtNn.mjs} +8 -8
- package/dist/env-egxaJtNn.mjs.map +1 -0
- package/dist/{expect-extensions-DCSqlneN.mjs → expect-extensions-BKdEPt3h.mjs} +46 -46
- package/dist/expect-extensions-BKdEPt3h.mjs.map +1 -0
- package/dist/expect.d.mts +1 -3
- package/dist/expect.mjs +1 -1
- package/dist/expect.mjs.map +1 -1
- package/dist/{index-D_aMeWqO.d.mts → index-BLIlhiWT.d.mts} +565 -565
- package/dist/{index-Bg0atWBF.d.mts → index-CIaJClcC.d.mts} +48 -48
- package/dist/index.d.mts +208 -197
- package/dist/index.mjs +148 -148
- package/dist/index.mjs.map +1 -1
- package/dist/{models-pBSRUZhY.mjs → models-CaCOUPZw.mjs} +1 -1
- package/dist/{models-pBSRUZhY.mjs.map → models-CaCOUPZw.mjs.map} +1 -1
- package/dist/plugins/chat-models/index.d.mts +279 -279
- package/dist/plugins/chat-models/index.mjs +360 -360
- package/dist/plugins/chat-models/index.mjs.map +1 -1
- package/dist/{queue-DsZQkZO_.mjs → queue-BL86z2W_.mjs} +1 -1
- package/dist/{queue-DsZQkZO_.mjs.map → queue-BL86z2W_.mjs.map} +1 -1
- package/dist/{registry-DMnwE_mY.mjs → registry-BK7k6X81.mjs} +294 -294
- package/dist/registry-BK7k6X81.mjs.map +1 -0
- package/dist/testing/expect-extensions.d.mts +27 -27
- package/dist/testing/expect-extensions.mjs +1 -1
- package/package.json +12 -12
- package/dist/cli-DTDgaqeI.mjs.map +0 -1
- package/dist/env-nV5rVErX.mjs.map +0 -1
- package/dist/expect-extensions-DCSqlneN.mjs.map +0 -1
- package/dist/registry-DMnwE_mY.mjs.map +0 -1
|
@@ -2,23 +2,6 @@ import { ReadStream, WriteStream } from "node:fs";
|
|
|
2
2
|
import { Buffer } from "node:buffer";
|
|
3
3
|
|
|
4
4
|
//#region src/core/cache/types.d.ts
|
|
5
|
-
/**
|
|
6
|
-
* Cache entry options used to derive one deterministic cache file path.
|
|
7
|
-
*/
|
|
8
|
-
interface CacheFileOptions {
|
|
9
|
-
/**
|
|
10
|
-
* Optional file extension for the cache artifact (for example: `json`, `txt`, `wav`).
|
|
11
|
-
*/
|
|
12
|
-
ext?: string;
|
|
13
|
-
/**
|
|
14
|
-
* Deterministic key segments used to build the relative cache path.
|
|
15
|
-
*/
|
|
16
|
-
key: readonly string[];
|
|
17
|
-
/**
|
|
18
|
-
* Optional media type hint used by adapters when extension is omitted.
|
|
19
|
-
*/
|
|
20
|
-
mediaType?: string;
|
|
21
|
-
}
|
|
22
5
|
/**
|
|
23
6
|
* One cache file handle exposed to task code.
|
|
24
7
|
*
|
|
@@ -34,18 +17,35 @@ interface CacheFileOptions {
|
|
|
34
17
|
* - read/write helpers over one deterministic cache artifact path
|
|
35
18
|
*/
|
|
36
19
|
interface CacheFileHandle {
|
|
37
|
-
path: string;
|
|
38
20
|
exists: () => Promise<boolean>;
|
|
21
|
+
loadAsCasesInput: <T>() => Promise<T[]>;
|
|
22
|
+
loadAsExpectFixture: <T>() => Promise<T>;
|
|
39
23
|
openReadStream: () => ReadStream;
|
|
40
24
|
openWriteStream: () => Promise<WriteStream>;
|
|
25
|
+
path: string;
|
|
41
26
|
readBuffer: () => Promise<Buffer>;
|
|
42
|
-
writeBuffer: (value: Buffer) => Promise<void>;
|
|
43
|
-
readText: (encoding?: BufferEncoding) => Promise<string>;
|
|
44
|
-
writeText: (value: string, encoding?: BufferEncoding) => Promise<void>;
|
|
45
27
|
readJson: <T>() => Promise<T>;
|
|
28
|
+
readText: (encoding?: BufferEncoding) => Promise<string>;
|
|
29
|
+
writeBuffer: (value: Buffer) => Promise<void>;
|
|
46
30
|
writeJson: (value: unknown) => Promise<void>;
|
|
47
|
-
|
|
48
|
-
|
|
31
|
+
writeText: (value: string, encoding?: BufferEncoding) => Promise<void>;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Cache entry options used to derive one deterministic cache file path.
|
|
35
|
+
*/
|
|
36
|
+
interface CacheFileOptions {
|
|
37
|
+
/**
|
|
38
|
+
* Optional file extension for the cache artifact (for example: `json`, `txt`, `wav`).
|
|
39
|
+
*/
|
|
40
|
+
ext?: string;
|
|
41
|
+
/**
|
|
42
|
+
* Deterministic key segments used to build the relative cache path.
|
|
43
|
+
*/
|
|
44
|
+
key: readonly string[];
|
|
45
|
+
/**
|
|
46
|
+
* Optional media type hint used by adapters when extension is omitted.
|
|
47
|
+
*/
|
|
48
|
+
mediaType?: string;
|
|
49
49
|
}
|
|
50
50
|
/**
|
|
51
51
|
* Namespaced cache accessor for deterministic cache artifacts.
|
|
@@ -78,16 +78,6 @@ interface CreateFilesystemTaskCacheRuntimeOptions {
|
|
|
78
78
|
*/
|
|
79
79
|
workspaceId: string;
|
|
80
80
|
}
|
|
81
|
-
/**
|
|
82
|
-
* Normalizes cache file options into deterministic relative path segments.
|
|
83
|
-
*
|
|
84
|
-
* Before:
|
|
85
|
-
* - `{ key: ['cases', 'dataset hash', 'v1'], ext: 'json' }`
|
|
86
|
-
*
|
|
87
|
-
* After:
|
|
88
|
-
* - `['cases', 'dataset-hash', 'v1.json']`
|
|
89
|
-
*/
|
|
90
|
-
declare function normalizeCacheFilePathSegments(options: CacheFileOptions): string[];
|
|
91
81
|
/**
|
|
92
82
|
* Creates a deterministic filesystem-backed task cache runtime.
|
|
93
83
|
*
|
|
@@ -104,6 +94,16 @@ declare function normalizeCacheFilePathSegments(options: CacheFileOptions): stri
|
|
|
104
94
|
* `<cacheRootDirectory>/<workspaceId>/<projectName>/<namespace>/...`
|
|
105
95
|
*/
|
|
106
96
|
declare function createFilesystemTaskCacheRuntime(options: CreateFilesystemTaskCacheRuntimeOptions): TaskCacheRuntime;
|
|
97
|
+
/**
|
|
98
|
+
* Normalizes cache file options into deterministic relative path segments.
|
|
99
|
+
*
|
|
100
|
+
* Before:
|
|
101
|
+
* - `{ key: ['cases', 'dataset hash', 'v1'], ext: 'json' }`
|
|
102
|
+
*
|
|
103
|
+
* After:
|
|
104
|
+
* - `['cases', 'dataset-hash', 'v1.json']`
|
|
105
|
+
*/
|
|
106
|
+
declare function normalizeCacheFilePathSegments(options: CacheFileOptions): string[];
|
|
107
107
|
//#endregion
|
|
108
108
|
//#region src/core/runner/schedule.d.ts
|
|
109
109
|
/**
|
|
@@ -116,30 +116,21 @@ interface InferenceExecutor {
|
|
|
116
116
|
id: string;
|
|
117
117
|
}
|
|
118
118
|
/**
|
|
119
|
-
*
|
|
119
|
+
* Maps matrix axis names to the values that should be expanded.
|
|
120
120
|
*/
|
|
121
|
-
type
|
|
121
|
+
type RunnerMatrixDefinition = MatrixDefinition;
|
|
122
122
|
/**
|
|
123
|
-
*
|
|
123
|
+
* Accepts either flat axis definitions or one layered matrix object.
|
|
124
124
|
*/
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
/**
|
|
131
|
-
* Stable row id for the resolved eval matrix selection.
|
|
132
|
-
*/
|
|
133
|
-
evalRowId: string;
|
|
134
|
-
}
|
|
125
|
+
type RunnerMatrixInput = MatrixLayer | RunnerMatrixDefinition;
|
|
126
|
+
/**
|
|
127
|
+
* Stores the selected value for each matrix axis.
|
|
128
|
+
*/
|
|
129
|
+
type RunnerMatrixSelection = Record<string, string>;
|
|
135
130
|
/**
|
|
136
131
|
* Stores the structured matrix payload for one scheduled task.
|
|
137
132
|
*/
|
|
138
133
|
interface ScheduledTaskMatrix {
|
|
139
|
-
/**
|
|
140
|
-
* Runtime matrix selection visible to task code.
|
|
141
|
-
*/
|
|
142
|
-
run: RunnerMatrixSelection;
|
|
143
134
|
/**
|
|
144
135
|
* Eval-time matrix selection visible to task code.
|
|
145
136
|
*/
|
|
@@ -148,35 +139,23 @@ interface ScheduledTaskMatrix {
|
|
|
148
139
|
* Stable row ids for both scopes.
|
|
149
140
|
*/
|
|
150
141
|
meta: ScheduledTaskMatrixMeta;
|
|
142
|
+
/**
|
|
143
|
+
* Runtime matrix selection visible to task code.
|
|
144
|
+
*/
|
|
145
|
+
run: RunnerMatrixSelection;
|
|
151
146
|
}
|
|
152
147
|
/**
|
|
153
|
-
*
|
|
154
|
-
*/
|
|
155
|
-
type RunnerMatrixDefinition = MatrixDefinition;
|
|
156
|
-
/**
|
|
157
|
-
* Accepts either flat axis definitions or one layered matrix object.
|
|
158
|
-
*/
|
|
159
|
-
type RunnerMatrixInput = RunnerMatrixDefinition | MatrixLayer;
|
|
160
|
-
/**
|
|
161
|
-
* Represents one fully expanded runner task.
|
|
148
|
+
* Stores stable row ids for one resolved scheduled task matrix.
|
|
162
149
|
*/
|
|
163
|
-
interface
|
|
164
|
-
/**
|
|
165
|
-
* Stable task id derived from the entry, inferenceExecutor, and matrix selection.
|
|
166
|
-
*/
|
|
167
|
-
id: string;
|
|
168
|
-
/**
|
|
169
|
-
* The collected eval entry to execute.
|
|
170
|
-
*/
|
|
171
|
-
entry: CollectedEvalEntry;
|
|
150
|
+
interface ScheduledTaskMatrixMeta {
|
|
172
151
|
/**
|
|
173
|
-
*
|
|
152
|
+
* Stable row id for the resolved eval matrix selection.
|
|
174
153
|
*/
|
|
175
|
-
|
|
154
|
+
evalRowId: string;
|
|
176
155
|
/**
|
|
177
|
-
*
|
|
156
|
+
* Stable row id for the resolved run matrix selection.
|
|
178
157
|
*/
|
|
179
|
-
|
|
158
|
+
runRowId: string;
|
|
180
159
|
}
|
|
181
160
|
/**
|
|
182
161
|
* Configures how the runner should expand its execution matrix.
|
|
@@ -186,6 +165,10 @@ interface CreateRunnerScheduleOptions {
|
|
|
186
165
|
* Collected eval entries that should be scheduled.
|
|
187
166
|
*/
|
|
188
167
|
entries: readonly CollectedEvalEntry[];
|
|
168
|
+
/**
|
|
169
|
+
* Optional eval-time matrix axes expanded as a cartesian product.
|
|
170
|
+
*/
|
|
171
|
+
evalMatrix?: RunnerMatrixInput;
|
|
189
172
|
/**
|
|
190
173
|
* Providers that should run each entry.
|
|
191
174
|
*/
|
|
@@ -194,10 +177,27 @@ interface CreateRunnerScheduleOptions {
|
|
|
194
177
|
* Optional run-time matrix axes expanded as a cartesian product.
|
|
195
178
|
*/
|
|
196
179
|
runMatrix?: RunnerMatrixInput;
|
|
180
|
+
}
|
|
181
|
+
/**
|
|
182
|
+
* Represents one fully expanded runner task.
|
|
183
|
+
*/
|
|
184
|
+
interface ScheduledTask {
|
|
197
185
|
/**
|
|
198
|
-
*
|
|
186
|
+
* The collected eval entry to execute.
|
|
199
187
|
*/
|
|
200
|
-
|
|
188
|
+
entry: CollectedEvalEntry;
|
|
189
|
+
/**
|
|
190
|
+
* Stable task id derived from the entry, inferenceExecutor, and matrix selection.
|
|
191
|
+
*/
|
|
192
|
+
id: string;
|
|
193
|
+
/**
|
|
194
|
+
* The inferenceExecutor selected for this task.
|
|
195
|
+
*/
|
|
196
|
+
inferenceExecutor: InferenceExecutor;
|
|
197
|
+
/**
|
|
198
|
+
* The concrete scoped matrix selection for this task.
|
|
199
|
+
*/
|
|
200
|
+
matrix: ScheduledTaskMatrix;
|
|
201
201
|
}
|
|
202
202
|
/**
|
|
203
203
|
* Expands collected entries into a stable runner schedule.
|
|
@@ -221,128 +221,128 @@ declare function createRunnerSchedule(options: CreateRunnerScheduleOptions): Sch
|
|
|
221
221
|
//#endregion
|
|
222
222
|
//#region src/core/runner/aggregate.d.ts
|
|
223
223
|
/**
|
|
224
|
-
*
|
|
225
|
-
*/
|
|
226
|
-
type RunScoreKind = 'exact' | 'judge';
|
|
227
|
-
/**
|
|
228
|
-
* Represents one normalized score emitted by a completed eval run.
|
|
224
|
+
* Stores inferenceExecutor-level score aggregates across multiple runs.
|
|
229
225
|
*/
|
|
230
|
-
interface
|
|
226
|
+
interface AggregatedProviderSummary {
|
|
231
227
|
/**
|
|
232
|
-
*
|
|
228
|
+
* Mean of all exact-match scores or `null` when absent.
|
|
233
229
|
*/
|
|
234
|
-
|
|
230
|
+
exactAverage: null | number;
|
|
235
231
|
/**
|
|
236
|
-
*
|
|
232
|
+
* Hybrid average derived from the inferenceExecutor exact and judge means.
|
|
237
233
|
*/
|
|
238
|
-
|
|
239
|
-
}
|
|
240
|
-
/**
|
|
241
|
-
* Captures the output of one scheduled runner task.
|
|
242
|
-
*/
|
|
243
|
-
interface RunResult {
|
|
234
|
+
hybridAverage: null | number;
|
|
244
235
|
/**
|
|
245
|
-
* Stable
|
|
236
|
+
* Stable inferenceExecutor id.
|
|
246
237
|
*/
|
|
247
|
-
|
|
238
|
+
inferenceExecutorId: string;
|
|
248
239
|
/**
|
|
249
|
-
*
|
|
240
|
+
* Mean of all judge-based scores or `null` when absent.
|
|
250
241
|
*/
|
|
251
|
-
|
|
242
|
+
judgeAverage: null | number;
|
|
252
243
|
/**
|
|
253
|
-
*
|
|
244
|
+
* Number of runs included in this inferenceExecutor bucket.
|
|
254
245
|
*/
|
|
255
|
-
|
|
246
|
+
runCount: number;
|
|
247
|
+
}
|
|
248
|
+
/**
|
|
249
|
+
* Stores the final aggregation output for a batch of runner results.
|
|
250
|
+
*/
|
|
251
|
+
interface AggregatedRunResults {
|
|
256
252
|
/**
|
|
257
|
-
*
|
|
253
|
+
* Provider-level summaries sorted by inferenceExecutor id.
|
|
258
254
|
*/
|
|
259
|
-
|
|
255
|
+
inferenceExecutors: AggregatedProviderSummary[];
|
|
260
256
|
/**
|
|
261
|
-
*
|
|
257
|
+
* Overall summary across every run.
|
|
262
258
|
*/
|
|
263
|
-
|
|
259
|
+
overall: {
|
|
260
|
+
exactAverage: null | number;
|
|
261
|
+
hybridAverage: null | number;
|
|
262
|
+
judgeAverage: null | number;
|
|
263
|
+
runCount: number;
|
|
264
|
+
};
|
|
265
|
+
/**
|
|
266
|
+
* Per-run normalized score summaries.
|
|
267
|
+
*/
|
|
268
|
+
runs: AggregatedRunSummary[];
|
|
264
269
|
}
|
|
265
270
|
/**
|
|
266
271
|
* Stores the per-run score averages after normalization.
|
|
267
272
|
*/
|
|
268
273
|
interface AggregatedRunSummary {
|
|
269
|
-
/**
|
|
270
|
-
* Stable run id.
|
|
271
|
-
*/
|
|
272
|
-
id: string;
|
|
273
274
|
/**
|
|
274
275
|
* Collected eval entry id.
|
|
275
276
|
*/
|
|
276
277
|
entryId: string;
|
|
277
278
|
/**
|
|
278
|
-
*
|
|
279
|
+
* Mean of exact-match scores or `null` when absent.
|
|
279
280
|
*/
|
|
280
|
-
|
|
281
|
+
exactAverage: null | number;
|
|
281
282
|
/**
|
|
282
|
-
*
|
|
283
|
+
* Hybrid average. Uses both families when present, otherwise falls back to the
|
|
284
|
+
* single available family.
|
|
283
285
|
*/
|
|
284
|
-
|
|
286
|
+
hybridAverage: null | number;
|
|
285
287
|
/**
|
|
286
|
-
*
|
|
288
|
+
* Stable run id.
|
|
287
289
|
*/
|
|
288
|
-
|
|
290
|
+
id: string;
|
|
291
|
+
/**
|
|
292
|
+
* Stable inferenceExecutor id.
|
|
293
|
+
*/
|
|
294
|
+
inferenceExecutorId: string;
|
|
289
295
|
/**
|
|
290
296
|
* Mean of judge-based scores or `null` when absent.
|
|
291
297
|
*/
|
|
292
|
-
judgeAverage:
|
|
298
|
+
judgeAverage: null | number;
|
|
293
299
|
/**
|
|
294
|
-
*
|
|
295
|
-
* single available family.
|
|
300
|
+
* Concrete matrix selection used by the run.
|
|
296
301
|
*/
|
|
297
|
-
|
|
302
|
+
matrix: ScheduledTaskMatrix;
|
|
298
303
|
}
|
|
299
304
|
/**
|
|
300
|
-
*
|
|
305
|
+
* Captures the output of one scheduled runner task.
|
|
301
306
|
*/
|
|
302
|
-
interface
|
|
307
|
+
interface RunResult {
|
|
303
308
|
/**
|
|
304
|
-
*
|
|
309
|
+
* Collected eval entry id.
|
|
305
310
|
*/
|
|
306
|
-
|
|
311
|
+
entryId: string;
|
|
307
312
|
/**
|
|
308
|
-
*
|
|
313
|
+
* Stable run id, usually copied from the scheduled task id.
|
|
309
314
|
*/
|
|
310
|
-
|
|
315
|
+
id: string;
|
|
311
316
|
/**
|
|
312
|
-
*
|
|
317
|
+
* Stable inferenceExecutor id.
|
|
313
318
|
*/
|
|
314
|
-
|
|
319
|
+
inferenceExecutorId: string;
|
|
315
320
|
/**
|
|
316
|
-
*
|
|
321
|
+
* Concrete matrix selection used by the run.
|
|
317
322
|
*/
|
|
318
|
-
|
|
323
|
+
matrix: ScheduledTaskMatrix;
|
|
319
324
|
/**
|
|
320
|
-
*
|
|
325
|
+
* Raw scores emitted by the eval.
|
|
321
326
|
*/
|
|
322
|
-
|
|
327
|
+
scores: readonly RunScore[];
|
|
323
328
|
}
|
|
324
329
|
/**
|
|
325
|
-
*
|
|
330
|
+
* Represents one normalized score emitted by a completed eval run.
|
|
326
331
|
*/
|
|
327
|
-
interface
|
|
328
|
-
/**
|
|
329
|
-
* Per-run normalized score summaries.
|
|
330
|
-
*/
|
|
331
|
-
runs: AggregatedRunSummary[];
|
|
332
|
+
interface RunScore {
|
|
332
333
|
/**
|
|
333
|
-
*
|
|
334
|
+
* Score family used for aggregation.
|
|
334
335
|
*/
|
|
335
|
-
|
|
336
|
+
kind: RunScoreKind;
|
|
336
337
|
/**
|
|
337
|
-
*
|
|
338
|
+
* Normalized score in the `0..1` range.
|
|
338
339
|
*/
|
|
339
|
-
|
|
340
|
-
exactAverage: number | null;
|
|
341
|
-
judgeAverage: number | null;
|
|
342
|
-
hybridAverage: number | null;
|
|
343
|
-
runCount: number;
|
|
344
|
-
};
|
|
340
|
+
score: number;
|
|
345
341
|
}
|
|
342
|
+
/**
|
|
343
|
+
* Identifies the scoring family for a single eval score.
|
|
344
|
+
*/
|
|
345
|
+
type RunScoreKind = 'exact' | 'judge';
|
|
346
346
|
/**
|
|
347
347
|
* Aggregates exact-match and judge-based scores into hybrid runner summaries.
|
|
348
348
|
*
|
|
@@ -365,19 +365,6 @@ interface AggregatedRunResults {
|
|
|
365
365
|
declare function aggregateRunResults(results: readonly RunResult[]): AggregatedRunResults;
|
|
366
366
|
//#endregion
|
|
367
367
|
//#region src/core/runner/runtime-context.d.ts
|
|
368
|
-
/**
|
|
369
|
-
* Shared runtime context used by the vieval runner.
|
|
370
|
-
*
|
|
371
|
-
* Use when:
|
|
372
|
-
* - runner services need stable path resolution without module-level side effects
|
|
373
|
-
* - call sites want deterministic control over workspace root detection
|
|
374
|
-
*/
|
|
375
|
-
interface RunnerRuntimeContext {
|
|
376
|
-
/**
|
|
377
|
-
* Absolute project root directory used for path normalization.
|
|
378
|
-
*/
|
|
379
|
-
projectRootDirectory: string;
|
|
380
|
-
}
|
|
381
368
|
/**
|
|
382
369
|
* Options used to construct the runner runtime context.
|
|
383
370
|
*/
|
|
@@ -395,6 +382,19 @@ interface CreateVievalRunnerRuntimeContextOptions {
|
|
|
395
382
|
*/
|
|
396
383
|
fallbackProjectRootDirectory?: string;
|
|
397
384
|
}
|
|
385
|
+
/**
|
|
386
|
+
* Shared runtime context used by the vieval runner.
|
|
387
|
+
*
|
|
388
|
+
* Use when:
|
|
389
|
+
* - runner services need stable path resolution without module-level side effects
|
|
390
|
+
* - call sites want deterministic control over workspace root detection
|
|
391
|
+
*/
|
|
392
|
+
interface RunnerRuntimeContext {
|
|
393
|
+
/**
|
|
394
|
+
* Absolute project root directory used for path normalization.
|
|
395
|
+
*/
|
|
396
|
+
projectRootDirectory: string;
|
|
397
|
+
}
|
|
398
398
|
/**
|
|
399
399
|
* Creates a side-effect-free runtime context for runner path normalization.
|
|
400
400
|
*
|
|
@@ -455,13 +455,17 @@ declare function collectEvalEntries(modules: EvalModuleMap, context: RunnerRunti
|
|
|
455
455
|
*/
|
|
456
456
|
interface ModelDefinition {
|
|
457
457
|
/**
|
|
458
|
-
*
|
|
458
|
+
* Alias names that can resolve this model.
|
|
459
459
|
*/
|
|
460
|
-
|
|
460
|
+
aliases: string[];
|
|
461
461
|
/**
|
|
462
|
-
*
|
|
462
|
+
* Optional execution policy hints attached to this model.
|
|
463
463
|
*/
|
|
464
|
-
|
|
464
|
+
executionPolicy?: TaskExecutionPolicy;
|
|
465
|
+
/**
|
|
466
|
+
* Stable model id.
|
|
467
|
+
*/
|
|
468
|
+
id: string;
|
|
465
469
|
/**
|
|
466
470
|
* Executor reference passed through config.
|
|
467
471
|
*
|
|
@@ -470,17 +474,13 @@ interface ModelDefinition {
|
|
|
470
474
|
*/
|
|
471
475
|
inferenceExecutor: unknown;
|
|
472
476
|
/**
|
|
473
|
-
*
|
|
474
|
-
*/
|
|
475
|
-
model: string;
|
|
476
|
-
/**
|
|
477
|
-
* Alias names that can resolve this model.
|
|
477
|
+
* Inference-executor id used for matching and reporting.
|
|
478
478
|
*/
|
|
479
|
-
|
|
479
|
+
inferenceExecutorId: string;
|
|
480
480
|
/**
|
|
481
|
-
*
|
|
481
|
+
* Concrete model name passed to the inference executor.
|
|
482
482
|
*/
|
|
483
|
-
|
|
483
|
+
model: string;
|
|
484
484
|
/**
|
|
485
485
|
* Optional model-level call parameters.
|
|
486
486
|
*/
|
|
@@ -495,6 +495,14 @@ interface ModelDefinition {
|
|
|
495
495
|
declare function resolveModelByName(models: readonly ModelDefinition[], name: string): ModelDefinition | undefined;
|
|
496
496
|
//#endregion
|
|
497
497
|
//#region src/core/runner/task-context.d.ts
|
|
498
|
+
/**
|
|
499
|
+
* Inputs used to build task execution context.
|
|
500
|
+
*/
|
|
501
|
+
interface CreateTaskExecutionContextOptions {
|
|
502
|
+
cache?: TaskCacheRuntime;
|
|
503
|
+
models: readonly ModelDefinition[];
|
|
504
|
+
task: ScheduledTask;
|
|
505
|
+
}
|
|
498
506
|
/**
|
|
499
507
|
* Task-scoped execution context exposed to runner executors.
|
|
500
508
|
*/
|
|
@@ -508,14 +516,6 @@ interface TaskExecutionContext {
|
|
|
508
516
|
*/
|
|
509
517
|
models: readonly ModelDefinition[];
|
|
510
518
|
}
|
|
511
|
-
/**
|
|
512
|
-
* Inputs used to build task execution context.
|
|
513
|
-
*/
|
|
514
|
-
interface CreateTaskExecutionContextOptions {
|
|
515
|
-
cache?: TaskCacheRuntime;
|
|
516
|
-
models: readonly ModelDefinition[];
|
|
517
|
-
task: ScheduledTask;
|
|
518
|
-
}
|
|
519
519
|
/**
|
|
520
520
|
* Creates task-scoped context data for runner execution.
|
|
521
521
|
*
|
|
@@ -528,20 +528,6 @@ interface CreateTaskExecutionContextOptions {
|
|
|
528
528
|
declare function createTaskExecutionContext(options: CreateTaskExecutionContextOptions): TaskExecutionContext;
|
|
529
529
|
//#endregion
|
|
530
530
|
//#region src/core/runner/run.d.ts
|
|
531
|
-
/**
|
|
532
|
-
* Executes one scheduled runner task and returns a normalized run result.
|
|
533
|
-
*
|
|
534
|
-
* Use when:
|
|
535
|
-
* - a scheduler already selected the task and execution context
|
|
536
|
-
* - the caller wants a typed executor contract for runner workers
|
|
537
|
-
*
|
|
538
|
-
* Expects:
|
|
539
|
-
* - the task context to be ready for model resolution and task-scoped work
|
|
540
|
-
*
|
|
541
|
-
* Returns:
|
|
542
|
-
* - a normalized run result with score entries ready for aggregation
|
|
543
|
-
*/
|
|
544
|
-
type ScheduledTaskExecutor = (task: ScheduledTask, context: TaskExecutionContext) => Promise<RunResult>;
|
|
545
531
|
/**
|
|
546
532
|
* Terminal task state reported by runner lifecycle hooks.
|
|
547
533
|
*
|
|
@@ -551,7 +537,7 @@ type ScheduledTaskExecutor = (task: ScheduledTask, context: TaskExecutionContext
|
|
|
551
537
|
* Expects:
|
|
552
538
|
* - hooks treat the value as final for the completed task
|
|
553
539
|
*/
|
|
554
|
-
type RunnerTaskState = '
|
|
540
|
+
type RunnerTaskState = 'failed' | 'passed';
|
|
555
541
|
/**
|
|
556
542
|
* Optional runner execution hooks used while processing scheduled tasks.
|
|
557
543
|
*
|
|
@@ -571,15 +557,11 @@ interface RunScheduledTasksOptions {
|
|
|
571
557
|
*/
|
|
572
558
|
createExecutionContext?: (task: ScheduledTask) => TaskExecutionContext;
|
|
573
559
|
/**
|
|
574
|
-
*
|
|
575
|
-
*
|
|
576
|
-
* Use when:
|
|
577
|
-
* - callers want to observe task activation before execution begins
|
|
560
|
+
* Maximum number of tasks to execute concurrently.
|
|
578
561
|
*
|
|
579
|
-
*
|
|
580
|
-
* - thrown errors abort the task before executor work starts
|
|
562
|
+
* @default 1
|
|
581
563
|
*/
|
|
582
|
-
|
|
564
|
+
maxConcurrency?: number;
|
|
583
565
|
/**
|
|
584
566
|
* Runs after the executor settles for a task.
|
|
585
567
|
*
|
|
@@ -592,12 +574,30 @@ interface RunScheduledTasksOptions {
|
|
|
592
574
|
*/
|
|
593
575
|
onTaskEnd?: (task: ScheduledTask, state: RunnerTaskState) => void;
|
|
594
576
|
/**
|
|
595
|
-
*
|
|
577
|
+
* Runs before the executor starts handling a task.
|
|
596
578
|
*
|
|
597
|
-
*
|
|
579
|
+
* Use when:
|
|
580
|
+
* - callers want to observe task activation before execution begins
|
|
581
|
+
*
|
|
582
|
+
* Expects:
|
|
583
|
+
* - thrown errors abort the task before executor work starts
|
|
598
584
|
*/
|
|
599
|
-
|
|
585
|
+
onTaskStart?: (task: ScheduledTask) => void;
|
|
600
586
|
}
|
|
587
|
+
/**
|
|
588
|
+
* Executes one scheduled runner task and returns a normalized run result.
|
|
589
|
+
*
|
|
590
|
+
* Use when:
|
|
591
|
+
* - a scheduler already selected the task and execution context
|
|
592
|
+
* - the caller wants a typed executor contract for runner workers
|
|
593
|
+
*
|
|
594
|
+
* Expects:
|
|
595
|
+
* - the task context to be ready for model resolution and task-scoped work
|
|
596
|
+
*
|
|
597
|
+
* Returns:
|
|
598
|
+
* - a normalized run result with score entries ready for aggregation
|
|
599
|
+
*/
|
|
600
|
+
type ScheduledTaskExecutor = (task: ScheduledTask, context: TaskExecutionContext) => Promise<RunResult>;
|
|
601
601
|
/**
|
|
602
602
|
* Error thrown when a scheduled run fails before producing a normalized result.
|
|
603
603
|
*/
|
|
@@ -633,10 +633,10 @@ declare class RunnerExecutionError extends Error {
|
|
|
633
633
|
declare function runScheduledTasks(tasks: readonly ScheduledTask[], executor: ScheduledTaskExecutor, options?: RunScheduledTasksOptions): Promise<AggregatedRunResults>;
|
|
634
634
|
//#endregion
|
|
635
635
|
//#region src/core/telemetry/types.d.ts
|
|
636
|
-
/** JSON-compatible scalar values accepted as telemetry attributes. */
|
|
637
|
-
type TelemetryAttributeValue = boolean | number | string | null | readonly TelemetryAttributeValue[];
|
|
638
636
|
/** Attribute map shared by local report projection and OpenTelemetry span calls. */
|
|
639
637
|
type TelemetryAttributes = Record<string, TelemetryAttributeValue | undefined>;
|
|
638
|
+
/** JSON-compatible scalar values accepted as telemetry attributes. */
|
|
639
|
+
type TelemetryAttributeValue = boolean | null | number | readonly TelemetryAttributeValue[] | string;
|
|
640
640
|
/**
|
|
641
641
|
* Internal Vieval telemetry runtime.
|
|
642
642
|
*
|
|
@@ -652,10 +652,10 @@ type TelemetryAttributes = Record<string, TelemetryAttributeValue | undefined>;
|
|
|
652
652
|
* - callback result, preserving thrown errors after telemetry records them
|
|
653
653
|
*/
|
|
654
654
|
interface TelemetryRuntime {
|
|
655
|
-
withSpan: <T>(name: string, attributes: TelemetryAttributes, callback: () => Promise<T>) => Promise<T>;
|
|
656
655
|
addEvent: (name: string, attributes?: TelemetryAttributes) => void;
|
|
657
|
-
setAttributes: (attributes: TelemetryAttributes) => void;
|
|
658
656
|
recordException: (error: unknown) => void;
|
|
657
|
+
setAttributes: (attributes: TelemetryAttributes) => void;
|
|
658
|
+
withSpan: <T>(name: string, attributes: TelemetryAttributes, callback: () => Promise<T>) => Promise<T>;
|
|
659
659
|
}
|
|
660
660
|
//#endregion
|
|
661
661
|
//#region src/config/types.d.ts
|
|
@@ -666,46 +666,94 @@ interface TelemetryRuntime {
|
|
|
666
666
|
*/
|
|
667
667
|
type Awaitable<T> = Promise<T> | T;
|
|
668
668
|
/**
|
|
669
|
-
*
|
|
670
|
-
*
|
|
671
|
-
* Use when:
|
|
672
|
-
* - defining axis values for canonical layered matrix config
|
|
673
|
-
* - preserving JSON-safe primitive values through config normalization
|
|
674
|
-
*
|
|
675
|
-
* Expects:
|
|
676
|
-
* - values remain serializable and comparable with stringified task ids
|
|
677
|
-
*
|
|
678
|
-
* Returns:
|
|
679
|
-
* - one JSON-friendly primitive matrix value
|
|
669
|
+
* OpenTelemetry reporting configuration managed by user config setup.
|
|
680
670
|
*/
|
|
681
|
-
|
|
671
|
+
interface CliOpenTelemetryReportingConfig {
|
|
672
|
+
/**
|
|
673
|
+
* Enables Vieval active span wrapping through `@opentelemetry/api`.
|
|
674
|
+
*
|
|
675
|
+
* @default false
|
|
676
|
+
*/
|
|
677
|
+
enabled?: boolean;
|
|
678
|
+
/**
|
|
679
|
+
* Called after all telemetry events and local report artifacts have been emitted.
|
|
680
|
+
*/
|
|
681
|
+
onRunEnd?: () => Awaitable<void>;
|
|
682
|
+
}
|
|
682
683
|
/**
|
|
683
|
-
*
|
|
684
|
-
*
|
|
685
|
-
* Use when:
|
|
686
|
-
* - declaring matrix axis values at the config boundary
|
|
687
|
-
*
|
|
688
|
-
* Expects:
|
|
689
|
-
* - values are normalized from config input without extra wrapping
|
|
690
|
-
*
|
|
691
|
-
* Returns:
|
|
692
|
-
* - a primitive cell value suitable for matrix expansion
|
|
684
|
+
* Reporting configuration for local artifacts and optional OpenTelemetry integration.
|
|
693
685
|
*/
|
|
694
|
-
|
|
686
|
+
interface CliReportingConfig {
|
|
687
|
+
/**
|
|
688
|
+
* Optional OpenTelemetry API integration.
|
|
689
|
+
*/
|
|
690
|
+
openTelemetry?: CliOpenTelemetryReportingConfig;
|
|
691
|
+
}
|
|
695
692
|
/**
|
|
696
|
-
*
|
|
697
|
-
*
|
|
698
|
-
* Use when:
|
|
699
|
-
* - storing the selected values for a resolved matrix row
|
|
700
|
-
* - passing task-level matrix context between layers
|
|
701
|
-
*
|
|
702
|
-
* Expects:
|
|
703
|
-
* - keys are axis names and values are resolved axis selections
|
|
704
|
-
*
|
|
705
|
-
* Returns:
|
|
706
|
-
* - one resolved row object
|
|
693
|
+
* Represents a normalized evaluation entry collected by the runner.
|
|
707
694
|
*/
|
|
708
|
-
type
|
|
695
|
+
type CollectedEvalEntry<TDefinition extends EvalDefinition = EvalDefinition> = TDefinition & {
|
|
696
|
+
directory: string;
|
|
697
|
+
filePath: string;
|
|
698
|
+
id: string;
|
|
699
|
+
};
|
|
700
|
+
/**
|
|
701
|
+
* Declares the metadata required for a single vieval evaluation module.
|
|
702
|
+
*/
|
|
703
|
+
interface EvalDefinition {
|
|
704
|
+
description: string;
|
|
705
|
+
/**
|
|
706
|
+
* Optional matrix layering for this eval definition.
|
|
707
|
+
*
|
|
708
|
+
* Use when:
|
|
709
|
+
* - one eval file needs control-group variants that differ from project defaults
|
|
710
|
+
*
|
|
711
|
+
* @example
|
|
712
|
+
* ```ts
|
|
713
|
+
* matrix: {
|
|
714
|
+
* runMatrix: {
|
|
715
|
+
* extend: {
|
|
716
|
+
* promptStyle: ['concise'],
|
|
717
|
+
* },
|
|
718
|
+
* override: {
|
|
719
|
+
* scenario: ['eval-scenario'],
|
|
720
|
+
* },
|
|
721
|
+
* },
|
|
722
|
+
* evalMatrix: {
|
|
723
|
+
* override: {
|
|
724
|
+
* rubric: ['strict'],
|
|
725
|
+
* },
|
|
726
|
+
* },
|
|
727
|
+
* }
|
|
728
|
+
* ```
|
|
729
|
+
*
|
|
730
|
+
* Context impact:
|
|
731
|
+
*
|
|
732
|
+
* ```txt
|
|
733
|
+
* project.runMatrix + eval.matrix.runMatrix + task.matrix.runMatrix
|
|
734
|
+
* => context.task.matrix.run
|
|
735
|
+
*
|
|
736
|
+
* project.evalMatrix + eval.matrix.evalMatrix + task.matrix.evalMatrix
|
|
737
|
+
* => context.task.matrix.eval
|
|
738
|
+
* ```
|
|
739
|
+
*/
|
|
740
|
+
matrix?: ScopedMatrices;
|
|
741
|
+
name: string;
|
|
742
|
+
/**
|
|
743
|
+
* Optional task implementation executed by runner.
|
|
744
|
+
*/
|
|
745
|
+
task?: TaskDefinition;
|
|
746
|
+
}
|
|
747
|
+
/**
|
|
748
|
+
* Describes the shape of an imported vieval evaluation module.
|
|
749
|
+
*/
|
|
750
|
+
interface EvalModule<TDefinition extends EvalDefinition = EvalDefinition> {
|
|
751
|
+
default: TDefinition;
|
|
752
|
+
}
|
|
753
|
+
/**
|
|
754
|
+
* Maps module URLs to their loaded vieval evaluation modules.
|
|
755
|
+
*/
|
|
756
|
+
type EvalModuleMap = Record<string, EvalModule>;
|
|
709
757
|
/**
|
|
710
758
|
* Canonical axis value list for one matrix definition.
|
|
711
759
|
*
|
|
@@ -767,6 +815,15 @@ type MatrixDefinition = Record<string, MatrixAxisValues>;
|
|
|
767
815
|
* ```
|
|
768
816
|
*/
|
|
769
817
|
interface MatrixLayer {
|
|
818
|
+
/**
|
|
819
|
+
* Matrix axes disabled at this layer.
|
|
820
|
+
*
|
|
821
|
+
* @example
|
|
822
|
+
* ```ts
|
|
823
|
+
* disable: ['temperatureProfile']
|
|
824
|
+
* ```
|
|
825
|
+
*/
|
|
826
|
+
disable?: readonly string[];
|
|
770
827
|
/**
|
|
771
828
|
* Matrix axes inherited or appended at this layer.
|
|
772
829
|
*
|
|
@@ -790,16 +847,48 @@ interface MatrixLayer {
|
|
|
790
847
|
* ```
|
|
791
848
|
*/
|
|
792
849
|
override?: MatrixDefinition;
|
|
793
|
-
/**
|
|
794
|
-
* Matrix axes disabled at this layer.
|
|
795
|
-
*
|
|
796
|
-
* @example
|
|
797
|
-
* ```ts
|
|
798
|
-
* disable: ['temperatureProfile']
|
|
799
|
-
* ```
|
|
800
|
-
*/
|
|
801
|
-
disable?: readonly string[];
|
|
802
850
|
}
|
|
851
|
+
/**
|
|
852
|
+
* Primitive value allowed in one matrix cell.
|
|
853
|
+
*
|
|
854
|
+
* Use when:
|
|
855
|
+
* - defining axis values for canonical layered matrix config
|
|
856
|
+
* - preserving JSON-safe primitive values through config normalization
|
|
857
|
+
*
|
|
858
|
+
* Expects:
|
|
859
|
+
* - values remain serializable and comparable with stringified task ids
|
|
860
|
+
*
|
|
861
|
+
* Returns:
|
|
862
|
+
* - one JSON-friendly primitive matrix value
|
|
863
|
+
*/
|
|
864
|
+
type MatrixPrimitive = boolean | number | string;
|
|
865
|
+
/**
|
|
866
|
+
* Canonical row payload for one matrix combination.
|
|
867
|
+
*
|
|
868
|
+
* Use when:
|
|
869
|
+
* - storing the selected values for a resolved matrix row
|
|
870
|
+
* - passing task-level matrix context between layers
|
|
871
|
+
*
|
|
872
|
+
* Expects:
|
|
873
|
+
* - keys are axis names and values are resolved axis selections
|
|
874
|
+
*
|
|
875
|
+
* Returns:
|
|
876
|
+
* - one resolved row object
|
|
877
|
+
*/
|
|
878
|
+
type MatrixRow = Record<string, MatrixValue>;
|
|
879
|
+
/**
|
|
880
|
+
* Canonical matrix value type.
|
|
881
|
+
*
|
|
882
|
+
* Use when:
|
|
883
|
+
* - declaring matrix axis values at the config boundary
|
|
884
|
+
*
|
|
885
|
+
* Expects:
|
|
886
|
+
* - values are normalized from config input without extra wrapping
|
|
887
|
+
*
|
|
888
|
+
* Returns:
|
|
889
|
+
* - a primitive cell value suitable for matrix expansion
|
|
890
|
+
*/
|
|
891
|
+
type MatrixValue = MatrixPrimitive;
|
|
803
892
|
/**
|
|
804
893
|
* Canonical run/eval matrix grouping.
|
|
805
894
|
*
|
|
@@ -845,88 +934,111 @@ interface MatrixLayer {
|
|
|
845
934
|
*/
|
|
846
935
|
interface ScopedMatrices {
|
|
847
936
|
/**
|
|
848
|
-
*
|
|
937
|
+
* Eval-time matrix scope.
|
|
849
938
|
*
|
|
850
939
|
* @example
|
|
851
940
|
* ```ts
|
|
852
|
-
*
|
|
853
|
-
*
|
|
854
|
-
*
|
|
941
|
+
* evalMatrix: {
|
|
942
|
+
* override: {
|
|
943
|
+
* rubric: ['strict'],
|
|
855
944
|
* },
|
|
856
945
|
* }
|
|
857
946
|
* ```
|
|
858
947
|
*/
|
|
859
|
-
|
|
948
|
+
evalMatrix?: MatrixLayer;
|
|
860
949
|
/**
|
|
861
|
-
*
|
|
950
|
+
* Runtime matrix scope.
|
|
862
951
|
*
|
|
863
952
|
* @example
|
|
864
953
|
* ```ts
|
|
865
|
-
*
|
|
866
|
-
*
|
|
867
|
-
*
|
|
954
|
+
* runMatrix: {
|
|
955
|
+
* extend: {
|
|
956
|
+
* promptLanguage: ['en', 'zh'],
|
|
868
957
|
* },
|
|
869
958
|
* }
|
|
870
959
|
* ```
|
|
871
960
|
*/
|
|
872
|
-
|
|
873
|
-
}
|
|
874
|
-
/**
|
|
875
|
-
* Output of one eval task execution.
|
|
876
|
-
*/
|
|
877
|
-
interface TaskRunOutput {
|
|
878
|
-
/**
|
|
879
|
-
* Scores emitted by this task run.
|
|
880
|
-
*/
|
|
881
|
-
scores: readonly RunScore[];
|
|
961
|
+
runMatrix?: MatrixLayer;
|
|
882
962
|
}
|
|
883
963
|
/**
|
|
884
964
|
* Delay policy for retries within one task case attempt.
|
|
885
965
|
*
|
|
886
966
|
* @param retryIndex Retry number where `1` is the first retry after the initial failure.
|
|
887
967
|
*/
|
|
888
|
-
type TaskAutoRetryDelay =
|
|
968
|
+
type TaskAutoRetryDelay = ((retryIndex: number) => number) | number;
|
|
889
969
|
/**
|
|
890
|
-
*
|
|
970
|
+
* Payload emitted when a task case ends.
|
|
891
971
|
*
|
|
892
972
|
* Use when:
|
|
893
|
-
* -
|
|
894
|
-
* - failures should retry within the current attempt or trigger a later full task attempt
|
|
973
|
+
* - reporter hooks need the case position plus terminal state
|
|
895
974
|
*
|
|
896
975
|
* Expects:
|
|
897
|
-
* - `
|
|
898
|
-
* - `
|
|
976
|
+
* - `name` is the declared DSL case label
|
|
977
|
+
* - `index` is the zero-based case position within the task
|
|
978
|
+
* - `total` is the total number of registered cases
|
|
979
|
+
* - `state` describes the final case result
|
|
980
|
+
*/
|
|
981
|
+
interface TaskCaseReporterEndPayload extends TaskCaseReporterPayload {
|
|
982
|
+
/**
|
|
983
|
+
* Optional failure message when `state` is `failed`.
|
|
984
|
+
*/
|
|
985
|
+
errorMessage?: string;
|
|
986
|
+
/**
|
|
987
|
+
* Optional case output returned by the task case callback.
|
|
988
|
+
*/
|
|
989
|
+
output?: unknown;
|
|
990
|
+
/**
|
|
991
|
+
* Final case state.
|
|
992
|
+
*/
|
|
993
|
+
state: TaskCaseState;
|
|
994
|
+
}
|
|
995
|
+
/**
|
|
996
|
+
* Payload emitted when a task case starts.
|
|
899
997
|
*
|
|
900
|
-
*
|
|
901
|
-
* - one
|
|
998
|
+
* Use when:
|
|
999
|
+
* - reporter hooks need a stable position for one case within the task
|
|
1000
|
+
*
|
|
1001
|
+
* Expects:
|
|
1002
|
+
* - `name` is the declared DSL case label
|
|
1003
|
+
* - `index` is the zero-based case position within the task
|
|
1004
|
+
* - `total` is the total number of registered cases
|
|
902
1005
|
*/
|
|
903
|
-
interface
|
|
1006
|
+
interface TaskCaseReporterPayload {
|
|
904
1007
|
/**
|
|
905
|
-
*
|
|
906
|
-
*
|
|
907
|
-
* @default 0
|
|
1008
|
+
* Maximum retry count configured for this case.
|
|
908
1009
|
*/
|
|
909
1010
|
autoRetry?: number;
|
|
910
1011
|
/**
|
|
911
|
-
*
|
|
912
|
-
*
|
|
913
|
-
* A number applies the same delay to every retry. A function receives the
|
|
914
|
-
* retry index where `1` is the first retry after the initial failure.
|
|
915
|
-
*
|
|
916
|
-
* @default retryIndex => 500 * 2 ** (retryIndex - 1)
|
|
1012
|
+
* Zero-based case position within the task.
|
|
917
1013
|
*/
|
|
918
|
-
|
|
1014
|
+
index: number;
|
|
919
1015
|
/**
|
|
920
|
-
*
|
|
921
|
-
*
|
|
922
|
-
* @default 0
|
|
1016
|
+
* Optional case input payload registered by the task DSL.
|
|
923
1017
|
*/
|
|
924
|
-
|
|
1018
|
+
input?: unknown;
|
|
925
1019
|
/**
|
|
926
|
-
*
|
|
1020
|
+
* Declared case label.
|
|
927
1021
|
*/
|
|
928
|
-
|
|
1022
|
+
name: string;
|
|
1023
|
+
/**
|
|
1024
|
+
* Current retry attempt index, where `0` is the first try.
|
|
1025
|
+
*/
|
|
1026
|
+
retryIndex?: number;
|
|
1027
|
+
/**
|
|
1028
|
+
* Total number of registered cases.
|
|
1029
|
+
*/
|
|
1030
|
+
total: number;
|
|
929
1031
|
}
|
|
1032
|
+
/**
|
|
1033
|
+
* Allowed terminal outcomes for one task case.
|
|
1034
|
+
*
|
|
1035
|
+
* Use when:
|
|
1036
|
+
* - emitting case lifecycle events from the task DSL
|
|
1037
|
+
*
|
|
1038
|
+
* Expects:
|
|
1039
|
+
* - consumers treat the value as the final state for the case
|
|
1040
|
+
*/
|
|
1041
|
+
type TaskCaseState = 'failed' | 'passed' | 'timeout';
|
|
930
1042
|
/**
|
|
931
1043
|
* Task-local concurrency metadata.
|
|
932
1044
|
*
|
|
@@ -951,194 +1063,124 @@ interface TaskConcurrencyConfig {
|
|
|
951
1063
|
case?: number;
|
|
952
1064
|
}
|
|
953
1065
|
/**
|
|
954
|
-
*
|
|
1066
|
+
* Eval task definition used by `defineTask`.
|
|
955
1067
|
*/
|
|
956
|
-
interface
|
|
1068
|
+
interface TaskDefinition {
|
|
957
1069
|
/**
|
|
958
|
-
* Optional
|
|
1070
|
+
* Optional task-local concurrency metadata.
|
|
1071
|
+
*
|
|
1072
|
+
* Use when:
|
|
1073
|
+
* - task declarations need to preserve task-scoped attempt/case caps for later scheduler wiring
|
|
1074
|
+
* - higher-level orchestration wants to inspect task-local concurrency without executing the task
|
|
1075
|
+
*
|
|
1076
|
+
* Expects:
|
|
1077
|
+
* - each provided value to be a positive integer chosen by the caller
|
|
1078
|
+
*
|
|
1079
|
+
* Returns:
|
|
1080
|
+
* - one partial task-local concurrency descriptor
|
|
959
1081
|
*/
|
|
960
|
-
|
|
961
|
-
}
|
|
962
|
-
/**
|
|
963
|
-
* OpenTelemetry reporting configuration managed by user config setup.
|
|
964
|
-
*/
|
|
965
|
-
interface CliOpenTelemetryReportingConfig {
|
|
1082
|
+
concurrency?: TaskConcurrencyConfig;
|
|
966
1083
|
/**
|
|
967
|
-
*
|
|
1084
|
+
* Optional task-local execution policy.
|
|
1085
|
+
*/
|
|
1086
|
+
executionPolicy?: TaskExecutionPolicy;
|
|
1087
|
+
/**
|
|
1088
|
+
* Stable task id for diagnostics.
|
|
1089
|
+
*/
|
|
1090
|
+
id: string;
|
|
1091
|
+
/**
|
|
1092
|
+
* Optional matrix layering for this task definition.
|
|
968
1093
|
*
|
|
969
|
-
*
|
|
1094
|
+
* Use when:
|
|
1095
|
+
* - task-local experiments should refine project/eval defaults
|
|
1096
|
+
*
|
|
1097
|
+
* @example
|
|
1098
|
+
* ```ts
|
|
1099
|
+
* matrix: {
|
|
1100
|
+
* runMatrix: {
|
|
1101
|
+
* override: {
|
|
1102
|
+
* model: ['gpt-4.1-mini'],
|
|
1103
|
+
* },
|
|
1104
|
+
* },
|
|
1105
|
+
* evalMatrix: {
|
|
1106
|
+
* extend: {
|
|
1107
|
+
* evaluator: ['default-judge'],
|
|
1108
|
+
* },
|
|
1109
|
+
* },
|
|
1110
|
+
* }
|
|
1111
|
+
* ```
|
|
970
1112
|
*/
|
|
971
|
-
|
|
1113
|
+
matrix?: ScopedMatrices;
|
|
972
1114
|
/**
|
|
973
|
-
*
|
|
1115
|
+
* Executes one scheduled eval task.
|
|
974
1116
|
*/
|
|
975
|
-
|
|
1117
|
+
run: (context: TaskRunContext) => Promise<TaskRunOutput> | TaskRunOutput;
|
|
976
1118
|
}
|
|
977
1119
|
/**
|
|
978
|
-
*
|
|
1120
|
+
* Execution policy applied to task and case callbacks.
|
|
1121
|
+
*
|
|
1122
|
+
* Use when:
|
|
1123
|
+
* - one task or case should time out after a bounded duration
|
|
1124
|
+
* - failures should retry within the current attempt or trigger a later full task attempt
|
|
1125
|
+
*
|
|
1126
|
+
* Expects:
|
|
1127
|
+
* - `timeout` to be a positive integer when provided
|
|
1128
|
+
* - `autoRetry` and `autoAttempt` to be non-negative integers when provided
|
|
1129
|
+
*
|
|
1130
|
+
* Returns:
|
|
1131
|
+
* - one partial execution policy descriptor
|
|
979
1132
|
*/
|
|
980
|
-
interface
|
|
1133
|
+
interface TaskExecutionPolicy {
|
|
981
1134
|
/**
|
|
982
|
-
*
|
|
1135
|
+
* Additional full task attempts allowed after the current attempt settles.
|
|
983
1136
|
*
|
|
984
|
-
*
|
|
985
|
-
* - benchmark setup needs deterministic artifact reuse across attempts
|
|
986
|
-
* - case-level logic needs typed text/json/binary cache loaders
|
|
1137
|
+
* @default 0
|
|
987
1138
|
*/
|
|
988
|
-
|
|
1139
|
+
autoAttempt?: number;
|
|
989
1140
|
/**
|
|
990
|
-
*
|
|
1141
|
+
* Additional retries allowed within the current attempt.
|
|
991
1142
|
*
|
|
992
|
-
*
|
|
1143
|
+
* @default 0
|
|
1144
|
+
*/
|
|
1145
|
+
autoRetry?: number;
|
|
1146
|
+
/**
|
|
1147
|
+
* Delay in milliseconds before a case auto retry starts.
|
|
993
1148
|
*
|
|
994
|
-
*
|
|
995
|
-
*
|
|
996
|
-
* -> scheduler expands run rows x eval rows
|
|
997
|
-
* -> one scheduled task per row pair
|
|
998
|
-
* -> context.task.matrix = {
|
|
999
|
-
* run: selected run-axis values,
|
|
1000
|
-
* eval: selected eval-axis values,
|
|
1001
|
-
* meta: { runRowId, evalRowId }
|
|
1002
|
-
* }
|
|
1003
|
-
* ```
|
|
1149
|
+
* A number applies the same delay to every retry. A function receives the
|
|
1150
|
+
* retry index where `1` is the first retry after the initial failure.
|
|
1004
1151
|
*
|
|
1005
|
-
*
|
|
1006
|
-
* - `runMatrix` axes appear under `context.task.matrix.run.*`
|
|
1007
|
-
* - `evalMatrix` axes appear under `context.task.matrix.eval.*`
|
|
1008
|
-
* - row ids are stable labels for grouping/aggregation under `context.task.matrix.meta.*`
|
|
1009
|
-
*
|
|
1010
|
-
* @example
|
|
1011
|
-
* ```ts
|
|
1012
|
-
* // If final selected rows are:
|
|
1013
|
-
* // run: { model: 'gpt-4.1-mini', scenario: 'stress', promptLanguage: 'zh' }
|
|
1014
|
-
* // eval: { rubric: 'strict', rubricModel: 'judge-large' }
|
|
1015
|
-
*
|
|
1016
|
-
* context.task.matrix.run.model // 'gpt-4.1-mini'
|
|
1017
|
-
* context.task.matrix.run.scenario // 'stress'
|
|
1018
|
-
* context.task.matrix.eval.rubric // 'strict'
|
|
1019
|
-
* context.task.matrix.meta.runRowId // stable encoded row id
|
|
1020
|
-
* ```
|
|
1021
|
-
*/
|
|
1022
|
-
task: ScheduledTask;
|
|
1023
|
-
/**
|
|
1024
|
-
* Configured model registrations available to model plugins.
|
|
1025
|
-
*
|
|
1026
|
-
* Use when:
|
|
1027
|
-
* - a plugin owns model selection semantics and needs access to registered models
|
|
1028
|
-
* - eval code resolves matrix-selected model axes through plugin helpers
|
|
1029
|
-
*/
|
|
1030
|
-
models: TaskExecutionContext['models'];
|
|
1031
|
-
/**
|
|
1032
|
-
* Optional reporter lifecycle hooks for task-local case events.
|
|
1033
|
-
*
|
|
1034
|
-
* Use when:
|
|
1035
|
-
* - a caller wants visibility into each case without coupling to the CLI reporter layer
|
|
1036
|
-
*
|
|
1037
|
-
* Expects:
|
|
1038
|
-
* - hooks are best-effort observers and should not affect task scoring
|
|
1039
|
-
*/
|
|
1040
|
-
reporterHooks?: TaskReporterHooks;
|
|
1041
|
-
/**
|
|
1042
|
-
* Optional telemetry runtime shared by runner, DSL, and reporter integrations.
|
|
1043
|
-
*
|
|
1044
|
-
* Use when:
|
|
1045
|
-
* - task execution should emit events to the currently active telemetry runtime
|
|
1046
|
-
* - enabled and disabled telemetry should keep the same execution path
|
|
1047
|
-
*
|
|
1048
|
-
* Expects:
|
|
1049
|
-
* - callers inject a no-op runtime when telemetry is disabled
|
|
1050
|
-
*/
|
|
1051
|
-
telemetry?: TelemetryRuntime;
|
|
1052
|
-
/**
|
|
1053
|
-
* Optional runtime scheduling overrides supplied by CLI or host execution.
|
|
1054
|
-
*
|
|
1055
|
-
* Use when:
|
|
1056
|
-
* - run operators need to override task/case concurrency without editing eval code
|
|
1057
|
-
* - DSL task runners need to distinguish runtime flags from code defaults
|
|
1058
|
-
*
|
|
1059
|
-
* Expects:
|
|
1060
|
-
* - values are positive integers when provided
|
|
1061
|
-
*
|
|
1062
|
-
* @default undefined
|
|
1063
|
-
*/
|
|
1064
|
-
runtimeConcurrency?: TaskConcurrencyConfig;
|
|
1065
|
-
/**
|
|
1066
|
-
* Cooperative abort signal for the current execution.
|
|
1067
|
-
*/
|
|
1068
|
-
signal?: AbortSignal;
|
|
1069
|
-
}
|
|
1070
|
-
/**
|
|
1071
|
-
* Allowed terminal outcomes for one task case.
|
|
1072
|
-
*
|
|
1073
|
-
* Use when:
|
|
1074
|
-
* - emitting case lifecycle events from the task DSL
|
|
1075
|
-
*
|
|
1076
|
-
* Expects:
|
|
1077
|
-
* - consumers treat the value as the final state for the case
|
|
1078
|
-
*/
|
|
1079
|
-
type TaskCaseState = 'passed' | 'failed' | 'timeout';
|
|
1080
|
-
/**
|
|
1081
|
-
* Payload emitted when a task case starts.
|
|
1082
|
-
*
|
|
1083
|
-
* Use when:
|
|
1084
|
-
* - reporter hooks need a stable position for one case within the task
|
|
1085
|
-
*
|
|
1086
|
-
* Expects:
|
|
1087
|
-
* - `name` is the declared DSL case label
|
|
1088
|
-
* - `index` is the zero-based case position within the task
|
|
1089
|
-
* - `total` is the total number of registered cases
|
|
1090
|
-
*/
|
|
1091
|
-
interface TaskCaseReporterPayload {
|
|
1092
|
-
/**
|
|
1093
|
-
* Maximum retry count configured for this case.
|
|
1094
|
-
*/
|
|
1095
|
-
autoRetry?: number;
|
|
1096
|
-
/**
|
|
1097
|
-
* Optional case input payload registered by the task DSL.
|
|
1098
|
-
*/
|
|
1099
|
-
input?: unknown;
|
|
1100
|
-
/**
|
|
1101
|
-
* Declared case label.
|
|
1102
|
-
*/
|
|
1103
|
-
name: string;
|
|
1104
|
-
/**
|
|
1105
|
-
* Current retry attempt index, where `0` is the first try.
|
|
1106
|
-
*/
|
|
1107
|
-
retryIndex?: number;
|
|
1108
|
-
/**
|
|
1109
|
-
* Zero-based case position within the task.
|
|
1152
|
+
* @default retryIndex => 500 * 2 ** (retryIndex - 1)
|
|
1110
1153
|
*/
|
|
1111
|
-
|
|
1154
|
+
autoRetryDelay?: TaskAutoRetryDelay;
|
|
1112
1155
|
/**
|
|
1113
|
-
*
|
|
1156
|
+
* Timeout in milliseconds for one case execution.
|
|
1114
1157
|
*/
|
|
1115
|
-
|
|
1158
|
+
timeout?: number;
|
|
1116
1159
|
}
|
|
1117
1160
|
/**
|
|
1118
|
-
* Payload emitted
|
|
1161
|
+
* Payload emitted by task code for custom report events.
|
|
1119
1162
|
*
|
|
1120
1163
|
* Use when:
|
|
1121
|
-
* -
|
|
1164
|
+
* - reporting runtime telemetry such as inference requests, responses, or tool calls
|
|
1165
|
+
* - attaching modality-specific metrics without coupling task logic to CLI internals
|
|
1122
1166
|
*
|
|
1123
1167
|
* Expects:
|
|
1124
|
-
* - `
|
|
1125
|
-
* - `
|
|
1126
|
-
* - `total` is the total number of registered cases
|
|
1127
|
-
* - `state` describes the final case result
|
|
1168
|
+
* - `event` to be a stable event name
|
|
1169
|
+
* - `data` to be JSON-serializable for report artifact persistence
|
|
1128
1170
|
*/
|
|
1129
|
-
interface
|
|
1171
|
+
interface TaskReporterEventPayload {
|
|
1130
1172
|
/**
|
|
1131
|
-
* Optional case
|
|
1173
|
+
* Optional stable case id when the event maps to one case lifecycle.
|
|
1132
1174
|
*/
|
|
1133
|
-
|
|
1175
|
+
caseId?: string;
|
|
1134
1176
|
/**
|
|
1135
|
-
*
|
|
1177
|
+
* Optional custom payload persisted under event `data`.
|
|
1136
1178
|
*/
|
|
1137
|
-
|
|
1179
|
+
data?: unknown;
|
|
1138
1180
|
/**
|
|
1139
|
-
*
|
|
1181
|
+
* Event name written into report event envelopes.
|
|
1140
1182
|
*/
|
|
1141
|
-
|
|
1183
|
+
event: string;
|
|
1142
1184
|
}
|
|
1143
1185
|
/**
|
|
1144
1186
|
* Reporter hooks invoked around each task case execution.
|
|
@@ -1151,14 +1193,14 @@ interface TaskCaseReporterEndPayload extends TaskCaseReporterPayload {
|
|
|
1151
1193
|
* - hooks observe case start/end events but do not influence scoring
|
|
1152
1194
|
*/
|
|
1153
1195
|
interface TaskReporterHooks {
|
|
1154
|
-
/**
|
|
1155
|
-
* Runs when a case is about to execute.
|
|
1156
|
-
*/
|
|
1157
|
-
onCaseStart?: (payload: TaskCaseReporterPayload) => void;
|
|
1158
1196
|
/**
|
|
1159
1197
|
* Runs after a case settles.
|
|
1160
1198
|
*/
|
|
1161
1199
|
onCaseEnd?: (payload: TaskCaseReporterEndPayload) => void;
|
|
1200
|
+
/**
|
|
1201
|
+
* Runs when a case is about to execute.
|
|
1202
|
+
*/
|
|
1203
|
+
onCaseStart?: (payload: TaskCaseReporterPayload) => void;
|
|
1162
1204
|
/**
|
|
1163
1205
|
* Runs when task code emits a custom telemetry/reporting event.
|
|
1164
1206
|
*
|
|
@@ -1169,149 +1211,107 @@ interface TaskReporterHooks {
|
|
|
1169
1211
|
onEvent?: (payload: TaskReporterEventPayload) => void;
|
|
1170
1212
|
}
|
|
1171
1213
|
/**
|
|
1172
|
-
*
|
|
1173
|
-
*
|
|
1174
|
-
* Use when:
|
|
1175
|
-
* - reporting runtime telemetry such as inference requests, responses, or tool calls
|
|
1176
|
-
* - attaching modality-specific metrics without coupling task logic to CLI internals
|
|
1177
|
-
*
|
|
1178
|
-
* Expects:
|
|
1179
|
-
* - `event` to be a stable event name
|
|
1180
|
-
* - `data` to be JSON-serializable for report artifact persistence
|
|
1214
|
+
* Runtime context passed into eval task `run`.
|
|
1181
1215
|
*/
|
|
1182
|
-
interface
|
|
1183
|
-
/**
|
|
1184
|
-
* Event name written into report event envelopes.
|
|
1185
|
-
*/
|
|
1186
|
-
event: string;
|
|
1187
|
-
/**
|
|
1188
|
-
* Optional custom payload persisted under event `data`.
|
|
1189
|
-
*/
|
|
1190
|
-
data?: unknown;
|
|
1216
|
+
interface TaskRunContext {
|
|
1191
1217
|
/**
|
|
1192
|
-
*
|
|
1218
|
+
* Task-scoped cache runtime.
|
|
1219
|
+
*
|
|
1220
|
+
* Use when:
|
|
1221
|
+
* - benchmark setup needs deterministic artifact reuse across attempts
|
|
1222
|
+
* - case-level logic needs typed text/json/binary cache loaders
|
|
1193
1223
|
*/
|
|
1194
|
-
|
|
1195
|
-
}
|
|
1196
|
-
/**
|
|
1197
|
-
* Eval task definition used by `defineTask`.
|
|
1198
|
-
*/
|
|
1199
|
-
interface TaskDefinition {
|
|
1224
|
+
cache: TaskExecutionContext['cache'];
|
|
1200
1225
|
/**
|
|
1201
|
-
*
|
|
1226
|
+
* Configured model registrations available to model plugins.
|
|
1227
|
+
*
|
|
1228
|
+
* Use when:
|
|
1229
|
+
* - a plugin owns model selection semantics and needs access to registered models
|
|
1230
|
+
* - eval code resolves matrix-selected model axes through plugin helpers
|
|
1202
1231
|
*/
|
|
1203
|
-
|
|
1232
|
+
models: TaskExecutionContext['models'];
|
|
1204
1233
|
/**
|
|
1205
|
-
* Optional task-local
|
|
1234
|
+
* Optional reporter lifecycle hooks for task-local case events.
|
|
1206
1235
|
*
|
|
1207
1236
|
* Use when:
|
|
1208
|
-
* -
|
|
1209
|
-
* - higher-level orchestration wants to inspect task-local concurrency without executing the task
|
|
1237
|
+
* - a caller wants visibility into each case without coupling to the CLI reporter layer
|
|
1210
1238
|
*
|
|
1211
1239
|
* Expects:
|
|
1212
|
-
* -
|
|
1213
|
-
*
|
|
1214
|
-
* Returns:
|
|
1215
|
-
* - one partial task-local concurrency descriptor
|
|
1216
|
-
*/
|
|
1217
|
-
concurrency?: TaskConcurrencyConfig;
|
|
1218
|
-
/**
|
|
1219
|
-
* Optional task-local execution policy.
|
|
1240
|
+
* - hooks are best-effort observers and should not affect task scoring
|
|
1220
1241
|
*/
|
|
1221
|
-
|
|
1242
|
+
reporterHooks?: TaskReporterHooks;
|
|
1222
1243
|
/**
|
|
1223
|
-
* Optional
|
|
1244
|
+
* Optional runtime scheduling overrides supplied by CLI or host execution.
|
|
1224
1245
|
*
|
|
1225
1246
|
* Use when:
|
|
1226
|
-
* - task
|
|
1247
|
+
* - run operators need to override task/case concurrency without editing eval code
|
|
1248
|
+
* - DSL task runners need to distinguish runtime flags from code defaults
|
|
1227
1249
|
*
|
|
1228
|
-
*
|
|
1229
|
-
*
|
|
1230
|
-
*
|
|
1231
|
-
*
|
|
1232
|
-
* override: {
|
|
1233
|
-
* model: ['gpt-4.1-mini'],
|
|
1234
|
-
* },
|
|
1235
|
-
* },
|
|
1236
|
-
* evalMatrix: {
|
|
1237
|
-
* extend: {
|
|
1238
|
-
* evaluator: ['default-judge'],
|
|
1239
|
-
* },
|
|
1240
|
-
* },
|
|
1241
|
-
* }
|
|
1242
|
-
* ```
|
|
1250
|
+
* Expects:
|
|
1251
|
+
* - values are positive integers when provided
|
|
1252
|
+
*
|
|
1253
|
+
* @default undefined
|
|
1243
1254
|
*/
|
|
1244
|
-
|
|
1255
|
+
runtimeConcurrency?: TaskConcurrencyConfig;
|
|
1245
1256
|
/**
|
|
1246
|
-
*
|
|
1257
|
+
* Cooperative abort signal for the current execution.
|
|
1247
1258
|
*/
|
|
1248
|
-
|
|
1249
|
-
}
|
|
1250
|
-
/**
|
|
1251
|
-
* Declares the metadata required for a single vieval evaluation module.
|
|
1252
|
-
*/
|
|
1253
|
-
interface EvalDefinition {
|
|
1254
|
-
description: string;
|
|
1255
|
-
name: string;
|
|
1259
|
+
signal?: AbortSignal;
|
|
1256
1260
|
/**
|
|
1257
|
-
*
|
|
1261
|
+
* Scheduled runner task metadata.
|
|
1258
1262
|
*
|
|
1259
|
-
*
|
|
1260
|
-
* - one eval file needs control-group variants that differ from project defaults
|
|
1263
|
+
* Matrix impact on runtime context:
|
|
1261
1264
|
*
|
|
1262
|
-
*
|
|
1263
|
-
*
|
|
1264
|
-
*
|
|
1265
|
-
*
|
|
1266
|
-
*
|
|
1267
|
-
*
|
|
1268
|
-
*
|
|
1269
|
-
*
|
|
1270
|
-
*
|
|
1271
|
-
* },
|
|
1272
|
-
* },
|
|
1273
|
-
* evalMatrix: {
|
|
1274
|
-
* override: {
|
|
1275
|
-
* rubric: ['strict'],
|
|
1276
|
-
* },
|
|
1277
|
-
* },
|
|
1278
|
-
* }
|
|
1265
|
+
* ```txt
|
|
1266
|
+
* project/eval/task matrix definitions
|
|
1267
|
+
* -> scheduler expands run rows x eval rows
|
|
1268
|
+
* -> one scheduled task per row pair
|
|
1269
|
+
* -> context.task.matrix = {
|
|
1270
|
+
* run: selected run-axis values,
|
|
1271
|
+
* eval: selected eval-axis values,
|
|
1272
|
+
* meta: { runRowId, evalRowId }
|
|
1273
|
+
* }
|
|
1279
1274
|
* ```
|
|
1280
1275
|
*
|
|
1281
|
-
*
|
|
1276
|
+
* Practical impact:
|
|
1277
|
+
* - `runMatrix` axes appear under `context.task.matrix.run.*`
|
|
1278
|
+
* - `evalMatrix` axes appear under `context.task.matrix.eval.*`
|
|
1279
|
+
* - row ids are stable labels for grouping/aggregation under `context.task.matrix.meta.*`
|
|
1282
1280
|
*
|
|
1283
|
-
*
|
|
1284
|
-
*
|
|
1285
|
-
*
|
|
1281
|
+
* @example
|
|
1282
|
+
* ```ts
|
|
1283
|
+
* // If final selected rows are:
|
|
1284
|
+
* // run: { model: 'gpt-4.1-mini', scenario: 'stress', promptLanguage: 'zh' }
|
|
1285
|
+
* // eval: { rubric: 'strict', rubricModel: 'judge-large' }
|
|
1286
1286
|
*
|
|
1287
|
-
*
|
|
1288
|
-
*
|
|
1287
|
+
* context.task.matrix.run.model // 'gpt-4.1-mini'
|
|
1288
|
+
* context.task.matrix.run.scenario // 'stress'
|
|
1289
|
+
* context.task.matrix.eval.rubric // 'strict'
|
|
1290
|
+
* context.task.matrix.meta.runRowId // stable encoded row id
|
|
1289
1291
|
* ```
|
|
1290
1292
|
*/
|
|
1291
|
-
|
|
1293
|
+
task: ScheduledTask;
|
|
1292
1294
|
/**
|
|
1293
|
-
* Optional
|
|
1295
|
+
* Optional telemetry runtime shared by runner, DSL, and reporter integrations.
|
|
1296
|
+
*
|
|
1297
|
+
* Use when:
|
|
1298
|
+
* - task execution should emit events to the currently active telemetry runtime
|
|
1299
|
+
* - enabled and disabled telemetry should keep the same execution path
|
|
1300
|
+
*
|
|
1301
|
+
* Expects:
|
|
1302
|
+
* - callers inject a no-op runtime when telemetry is disabled
|
|
1294
1303
|
*/
|
|
1295
|
-
|
|
1304
|
+
telemetry?: TelemetryRuntime;
|
|
1296
1305
|
}
|
|
1297
1306
|
/**
|
|
1298
|
-
*
|
|
1307
|
+
* Output of one eval task execution.
|
|
1299
1308
|
*/
|
|
1300
|
-
interface
|
|
1301
|
-
|
|
1309
|
+
interface TaskRunOutput {
|
|
1310
|
+
/**
|
|
1311
|
+
* Scores emitted by this task run.
|
|
1312
|
+
*/
|
|
1313
|
+
scores: readonly RunScore[];
|
|
1302
1314
|
}
|
|
1303
|
-
/**
|
|
1304
|
-
* Maps module URLs to their loaded vieval evaluation modules.
|
|
1305
|
-
*/
|
|
1306
|
-
type EvalModuleMap = Record<string, EvalModule>;
|
|
1307
|
-
/**
|
|
1308
|
-
* Represents a normalized evaluation entry collected by the runner.
|
|
1309
|
-
*/
|
|
1310
|
-
type CollectedEvalEntry<TDefinition extends EvalDefinition = EvalDefinition> = TDefinition & {
|
|
1311
|
-
directory: string;
|
|
1312
|
-
filePath: string;
|
|
1313
|
-
id: string;
|
|
1314
|
-
};
|
|
1315
1315
|
//#endregion
|
|
1316
1316
|
//#region src/config/define.d.ts
|
|
1317
1317
|
/**
|
|
@@ -1339,19 +1339,19 @@ declare function defineTask<const TDefinition extends TaskDefinition>(definition
|
|
|
1339
1339
|
* - a typed plugin shape bound to one config object
|
|
1340
1340
|
*/
|
|
1341
1341
|
interface ConfigHookPlugin<TConfig> {
|
|
1342
|
-
/**
|
|
1343
|
-
* Stable plugin name for diagnostics.
|
|
1344
|
-
*/
|
|
1345
|
-
name: string;
|
|
1346
1342
|
/**
|
|
1347
1343
|
* Optional config transform hook.
|
|
1348
1344
|
*/
|
|
1349
|
-
configVieval?: (config: TConfig) => TConfig | void |
|
|
1345
|
+
configVieval?: (config: TConfig) => Promise<TConfig | void> | TConfig | void;
|
|
1350
1346
|
/**
|
|
1351
1347
|
* Optional hook after config is finalized.
|
|
1352
1348
|
*/
|
|
1353
|
-
configVievalResolved?: (config: TConfig) => void |
|
|
1349
|
+
configVievalResolved?: (config: TConfig) => Promise<void> | void;
|
|
1350
|
+
/**
|
|
1351
|
+
* Stable plugin name for diagnostics.
|
|
1352
|
+
*/
|
|
1353
|
+
name: string;
|
|
1354
1354
|
}
|
|
1355
1355
|
//#endregion
|
|
1356
1356
|
export { InferenceExecutor as $, RunScheduledTasksOptions as A, asProjectRelativePath as B, TaskDefinition as C, TaskRunContext as D, TaskReporterHooks as E, CreateTaskExecutionContextOptions as F, AggregatedProviderSummary as G, CreateVievalRunnerRuntimeContextOptions as H, TaskExecutionContext as I, RunResult as J, AggregatedRunResults as K, createTaskExecutionContext as L, RunnerTaskState as M, ScheduledTaskExecutor as N, TaskRunOutput as O, runScheduledTasks as P, CreateRunnerScheduleOptions as Q, ModelDefinition as R, TaskConcurrencyConfig as S, TaskReporterEventPayload as T, RunnerRuntimeContext as U, collectEvalEntries as V, createRunnerRuntimeContext as W, RunScoreKind as X, RunScore as Y, aggregateRunResults as Z, ScopedMatrices as _, CliOpenTelemetryReportingConfig as a, ScheduledTaskMatrixMeta as at, TaskCaseReporterPayload as b, EvalDefinition as c, createFilesystemTaskCacheRuntime as ct, MatrixAxisValues as d, CacheFileOptions as dt, RunnerMatrixDefinition as et, MatrixDefinition as f, CacheNamespace as ft, MatrixValue as g, MatrixRow as h, Awaitable as i, ScheduledTaskMatrix as it, RunnerExecutionError as j, TelemetryAttributeValue as k, EvalModule as l, normalizeCacheFilePathSegments as lt, MatrixPrimitive as m, defineEval as n, RunnerMatrixSelection as nt, CliReportingConfig as o, createRunnerSchedule as ot, MatrixLayer as p, TaskCacheRuntime as pt, AggregatedRunSummary as q, defineTask as r, ScheduledTask as rt, CollectedEvalEntry as s, CreateFilesystemTaskCacheRuntimeOptions as st, ConfigHookPlugin as t, RunnerMatrixInput as tt, EvalModuleMap as u, CacheFileHandle as ut, TaskAutoRetryDelay as v, TaskExecutionPolicy as w, TaskCaseState as x, TaskCaseReporterEndPayload as y, resolveModelByName as z };
|
|
1357
|
-
//# sourceMappingURL=index-
|
|
1357
|
+
//# sourceMappingURL=index-BLIlhiWT.d.mts.map
|