vieval 0.0.11 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/README.md +31 -31
  2. package/dist/bin/vieval.mjs +1 -1
  3. package/dist/cli/index.d.mts +1 -1
  4. package/dist/cli/index.mjs +1 -1
  5. package/dist/{cli-CHFCF8UR.mjs → cli-uzS81IPd.mjs} +1529 -1529
  6. package/dist/cli-uzS81IPd.mjs.map +1 -0
  7. package/dist/config.d.mts +1 -1
  8. package/dist/core/assertions/index.d.mts +156 -156
  9. package/dist/core/assertions/index.mjs +82 -82
  10. package/dist/core/assertions/index.mjs.map +1 -1
  11. package/dist/core/inference-executors/index.d.mts +37 -37
  12. package/dist/core/inference-executors/index.mjs +53 -52
  13. package/dist/core/inference-executors/index.mjs.map +1 -1
  14. package/dist/core/processors/results/index.d.mts +18 -18
  15. package/dist/core/processors/results/index.mjs.map +1 -1
  16. package/dist/core/runner/index.d.mts +2 -2
  17. package/dist/core/runner/index.mjs +258 -258
  18. package/dist/core/runner/index.mjs.map +1 -1
  19. package/dist/core/scheduler/index.d.mts +1 -1
  20. package/dist/core/scheduler/index.mjs +64 -64
  21. package/dist/core/scheduler/index.mjs.map +1 -1
  22. package/dist/{env-bRH0K6fU.d.mts → env-Br6jaWGL.d.mts} +9 -9
  23. package/dist/{env-BVYeJhGA.mjs → env-egxaJtNn.mjs} +8 -8
  24. package/dist/env-egxaJtNn.mjs.map +1 -0
  25. package/dist/{expect-extensions-Mf1sMNBv.mjs → expect-extensions-BKdEPt3h.mjs} +46 -46
  26. package/dist/expect-extensions-BKdEPt3h.mjs.map +1 -0
  27. package/dist/expect.mjs +1 -1
  28. package/dist/{index-CwKBlCG9.d.mts → index-BLIlhiWT.d.mts} +565 -565
  29. package/dist/{index-Be5I1ZJL.d.mts → index-CIaJClcC.d.mts} +48 -48
  30. package/dist/index.d.mts +207 -195
  31. package/dist/index.mjs +147 -147
  32. package/dist/index.mjs.map +1 -1
  33. package/dist/models-CaCOUPZw.mjs.map +1 -1
  34. package/dist/plugins/chat-models/index.d.mts +279 -279
  35. package/dist/plugins/chat-models/index.mjs +359 -359
  36. package/dist/plugins/chat-models/index.mjs.map +1 -1
  37. package/dist/{registry-BSyjwZFx.mjs → registry-BK7k6X81.mjs} +293 -293
  38. package/dist/registry-BK7k6X81.mjs.map +1 -0
  39. package/dist/testing/expect-extensions.d.mts +27 -27
  40. package/dist/testing/expect-extensions.mjs +1 -1
  41. package/package.json +3 -3
  42. package/dist/cli-CHFCF8UR.mjs.map +0 -1
  43. package/dist/env-BVYeJhGA.mjs.map +0 -1
  44. package/dist/expect-extensions-Mf1sMNBv.mjs.map +0 -1
  45. package/dist/registry-BSyjwZFx.mjs.map +0 -1
package/dist/config.d.mts CHANGED
@@ -1,2 +1,2 @@
1
- import { C as TaskDefinition, D as TaskRunContext, E as TaskReporterHooks, O as TaskRunOutput, R as ModelDefinition, S as TaskConcurrencyConfig, T as TaskReporterEventPayload, _ as ScopedMatrices, a as CliOpenTelemetryReportingConfig, b as TaskCaseReporterPayload, c as EvalDefinition, d as MatrixAxisValues, f as MatrixDefinition, g as MatrixValue, h as MatrixRow, i as Awaitable, l as EvalModule, m as MatrixPrimitive, n as defineEval, o as CliReportingConfig, p as MatrixLayer, r as defineTask, s as CollectedEvalEntry, t as ConfigHookPlugin, u as EvalModuleMap, v as TaskAutoRetryDelay, w as TaskExecutionPolicy, x as TaskCaseState, y as TaskCaseReporterEndPayload, z as resolveModelByName } from "./index-CwKBlCG9.mjs";
1
+ import { C as TaskDefinition, D as TaskRunContext, E as TaskReporterHooks, O as TaskRunOutput, R as ModelDefinition, S as TaskConcurrencyConfig, T as TaskReporterEventPayload, _ as ScopedMatrices, a as CliOpenTelemetryReportingConfig, b as TaskCaseReporterPayload, c as EvalDefinition, d as MatrixAxisValues, f as MatrixDefinition, g as MatrixValue, h as MatrixRow, i as Awaitable, l as EvalModule, m as MatrixPrimitive, n as defineEval, o as CliReportingConfig, p as MatrixLayer, r as defineTask, s as CollectedEvalEntry, t as ConfigHookPlugin, u as EvalModuleMap, v as TaskAutoRetryDelay, w as TaskExecutionPolicy, x as TaskCaseState, y as TaskCaseReporterEndPayload, z as resolveModelByName } from "./index-BLIlhiWT.mjs";
2
2
  export { Awaitable, CliOpenTelemetryReportingConfig, CliReportingConfig, CollectedEvalEntry, ConfigHookPlugin, EvalDefinition, EvalModule, EvalModuleMap, MatrixAxisValues, MatrixDefinition, MatrixLayer, MatrixPrimitive, MatrixRow, MatrixValue, ModelDefinition, ScopedMatrices, TaskAutoRetryDelay, TaskCaseReporterEndPayload, TaskCaseReporterPayload, TaskCaseState, TaskConcurrencyConfig, TaskDefinition, TaskExecutionPolicy, TaskReporterEventPayload, TaskReporterHooks, TaskRunContext, TaskRunOutput, defineEval, defineTask, resolveModelByName };
@@ -1,47 +1,30 @@
1
- import { X as RunScoreKind, Y as RunScore } from "../../index-CwKBlCG9.mjs";
1
+ import { X as RunScoreKind, Y as RunScore } from "../../index-BLIlhiWT.mjs";
2
2
 
3
3
  //#region src/core/assertions/index.d.ts
4
4
  /**
5
- * Stores mutable evaluation state for stateful assertion flows.
6
- *
7
- * Use when:
8
- * - assertions need to share counters, rolling metrics, or memoized values
9
- * - a scenario evaluates multiple steps and expects state-aware checks
10
- */
11
- type AssertionState = Map<string, unknown>;
12
- /**
13
- * Represents one tool call emitted by a model response.
5
+ * Async assertion function used by eval scenarios.
14
6
  */
15
- interface ToolCall {
16
- /**
17
- * Tool name used by the call.
18
- */
19
- name: string;
20
- /**
21
- * Tool arguments payload.
22
- */
23
- args: unknown;
24
- }
7
+ type Assertion = (context: AssertionContext) => Promise<AssertionOutcome>;
25
8
  /**
26
9
  * Normalized assertion context for one model output.
27
10
  */
28
11
  interface AssertionContext {
29
12
  /**
30
- * Plain text model output used by text assertions.
13
+ * Shared mutable state for stateful assertion measurement.
31
14
  */
32
- text: string;
15
+ state: AssertionState;
33
16
  /**
34
17
  * Optional structured output parsed from the model response.
35
18
  */
36
19
  structuredOutput?: unknown;
37
20
  /**
38
- * Optional tool calls extracted from the model response.
21
+ * Plain text model output used by text assertions.
39
22
  */
40
- toolCalls?: readonly ToolCall[];
23
+ text: string;
41
24
  /**
42
- * Shared mutable state for stateful assertion measurement.
25
+ * Optional tool calls extracted from the model response.
43
26
  */
44
- state: AssertionState;
27
+ toolCalls?: readonly ToolCall[];
45
28
  }
46
29
  /**
47
30
  * Result for one assertion evaluation.
@@ -51,70 +34,66 @@ interface AssertionOutcome {
51
34
  * Stable assertion id.
52
35
  */
53
36
  id: string;
54
- /**
55
- * Assertion family emitted as run score kind.
56
- */
57
- scoreKind: RunScoreKind;
58
37
  /**
59
38
  * Whether the assertion passed.
60
39
  */
61
40
  pass: boolean;
41
+ /**
42
+ * Human-readable reason for logs and reports.
43
+ */
44
+ reason: string;
62
45
  /**
63
46
  * Normalized score in the `0..1` range.
64
47
  */
65
48
  score: number;
66
49
  /**
67
- * Human-readable reason for logs and reports.
50
+ * Assertion family emitted as run score kind.
68
51
  */
69
- reason: string;
52
+ scoreKind: RunScoreKind;
70
53
  }
71
54
  /**
72
- * Async assertion function used by eval scenarios.
73
- */
74
- type Assertion = (context: AssertionContext) => Promise<AssertionOutcome>;
75
- /**
76
- * Normalizes text for matching.
55
+ * Stores mutable evaluation state for stateful assertion flows.
77
56
  *
78
- * Before: `" Hello\nWorld "`
79
- * After: `"hello world"`
57
+ * Use when:
58
+ * - assertions need to share counters, rolling metrics, or memoized values
59
+ * - a scenario evaluates multiple steps and expects state-aware checks
80
60
  */
81
- declare function normalizeMatchText(value: string, caseSensitive: boolean): string;
61
+ type AssertionState = Map<string, unknown>;
82
62
  /**
83
- * Options for include-keyword assertions.
63
+ * Options for custom assertions.
84
64
  */
85
- interface MustIncludeAssertionOptions {
65
+ interface CustomAssertionOptions {
86
66
  /**
87
- * Stable assertion id.
67
+ * Custom evaluator callback.
88
68
  */
89
- id: string;
69
+ evaluate: (context: AssertionContext) => Promise<{
70
+ pass: boolean;
71
+ reason: string;
72
+ score: number;
73
+ }> | {
74
+ pass: boolean;
75
+ reason: string;
76
+ score: number;
77
+ };
90
78
  /**
91
- * Keywords that must be present.
79
+ * Stable assertion id.
92
80
  */
93
- keywords: readonly string[];
81
+ id: string;
94
82
  /**
95
- * Match mode for keywords.
96
- *
97
- * @default 'all'
83
+ * Score family emitted by this custom assertion.
98
84
  */
99
- mode?: 'all' | 'any';
85
+ scoreKind: RunScoreKind;
86
+ }
87
+ /**
88
+ * Options for exclude-keyword assertions.
89
+ */
90
+ interface MustExcludeAssertionOptions {
100
91
  /**
101
92
  * Case-sensitive matching toggle.
102
93
  *
103
94
  * @default false
104
95
  */
105
96
  caseSensitive?: boolean;
106
- }
107
- /**
108
- * Creates an assertion that requires specific keywords in model text.
109
- *
110
- * Example:
111
- * `expectMustInclude({ id: 'tone', keywords: ['calm', 'move'] })`
112
- */
113
- declare function expectMustInclude(options: MustIncludeAssertionOptions): Assertion;
114
- /**
115
- * Options for exclude-keyword assertions.
116
- */
117
- interface MustExcludeAssertionOptions {
118
97
  /**
119
98
  * Stable assertion id.
120
99
  */
@@ -123,156 +102,144 @@ interface MustExcludeAssertionOptions {
123
102
  * Keywords that must not appear.
124
103
  */
125
104
  keywords: readonly string[];
105
+ }
106
+ /**
107
+ * Options for include-keyword assertions.
108
+ */
109
+ interface MustIncludeAssertionOptions {
126
110
  /**
127
111
  * Case-sensitive matching toggle.
128
112
  *
129
113
  * @default false
130
114
  */
131
115
  caseSensitive?: boolean;
132
- }
133
- /**
134
- * Creates an assertion that forbids specific keywords.
135
- *
136
- * Example:
137
- * `expectMustExclude({ id: 'no-engine-dump', keywords: ['bestmove', 'ponder'] })`
138
- */
139
- declare function expectMustExclude(options: MustExcludeAssertionOptions): Assertion;
140
- /**
141
- * Options for regular-expression assertions.
142
- */
143
- interface RegexAssertionOptions {
144
116
  /**
145
117
  * Stable assertion id.
146
118
  */
147
119
  id: string;
148
120
  /**
149
- * Pattern to apply to model text.
121
+ * Keywords that must be present.
150
122
  */
151
- pattern: RegExp;
123
+ keywords: readonly string[];
124
+ /**
125
+ * Match mode for keywords.
126
+ *
127
+ * @default 'all'
128
+ */
129
+ mode?: 'all' | 'any';
152
130
  }
153
131
  /**
154
- * Creates an assertion based on a regular expression.
155
- *
156
- * Example:
157
- * `expectRegex({ id: 'starts-with-act', pattern: /^<\|ACT:/ })`
158
- */
159
- declare function expectRegex(options: RegexAssertionOptions): Assertion;
160
- /**
161
- * Options for structured-output assertions.
132
+ * Options for regular-expression assertions.
162
133
  */
163
- interface StructuredOutputAssertionOptions<TValue> {
134
+ interface RegexAssertionOptions {
164
135
  /**
165
136
  * Stable assertion id.
166
137
  */
167
138
  id: string;
168
139
  /**
169
- * Runtime validator for structured output.
170
- */
171
- validate: (value: unknown) => value is TValue;
172
- /**
173
- * Optional failure reason.
140
+ * Pattern to apply to model text.
174
141
  */
175
- failureReason?: string;
142
+ pattern: RegExp;
176
143
  }
177
144
  /**
178
- * Creates an assertion for structured model output.
179
- *
180
- * Example:
181
- * `expectStructuredOutput({ id: 'json-shape', validate: isMySchema })`
182
- */
183
- declare function expectStructuredOutput<TValue>(options: StructuredOutputAssertionOptions<TValue>): Assertion;
184
- /**
185
- * Options for tool-call argument assertions.
145
+ * Options for rubric assertions.
186
146
  */
187
- interface ToolCallArgsAssertionOptions {
147
+ interface RubricAssertionOptions {
188
148
  /**
189
149
  * Stable assertion id.
190
150
  */
191
151
  id: string;
192
152
  /**
193
- * Tool name to inspect.
153
+ * Async rubric judge callback.
194
154
  */
195
- toolName: string;
155
+ judge: (context: AssertionContext) => Promise<RubricJudgeResult>;
196
156
  /**
197
- * Runtime validator for tool arguments.
157
+ * Minimum passing score.
158
+ *
159
+ * @default 0.7
198
160
  */
199
- validate: (args: unknown) => boolean;
161
+ minScore?: number;
200
162
  }
201
- /**
202
- * Creates an assertion for validating tool-call arguments.
203
- *
204
- * Example:
205
- * `expectToolCallArgs({ id: 'spark-command-shape', toolName: 'builtIn_sparkCommand', validate: isSparkArgs })`
206
- */
207
- declare function expectToolCallArgs(options: ToolCallArgsAssertionOptions): Assertion;
208
163
  /**
209
164
  * Rubric judge result returned by teacher-model or rubric logic.
210
165
  */
211
166
  interface RubricJudgeResult {
212
167
  /**
213
- * Normalized score in the `0..1` range.
168
+ * Optional judge model id.
214
169
  */
215
- score: number;
170
+ judgeModel?: string;
216
171
  /**
217
172
  * Judge explanation text.
218
173
  */
219
174
  reason: string;
220
175
  /**
221
- * Optional judge model id.
176
+ * Normalized score in the `0..1` range.
222
177
  */
223
- judgeModel?: string;
178
+ score: number;
224
179
  }
225
180
  /**
226
- * Options for rubric assertions.
181
+ * Options for structured-output assertions.
227
182
  */
228
- interface RubricAssertionOptions {
183
+ interface StructuredOutputAssertionOptions<TValue> {
229
184
  /**
230
- * Stable assertion id.
185
+ * Optional failure reason.
231
186
  */
232
- id: string;
187
+ failureReason?: string;
233
188
  /**
234
- * Async rubric judge callback.
189
+ * Stable assertion id.
235
190
  */
236
- judge: (context: AssertionContext) => Promise<RubricJudgeResult>;
191
+ id: string;
237
192
  /**
238
- * Minimum passing score.
239
- *
240
- * @default 0.7
193
+ * Runtime validator for structured output.
241
194
  */
242
- minScore?: number;
195
+ validate: (value: unknown) => value is TValue;
243
196
  }
244
197
  /**
245
- * Creates a rubric assertion driven by teacher-model style scoring.
246
- *
247
- * Example:
248
- * `expectRubric({ id: 'human-like-tone', judge: judgeFn, minScore: 0.8 })`
198
+ * Represents one tool call emitted by a model response.
249
199
  */
250
- declare function expectRubric(options: RubricAssertionOptions): Assertion;
200
+ interface ToolCall {
201
+ /**
202
+ * Tool arguments payload.
203
+ */
204
+ args: unknown;
205
+ /**
206
+ * Tool name used by the call.
207
+ */
208
+ name: string;
209
+ }
251
210
  /**
252
- * Options for custom assertions.
211
+ * Options for tool-call argument assertions.
253
212
  */
254
- interface CustomAssertionOptions {
213
+ interface ToolCallArgsAssertionOptions {
255
214
  /**
256
215
  * Stable assertion id.
257
216
  */
258
217
  id: string;
259
218
  /**
260
- * Score family emitted by this custom assertion.
219
+ * Tool name to inspect.
261
220
  */
262
- scoreKind: RunScoreKind;
221
+ toolName: string;
263
222
  /**
264
- * Custom evaluator callback.
223
+ * Runtime validator for tool arguments.
265
224
  */
266
- evaluate: (context: AssertionContext) => Promise<{
267
- pass: boolean;
268
- reason: string;
269
- score: number;
270
- }> | {
271
- pass: boolean;
272
- reason: string;
273
- score: number;
274
- };
225
+ validate: (args: unknown) => boolean;
275
226
  }
227
+ /**
228
+ * Returns failing assertion outcomes in original order.
229
+ */
230
+ declare function collectFailedAssertions(outcomes: readonly AssertionOutcome[]): AssertionOutcome[];
231
+ /**
232
+ * Executes assertion list and returns all outcomes.
233
+ *
234
+ * Call stack:
235
+ *
236
+ * {@link evaluateAssertions}
237
+ * -> `assertion(context)`
238
+ * -> {@link AssertionOutcome}[]
239
+ */
240
+ declare function evaluateAssertions(assertions: readonly Assertion[], context: Omit<AssertionContext, 'state'> & {
241
+ state?: AssertionState;
242
+ }): Promise<AssertionOutcome[]>;
276
243
  /**
277
244
  * Creates a custom assertion with fully user-defined logic.
278
245
  *
@@ -280,6 +247,20 @@ interface CustomAssertionOptions {
280
247
  * `expectCustom({ id: 'stateful-window', scoreKind: 'exact', evaluate: (ctx) => ... })`
281
248
  */
282
249
  declare function expectCustom(options: CustomAssertionOptions): Assertion;
250
+ /**
251
+ * Creates an assertion that forbids specific keywords.
252
+ *
253
+ * Example:
254
+ * `expectMustExclude({ id: 'no-engine-dump', keywords: ['bestmove', 'ponder'] })`
255
+ */
256
+ declare function expectMustExclude(options: MustExcludeAssertionOptions): Assertion;
257
+ /**
258
+ * Creates an assertion that requires specific keywords in model text.
259
+ *
260
+ * Example:
261
+ * `expectMustInclude({ id: 'tone', keywords: ['calm', 'move'] })`
262
+ */
263
+ declare function expectMustInclude(options: MustIncludeAssertionOptions): Assertion;
283
264
  /**
284
265
  * Creates an inverse assertion.
285
266
  *
@@ -290,25 +271,44 @@ declare function expectNot(assertion: Assertion, options: {
290
271
  id: string;
291
272
  }): Assertion;
292
273
  /**
293
- * Executes assertion list and returns all outcomes.
274
+ * Creates an assertion based on a regular expression.
294
275
  *
295
- * Call stack:
276
+ * Example:
277
+ * `expectRegex({ id: 'starts-with-act', pattern: /^<\|ACT:/ })`
278
+ */
279
+ declare function expectRegex(options: RegexAssertionOptions): Assertion;
280
+ /**
281
+ * Creates a rubric assertion driven by teacher-model style scoring.
296
282
  *
297
- * {@link evaluateAssertions}
298
- * -> `assertion(context)`
299
- * -> {@link AssertionOutcome}[]
283
+ * Example:
284
+ * `expectRubric({ id: 'human-like-tone', judge: judgeFn, minScore: 0.8 })`
300
285
  */
301
- declare function evaluateAssertions(assertions: readonly Assertion[], context: Omit<AssertionContext, 'state'> & {
302
- state?: AssertionState;
303
- }): Promise<AssertionOutcome[]>;
286
+ declare function expectRubric(options: RubricAssertionOptions): Assertion;
304
287
  /**
305
- * Converts assertion outcomes to run-score tuples consumed by aggregation.
288
+ * Creates an assertion for structured model output.
289
+ *
290
+ * Example:
291
+ * `expectStructuredOutput({ id: 'json-shape', validate: isMySchema })`
306
292
  */
307
- declare function toRunScores(outcomes: readonly AssertionOutcome[]): RunScore[];
293
+ declare function expectStructuredOutput<TValue>(options: StructuredOutputAssertionOptions<TValue>): Assertion;
308
294
  /**
309
- * Returns failing assertion outcomes in original order.
295
+ * Creates an assertion for validating tool-call arguments.
296
+ *
297
+ * Example:
298
+ * `expectToolCallArgs({ id: 'spark-command-shape', toolName: 'builtIn_sparkCommand', validate: isSparkArgs })`
310
299
  */
311
- declare function collectFailedAssertions(outcomes: readonly AssertionOutcome[]): AssertionOutcome[];
300
+ declare function expectToolCallArgs(options: ToolCallArgsAssertionOptions): Assertion;
301
+ /**
302
+ * Normalizes text for matching.
303
+ *
304
+ * Before: `" Hello\nWorld "`
305
+ * After: `"hello world"`
306
+ */
307
+ declare function normalizeMatchText(value: string, caseSensitive: boolean): string;
308
+ /**
309
+ * Converts assertion outcomes to run-score tuples consumed by aggregation.
310
+ */
311
+ declare function toRunScores(outcomes: readonly AssertionOutcome[]): RunScore[];
312
312
  //#endregion
313
313
  export { Assertion, AssertionContext, AssertionOutcome, AssertionState, CustomAssertionOptions, MustExcludeAssertionOptions, MustIncludeAssertionOptions, RegexAssertionOptions, RubricAssertionOptions, RubricJudgeResult, StructuredOutputAssertionOptions, ToolCall, ToolCallArgsAssertionOptions, collectFailedAssertions, evaluateAssertions, expectCustom, expectMustExclude, expectMustInclude, expectNot, expectRegex, expectRubric, expectStructuredOutput, expectToolCallArgs, normalizeMatchText, toRunScores };
314
314
  //# sourceMappingURL=index.d.mts.map