vieval 0.0.10 → 0.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -31
- package/dist/bin/vieval.mjs +1 -1
- package/dist/bin/vieval.mjs.map +1 -1
- package/dist/cli/index.d.mts +1 -1
- package/dist/cli/index.mjs +1 -1
- package/dist/{cli-DTDgaqeI.mjs → cli-uzS81IPd.mjs} +1483 -1483
- package/dist/cli-uzS81IPd.mjs.map +1 -0
- package/dist/config.d.mts +1 -1
- package/dist/config.mjs +1 -1
- package/dist/config.mjs.map +1 -1
- package/dist/core/assertions/index.d.mts +156 -156
- package/dist/core/assertions/index.mjs +82 -82
- package/dist/core/assertions/index.mjs.map +1 -1
- package/dist/core/inference-executors/index.d.mts +37 -37
- package/dist/core/inference-executors/index.mjs +54 -53
- package/dist/core/inference-executors/index.mjs.map +1 -1
- package/dist/core/processors/results/index.d.mts +18 -18
- package/dist/core/processors/results/index.mjs.map +1 -1
- package/dist/core/runner/index.d.mts +2 -2
- package/dist/core/runner/index.mjs +259 -259
- package/dist/core/runner/index.mjs.map +1 -1
- package/dist/core/scheduler/index.d.mts +1 -1
- package/dist/core/scheduler/index.mjs +65 -65
- package/dist/core/scheduler/index.mjs.map +1 -1
- package/dist/{env-DfWZy_n4.d.mts → env-Br6jaWGL.d.mts} +9 -9
- package/dist/{env-nV5rVErX.mjs → env-egxaJtNn.mjs} +8 -8
- package/dist/env-egxaJtNn.mjs.map +1 -0
- package/dist/{expect-extensions-DCSqlneN.mjs → expect-extensions-BKdEPt3h.mjs} +46 -46
- package/dist/expect-extensions-BKdEPt3h.mjs.map +1 -0
- package/dist/expect.d.mts +1 -3
- package/dist/expect.mjs +1 -1
- package/dist/expect.mjs.map +1 -1
- package/dist/{index-D_aMeWqO.d.mts → index-BLIlhiWT.d.mts} +565 -565
- package/dist/{index-Bg0atWBF.d.mts → index-CIaJClcC.d.mts} +48 -48
- package/dist/index.d.mts +208 -197
- package/dist/index.mjs +148 -148
- package/dist/index.mjs.map +1 -1
- package/dist/{models-pBSRUZhY.mjs → models-CaCOUPZw.mjs} +1 -1
- package/dist/{models-pBSRUZhY.mjs.map → models-CaCOUPZw.mjs.map} +1 -1
- package/dist/plugins/chat-models/index.d.mts +279 -279
- package/dist/plugins/chat-models/index.mjs +360 -360
- package/dist/plugins/chat-models/index.mjs.map +1 -1
- package/dist/{queue-DsZQkZO_.mjs → queue-BL86z2W_.mjs} +1 -1
- package/dist/{queue-DsZQkZO_.mjs.map → queue-BL86z2W_.mjs.map} +1 -1
- package/dist/{registry-DMnwE_mY.mjs → registry-BK7k6X81.mjs} +294 -294
- package/dist/registry-BK7k6X81.mjs.map +1 -0
- package/dist/testing/expect-extensions.d.mts +27 -27
- package/dist/testing/expect-extensions.mjs +1 -1
- package/package.json +12 -12
- package/dist/cli-DTDgaqeI.mjs.map +0 -1
- package/dist/env-nV5rVErX.mjs.map +0 -1
- package/dist/expect-extensions-DCSqlneN.mjs.map +0 -1
- package/dist/registry-DMnwE_mY.mjs.map +0 -1
package/dist/config.d.mts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { C as TaskDefinition, D as TaskRunContext, E as TaskReporterHooks, O as TaskRunOutput, R as ModelDefinition, S as TaskConcurrencyConfig, T as TaskReporterEventPayload, _ as ScopedMatrices, a as CliOpenTelemetryReportingConfig, b as TaskCaseReporterPayload, c as EvalDefinition, d as MatrixAxisValues, f as MatrixDefinition, g as MatrixValue, h as MatrixRow, i as Awaitable, l as EvalModule, m as MatrixPrimitive, n as defineEval, o as CliReportingConfig, p as MatrixLayer, r as defineTask, s as CollectedEvalEntry, t as ConfigHookPlugin, u as EvalModuleMap, v as TaskAutoRetryDelay, w as TaskExecutionPolicy, x as TaskCaseState, y as TaskCaseReporterEndPayload, z as resolveModelByName } from "./index-
|
|
1
|
+
import { C as TaskDefinition, D as TaskRunContext, E as TaskReporterHooks, O as TaskRunOutput, R as ModelDefinition, S as TaskConcurrencyConfig, T as TaskReporterEventPayload, _ as ScopedMatrices, a as CliOpenTelemetryReportingConfig, b as TaskCaseReporterPayload, c as EvalDefinition, d as MatrixAxisValues, f as MatrixDefinition, g as MatrixValue, h as MatrixRow, i as Awaitable, l as EvalModule, m as MatrixPrimitive, n as defineEval, o as CliReportingConfig, p as MatrixLayer, r as defineTask, s as CollectedEvalEntry, t as ConfigHookPlugin, u as EvalModuleMap, v as TaskAutoRetryDelay, w as TaskExecutionPolicy, x as TaskCaseState, y as TaskCaseReporterEndPayload, z as resolveModelByName } from "./index-BLIlhiWT.mjs";
|
|
2
2
|
export { Awaitable, CliOpenTelemetryReportingConfig, CliReportingConfig, CollectedEvalEntry, ConfigHookPlugin, EvalDefinition, EvalModule, EvalModuleMap, MatrixAxisValues, MatrixDefinition, MatrixLayer, MatrixPrimitive, MatrixRow, MatrixValue, ModelDefinition, ScopedMatrices, TaskAutoRetryDelay, TaskCaseReporterEndPayload, TaskCaseReporterPayload, TaskCaseState, TaskConcurrencyConfig, TaskDefinition, TaskExecutionPolicy, TaskReporterEventPayload, TaskReporterHooks, TaskRunContext, TaskRunOutput, defineEval, defineTask, resolveModelByName };
|
package/dist/config.mjs
CHANGED
package/dist/config.mjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"config.mjs","names":[],"sources":["../src/config/define.ts"],"sourcesContent":["import type { EvalDefinition, TaskDefinition } from './types'\n\n/**\n * Returns the provided vieval definition while preserving literal field types.\n */\nexport function defineEval<const TDefinition extends EvalDefinition>(definition: TDefinition): TDefinition {\n return definition\n}\n\n/**\n * Returns the provided task definition while preserving literal field types.\n */\nexport function defineTask<const TDefinition extends TaskDefinition>(definition: TDefinition): TDefinition {\n return definition\n}\n"],"mappings":";;;;;AAKA,SAAgB,WAAqD,YAAsC;
|
|
1
|
+
{"version":3,"file":"config.mjs","names":[],"sources":["../src/config/define.ts"],"sourcesContent":["import type { EvalDefinition, TaskDefinition } from './types'\n\n/**\n * Returns the provided vieval definition while preserving literal field types.\n */\nexport function defineEval<const TDefinition extends EvalDefinition>(definition: TDefinition): TDefinition {\n return definition\n}\n\n/**\n * Returns the provided task definition while preserving literal field types.\n */\nexport function defineTask<const TDefinition extends TaskDefinition>(definition: TDefinition): TDefinition {\n return definition\n}\n"],"mappings":";;;;;AAKA,SAAgB,WAAqD,YAAsC;CACzG,OAAO;AACT;;;;AAKA,SAAgB,WAAqD,YAAsC;CACzG,OAAO;AACT"}
|
|
@@ -1,47 +1,30 @@
|
|
|
1
|
-
import { X as RunScoreKind, Y as RunScore } from "../../index-
|
|
1
|
+
import { X as RunScoreKind, Y as RunScore } from "../../index-BLIlhiWT.mjs";
|
|
2
2
|
|
|
3
3
|
//#region src/core/assertions/index.d.ts
|
|
4
4
|
/**
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
* Use when:
|
|
8
|
-
* - assertions need to share counters, rolling metrics, or memoized values
|
|
9
|
-
* - a scenario evaluates multiple steps and expects state-aware checks
|
|
10
|
-
*/
|
|
11
|
-
type AssertionState = Map<string, unknown>;
|
|
12
|
-
/**
|
|
13
|
-
* Represents one tool call emitted by a model response.
|
|
5
|
+
* Async assertion function used by eval scenarios.
|
|
14
6
|
*/
|
|
15
|
-
|
|
16
|
-
/**
|
|
17
|
-
* Tool name used by the call.
|
|
18
|
-
*/
|
|
19
|
-
name: string;
|
|
20
|
-
/**
|
|
21
|
-
* Tool arguments payload.
|
|
22
|
-
*/
|
|
23
|
-
args: unknown;
|
|
24
|
-
}
|
|
7
|
+
type Assertion = (context: AssertionContext) => Promise<AssertionOutcome>;
|
|
25
8
|
/**
|
|
26
9
|
* Normalized assertion context for one model output.
|
|
27
10
|
*/
|
|
28
11
|
interface AssertionContext {
|
|
29
12
|
/**
|
|
30
|
-
*
|
|
13
|
+
* Shared mutable state for stateful assertion measurement.
|
|
31
14
|
*/
|
|
32
|
-
|
|
15
|
+
state: AssertionState;
|
|
33
16
|
/**
|
|
34
17
|
* Optional structured output parsed from the model response.
|
|
35
18
|
*/
|
|
36
19
|
structuredOutput?: unknown;
|
|
37
20
|
/**
|
|
38
|
-
*
|
|
21
|
+
* Plain text model output used by text assertions.
|
|
39
22
|
*/
|
|
40
|
-
|
|
23
|
+
text: string;
|
|
41
24
|
/**
|
|
42
|
-
*
|
|
25
|
+
* Optional tool calls extracted from the model response.
|
|
43
26
|
*/
|
|
44
|
-
|
|
27
|
+
toolCalls?: readonly ToolCall[];
|
|
45
28
|
}
|
|
46
29
|
/**
|
|
47
30
|
* Result for one assertion evaluation.
|
|
@@ -51,70 +34,66 @@ interface AssertionOutcome {
|
|
|
51
34
|
* Stable assertion id.
|
|
52
35
|
*/
|
|
53
36
|
id: string;
|
|
54
|
-
/**
|
|
55
|
-
* Assertion family emitted as run score kind.
|
|
56
|
-
*/
|
|
57
|
-
scoreKind: RunScoreKind;
|
|
58
37
|
/**
|
|
59
38
|
* Whether the assertion passed.
|
|
60
39
|
*/
|
|
61
40
|
pass: boolean;
|
|
41
|
+
/**
|
|
42
|
+
* Human-readable reason for logs and reports.
|
|
43
|
+
*/
|
|
44
|
+
reason: string;
|
|
62
45
|
/**
|
|
63
46
|
* Normalized score in the `0..1` range.
|
|
64
47
|
*/
|
|
65
48
|
score: number;
|
|
66
49
|
/**
|
|
67
|
-
*
|
|
50
|
+
* Assertion family emitted as run score kind.
|
|
68
51
|
*/
|
|
69
|
-
|
|
52
|
+
scoreKind: RunScoreKind;
|
|
70
53
|
}
|
|
71
54
|
/**
|
|
72
|
-
*
|
|
73
|
-
*/
|
|
74
|
-
type Assertion = (context: AssertionContext) => Promise<AssertionOutcome>;
|
|
75
|
-
/**
|
|
76
|
-
* Normalizes text for matching.
|
|
55
|
+
* Stores mutable evaluation state for stateful assertion flows.
|
|
77
56
|
*
|
|
78
|
-
*
|
|
79
|
-
*
|
|
57
|
+
* Use when:
|
|
58
|
+
* - assertions need to share counters, rolling metrics, or memoized values
|
|
59
|
+
* - a scenario evaluates multiple steps and expects state-aware checks
|
|
80
60
|
*/
|
|
81
|
-
|
|
61
|
+
type AssertionState = Map<string, unknown>;
|
|
82
62
|
/**
|
|
83
|
-
* Options for
|
|
63
|
+
* Options for custom assertions.
|
|
84
64
|
*/
|
|
85
|
-
interface
|
|
65
|
+
interface CustomAssertionOptions {
|
|
86
66
|
/**
|
|
87
|
-
*
|
|
67
|
+
* Custom evaluator callback.
|
|
88
68
|
*/
|
|
89
|
-
|
|
69
|
+
evaluate: (context: AssertionContext) => Promise<{
|
|
70
|
+
pass: boolean;
|
|
71
|
+
reason: string;
|
|
72
|
+
score: number;
|
|
73
|
+
}> | {
|
|
74
|
+
pass: boolean;
|
|
75
|
+
reason: string;
|
|
76
|
+
score: number;
|
|
77
|
+
};
|
|
90
78
|
/**
|
|
91
|
-
*
|
|
79
|
+
* Stable assertion id.
|
|
92
80
|
*/
|
|
93
|
-
|
|
81
|
+
id: string;
|
|
94
82
|
/**
|
|
95
|
-
*
|
|
96
|
-
*
|
|
97
|
-
* @default 'all'
|
|
83
|
+
* Score family emitted by this custom assertion.
|
|
98
84
|
*/
|
|
99
|
-
|
|
85
|
+
scoreKind: RunScoreKind;
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Options for exclude-keyword assertions.
|
|
89
|
+
*/
|
|
90
|
+
interface MustExcludeAssertionOptions {
|
|
100
91
|
/**
|
|
101
92
|
* Case-sensitive matching toggle.
|
|
102
93
|
*
|
|
103
94
|
* @default false
|
|
104
95
|
*/
|
|
105
96
|
caseSensitive?: boolean;
|
|
106
|
-
}
|
|
107
|
-
/**
|
|
108
|
-
* Creates an assertion that requires specific keywords in model text.
|
|
109
|
-
*
|
|
110
|
-
* Example:
|
|
111
|
-
* `expectMustInclude({ id: 'tone', keywords: ['calm', 'move'] })`
|
|
112
|
-
*/
|
|
113
|
-
declare function expectMustInclude(options: MustIncludeAssertionOptions): Assertion;
|
|
114
|
-
/**
|
|
115
|
-
* Options for exclude-keyword assertions.
|
|
116
|
-
*/
|
|
117
|
-
interface MustExcludeAssertionOptions {
|
|
118
97
|
/**
|
|
119
98
|
* Stable assertion id.
|
|
120
99
|
*/
|
|
@@ -123,156 +102,144 @@ interface MustExcludeAssertionOptions {
|
|
|
123
102
|
* Keywords that must not appear.
|
|
124
103
|
*/
|
|
125
104
|
keywords: readonly string[];
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Options for include-keyword assertions.
|
|
108
|
+
*/
|
|
109
|
+
interface MustIncludeAssertionOptions {
|
|
126
110
|
/**
|
|
127
111
|
* Case-sensitive matching toggle.
|
|
128
112
|
*
|
|
129
113
|
* @default false
|
|
130
114
|
*/
|
|
131
115
|
caseSensitive?: boolean;
|
|
132
|
-
}
|
|
133
|
-
/**
|
|
134
|
-
* Creates an assertion that forbids specific keywords.
|
|
135
|
-
*
|
|
136
|
-
* Example:
|
|
137
|
-
* `expectMustExclude({ id: 'no-engine-dump', keywords: ['bestmove', 'ponder'] })`
|
|
138
|
-
*/
|
|
139
|
-
declare function expectMustExclude(options: MustExcludeAssertionOptions): Assertion;
|
|
140
|
-
/**
|
|
141
|
-
* Options for regular-expression assertions.
|
|
142
|
-
*/
|
|
143
|
-
interface RegexAssertionOptions {
|
|
144
116
|
/**
|
|
145
117
|
* Stable assertion id.
|
|
146
118
|
*/
|
|
147
119
|
id: string;
|
|
148
120
|
/**
|
|
149
|
-
*
|
|
121
|
+
* Keywords that must be present.
|
|
150
122
|
*/
|
|
151
|
-
|
|
123
|
+
keywords: readonly string[];
|
|
124
|
+
/**
|
|
125
|
+
* Match mode for keywords.
|
|
126
|
+
*
|
|
127
|
+
* @default 'all'
|
|
128
|
+
*/
|
|
129
|
+
mode?: 'all' | 'any';
|
|
152
130
|
}
|
|
153
131
|
/**
|
|
154
|
-
*
|
|
155
|
-
*
|
|
156
|
-
* Example:
|
|
157
|
-
* `expectRegex({ id: 'starts-with-act', pattern: /^<\|ACT:/ })`
|
|
158
|
-
*/
|
|
159
|
-
declare function expectRegex(options: RegexAssertionOptions): Assertion;
|
|
160
|
-
/**
|
|
161
|
-
* Options for structured-output assertions.
|
|
132
|
+
* Options for regular-expression assertions.
|
|
162
133
|
*/
|
|
163
|
-
interface
|
|
134
|
+
interface RegexAssertionOptions {
|
|
164
135
|
/**
|
|
165
136
|
* Stable assertion id.
|
|
166
137
|
*/
|
|
167
138
|
id: string;
|
|
168
139
|
/**
|
|
169
|
-
*
|
|
170
|
-
*/
|
|
171
|
-
validate: (value: unknown) => value is TValue;
|
|
172
|
-
/**
|
|
173
|
-
* Optional failure reason.
|
|
140
|
+
* Pattern to apply to model text.
|
|
174
141
|
*/
|
|
175
|
-
|
|
142
|
+
pattern: RegExp;
|
|
176
143
|
}
|
|
177
144
|
/**
|
|
178
|
-
*
|
|
179
|
-
*
|
|
180
|
-
* Example:
|
|
181
|
-
* `expectStructuredOutput({ id: 'json-shape', validate: isMySchema })`
|
|
182
|
-
*/
|
|
183
|
-
declare function expectStructuredOutput<TValue>(options: StructuredOutputAssertionOptions<TValue>): Assertion;
|
|
184
|
-
/**
|
|
185
|
-
* Options for tool-call argument assertions.
|
|
145
|
+
* Options for rubric assertions.
|
|
186
146
|
*/
|
|
187
|
-
interface
|
|
147
|
+
interface RubricAssertionOptions {
|
|
188
148
|
/**
|
|
189
149
|
* Stable assertion id.
|
|
190
150
|
*/
|
|
191
151
|
id: string;
|
|
192
152
|
/**
|
|
193
|
-
*
|
|
153
|
+
* Async rubric judge callback.
|
|
194
154
|
*/
|
|
195
|
-
|
|
155
|
+
judge: (context: AssertionContext) => Promise<RubricJudgeResult>;
|
|
196
156
|
/**
|
|
197
|
-
*
|
|
157
|
+
* Minimum passing score.
|
|
158
|
+
*
|
|
159
|
+
* @default 0.7
|
|
198
160
|
*/
|
|
199
|
-
|
|
161
|
+
minScore?: number;
|
|
200
162
|
}
|
|
201
|
-
/**
|
|
202
|
-
* Creates an assertion for validating tool-call arguments.
|
|
203
|
-
*
|
|
204
|
-
* Example:
|
|
205
|
-
* `expectToolCallArgs({ id: 'spark-command-shape', toolName: 'builtIn_sparkCommand', validate: isSparkArgs })`
|
|
206
|
-
*/
|
|
207
|
-
declare function expectToolCallArgs(options: ToolCallArgsAssertionOptions): Assertion;
|
|
208
163
|
/**
|
|
209
164
|
* Rubric judge result returned by teacher-model or rubric logic.
|
|
210
165
|
*/
|
|
211
166
|
interface RubricJudgeResult {
|
|
212
167
|
/**
|
|
213
|
-
*
|
|
168
|
+
* Optional judge model id.
|
|
214
169
|
*/
|
|
215
|
-
|
|
170
|
+
judgeModel?: string;
|
|
216
171
|
/**
|
|
217
172
|
* Judge explanation text.
|
|
218
173
|
*/
|
|
219
174
|
reason: string;
|
|
220
175
|
/**
|
|
221
|
-
*
|
|
176
|
+
* Normalized score in the `0..1` range.
|
|
222
177
|
*/
|
|
223
|
-
|
|
178
|
+
score: number;
|
|
224
179
|
}
|
|
225
180
|
/**
|
|
226
|
-
* Options for
|
|
181
|
+
* Options for structured-output assertions.
|
|
227
182
|
*/
|
|
228
|
-
interface
|
|
183
|
+
interface StructuredOutputAssertionOptions<TValue> {
|
|
229
184
|
/**
|
|
230
|
-
*
|
|
185
|
+
* Optional failure reason.
|
|
231
186
|
*/
|
|
232
|
-
|
|
187
|
+
failureReason?: string;
|
|
233
188
|
/**
|
|
234
|
-
*
|
|
189
|
+
* Stable assertion id.
|
|
235
190
|
*/
|
|
236
|
-
|
|
191
|
+
id: string;
|
|
237
192
|
/**
|
|
238
|
-
*
|
|
239
|
-
*
|
|
240
|
-
* @default 0.7
|
|
193
|
+
* Runtime validator for structured output.
|
|
241
194
|
*/
|
|
242
|
-
|
|
195
|
+
validate: (value: unknown) => value is TValue;
|
|
243
196
|
}
|
|
244
197
|
/**
|
|
245
|
-
*
|
|
246
|
-
*
|
|
247
|
-
* Example:
|
|
248
|
-
* `expectRubric({ id: 'human-like-tone', judge: judgeFn, minScore: 0.8 })`
|
|
198
|
+
* Represents one tool call emitted by a model response.
|
|
249
199
|
*/
|
|
250
|
-
|
|
200
|
+
interface ToolCall {
|
|
201
|
+
/**
|
|
202
|
+
* Tool arguments payload.
|
|
203
|
+
*/
|
|
204
|
+
args: unknown;
|
|
205
|
+
/**
|
|
206
|
+
* Tool name used by the call.
|
|
207
|
+
*/
|
|
208
|
+
name: string;
|
|
209
|
+
}
|
|
251
210
|
/**
|
|
252
|
-
* Options for
|
|
211
|
+
* Options for tool-call argument assertions.
|
|
253
212
|
*/
|
|
254
|
-
interface
|
|
213
|
+
interface ToolCallArgsAssertionOptions {
|
|
255
214
|
/**
|
|
256
215
|
* Stable assertion id.
|
|
257
216
|
*/
|
|
258
217
|
id: string;
|
|
259
218
|
/**
|
|
260
|
-
*
|
|
219
|
+
* Tool name to inspect.
|
|
261
220
|
*/
|
|
262
|
-
|
|
221
|
+
toolName: string;
|
|
263
222
|
/**
|
|
264
|
-
*
|
|
223
|
+
* Runtime validator for tool arguments.
|
|
265
224
|
*/
|
|
266
|
-
|
|
267
|
-
pass: boolean;
|
|
268
|
-
reason: string;
|
|
269
|
-
score: number;
|
|
270
|
-
}> | {
|
|
271
|
-
pass: boolean;
|
|
272
|
-
reason: string;
|
|
273
|
-
score: number;
|
|
274
|
-
};
|
|
225
|
+
validate: (args: unknown) => boolean;
|
|
275
226
|
}
|
|
227
|
+
/**
|
|
228
|
+
* Returns failing assertion outcomes in original order.
|
|
229
|
+
*/
|
|
230
|
+
declare function collectFailedAssertions(outcomes: readonly AssertionOutcome[]): AssertionOutcome[];
|
|
231
|
+
/**
|
|
232
|
+
* Executes assertion list and returns all outcomes.
|
|
233
|
+
*
|
|
234
|
+
* Call stack:
|
|
235
|
+
*
|
|
236
|
+
* {@link evaluateAssertions}
|
|
237
|
+
* -> `assertion(context)`
|
|
238
|
+
* -> {@link AssertionOutcome}[]
|
|
239
|
+
*/
|
|
240
|
+
declare function evaluateAssertions(assertions: readonly Assertion[], context: Omit<AssertionContext, 'state'> & {
|
|
241
|
+
state?: AssertionState;
|
|
242
|
+
}): Promise<AssertionOutcome[]>;
|
|
276
243
|
/**
|
|
277
244
|
* Creates a custom assertion with fully user-defined logic.
|
|
278
245
|
*
|
|
@@ -280,6 +247,20 @@ interface CustomAssertionOptions {
|
|
|
280
247
|
* `expectCustom({ id: 'stateful-window', scoreKind: 'exact', evaluate: (ctx) => ... })`
|
|
281
248
|
*/
|
|
282
249
|
declare function expectCustom(options: CustomAssertionOptions): Assertion;
|
|
250
|
+
/**
|
|
251
|
+
* Creates an assertion that forbids specific keywords.
|
|
252
|
+
*
|
|
253
|
+
* Example:
|
|
254
|
+
* `expectMustExclude({ id: 'no-engine-dump', keywords: ['bestmove', 'ponder'] })`
|
|
255
|
+
*/
|
|
256
|
+
declare function expectMustExclude(options: MustExcludeAssertionOptions): Assertion;
|
|
257
|
+
/**
|
|
258
|
+
* Creates an assertion that requires specific keywords in model text.
|
|
259
|
+
*
|
|
260
|
+
* Example:
|
|
261
|
+
* `expectMustInclude({ id: 'tone', keywords: ['calm', 'move'] })`
|
|
262
|
+
*/
|
|
263
|
+
declare function expectMustInclude(options: MustIncludeAssertionOptions): Assertion;
|
|
283
264
|
/**
|
|
284
265
|
* Creates an inverse assertion.
|
|
285
266
|
*
|
|
@@ -290,25 +271,44 @@ declare function expectNot(assertion: Assertion, options: {
|
|
|
290
271
|
id: string;
|
|
291
272
|
}): Assertion;
|
|
292
273
|
/**
|
|
293
|
-
*
|
|
274
|
+
* Creates an assertion based on a regular expression.
|
|
294
275
|
*
|
|
295
|
-
*
|
|
276
|
+
* Example:
|
|
277
|
+
* `expectRegex({ id: 'starts-with-act', pattern: /^<\|ACT:/ })`
|
|
278
|
+
*/
|
|
279
|
+
declare function expectRegex(options: RegexAssertionOptions): Assertion;
|
|
280
|
+
/**
|
|
281
|
+
* Creates a rubric assertion driven by teacher-model style scoring.
|
|
296
282
|
*
|
|
297
|
-
*
|
|
298
|
-
*
|
|
299
|
-
* -> {@link AssertionOutcome}[]
|
|
283
|
+
* Example:
|
|
284
|
+
* `expectRubric({ id: 'human-like-tone', judge: judgeFn, minScore: 0.8 })`
|
|
300
285
|
*/
|
|
301
|
-
declare function
|
|
302
|
-
state?: AssertionState;
|
|
303
|
-
}): Promise<AssertionOutcome[]>;
|
|
286
|
+
declare function expectRubric(options: RubricAssertionOptions): Assertion;
|
|
304
287
|
/**
|
|
305
|
-
*
|
|
288
|
+
* Creates an assertion for structured model output.
|
|
289
|
+
*
|
|
290
|
+
* Example:
|
|
291
|
+
* `expectStructuredOutput({ id: 'json-shape', validate: isMySchema })`
|
|
306
292
|
*/
|
|
307
|
-
declare function
|
|
293
|
+
declare function expectStructuredOutput<TValue>(options: StructuredOutputAssertionOptions<TValue>): Assertion;
|
|
308
294
|
/**
|
|
309
|
-
*
|
|
295
|
+
* Creates an assertion for validating tool-call arguments.
|
|
296
|
+
*
|
|
297
|
+
* Example:
|
|
298
|
+
* `expectToolCallArgs({ id: 'spark-command-shape', toolName: 'builtIn_sparkCommand', validate: isSparkArgs })`
|
|
310
299
|
*/
|
|
311
|
-
declare function
|
|
300
|
+
declare function expectToolCallArgs(options: ToolCallArgsAssertionOptions): Assertion;
|
|
301
|
+
/**
|
|
302
|
+
* Normalizes text for matching.
|
|
303
|
+
*
|
|
304
|
+
* Before: `" Hello\nWorld "`
|
|
305
|
+
* After: `"hello world"`
|
|
306
|
+
*/
|
|
307
|
+
declare function normalizeMatchText(value: string, caseSensitive: boolean): string;
|
|
308
|
+
/**
|
|
309
|
+
* Converts assertion outcomes to run-score tuples consumed by aggregation.
|
|
310
|
+
*/
|
|
311
|
+
declare function toRunScores(outcomes: readonly AssertionOutcome[]): RunScore[];
|
|
312
312
|
//#endregion
|
|
313
313
|
export { Assertion, AssertionContext, AssertionOutcome, AssertionState, CustomAssertionOptions, MustExcludeAssertionOptions, MustIncludeAssertionOptions, RegexAssertionOptions, RubricAssertionOptions, RubricJudgeResult, StructuredOutputAssertionOptions, ToolCall, ToolCallArgsAssertionOptions, collectFailedAssertions, evaluateAssertions, expectCustom, expectMustExclude, expectMustInclude, expectNot, expectRegex, expectRubric, expectStructuredOutput, expectToolCallArgs, normalizeMatchText, toRunScores };
|
|
314
314
|
//# sourceMappingURL=index.d.mts.map
|