vieval 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/README.md +290 -0
  2. package/dist/assertions-DcAjfVDA.mjs +183 -0
  3. package/dist/assertions-DcAjfVDA.mjs.map +1 -0
  4. package/dist/cli/index.d.mts +11 -0
  5. package/dist/cli/index.mjs +1434 -0
  6. package/dist/cli/index.mjs.map +1 -0
  7. package/dist/config-D2fe1SnT.mjs +17 -0
  8. package/dist/config-D2fe1SnT.mjs.map +1 -0
  9. package/dist/config.d.mts +3 -0
  10. package/dist/config.mjs +3 -0
  11. package/dist/core/assertions/index.d.mts +2 -0
  12. package/dist/core/assertions/index.mjs +2 -0
  13. package/dist/core/inference-executors/index.d.mts +273 -0
  14. package/dist/core/inference-executors/index.mjs +225 -0
  15. package/dist/core/inference-executors/index.mjs.map +1 -0
  16. package/dist/core/processors/results/index.d.mts +96 -0
  17. package/dist/core/processors/results/index.mjs +64 -0
  18. package/dist/core/processors/results/index.mjs.map +1 -0
  19. package/dist/core/runner/index.d.mts +2 -0
  20. package/dist/core/runner/index.mjs +2 -0
  21. package/dist/expect-0jPJ7Zio.d.mts +2318 -0
  22. package/dist/expect-extensions-CwPtgTz8.mjs +13471 -0
  23. package/dist/expect-extensions-CwPtgTz8.mjs.map +1 -0
  24. package/dist/expect-i9WZWGrA.mjs +17 -0
  25. package/dist/expect-i9WZWGrA.mjs.map +1 -0
  26. package/dist/expect.d.mts +2 -0
  27. package/dist/expect.mjs +2 -0
  28. package/dist/index-DP7jsORl.d.mts +947 -0
  29. package/dist/index-oSXhM1zx.d.mts +314 -0
  30. package/dist/index.d.mts +92 -0
  31. package/dist/index.mjs +150 -0
  32. package/dist/index.mjs.map +1 -0
  33. package/dist/magic-string.es-CH1jwzMg.mjs +1013 -0
  34. package/dist/magic-string.es-CH1jwzMg.mjs.map +1 -0
  35. package/dist/models-D_MsBtYw.mjs +14 -0
  36. package/dist/models-D_MsBtYw.mjs.map +1 -0
  37. package/dist/plugin-DVaRZY2x.d.mts +84 -0
  38. package/dist/plugins/chat-models/index.d.mts +90 -0
  39. package/dist/plugins/chat-models/index.mjs +48 -0
  40. package/dist/plugins/chat-models/index.mjs.map +1 -0
  41. package/dist/registry-ChOjjdEC.mjs +245 -0
  42. package/dist/registry-ChOjjdEC.mjs.map +1 -0
  43. package/dist/runner-4ZsOveoY.mjs +480 -0
  44. package/dist/runner-4ZsOveoY.mjs.map +1 -0
  45. package/dist/testing/expect-extensions.d.mts +86 -0
  46. package/dist/testing/expect-extensions.mjs +2 -0
  47. package/package.json +88 -0
@@ -0,0 +1,314 @@
1
+ import { B as RunScoreKind, z as RunScore } from "./index-DP7jsORl.mjs";
2
+
3
+ //#region src/core/assertions/index.d.ts
4
+ /**
5
+ * Stores mutable evaluation state for stateful assertion flows.
6
+ *
7
+ * Use when:
8
+ * - assertions need to share counters, rolling metrics, or memoized values
9
+ * - a scenario evaluates multiple steps and expects state-aware checks
10
+ */
11
+ type AssertionState = Map<string, unknown>;
12
+ /**
13
+ * Represents one tool call emitted by a model response.
14
+ */
15
+ interface ToolCall {
16
+ /**
17
+ * Tool name used by the call.
18
+ */
19
+ name: string;
20
+ /**
21
+ * Tool arguments payload.
22
+ */
23
+ args: unknown;
24
+ }
25
+ /**
26
+ * Normalized assertion context for one model output.
27
+ */
28
+ interface AssertionContext {
29
+ /**
30
+ * Plain text model output used by text assertions.
31
+ */
32
+ text: string;
33
+ /**
34
+ * Optional structured output parsed from the model response.
35
+ */
36
+ structuredOutput?: unknown;
37
+ /**
38
+ * Optional tool calls extracted from the model response.
39
+ */
40
+ toolCalls?: readonly ToolCall[];
41
+ /**
42
+ * Shared mutable state for stateful assertion measurement.
43
+ */
44
+ state: AssertionState;
45
+ }
46
+ /**
47
+ * Result for one assertion evaluation.
48
+ */
49
+ interface AssertionOutcome {
50
+ /**
51
+ * Stable assertion id.
52
+ */
53
+ id: string;
54
+ /**
55
+ * Assertion family emitted as run score kind.
56
+ */
57
+ scoreKind: RunScoreKind;
58
+ /**
59
+ * Whether the assertion passed.
60
+ */
61
+ pass: boolean;
62
+ /**
63
+ * Normalized score in the `0..1` range.
64
+ */
65
+ score: number;
66
+ /**
67
+ * Human-readable reason for logs and reports.
68
+ */
69
+ reason: string;
70
+ }
71
+ /**
72
+ * Async assertion function used by eval scenarios.
73
+ */
74
+ type Assertion = (context: AssertionContext) => Promise<AssertionOutcome>;
75
+ /**
76
+ * Normalizes text for matching.
77
+ *
78
+ * Before: `" Hello\nWorld "`
79
+ * After: `"hello world"`
80
+ */
81
+ declare function normalizeMatchText(value: string, caseSensitive: boolean): string;
82
+ /**
83
+ * Options for include-keyword assertions.
84
+ */
85
+ interface MustIncludeAssertionOptions {
86
+ /**
87
+ * Stable assertion id.
88
+ */
89
+ id: string;
90
+ /**
91
+ * Keywords that must be present.
92
+ */
93
+ keywords: readonly string[];
94
+ /**
95
+ * Match mode for keywords.
96
+ *
97
+ * @default 'all'
98
+ */
99
+ mode?: 'all' | 'any';
100
+ /**
101
+ * Case-sensitive matching toggle.
102
+ *
103
+ * @default false
104
+ */
105
+ caseSensitive?: boolean;
106
+ }
107
+ /**
108
+ * Creates an assertion that requires specific keywords in model text.
109
+ *
110
+ * Example:
111
+ * `expectMustInclude({ id: 'tone', keywords: ['calm', 'move'] })`
112
+ */
113
+ declare function expectMustInclude(options: MustIncludeAssertionOptions): Assertion;
114
+ /**
115
+ * Options for exclude-keyword assertions.
116
+ */
117
+ interface MustExcludeAssertionOptions {
118
+ /**
119
+ * Stable assertion id.
120
+ */
121
+ id: string;
122
+ /**
123
+ * Keywords that must not appear.
124
+ */
125
+ keywords: readonly string[];
126
+ /**
127
+ * Case-sensitive matching toggle.
128
+ *
129
+ * @default false
130
+ */
131
+ caseSensitive?: boolean;
132
+ }
133
+ /**
134
+ * Creates an assertion that forbids specific keywords.
135
+ *
136
+ * Example:
137
+ * `expectMustExclude({ id: 'no-engine-dump', keywords: ['bestmove', 'ponder'] })`
138
+ */
139
+ declare function expectMustExclude(options: MustExcludeAssertionOptions): Assertion;
140
+ /**
141
+ * Options for regular-expression assertions.
142
+ */
143
+ interface RegexAssertionOptions {
144
+ /**
145
+ * Stable assertion id.
146
+ */
147
+ id: string;
148
+ /**
149
+ * Pattern to apply to model text.
150
+ */
151
+ pattern: RegExp;
152
+ }
153
+ /**
154
+ * Creates an assertion based on a regular expression.
155
+ *
156
+ * Example:
157
+ * `expectRegex({ id: 'starts-with-act', pattern: /^<\|ACT:/ })`
158
+ */
159
+ declare function expectRegex(options: RegexAssertionOptions): Assertion;
160
+ /**
161
+ * Options for structured-output assertions.
162
+ */
163
+ interface StructuredOutputAssertionOptions<TValue> {
164
+ /**
165
+ * Stable assertion id.
166
+ */
167
+ id: string;
168
+ /**
169
+ * Runtime validator for structured output.
170
+ */
171
+ validate: (value: unknown) => value is TValue;
172
+ /**
173
+ * Optional failure reason.
174
+ */
175
+ failureReason?: string;
176
+ }
177
+ /**
178
+ * Creates an assertion for structured model output.
179
+ *
180
+ * Example:
181
+ * `expectStructuredOutput({ id: 'json-shape', validate: isMySchema })`
182
+ */
183
+ declare function expectStructuredOutput<TValue>(options: StructuredOutputAssertionOptions<TValue>): Assertion;
184
+ /**
185
+ * Options for tool-call argument assertions.
186
+ */
187
+ interface ToolCallArgsAssertionOptions {
188
+ /**
189
+ * Stable assertion id.
190
+ */
191
+ id: string;
192
+ /**
193
+ * Tool name to inspect.
194
+ */
195
+ toolName: string;
196
+ /**
197
+ * Runtime validator for tool arguments.
198
+ */
199
+ validate: (args: unknown) => boolean;
200
+ }
201
+ /**
202
+ * Creates an assertion for validating tool-call arguments.
203
+ *
204
+ * Example:
205
+ * `expectToolCallArgs({ id: 'spark-command-shape', toolName: 'builtIn_sparkCommand', validate: isSparkArgs })`
206
+ */
207
+ declare function expectToolCallArgs(options: ToolCallArgsAssertionOptions): Assertion;
208
+ /**
209
+ * Rubric judge result returned by teacher-model or rubric logic.
210
+ */
211
+ interface RubricJudgeResult {
212
+ /**
213
+ * Normalized score in the `0..1` range.
214
+ */
215
+ score: number;
216
+ /**
217
+ * Judge explanation text.
218
+ */
219
+ reason: string;
220
+ /**
221
+ * Optional judge model id.
222
+ */
223
+ judgeModel?: string;
224
+ }
225
+ /**
226
+ * Options for rubric assertions.
227
+ */
228
+ interface RubricAssertionOptions {
229
+ /**
230
+ * Stable assertion id.
231
+ */
232
+ id: string;
233
+ /**
234
+ * Async rubric judge callback.
235
+ */
236
+ judge: (context: AssertionContext) => Promise<RubricJudgeResult>;
237
+ /**
238
+ * Minimum passing score.
239
+ *
240
+ * @default 0.7
241
+ */
242
+ minScore?: number;
243
+ }
244
+ /**
245
+ * Creates a rubric assertion driven by teacher-model style scoring.
246
+ *
247
+ * Example:
248
+ * `expectRubric({ id: 'human-like-tone', judge: judgeFn, minScore: 0.8 })`
249
+ */
250
+ declare function expectRubric(options: RubricAssertionOptions): Assertion;
251
+ /**
252
+ * Options for custom assertions.
253
+ */
254
+ interface CustomAssertionOptions {
255
+ /**
256
+ * Stable assertion id.
257
+ */
258
+ id: string;
259
+ /**
260
+ * Score family emitted by this custom assertion.
261
+ */
262
+ scoreKind: RunScoreKind;
263
+ /**
264
+ * Custom evaluator callback.
265
+ */
266
+ evaluate: (context: AssertionContext) => Promise<{
267
+ pass: boolean;
268
+ reason: string;
269
+ score: number;
270
+ }> | {
271
+ pass: boolean;
272
+ reason: string;
273
+ score: number;
274
+ };
275
+ }
276
+ /**
277
+ * Creates a custom assertion with fully user-defined logic.
278
+ *
279
+ * Example:
280
+ * `expectCustom({ id: 'stateful-window', scoreKind: 'exact', evaluate: (ctx) => ... })`
281
+ */
282
+ declare function expectCustom(options: CustomAssertionOptions): Assertion;
283
+ /**
284
+ * Creates an inverse assertion.
285
+ *
286
+ * Example:
287
+ * `expectNot(expectMustInclude({ id: 'contains-engine-word', keywords: ['bestmove'] }), { id: 'no-engine-word' })`
288
+ */
289
+ declare function expectNot(assertion: Assertion, options: {
290
+ id: string;
291
+ }): Assertion;
292
+ /**
293
+ * Executes assertion list and returns all outcomes.
294
+ *
295
+ * Call stack:
296
+ *
297
+ * {@link evaluateAssertions}
298
+ * -> `assertion(context)`
299
+ * -> {@link AssertionOutcome}[]
300
+ */
301
+ declare function evaluateAssertions(assertions: readonly Assertion[], context: Omit<AssertionContext, 'state'> & {
302
+ state?: AssertionState;
303
+ }): Promise<AssertionOutcome[]>;
304
+ /**
305
+ * Converts assertion outcomes to run-score tuples consumed by aggregation.
306
+ */
307
+ declare function toRunScores(outcomes: readonly AssertionOutcome[]): RunScore[];
308
+ /**
309
+ * Returns failing assertion outcomes in original order.
310
+ */
311
+ declare function collectFailedAssertions(outcomes: readonly AssertionOutcome[]): AssertionOutcome[];
312
+ //#endregion
313
+ export { expectToolCallArgs as C, expectStructuredOutput as S, toRunScores as T, expectMustExclude as _, CustomAssertionOptions as a, expectRegex as b, RegexAssertionOptions as c, StructuredOutputAssertionOptions as d, ToolCall as f, expectCustom as g, evaluateAssertions as h, AssertionState as i, RubricAssertionOptions as l, collectFailedAssertions as m, AssertionContext as n, MustExcludeAssertionOptions as o, ToolCallArgsAssertionOptions as p, AssertionOutcome as r, MustIncludeAssertionOptions as s, Assertion as t, RubricJudgeResult as u, expectMustInclude as v, normalizeMatchText as w, expectRubric as x, expectNot as y };
314
+ //# sourceMappingURL=index-oSXhM1zx.d.mts.map
@@ -0,0 +1,92 @@
1
+ import { b as TaskRunOutput, y as TaskRunContext } from "./index-DP7jsORl.mjs";
2
+ import { t as expect } from "./expect-0jPJ7Zio.mjs";
3
+
4
+ //#region src/cli/config.d.ts
5
+ /**
6
+ * Helper used by `vieval.config.*` for better type inference.
7
+ */
8
+ declare const defineConfig: any;
9
+ /**
10
+ * Loads `.env*` files using Vite's env resolution behavior.
11
+ *
12
+ * Use when:
13
+ * - `vieval.config.*` should mirror Vitest/Vite env loading semantics
14
+ * - config wants to populate top-level `env` via file-based values
15
+ *
16
+ * Expects:
17
+ * - `mode` to match the env file suffix (`.env.<mode>`)
18
+ * - `envDir` to point at the directory containing `.env` files
19
+ *
20
+ * Returns:
21
+ * - Key/value map compatible with `CliConfig['env']`
22
+ */
23
+ declare function loadEnv(mode: string, envDir: string, prefixes?: string | string[]): NodeJS.ProcessEnv;
24
+ //#endregion
25
+ //#region src/dsl/task.d.ts
26
+ /**
27
+ * Runtime context provided to a task case callback.
28
+ */
29
+ interface CaseRunContext<TInput> extends TaskRunContext {
30
+ /**
31
+ * Case-scoped matrix payload.
32
+ */
33
+ matrix: TaskRunContext['task']['matrix'] & {
34
+ inputs: TInput;
35
+ };
36
+ }
37
+ /**
38
+ * Callback for one task case.
39
+ */
40
+ type CaseRunner<TInput> = (context: CaseRunContext<TInput>) => Promise<void> | void;
41
+ /**
42
+ * Builder callbacks passed into `describeTask`.
43
+ */
44
+ interface DescribeTaskBuilder {
45
+ /**
46
+ * Registers one explicit case.
47
+ */
48
+ caseOf: <TInput>(name: string, run: CaseRunner<TInput>, input: TInput) => void;
49
+ /**
50
+ * Registers multiple cases from input list.
51
+ */
52
+ casesFromInputs: <TInput>(namePrefix: string, inputs: readonly TInput[], run: CaseRunner<TInput>) => void;
53
+ }
54
+ /**
55
+ * Options for `describeTask`.
56
+ */
57
+ interface DescribeTaskOptions {
58
+ /**
59
+ * Optional description override.
60
+ */
61
+ description?: string;
62
+ }
63
+ /**
64
+ * Registers one case in the currently active task scope.
65
+ */
66
+ declare function caseOf<TInput>(name: string, run: CaseRunner<TInput>, input: TInput): void;
67
+ /**
68
+ * Registers multiple cases in the currently active task scope.
69
+ */
70
+ declare function casesFromInputs<TInput>(namePrefix: string, inputs: readonly TInput[], run: CaseRunner<TInput>): void;
71
+ /**
72
+ * Defines one eval task with task/case semantics similar to Vitest.
73
+ *
74
+ * Use when:
75
+ * - task behavior should be declared with `caseOf` and `casesFromInputs`
76
+ * - business agent code should be imported and run from eval task files
77
+ */
78
+ declare function describeTask(name: string, build: ((builder: DescribeTaskBuilder) => void) | (() => void), options?: DescribeTaskOptions): {
79
+ readonly description: string;
80
+ readonly name: string;
81
+ readonly task: {
82
+ readonly id: string;
83
+ readonly run: (context: TaskRunContext) => Promise<TaskRunOutput>;
84
+ };
85
+ };
86
+ /**
87
+ * Alias of `describeTask` for eval-centric naming.
88
+ */
89
+ declare const describeEval: typeof describeTask;
90
+ //#endregion
91
+ export { caseOf, casesFromInputs, defineConfig, describeEval, describeTask, expect, loadEnv };
92
+ //# sourceMappingURL=index.d.mts.map
package/dist/index.mjs ADDED
@@ -0,0 +1,150 @@
1
+ import { a as defineConfig, i as registerEvalDefinition, o as loadEnv } from "./registry-ChOjjdEC.mjs";
2
+ import { n as defineTask, t as defineEval } from "./config-D2fe1SnT.mjs";
3
+ import { t as expect } from "./expect-i9WZWGrA.mjs";
4
+ //#region src/dsl/task.ts
5
+ function cloneCaseMatrix(matrix) {
6
+ return {
7
+ eval: { ...matrix.eval },
8
+ meta: { ...matrix.meta },
9
+ run: { ...matrix.run }
10
+ };
11
+ }
12
+ function emitCaseStart(hooks, payload) {
13
+ try {
14
+ hooks?.onCaseStart?.(payload);
15
+ } catch {}
16
+ }
17
+ function emitCaseEnd(hooks, payload) {
18
+ try {
19
+ hooks?.onCaseEnd?.(payload);
20
+ } catch {}
21
+ }
22
+ function createCaseBuilder(registeredCases) {
23
+ return {
24
+ caseOf(name, run, input) {
25
+ registeredCases.push({
26
+ input,
27
+ name,
28
+ run
29
+ });
30
+ },
31
+ casesFromInputs(namePrefix, inputs, run) {
32
+ inputs.forEach((input, index) => {
33
+ registeredCases.push({
34
+ input,
35
+ name: `${namePrefix} #${index + 1}`,
36
+ run
37
+ });
38
+ });
39
+ }
40
+ };
41
+ }
42
+ let activeCasesStack = [];
43
+ function withActiveCases(cases, callback) {
44
+ activeCasesStack = [...activeCasesStack, cases];
45
+ try {
46
+ return callback();
47
+ } finally {
48
+ activeCasesStack = activeCasesStack.slice(0, -1);
49
+ }
50
+ }
51
+ function getActiveCases() {
52
+ const active = activeCasesStack.at(-1);
53
+ if (active == null) throw new Error("caseOf/casesFromInputs must be called inside describeTask/describeEval.");
54
+ return active;
55
+ }
56
+ /**
57
+ * Registers one case in the currently active task scope.
58
+ */
59
+ function caseOf(name, run, input) {
60
+ getActiveCases().push({
61
+ input,
62
+ name,
63
+ run
64
+ });
65
+ }
66
+ /**
67
+ * Registers multiple cases in the currently active task scope.
68
+ */
69
+ function casesFromInputs(namePrefix, inputs, run) {
70
+ inputs.forEach((input, index) => {
71
+ getActiveCases().push({
72
+ input,
73
+ name: `${namePrefix} #${index + 1}`,
74
+ run
75
+ });
76
+ });
77
+ }
78
+ /**
79
+ * Defines one eval task with task/case semantics similar to Vitest.
80
+ *
81
+ * Use when:
82
+ * - task behavior should be declared with `caseOf` and `casesFromInputs`
83
+ * - business agent code should be imported and run from eval task files
84
+ */
85
+ function describeTask(name, build, options = {}) {
86
+ const registeredCases = [];
87
+ const builder = createCaseBuilder(registeredCases);
88
+ withActiveCases(registeredCases, () => {
89
+ if (build.length > 0) {
90
+ build(builder);
91
+ return;
92
+ }
93
+ build();
94
+ });
95
+ const definition = defineEval({
96
+ description: options.description ?? name,
97
+ name,
98
+ task: defineTask({
99
+ id: name,
100
+ async run(context) {
101
+ if (registeredCases.length === 0) return { scores: [{
102
+ kind: "exact",
103
+ score: 1
104
+ }] };
105
+ const totalCases = registeredCases.length;
106
+ const caseScores = await Promise.all(registeredCases.map(async (taskCase, index) => {
107
+ emitCaseStart(context.reporterHooks, {
108
+ index,
109
+ name: taskCase.name,
110
+ total: totalCases
111
+ });
112
+ let state = "passed";
113
+ try {
114
+ await taskCase.run({
115
+ ...context,
116
+ matrix: {
117
+ ...cloneCaseMatrix(context.task.matrix),
118
+ inputs: taskCase.input
119
+ }
120
+ });
121
+ } catch {
122
+ state = "failed";
123
+ } finally {
124
+ emitCaseEnd(context.reporterHooks, {
125
+ index,
126
+ state,
127
+ name: taskCase.name,
128
+ total: totalCases
129
+ });
130
+ }
131
+ return state === "passed" ? 1 : 0;
132
+ }));
133
+ return { scores: [{
134
+ kind: "exact",
135
+ score: caseScores.reduce((sum, score) => sum + score, 0) / caseScores.length
136
+ }] };
137
+ }
138
+ })
139
+ });
140
+ registerEvalDefinition(definition);
141
+ return definition;
142
+ }
143
+ /**
144
+ * Alias of `describeTask` for eval-centric naming.
145
+ */
146
+ const describeEval = describeTask;
147
+ //#endregion
148
+ export { caseOf, casesFromInputs, defineConfig, describeEval, describeTask, expect, loadEnv };
149
+
150
+ //# sourceMappingURL=index.mjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.mjs","names":[],"sources":["../src/dsl/task.ts"],"sourcesContent":["import type { TaskRunContext, TaskRunOutput } from '../config'\n\nimport { defineEval, defineTask } from '../config'\nimport { registerEvalDefinition } from './registry'\n\n/**\n * Runtime context provided to a task case callback.\n */\nexport interface CaseRunContext<TInput> extends TaskRunContext {\n /**\n * Case-scoped matrix payload.\n */\n matrix: TaskRunContext['task']['matrix'] & { inputs: TInput }\n}\n\n/**\n * Callback for one task case.\n */\nexport type CaseRunner<TInput> = (context: CaseRunContext<TInput>) => Promise<void> | void\n\ninterface RegisteredCase<TInput> {\n input: TInput\n name: string\n run: CaseRunner<TInput>\n}\n\nfunction cloneCaseMatrix(matrix: TaskRunContext['task']['matrix']): TaskRunContext['task']['matrix'] {\n return {\n eval: {\n ...matrix.eval,\n },\n meta: {\n ...matrix.meta,\n },\n run: {\n ...matrix.run,\n },\n }\n}\n\nfunction emitCaseStart(\n hooks: TaskRunContext['reporterHooks'] | undefined,\n payload: {\n index: number\n name: string\n total: number\n },\n): void {\n try {\n hooks?.onCaseStart?.(payload)\n }\n catch {\n // Reporter hooks must never affect task scoring.\n }\n}\n\nfunction emitCaseEnd(\n hooks: TaskRunContext['reporterHooks'] | undefined,\n payload: {\n index: number\n state: 'passed' | 'failed'\n name: string\n total: number\n },\n): void {\n try {\n hooks?.onCaseEnd?.(payload)\n }\n catch {\n // Reporter hooks must never affect task scoring.\n }\n}\n\n/**\n * Builder callbacks passed into `describeTask`.\n */\nexport interface DescribeTaskBuilder {\n /**\n * Registers one explicit case.\n */\n caseOf: <TInput>(name: string, run: CaseRunner<TInput>, input: TInput) => void\n /**\n * Registers multiple cases from input list.\n */\n casesFromInputs: <TInput>(\n namePrefix: string,\n inputs: readonly TInput[],\n run: CaseRunner<TInput>,\n ) => void\n}\n\n/**\n * Options for `describeTask`.\n */\nexport interface DescribeTaskOptions {\n /**\n * Optional description override.\n */\n description?: string\n}\n\nfunction createCaseBuilder(registeredCases: RegisteredCase<unknown>[]): DescribeTaskBuilder {\n return {\n caseOf(name, run, input) {\n registeredCases.push({\n input,\n name,\n run: run as CaseRunner<unknown>,\n })\n },\n casesFromInputs(namePrefix, inputs, run) {\n inputs.forEach((input, index) => {\n registeredCases.push({\n input,\n name: `${namePrefix} #${index + 1}`,\n run: run as CaseRunner<unknown>,\n })\n })\n },\n }\n}\n\nlet activeCasesStack: RegisteredCase<unknown>[][] = []\n\nfunction withActiveCases<T>(cases: RegisteredCase<unknown>[], callback: () => T): T {\n activeCasesStack = [...activeCasesStack, cases]\n\n try {\n return callback()\n }\n finally {\n activeCasesStack = activeCasesStack.slice(0, -1)\n }\n}\n\nfunction getActiveCases(): RegisteredCase<unknown>[] {\n const active = activeCasesStack.at(-1)\n if (active == null) {\n throw new Error('caseOf/casesFromInputs must be called inside describeTask/describeEval.')\n }\n\n return active\n}\n\n/**\n * Registers one case in the currently active task scope.\n */\nexport function caseOf<TInput>(\n name: string,\n run: CaseRunner<TInput>,\n input: TInput,\n): void {\n getActiveCases().push({\n input,\n name,\n run: run as CaseRunner<unknown>,\n })\n}\n\n/**\n * Registers multiple cases in the currently active task scope.\n */\nexport function casesFromInputs<TInput>(\n namePrefix: string,\n inputs: readonly TInput[],\n run: CaseRunner<TInput>,\n): void {\n inputs.forEach((input, index) => {\n getActiveCases().push({\n input,\n name: `${namePrefix} #${index + 1}`,\n run: run as CaseRunner<unknown>,\n })\n })\n}\n\n/**\n * Defines one eval task with task/case semantics similar to Vitest.\n *\n * Use when:\n * - task behavior should be declared with `caseOf` and `casesFromInputs`\n * - business agent code should be imported and run from eval task files\n */\nexport function describeTask(\n name: string,\n build: ((builder: DescribeTaskBuilder) => void) | (() => void),\n options: DescribeTaskOptions = {},\n) {\n const registeredCases: RegisteredCase<unknown>[] = []\n const builder = createCaseBuilder(registeredCases)\n withActiveCases(registeredCases, () => {\n if (build.length > 0) {\n (build as (builder: DescribeTaskBuilder) => void)(builder)\n return\n }\n\n ;(build as () => void)()\n })\n\n const description = options.description ?? name\n\n const definition = defineEval({\n description,\n name,\n task: defineTask({\n id: name,\n async run(context): Promise<TaskRunOutput> {\n if (registeredCases.length === 0) {\n return {\n scores: [{ kind: 'exact', score: 1 }],\n }\n }\n\n const totalCases = registeredCases.length\n\n const caseScores: number[] = await Promise.all(\n registeredCases.map(async (taskCase, index) => {\n emitCaseStart(context.reporterHooks, {\n index,\n name: taskCase.name,\n total: totalCases,\n })\n\n let state: 'passed' | 'failed' = 'passed'\n\n try {\n await taskCase.run({\n ...context,\n matrix: {\n ...cloneCaseMatrix(context.task.matrix),\n inputs: taskCase.input,\n },\n })\n }\n catch {\n state = 'failed'\n }\n finally {\n emitCaseEnd(context.reporterHooks, {\n index,\n state,\n name: taskCase.name,\n total: totalCases,\n })\n }\n\n return state === 'passed' ? 1 : 0\n }),\n )\n\n const averageScore = caseScores.reduce((sum, score) => sum + score, 0) / caseScores.length\n\n return {\n scores: [{ kind: 'exact', score: averageScore }],\n }\n },\n }),\n })\n\n registerEvalDefinition(definition)\n\n return definition\n}\n\n/**\n * Alias of `describeTask` for eval-centric naming.\n */\nexport const describeEval = describeTask\n"],"mappings":";;;;AA0BA,SAAS,gBAAgB,QAA4E;AACnG,QAAO;EACL,MAAM,EACJ,GAAG,OAAO,MACX;EACD,MAAM,EACJ,GAAG,OAAO,MACX;EACD,KAAK,EACH,GAAG,OAAO,KACX;EACF;;AAGH,SAAS,cACP,OACA,SAKM;AACN,KAAI;AACF,SAAO,cAAc,QAAQ;SAEzB;;AAKR,SAAS,YACP,OACA,SAMM;AACN,KAAI;AACF,SAAO,YAAY,QAAQ;SAEvB;;AAiCR,SAAS,kBAAkB,iBAAiE;AAC1F,QAAO;EACL,OAAO,MAAM,KAAK,OAAO;AACvB,mBAAgB,KAAK;IACnB;IACA;IACK;IACN,CAAC;;EAEJ,gBAAgB,YAAY,QAAQ,KAAK;AACvC,UAAO,SAAS,OAAO,UAAU;AAC/B,oBAAgB,KAAK;KACnB;KACA,MAAM,GAAG,WAAW,IAAI,QAAQ;KAC3B;KACN,CAAC;KACF;;EAEL;;AAGH,IAAI,mBAAgD,EAAE;AAEtD,SAAS,gBAAmB,OAAkC,UAAsB;AAClF,oBAAmB,CAAC,GAAG,kBAAkB,MAAM;AAE/C,KAAI;AACF,SAAO,UAAU;WAEX;AACN,qBAAmB,iBAAiB,MAAM,GAAG,GAAG;;;AAIpD,SAAS,iBAA4C;CACnD,MAAM,SAAS,iBAAiB,GAAG,GAAG;AACtC,KAAI,UAAU,KACZ,OAAM,IAAI,MAAM,0EAA0E;AAG5F,QAAO;;;;;AAMT,SAAgB,OACd,MACA,KACA,OACM;AACN,iBAAgB,CAAC,KAAK;EACpB;EACA;EACK;EACN,CAAC;;;;;AAMJ,SAAgB,gBACd,YACA,QACA,KACM;AACN,QAAO,SAAS,OAAO,UAAU;AAC/B,kBAAgB,CAAC,KAAK;GACpB;GACA,MAAM,GAAG,WAAW,IAAI,QAAQ;GAC3B;GACN,CAAC;GACF;;;;;;;;;AAUJ,SAAgB,aACd,MACA,OACA,UAA+B,EAAE,EACjC;CACA,MAAM,kBAA6C,EAAE;CACrD,MAAM,UAAU,kBAAkB,gBAAgB;AAClD,iBAAgB,uBAAuB;AACrC,MAAI,MAAM,SAAS,GAAG;AACnB,SAAiD,QAAQ;AAC1D;;AAGA,SAAsB;GACxB;CAIF,MAAM,aAAa,WAAW;EAC5B,aAHkB,QAAQ,eAAe;EAIzC;EACA,MAAM,WAAW;GACf,IAAI;GACJ,MAAM,IAAI,SAAiC;AACzC,QAAI,gBAAgB,WAAW,EAC7B,QAAO,EACL,QAAQ,CAAC;KAAE,MAAM;KAAS,OAAO;KAAG,CAAC,EACtC;IAGH,MAAM,aAAa,gBAAgB;IAEnC,MAAM,aAAuB,MAAM,QAAQ,IACzC,gBAAgB,IAAI,OAAO,UAAU,UAAU;AAC7C,mBAAc,QAAQ,eAAe;MACnC;MACA,MAAM,SAAS;MACf,OAAO;MACR,CAAC;KAEF,IAAI,QAA6B;AAEjC,SAAI;AACF,YAAM,SAAS,IAAI;OACjB,GAAG;OACH,QAAQ;QACN,GAAG,gBAAgB,QAAQ,KAAK,OAAO;QACvC,QAAQ,SAAS;QAClB;OACF,CAAC;aAEE;AACJ,cAAQ;eAEF;AACN,kBAAY,QAAQ,eAAe;OACjC;OACA;OACA,MAAM,SAAS;OACf,OAAO;OACR,CAAC;;AAGJ,YAAO,UAAU,WAAW,IAAI;MAChC,CACH;AAID,WAAO,EACL,QAAQ,CAAC;KAAE,MAAM;KAAS,OAHP,WAAW,QAAQ,KAAK,UAAU,MAAM,OAAO,EAAE,GAAG,WAAW;KAGnC,CAAC,EACjD;;GAEJ,CAAC;EACH,CAAC;AAEF,wBAAuB,WAAW;AAElC,QAAO;;;;;AAMT,MAAa,eAAe"}