vieval 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/README.md +290 -0
  2. package/dist/assertions-DcAjfVDA.mjs +183 -0
  3. package/dist/assertions-DcAjfVDA.mjs.map +1 -0
  4. package/dist/cli/index.d.mts +11 -0
  5. package/dist/cli/index.mjs +1434 -0
  6. package/dist/cli/index.mjs.map +1 -0
  7. package/dist/config-D2fe1SnT.mjs +17 -0
  8. package/dist/config-D2fe1SnT.mjs.map +1 -0
  9. package/dist/config.d.mts +3 -0
  10. package/dist/config.mjs +3 -0
  11. package/dist/core/assertions/index.d.mts +2 -0
  12. package/dist/core/assertions/index.mjs +2 -0
  13. package/dist/core/inference-executors/index.d.mts +273 -0
  14. package/dist/core/inference-executors/index.mjs +225 -0
  15. package/dist/core/inference-executors/index.mjs.map +1 -0
  16. package/dist/core/processors/results/index.d.mts +96 -0
  17. package/dist/core/processors/results/index.mjs +64 -0
  18. package/dist/core/processors/results/index.mjs.map +1 -0
  19. package/dist/core/runner/index.d.mts +2 -0
  20. package/dist/core/runner/index.mjs +2 -0
  21. package/dist/expect-0jPJ7Zio.d.mts +2318 -0
  22. package/dist/expect-extensions-CwPtgTz8.mjs +13471 -0
  23. package/dist/expect-extensions-CwPtgTz8.mjs.map +1 -0
  24. package/dist/expect-i9WZWGrA.mjs +17 -0
  25. package/dist/expect-i9WZWGrA.mjs.map +1 -0
  26. package/dist/expect.d.mts +2 -0
  27. package/dist/expect.mjs +2 -0
  28. package/dist/index-DP7jsORl.d.mts +947 -0
  29. package/dist/index-oSXhM1zx.d.mts +314 -0
  30. package/dist/index.d.mts +92 -0
  31. package/dist/index.mjs +150 -0
  32. package/dist/index.mjs.map +1 -0
  33. package/dist/magic-string.es-CH1jwzMg.mjs +1013 -0
  34. package/dist/magic-string.es-CH1jwzMg.mjs.map +1 -0
  35. package/dist/models-D_MsBtYw.mjs +14 -0
  36. package/dist/models-D_MsBtYw.mjs.map +1 -0
  37. package/dist/plugin-DVaRZY2x.d.mts +84 -0
  38. package/dist/plugins/chat-models/index.d.mts +90 -0
  39. package/dist/plugins/chat-models/index.mjs +48 -0
  40. package/dist/plugins/chat-models/index.mjs.map +1 -0
  41. package/dist/registry-ChOjjdEC.mjs +245 -0
  42. package/dist/registry-ChOjjdEC.mjs.map +1 -0
  43. package/dist/runner-4ZsOveoY.mjs +480 -0
  44. package/dist/runner-4ZsOveoY.mjs.map +1 -0
  45. package/dist/testing/expect-extensions.d.mts +86 -0
  46. package/dist/testing/expect-extensions.mjs +2 -0
  47. package/package.json +88 -0
package/README.md ADDED
@@ -0,0 +1,290 @@
1
+ # Vieval
2
+
3
+ [![npm version][npm-version-src]][npm-version-href]
4
+ [![npm downloads][npm-downloads-src]][npm-downloads-href]
5
+ [![bundle][bundle-src]][bundle-href]
6
+ [![JSDocs][jsdocs-src]][jsdocs-href]
7
+ [![License][license-src]][license-href]
8
+ [![Ask DeepWiki][deepwiki-src]][deepwiki-href]
9
+
10
+ Vitest-style evaluation framework for agents, models, and task pipelines.
11
+
12
+ `vieval` keeps eval authoring close to product code while giving you repeatable project/eval/task matrix runs and a CLI summary experience.
13
+
14
+ ## Why Vieval
15
+
16
+ - Familiar authoring model (`describeEval`, `caseOf`, `expect`) instead of a separate eval DSL language.
17
+ - Matrix control at three levels (project, eval, task) with deterministic merge rules.
18
+ - Works for chat and non-chat workloads through custom `projects[].executor`.
19
+ - Human-readable TTY output and machine-readable JSON output from the same command.
20
+
21
+ ## Quick Start
22
+
23
+ ### 1) Create a config
24
+
25
+ ```ts
26
+ // vieval.config.ts
27
+ import { defineConfig } from 'vieval'
28
+
29
+ export default defineConfig({
30
+ projects: [
31
+ {
32
+ name: 'default',
33
+ root: '.',
34
+ include: ['evals/*.eval.ts'],
35
+ },
36
+ ],
37
+ })
38
+ ```
39
+
40
+ ### 2) Create an eval
41
+
42
+ ```ts
43
+ // evals/smoke.eval.ts
44
+ import { caseOf, describeEval, expect } from 'vieval'
45
+
46
+ export default describeEval('smoke', () => {
47
+ caseOf('2 + 2 = 4', () => {
48
+ expect(2 + 2).toBe(4)
49
+ }, {})
50
+ })
51
+ ```
52
+
53
+ ### 3) Run
54
+
55
+ ```bash
56
+ pnpm -F vieval eval:run -- --config ./vieval.config.ts
57
+ ```
58
+
59
+ ## Core Concepts
60
+
61
+ ### Matrix layering
62
+
63
+ `vieval` expands matrices in scope order:
64
+
65
+ 1. `project` from `vieval.config.*`
66
+ 2. `eval` from `*.eval.ts`
67
+ 3. `task` from `defineTask(...)`
68
+
69
+ Within each scope, matrix layers apply in this order:
70
+
71
+ 1. `disable`
72
+ 2. `extend`
73
+ 3. `override`
74
+
75
+ Both `runMatrix` and `evalMatrix` are supported at each scope.
76
+
77
+ ### Matrix compatibility alias
78
+
79
+ `matrix` remains supported as a compatibility alias for `runMatrix.extend`.
80
+
81
+ ### Stable matrix artifact
82
+
83
+ Each scheduled run includes:
84
+
85
+ - `matrix.run`
86
+ - `matrix.eval`
87
+ - `matrix.meta.runRowId`
88
+ - `matrix.meta.evalRowId`
89
+
90
+ Use these fields to group and compare runs across models, rubrics, and scenarios.
91
+
92
+ ## Architecture
93
+
94
+ ```mermaid
95
+ flowchart LR
96
+ CLI["src/cli/index.ts\n(runTopLevelCli)"] --> RUN["src/cli/run.ts\n(runVievalCli + formatter)"]
97
+ RUN --> CFG["src/cli/config.ts\n(loadVievalCliConfig)"]
98
+ RUN --> DISC["src/cli/discovery.ts\n(discoverEvalFiles)"]
99
+ RUN --> REG["src/dsl/registry.ts\n(module registrations)"]
100
+ RUN --> DSL["src/dsl/task.ts\n(describeTask/caseOf hooks)"]
101
+ RUN --> REP["src/cli/reporters/*\n(summary + windowed + noop)"]
102
+
103
+ RUN --> COLLECT["src/core/runner/collect.ts\n(collectEvalEntries)"]
104
+ RUN --> SCHEDULE["src/core/runner/schedule.ts\n(createRunnerSchedule)"]
105
+ RUN --> EXEC["src/core/runner/run.ts\n(runScheduledTasks)"]
106
+ EXEC --> CTX["src/core/runner/task-context.ts\n(createTaskExecutionContext)"]
107
+ EXEC --> AGG["src/core/runner/aggregate.ts\n(aggregateRunResults)"]
108
+
109
+ AGG --> POLICY["src/core/processors/results/*\n(hybrid-threshold, max-failed-runs)"]
110
+ RUN --> POLICY
111
+
112
+ PROVIDERS["src/core/inference-executors/*\n(env, adapters, retry, openai)"] --> CTX
113
+ PLUGINS["src/plugins/chat-models/*\n(model aliases/plugins)"] --> CFG
114
+
115
+ TESTS["src/**/*.test.ts + tests/projects/*"] --> CLI
116
+ TESTS --> RUN
117
+ TESTS --> EXEC
118
+ TESTS --> DSL
119
+ TESTS --> REP
120
+ ```
121
+
122
+ ### Connection Notes
123
+
124
+ - `src/cli/run.ts` is the integration hub: it loads config, discovers eval files, prepares schedules, runs tasks, emits live reporter events, and formats static summaries.
125
+ - `src/dsl/task.ts` emits case lifecycle hooks (`onCaseStart` / `onCaseEnd`) that feed the live reporter when `reporterHooks` is present in task context.
126
+ - `src/core/runner/run.ts` owns task lifecycle (`onTaskStart` / `onTaskEnd`) and result aggregation boundaries.
127
+ - `src/cli/reporters/summary-reporter.ts` and `src/cli/reporters/renderers/windowed-renderer.ts` provide the Vitest-style live TTY experience; non-TTY falls back to noop reporter + final static formatter.
128
+
129
+ ### Runtime Sequence (`eval:run`)
130
+
131
+ ```mermaid
132
+ sequenceDiagram
133
+ participant U as User
134
+ participant C as src/cli/index.ts
135
+ participant R as src/cli/run.ts
136
+ participant L as src/cli/config.ts
137
+ participant D as src/cli/discovery.ts
138
+ participant S as src/core/runner/*
139
+ participant T as src/dsl/task.ts
140
+ participant P as src/cli/reporters/*
141
+
142
+ U->>C: pnpm run eval:run -- --config ...
143
+ C->>R: runVievalCli(options)
144
+ R->>L: loadVievalCliConfig()
145
+ R->>D: discoverEvalFiles()
146
+ R->>S: collectEvalEntries() + createRunnerSchedule()
147
+ R->>P: createCliReporter(isTTY)
148
+ R->>P: onRunStart + onTaskQueued
149
+ R->>S: runScheduledTasks(...)
150
+ S->>P: onTaskStart / onTaskEnd
151
+ S->>T: task.run(context)
152
+ T->>P: reporterHooks.onCaseStart / onCaseEnd
153
+ S-->>R: aggregated run results
154
+ R->>P: onRunEnd + dispose
155
+ R-->>C: CliRunOutput
156
+ C->>U: static summary (or JSON)
157
+ ```
158
+
159
+ ## Config Example (Control Group Style)
160
+
161
+ ```ts
162
+ import { defineConfig } from 'vieval'
163
+
164
+ export default defineConfig({
165
+ projects: [
166
+ {
167
+ name: 'chat-evals',
168
+ runMatrix: {
169
+ extend: {
170
+ model: ['gpt-4.1-mini', 'gpt-4.1'],
171
+ promptLanguage: ['en', 'zh'],
172
+ scenario: ['baseline', 'stress'],
173
+ },
174
+ },
175
+ evalMatrix: {
176
+ extend: {
177
+ rubric: ['strict', 'lenient'],
178
+ rubricModel: ['judge-mini', 'judge-large'],
179
+ },
180
+ },
181
+ },
182
+ ],
183
+ })
184
+ ```
185
+
186
+ ## Custom Executor Example
187
+
188
+ Use `projects[].executor` for non-chat workloads such as ASR, TTS, image, motion, or other domain-specific evaluators.
189
+
190
+ ```ts
191
+ import { defineConfig } from 'vieval'
192
+
193
+ export default defineConfig({
194
+ projects: [
195
+ {
196
+ name: 'motion-evals',
197
+ inferenceExecutors: [{ id: 'motion-engine' }],
198
+ models: [
199
+ {
200
+ id: 'motion-engine:v2',
201
+ inferenceExecutor: 'motion-engine',
202
+ inferenceExecutorId: 'motion-engine',
203
+ model: 'v2',
204
+ aliases: ['motion-default'],
205
+ },
206
+ ],
207
+ async executor(task, context) {
208
+ const model = context.model()
209
+ const success = model.model === 'v2' && task.matrix.run.scenario === 'baseline'
210
+
211
+ return {
212
+ id: task.id,
213
+ entryId: task.entry.id,
214
+ inferenceExecutorId: task.inferenceExecutor.id,
215
+ matrix: task.matrix,
216
+ scores: [{ kind: 'exact', score: success ? 1 : 0 }],
217
+ }
218
+ },
219
+ },
220
+ ],
221
+ })
222
+ ```
223
+
224
+ ## CLI
225
+
226
+ ```bash
227
+ vieval run [--config <path>] [--project <name>] [--json]
228
+ ```
229
+
230
+ Common usage:
231
+
232
+ ```bash
233
+ pnpm -F vieval eval:run
234
+ pnpm -F vieval eval:run -- --config ./vieval.config.ts
235
+ pnpm -F vieval eval:run -- --config ./vieval.config.ts --project chess --project moderation
236
+ pnpm -F vieval eval:run -- --json
237
+ pnpm -F vieval eval:run -- --help
238
+ ```
239
+
240
+ ## Examples In This Repository
241
+
242
+ - `packages/vieval/tests/projects/example-api-defining-new-task`
243
+ - `packages/vieval/tests/projects/example-api-config-matrix`
244
+ - `packages/vieval/tests/projects/example-api-load-datasource-as-cases`
245
+ - `packages/vieval/tests/projects/example-pattern-byoa-bring-your-own-agent`
246
+
247
+ ## Development
248
+
249
+ ```bash
250
+ pnpm install
251
+ pnpm -F vieval test:run
252
+ pnpm -F vieval typecheck
253
+ pnpm lint:fix
254
+ ```
255
+
256
+ ## When To Use / Not Use
257
+
258
+ Use `vieval` when:
259
+
260
+ - you want evals close to app code with Vitest-like ergonomics;
261
+ - you need matrix experiments and repeatable run metadata;
262
+ - you want one CLI for local diagnostics and CI export (`--json`).
263
+
264
+ Do not use `vieval` when:
265
+
266
+ - you need hosted dataset management, annotation UI, or SaaS observability out of the box;
267
+ - you only need one-off scripts without reusable eval definitions or matrix scheduling.
268
+
269
+ ## Acknowledgements
270
+
271
+ - [Vitest](https://github.com/vitest-dev/vitest)
272
+ - [LobeHub](https://github.com/lobehub/lobehub)
273
+ - [EvalSys](https://github.com/evalsys)
274
+
275
+ ## License
276
+
277
+ MIT
278
+
279
+ [npm-version-src]: https://img.shields.io/npm/v/vieval?style=flat&colorA=080f12&colorB=1fa669
280
+ [npm-version-href]: https://npmjs.com/package/vieval
281
+ [npm-downloads-src]: https://img.shields.io/npm/dm/vieval?style=flat&colorA=080f12&colorB=1fa669
282
+ [npm-downloads-href]: https://npmjs.com/package/vieval
283
+ [bundle-src]: https://img.shields.io/bundlephobia/minzip/vieval?style=flat&colorA=080f12&colorB=1fa669&label=minzip
284
+ [bundle-href]: https://bundlephobia.com/result?p=vieval
285
+ [license-src]: https://img.shields.io/github/license/vieval-dev/vieval.svg?style=flat&colorA=080f12&colorB=1fa669
286
+ [license-href]: https://github.com/vieval-dev/vieval/blob/main/LICENSE
287
+ [jsdocs-src]: https://img.shields.io/badge/jsdocs-reference-080f12?style=flat&colorA=080f12&colorB=1fa669
288
+ [jsdocs-href]: https://www.jsdocs.io/package/vieval
289
+ [deepwiki-src]: https://deepwiki.com/badge.svg
290
+ [deepwiki-href]: https://deepwiki.com/vieval-dev/vieval
@@ -0,0 +1,183 @@
1
+ //#region src/core/assertions/index.ts
2
+ /**
3
+ * Normalizes text for matching.
4
+ *
5
+ * Before: `" Hello\nWorld "`
6
+ * After: `"hello world"`
7
+ */
8
+ function normalizeMatchText(value, caseSensitive) {
9
+ const compactedWhitespace = value.trim().replaceAll(/\s+/g, " ");
10
+ if (caseSensitive) return compactedWhitespace;
11
+ return compactedWhitespace.toLowerCase();
12
+ }
13
+ function clampScore(score) {
14
+ if (Number.isNaN(score)) return 0;
15
+ if (score < 0) return 0;
16
+ if (score > 1) return 1;
17
+ return score;
18
+ }
19
+ function createOutcome(id, scoreKind, pass, score, reason) {
20
+ return {
21
+ id,
22
+ pass,
23
+ reason,
24
+ score: clampScore(score),
25
+ scoreKind
26
+ };
27
+ }
28
+ /**
29
+ * Creates an assertion that requires specific keywords in model text.
30
+ *
31
+ * Example:
32
+ * `expectMustInclude({ id: 'tone', keywords: ['calm', 'move'] })`
33
+ */
34
+ function expectMustInclude(options) {
35
+ return async (context) => {
36
+ if (options.keywords.length === 0) return createOutcome(options.id, "exact", true, 1, "No required keywords configured.");
37
+ const caseSensitive = options.caseSensitive ?? false;
38
+ const normalizedText = normalizeMatchText(context.text, caseSensitive);
39
+ const matches = options.keywords.filter((keyword) => {
40
+ const normalizedKeyword = normalizeMatchText(keyword, caseSensitive);
41
+ return normalizedText.includes(normalizedKeyword);
42
+ });
43
+ const pass = (options.mode ?? "all") === "all" ? matches.length === options.keywords.length : matches.length > 0;
44
+ const score = options.keywords.length === 0 ? 1 : matches.length / options.keywords.length;
45
+ return createOutcome(options.id, "exact", pass, score, pass ? `Matched ${matches.length}/${options.keywords.length} required keywords.` : `Matched ${matches.length}/${options.keywords.length} required keywords.`);
46
+ };
47
+ }
48
+ /**
49
+ * Creates an assertion that forbids specific keywords.
50
+ *
51
+ * Example:
52
+ * `expectMustExclude({ id: 'no-engine-dump', keywords: ['bestmove', 'ponder'] })`
53
+ */
54
+ function expectMustExclude(options) {
55
+ return async (context) => {
56
+ if (options.keywords.length === 0) return createOutcome(options.id, "exact", true, 1, "No excluded keywords configured.");
57
+ const caseSensitive = options.caseSensitive ?? false;
58
+ const normalizedText = normalizeMatchText(context.text, caseSensitive);
59
+ const forbiddenMatches = options.keywords.filter((keyword) => {
60
+ const normalizedKeyword = normalizeMatchText(keyword, caseSensitive);
61
+ return normalizedText.includes(normalizedKeyword);
62
+ });
63
+ const pass = forbiddenMatches.length === 0;
64
+ const score = pass ? 1 : 0;
65
+ return createOutcome(options.id, "exact", pass, score, pass ? "No forbidden keywords found." : `Forbidden keywords found: ${forbiddenMatches.join(", ")}`);
66
+ };
67
+ }
68
+ /**
69
+ * Creates an assertion based on a regular expression.
70
+ *
71
+ * Example:
72
+ * `expectRegex({ id: 'starts-with-act', pattern: /^<\|ACT:/ })`
73
+ */
74
+ function expectRegex(options) {
75
+ return async (context) => {
76
+ const pass = options.pattern.test(context.text);
77
+ return createOutcome(options.id, "exact", pass, pass ? 1 : 0, pass ? "Regex matched response text." : `Regex did not match: ${options.pattern}`);
78
+ };
79
+ }
80
+ /**
81
+ * Creates an assertion for structured model output.
82
+ *
83
+ * Example:
84
+ * `expectStructuredOutput({ id: 'json-shape', validate: isMySchema })`
85
+ */
86
+ function expectStructuredOutput(options) {
87
+ return async (context) => {
88
+ const pass = options.validate(context.structuredOutput);
89
+ return createOutcome(options.id, "exact", pass, pass ? 1 : 0, pass ? "Structured output matched validator." : options.failureReason ?? "Structured output validation failed.");
90
+ };
91
+ }
92
+ /**
93
+ * Creates an assertion for validating tool-call arguments.
94
+ *
95
+ * Example:
96
+ * `expectToolCallArgs({ id: 'spark-command-shape', toolName: 'builtIn_sparkCommand', validate: isSparkArgs })`
97
+ */
98
+ function expectToolCallArgs(options) {
99
+ return async (context) => {
100
+ const targetCall = (context.toolCalls ?? []).find((call) => call.name === options.toolName);
101
+ if (targetCall == null) return createOutcome(options.id, "exact", false, 0, `Missing tool call: ${options.toolName}`);
102
+ const pass = options.validate(targetCall.args);
103
+ return createOutcome(options.id, "exact", pass, pass ? 1 : 0, pass ? `Tool call args validated for ${options.toolName}.` : `Tool call args validation failed for ${options.toolName}.`);
104
+ };
105
+ }
106
+ /**
107
+ * Creates a rubric assertion driven by teacher-model style scoring.
108
+ *
109
+ * Example:
110
+ * `expectRubric({ id: 'human-like-tone', judge: judgeFn, minScore: 0.8 })`
111
+ */
112
+ function expectRubric(options) {
113
+ return async (context) => {
114
+ const result = await options.judge(context);
115
+ const minScore = options.minScore ?? .7;
116
+ const normalizedScore = clampScore(result.score);
117
+ const pass = normalizedScore >= minScore;
118
+ return createOutcome(options.id, "judge", pass, normalizedScore, `${result.reason}${result.judgeModel ? ` (judge: ${result.judgeModel})` : ""}`);
119
+ };
120
+ }
121
+ /**
122
+ * Creates a custom assertion with fully user-defined logic.
123
+ *
124
+ * Example:
125
+ * `expectCustom({ id: 'stateful-window', scoreKind: 'exact', evaluate: (ctx) => ... })`
126
+ */
127
+ function expectCustom(options) {
128
+ return async (context) => {
129
+ const result = await options.evaluate(context);
130
+ return createOutcome(options.id, options.scoreKind, result.pass, result.score, result.reason);
131
+ };
132
+ }
133
+ /**
134
+ * Creates an inverse assertion.
135
+ *
136
+ * Example:
137
+ * `expectNot(expectMustInclude({ id: 'contains-engine-word', keywords: ['bestmove'] }), { id: 'no-engine-word' })`
138
+ */
139
+ function expectNot(assertion, options) {
140
+ return async (context) => {
141
+ const baseOutcome = await assertion(context);
142
+ return createOutcome(options.id, baseOutcome.scoreKind, !baseOutcome.pass, 1 - baseOutcome.score, `NOT(${baseOutcome.id}): ${baseOutcome.reason}`);
143
+ };
144
+ }
145
+ /**
146
+ * Executes assertion list and returns all outcomes.
147
+ *
148
+ * Call stack:
149
+ *
150
+ * {@link evaluateAssertions}
151
+ * -> `assertion(context)`
152
+ * -> {@link AssertionOutcome}[]
153
+ */
154
+ async function evaluateAssertions(assertions, context) {
155
+ const normalizedContext = {
156
+ state: context.state ?? /* @__PURE__ */ new Map(),
157
+ structuredOutput: context.structuredOutput,
158
+ text: context.text,
159
+ toolCalls: context.toolCalls
160
+ };
161
+ const outcomes = [];
162
+ for (const assertion of assertions) outcomes.push(await assertion(normalizedContext));
163
+ return outcomes;
164
+ }
165
+ /**
166
+ * Converts assertion outcomes to run-score tuples consumed by aggregation.
167
+ */
168
+ function toRunScores(outcomes) {
169
+ return outcomes.map((outcome) => ({
170
+ kind: outcome.scoreKind,
171
+ score: outcome.score
172
+ }));
173
+ }
174
+ /**
175
+ * Returns failing assertion outcomes in original order.
176
+ */
177
+ function collectFailedAssertions(outcomes) {
178
+ return outcomes.filter((outcome) => !outcome.pass);
179
+ }
180
+ //#endregion
181
+ export { expectMustInclude as a, expectRubric as c, normalizeMatchText as d, toRunScores as f, expectMustExclude as i, expectStructuredOutput as l, evaluateAssertions as n, expectNot as o, expectCustom as r, expectRegex as s, collectFailedAssertions as t, expectToolCallArgs as u };
182
+
183
+ //# sourceMappingURL=assertions-DcAjfVDA.mjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"assertions-DcAjfVDA.mjs","names":[],"sources":["../src/core/assertions/index.ts"],"sourcesContent":["import type { RunScore, RunScoreKind } from '../runner/aggregate'\n\n/**\n * Stores mutable evaluation state for stateful assertion flows.\n *\n * Use when:\n * - assertions need to share counters, rolling metrics, or memoized values\n * - a scenario evaluates multiple steps and expects state-aware checks\n */\nexport type AssertionState = Map<string, unknown>\n\n/**\n * Represents one tool call emitted by a model response.\n */\nexport interface ToolCall {\n /**\n * Tool name used by the call.\n */\n name: string\n /**\n * Tool arguments payload.\n */\n args: unknown\n}\n\n/**\n * Normalized assertion context for one model output.\n */\nexport interface AssertionContext {\n /**\n * Plain text model output used by text assertions.\n */\n text: string\n /**\n * Optional structured output parsed from the model response.\n */\n structuredOutput?: unknown\n /**\n * Optional tool calls extracted from the model response.\n */\n toolCalls?: readonly ToolCall[]\n /**\n * Shared mutable state for stateful assertion measurement.\n */\n state: AssertionState\n}\n\n/**\n * Result for one assertion evaluation.\n */\nexport interface AssertionOutcome {\n /**\n * Stable assertion id.\n */\n id: string\n /**\n * Assertion family emitted as run score kind.\n */\n scoreKind: RunScoreKind\n /**\n * Whether the assertion passed.\n */\n pass: boolean\n /**\n * Normalized score in the `0..1` range.\n */\n score: number\n /**\n * Human-readable reason for logs and reports.\n */\n reason: string\n}\n\n/**\n * Async assertion function used by eval scenarios.\n */\nexport type Assertion = (context: AssertionContext) => Promise<AssertionOutcome>\n\n/**\n * Normalizes text for matching.\n *\n * Before: `\" Hello\\nWorld \"`\n * After: `\"hello world\"`\n */\nexport function normalizeMatchText(value: string, caseSensitive: boolean): string {\n const compactedWhitespace = value.trim().replaceAll(/\\s+/g, ' ')\n\n if (caseSensitive) {\n return compactedWhitespace\n }\n\n return compactedWhitespace.toLowerCase()\n}\n\nfunction clampScore(score: number): number {\n if (Number.isNaN(score)) {\n return 0\n }\n\n if (score < 0) {\n return 0\n }\n\n if (score > 1) {\n return 1\n }\n\n return score\n}\n\nfunction createOutcome(\n id: string,\n scoreKind: RunScoreKind,\n pass: boolean,\n score: number,\n reason: string,\n): AssertionOutcome {\n return {\n id,\n pass,\n reason,\n score: clampScore(score),\n scoreKind,\n }\n}\n\n/**\n * Options for include-keyword assertions.\n */\nexport interface MustIncludeAssertionOptions {\n /**\n * Stable assertion id.\n */\n id: string\n /**\n * Keywords that must be present.\n */\n keywords: readonly string[]\n /**\n * Match mode for keywords.\n *\n * @default 'all'\n */\n mode?: 'all' | 'any'\n /**\n * Case-sensitive matching toggle.\n *\n * @default false\n */\n caseSensitive?: boolean\n}\n\n/**\n * Creates an assertion that requires specific keywords in model text.\n *\n * Example:\n * `expectMustInclude({ id: 'tone', keywords: ['calm', 'move'] })`\n */\nexport function expectMustInclude(options: MustIncludeAssertionOptions): Assertion {\n return async (context) => {\n if (options.keywords.length === 0) {\n return createOutcome(options.id, 'exact', true, 1, 'No required keywords configured.')\n }\n\n const caseSensitive = options.caseSensitive ?? false\n const normalizedText = normalizeMatchText(context.text, caseSensitive)\n const matches = options.keywords.filter((keyword) => {\n const normalizedKeyword = normalizeMatchText(keyword, caseSensitive)\n return normalizedText.includes(normalizedKeyword)\n })\n\n const mode = options.mode ?? 'all'\n const pass = mode === 'all'\n ? matches.length === options.keywords.length\n : matches.length > 0\n\n const score = options.keywords.length === 0 ? 1 : matches.length / options.keywords.length\n\n return createOutcome(\n options.id,\n 'exact',\n pass,\n score,\n pass\n ? `Matched ${matches.length}/${options.keywords.length} required keywords.`\n : `Matched ${matches.length}/${options.keywords.length} required keywords.`,\n )\n }\n}\n\n/**\n * Options for exclude-keyword assertions.\n */\nexport interface MustExcludeAssertionOptions {\n /**\n * Stable assertion id.\n */\n id: string\n /**\n * Keywords that must not appear.\n */\n keywords: readonly string[]\n /**\n * Case-sensitive matching toggle.\n *\n * @default false\n */\n caseSensitive?: boolean\n}\n\n/**\n * Creates an assertion that forbids specific keywords.\n *\n * Example:\n * `expectMustExclude({ id: 'no-engine-dump', keywords: ['bestmove', 'ponder'] })`\n */\nexport function expectMustExclude(options: MustExcludeAssertionOptions): Assertion {\n return async (context) => {\n if (options.keywords.length === 0) {\n return createOutcome(options.id, 'exact', true, 1, 'No excluded keywords configured.')\n }\n\n const caseSensitive = options.caseSensitive ?? false\n const normalizedText = normalizeMatchText(context.text, caseSensitive)\n const forbiddenMatches = options.keywords.filter((keyword) => {\n const normalizedKeyword = normalizeMatchText(keyword, caseSensitive)\n return normalizedText.includes(normalizedKeyword)\n })\n\n const pass = forbiddenMatches.length === 0\n const score = pass ? 1 : 0\n\n return createOutcome(\n options.id,\n 'exact',\n pass,\n score,\n pass\n ? 'No forbidden keywords found.'\n : `Forbidden keywords found: ${forbiddenMatches.join(', ')}`,\n )\n }\n}\n\n/**\n * Options for regular-expression assertions.\n */\nexport interface RegexAssertionOptions {\n /**\n * Stable assertion id.\n */\n id: string\n /**\n * Pattern to apply to model text.\n */\n pattern: RegExp\n}\n\n/**\n * Creates an assertion based on a regular expression.\n *\n * Example:\n * `expectRegex({ id: 'starts-with-act', pattern: /^<\\|ACT:/ })`\n */\nexport function expectRegex(options: RegexAssertionOptions): Assertion {\n return async (context) => {\n const pass = options.pattern.test(context.text)\n\n return createOutcome(\n options.id,\n 'exact',\n pass,\n pass ? 1 : 0,\n pass ? 'Regex matched response text.' : `Regex did not match: ${options.pattern}`,\n )\n }\n}\n\n/**\n * Options for structured-output assertions.\n */\nexport interface StructuredOutputAssertionOptions<TValue> {\n /**\n * Stable assertion id.\n */\n id: string\n /**\n * Runtime validator for structured output.\n */\n validate: (value: unknown) => value is TValue\n /**\n * Optional failure reason.\n */\n failureReason?: string\n}\n\n/**\n * Creates an assertion for structured model output.\n *\n * Example:\n * `expectStructuredOutput({ id: 'json-shape', validate: isMySchema })`\n */\nexport function expectStructuredOutput<TValue>(options: StructuredOutputAssertionOptions<TValue>): Assertion {\n return async (context) => {\n const pass = options.validate(context.structuredOutput)\n\n return createOutcome(\n options.id,\n 'exact',\n pass,\n pass ? 1 : 0,\n pass ? 'Structured output matched validator.' : (options.failureReason ?? 'Structured output validation failed.'),\n )\n }\n}\n\n/**\n * Options for tool-call argument assertions.\n */\nexport interface ToolCallArgsAssertionOptions {\n /**\n * Stable assertion id.\n */\n id: string\n /**\n * Tool name to inspect.\n */\n toolName: string\n /**\n * Runtime validator for tool arguments.\n */\n validate: (args: unknown) => boolean\n}\n\n/**\n * Creates an assertion for validating tool-call arguments.\n *\n * Example:\n * `expectToolCallArgs({ id: 'spark-command-shape', toolName: 'builtIn_sparkCommand', validate: isSparkArgs })`\n */\nexport function expectToolCallArgs(options: ToolCallArgsAssertionOptions): Assertion {\n return async (context) => {\n const targetCall = (context.toolCalls ?? []).find(call => call.name === options.toolName)\n\n if (targetCall == null) {\n return createOutcome(options.id, 'exact', false, 0, `Missing tool call: ${options.toolName}`)\n }\n\n const pass = options.validate(targetCall.args)\n\n return createOutcome(\n options.id,\n 'exact',\n pass,\n pass ? 1 : 0,\n pass ? `Tool call args validated for ${options.toolName}.` : `Tool call args validation failed for ${options.toolName}.`,\n )\n }\n}\n\n/**\n * Rubric judge result returned by teacher-model or rubric logic.\n */\nexport interface RubricJudgeResult {\n /**\n * Normalized score in the `0..1` range.\n */\n score: number\n /**\n * Judge explanation text.\n */\n reason: string\n /**\n * Optional judge model id.\n */\n judgeModel?: string\n}\n\n/**\n * Options for rubric assertions.\n */\nexport interface RubricAssertionOptions {\n /**\n * Stable assertion id.\n */\n id: string\n /**\n * Async rubric judge callback.\n */\n judge: (context: AssertionContext) => Promise<RubricJudgeResult>\n /**\n * Minimum passing score.\n *\n * @default 0.7\n */\n minScore?: number\n}\n\n/**\n * Creates a rubric assertion driven by teacher-model style scoring.\n *\n * Example:\n * `expectRubric({ id: 'human-like-tone', judge: judgeFn, minScore: 0.8 })`\n */\nexport function expectRubric(options: RubricAssertionOptions): Assertion {\n return async (context) => {\n const result = await options.judge(context)\n const minScore = options.minScore ?? 0.7\n const normalizedScore = clampScore(result.score)\n const pass = normalizedScore >= minScore\n\n return createOutcome(\n options.id,\n 'judge',\n pass,\n normalizedScore,\n `${result.reason}${result.judgeModel ? ` (judge: ${result.judgeModel})` : ''}`,\n )\n }\n}\n\n/**\n * Options for custom assertions.\n */\nexport interface CustomAssertionOptions {\n /**\n * Stable assertion id.\n */\n id: string\n /**\n * Score family emitted by this custom assertion.\n */\n scoreKind: RunScoreKind\n /**\n * Custom evaluator callback.\n */\n evaluate: (context: AssertionContext) => Promise<{ pass: boolean, reason: string, score: number }> | { pass: boolean, reason: string, score: number }\n}\n\n/**\n * Creates a custom assertion with fully user-defined logic.\n *\n * Example:\n * `expectCustom({ id: 'stateful-window', scoreKind: 'exact', evaluate: (ctx) => ... })`\n */\nexport function expectCustom(options: CustomAssertionOptions): Assertion {\n return async (context) => {\n const result = await options.evaluate(context)\n\n return createOutcome(options.id, options.scoreKind, result.pass, result.score, result.reason)\n }\n}\n\n/**\n * Creates an inverse assertion.\n *\n * Example:\n * `expectNot(expectMustInclude({ id: 'contains-engine-word', keywords: ['bestmove'] }), { id: 'no-engine-word' })`\n */\nexport function expectNot(assertion: Assertion, options: { id: string }): Assertion {\n return async (context) => {\n const baseOutcome = await assertion(context)\n\n return createOutcome(\n options.id,\n baseOutcome.scoreKind,\n !baseOutcome.pass,\n 1 - baseOutcome.score,\n `NOT(${baseOutcome.id}): ${baseOutcome.reason}`,\n )\n }\n}\n\n/**\n * Executes assertion list and returns all outcomes.\n *\n * Call stack:\n *\n * {@link evaluateAssertions}\n * -> `assertion(context)`\n * -> {@link AssertionOutcome}[]\n */\nexport async function evaluateAssertions(\n assertions: readonly Assertion[],\n context: Omit<AssertionContext, 'state'> & { state?: AssertionState },\n): Promise<AssertionOutcome[]> {\n const state = context.state ?? new Map<string, unknown>()\n const normalizedContext: AssertionContext = {\n state,\n structuredOutput: context.structuredOutput,\n text: context.text,\n toolCalls: context.toolCalls,\n }\n\n const outcomes: AssertionOutcome[] = []\n\n for (const assertion of assertions) {\n outcomes.push(await assertion(normalizedContext))\n }\n\n return outcomes\n}\n\n/**\n * Converts assertion outcomes to run-score tuples consumed by aggregation.\n */\nexport function toRunScores(outcomes: readonly AssertionOutcome[]): RunScore[] {\n return outcomes.map(outcome => ({\n kind: outcome.scoreKind,\n score: outcome.score,\n }))\n}\n\n/**\n * Returns failing assertion outcomes in original order.\n */\nexport function collectFailedAssertions(outcomes: readonly AssertionOutcome[]): AssertionOutcome[] {\n return outcomes.filter(outcome => !outcome.pass)\n}\n"],"mappings":";;;;;;;AAoFA,SAAgB,mBAAmB,OAAe,eAAgC;CAChF,MAAM,sBAAsB,MAAM,MAAM,CAAC,WAAW,QAAQ,IAAI;AAEhE,KAAI,cACF,QAAO;AAGT,QAAO,oBAAoB,aAAa;;AAG1C,SAAS,WAAW,OAAuB;AACzC,KAAI,OAAO,MAAM,MAAM,CACrB,QAAO;AAGT,KAAI,QAAQ,EACV,QAAO;AAGT,KAAI,QAAQ,EACV,QAAO;AAGT,QAAO;;AAGT,SAAS,cACP,IACA,WACA,MACA,OACA,QACkB;AAClB,QAAO;EACL;EACA;EACA;EACA,OAAO,WAAW,MAAM;EACxB;EACD;;;;;;;;AAmCH,SAAgB,kBAAkB,SAAiD;AACjF,QAAO,OAAO,YAAY;AACxB,MAAI,QAAQ,SAAS,WAAW,EAC9B,QAAO,cAAc,QAAQ,IAAI,SAAS,MAAM,GAAG,mCAAmC;EAGxF,MAAM,gBAAgB,QAAQ,iBAAiB;EAC/C,MAAM,iBAAiB,mBAAmB,QAAQ,MAAM,cAAc;EACtE,MAAM,UAAU,QAAQ,SAAS,QAAQ,YAAY;GACnD,MAAM,oBAAoB,mBAAmB,SAAS,cAAc;AACpE,UAAO,eAAe,SAAS,kBAAkB;IACjD;EAGF,MAAM,QADO,QAAQ,QAAQ,WACP,QAClB,QAAQ,WAAW,QAAQ,SAAS,SACpC,QAAQ,SAAS;EAErB,MAAM,QAAQ,QAAQ,SAAS,WAAW,IAAI,IAAI,QAAQ,SAAS,QAAQ,SAAS;AAEpF,SAAO,cACL,QAAQ,IACR,SACA,MACA,OACA,OACI,WAAW,QAAQ,OAAO,GAAG,QAAQ,SAAS,OAAO,uBACrD,WAAW,QAAQ,OAAO,GAAG,QAAQ,SAAS,OAAO,qBAC1D;;;;;;;;;AA8BL,SAAgB,kBAAkB,SAAiD;AACjF,QAAO,OAAO,YAAY;AACxB,MAAI,QAAQ,SAAS,WAAW,EAC9B,QAAO,cAAc,QAAQ,IAAI,SAAS,MAAM,GAAG,mCAAmC;EAGxF,MAAM,gBAAgB,QAAQ,iBAAiB;EAC/C,MAAM,iBAAiB,mBAAmB,QAAQ,MAAM,cAAc;EACtE,MAAM,mBAAmB,QAAQ,SAAS,QAAQ,YAAY;GAC5D,MAAM,oBAAoB,mBAAmB,SAAS,cAAc;AACpE,UAAO,eAAe,SAAS,kBAAkB;IACjD;EAEF,MAAM,OAAO,iBAAiB,WAAW;EACzC,MAAM,QAAQ,OAAO,IAAI;AAEzB,SAAO,cACL,QAAQ,IACR,SACA,MACA,OACA,OACI,iCACA,6BAA6B,iBAAiB,KAAK,KAAK,GAC7D;;;;;;;;;AAwBL,SAAgB,YAAY,SAA2C;AACrE,QAAO,OAAO,YAAY;EACxB,MAAM,OAAO,QAAQ,QAAQ,KAAK,QAAQ,KAAK;AAE/C,SAAO,cACL,QAAQ,IACR,SACA,MACA,OAAO,IAAI,GACX,OAAO,iCAAiC,wBAAwB,QAAQ,UACzE;;;;;;;;;AA4BL,SAAgB,uBAA+B,SAA8D;AAC3G,QAAO,OAAO,YAAY;EACxB,MAAM,OAAO,QAAQ,SAAS,QAAQ,iBAAiB;AAEvD,SAAO,cACL,QAAQ,IACR,SACA,MACA,OAAO,IAAI,GACX,OAAO,yCAA0C,QAAQ,iBAAiB,uCAC3E;;;;;;;;;AA4BL,SAAgB,mBAAmB,SAAkD;AACnF,QAAO,OAAO,YAAY;EACxB,MAAM,cAAc,QAAQ,aAAa,EAAE,EAAE,MAAK,SAAQ,KAAK,SAAS,QAAQ,SAAS;AAEzF,MAAI,cAAc,KAChB,QAAO,cAAc,QAAQ,IAAI,SAAS,OAAO,GAAG,sBAAsB,QAAQ,WAAW;EAG/F,MAAM,OAAO,QAAQ,SAAS,WAAW,KAAK;AAE9C,SAAO,cACL,QAAQ,IACR,SACA,MACA,OAAO,IAAI,GACX,OAAO,gCAAgC,QAAQ,SAAS,KAAK,wCAAwC,QAAQ,SAAS,GACvH;;;;;;;;;AAgDL,SAAgB,aAAa,SAA4C;AACvE,QAAO,OAAO,YAAY;EACxB,MAAM,SAAS,MAAM,QAAQ,MAAM,QAAQ;EAC3C,MAAM,WAAW,QAAQ,YAAY;EACrC,MAAM,kBAAkB,WAAW,OAAO,MAAM;EAChD,MAAM,OAAO,mBAAmB;AAEhC,SAAO,cACL,QAAQ,IACR,SACA,MACA,iBACA,GAAG,OAAO,SAAS,OAAO,aAAa,YAAY,OAAO,WAAW,KAAK,KAC3E;;;;;;;;;AA4BL,SAAgB,aAAa,SAA4C;AACvE,QAAO,OAAO,YAAY;EACxB,MAAM,SAAS,MAAM,QAAQ,SAAS,QAAQ;AAE9C,SAAO,cAAc,QAAQ,IAAI,QAAQ,WAAW,OAAO,MAAM,OAAO,OAAO,OAAO,OAAO;;;;;;;;;AAUjG,SAAgB,UAAU,WAAsB,SAAoC;AAClF,QAAO,OAAO,YAAY;EACxB,MAAM,cAAc,MAAM,UAAU,QAAQ;AAE5C,SAAO,cACL,QAAQ,IACR,YAAY,WACZ,CAAC,YAAY,MACb,IAAI,YAAY,OAChB,OAAO,YAAY,GAAG,KAAK,YAAY,SACxC;;;;;;;;;;;;AAaL,eAAsB,mBACpB,YACA,SAC6B;CAE7B,MAAM,oBAAsC;EAC1C,OAFY,QAAQ,yBAAS,IAAI,KAAsB;EAGvD,kBAAkB,QAAQ;EAC1B,MAAM,QAAQ;EACd,WAAW,QAAQ;EACpB;CAED,MAAM,WAA+B,EAAE;AAEvC,MAAK,MAAM,aAAa,WACtB,UAAS,KAAK,MAAM,UAAU,kBAAkB,CAAC;AAGnD,QAAO;;;;;AAMT,SAAgB,YAAY,UAAmD;AAC7E,QAAO,SAAS,KAAI,aAAY;EAC9B,MAAM,QAAQ;EACd,OAAO,QAAQ;EAChB,EAAE;;;;;AAML,SAAgB,wBAAwB,UAA2D;AACjG,QAAO,SAAS,QAAO,YAAW,CAAC,QAAQ,KAAK"}
@@ -0,0 +1,11 @@
1
+ //#region src/cli/index.d.ts
2
+ type Command = 'run';
3
+ interface ParsedTopLevelCliArguments {
4
+ command: Command | 'help';
5
+ commandArgv: string[];
6
+ }
7
+ declare function parseTopLevelCliArguments(argv: readonly string[]): ParsedTopLevelCliArguments;
8
+ declare function runTopLevelCli(argv: readonly string[]): Promise<void>;
9
+ //#endregion
10
+ export { parseTopLevelCliArguments, runTopLevelCli };
11
+ //# sourceMappingURL=index.d.mts.map