vieval 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +219 -109
- package/dist/bin/vieval.mjs +1 -1
- package/dist/cli/index.mjs +1 -1
- package/dist/{cli-sanbKtQq.mjs → cli-ImxGpoYQ.mjs} +1185 -161
- package/dist/cli-ImxGpoYQ.mjs.map +1 -0
- package/dist/config.d.mts +2 -2
- package/dist/core/assertions/index.d.mts +1 -1
- package/dist/core/processors/results/index.d.mts +1 -1
- package/dist/core/runner/index.d.mts +1 -1
- package/dist/{index-DBZKkpBe.d.mts → index-5R1_k2nv.d.mts} +92 -2
- package/dist/index.d.mts +11 -3
- package/dist/index.mjs +109 -38
- package/dist/index.mjs.map +1 -1
- package/dist/plugins/chat-models/index.d.mts +7 -1
- package/dist/plugins/chat-models/index.mjs +2 -0
- package/dist/plugins/chat-models/index.mjs.map +1 -1
- package/dist/{registry-CcKZqDJY.mjs → registry-BHGMxjpA.mjs} +140 -4
- package/dist/registry-BHGMxjpA.mjs.map +1 -0
- package/package.json +2 -1
- package/dist/cli-sanbKtQq.mjs.map +0 -1
- package/dist/registry-CcKZqDJY.mjs.map +0 -1
package/README.md
CHANGED
|
@@ -9,168 +9,192 @@
|
|
|
9
9
|
|
|
10
10
|
Vitest-style evaluation framework for agents, models, and task pipelines.
|
|
11
11
|
|
|
12
|
-
`vieval` keeps eval authoring close to product code while giving you repeatable
|
|
12
|
+
`vieval` keeps eval authoring close to product code while giving you repeatable task discovery, matrix scheduling, live CLI output, JSON artifacts, and report commands.
|
|
13
13
|
|
|
14
14
|
## Why Vieval
|
|
15
15
|
|
|
16
|
-
- Familiar
|
|
17
|
-
-
|
|
18
|
-
-
|
|
19
|
-
- Human-readable
|
|
16
|
+
- Familiar eval files with `describeTask`, `caseOf`, `casesFromInputs`, and `expect`.
|
|
17
|
+
- Project, eval, and task matrix layers for model, scenario, rubric, and dataset variants.
|
|
18
|
+
- Built-in chat-model registration through `ChatModels`, plus custom project executors for non-chat workloads.
|
|
19
|
+
- Human-readable terminal output and machine-readable JSON/report artifacts from the same CLI.
|
|
20
|
+
- Importable runner, scheduler, assertion, config, plugin, and testing entrypoints for advanced integration.
|
|
20
21
|
|
|
21
22
|
## Quick Start
|
|
22
23
|
|
|
23
|
-
### 1
|
|
24
|
+
### Step 1. Create a config
|
|
24
25
|
|
|
25
26
|
```ts
|
|
26
27
|
// vieval.config.ts
|
|
27
28
|
import { defineConfig } from 'vieval'
|
|
29
|
+
import { chatModelFrom, ChatModels } from 'vieval/plugins/chat-models'
|
|
28
30
|
|
|
29
31
|
export default defineConfig({
|
|
32
|
+
plugins: [
|
|
33
|
+
ChatModels({
|
|
34
|
+
models: [
|
|
35
|
+
chatModelFrom({
|
|
36
|
+
aliases: ['agent-mini', 'judge-mini'],
|
|
37
|
+
inferenceExecutor: 'openai',
|
|
38
|
+
model: 'gpt-4.1-mini',
|
|
39
|
+
}),
|
|
40
|
+
],
|
|
41
|
+
}),
|
|
42
|
+
],
|
|
30
43
|
projects: [
|
|
31
44
|
{
|
|
32
45
|
name: 'default',
|
|
33
46
|
root: '.',
|
|
34
47
|
include: ['evals/*.eval.ts'],
|
|
48
|
+
runMatrix: {
|
|
49
|
+
extend: {
|
|
50
|
+
model: ['agent-mini'],
|
|
51
|
+
scenario: ['baseline'],
|
|
52
|
+
},
|
|
53
|
+
},
|
|
54
|
+
evalMatrix: {
|
|
55
|
+
extend: {
|
|
56
|
+
rubric: ['default'],
|
|
57
|
+
},
|
|
58
|
+
},
|
|
35
59
|
},
|
|
36
60
|
],
|
|
37
61
|
})
|
|
38
62
|
```
|
|
39
63
|
|
|
40
|
-
### 2
|
|
64
|
+
### Step 2. Create an eval task
|
|
41
65
|
|
|
42
66
|
```ts
|
|
43
67
|
// evals/smoke.eval.ts
|
|
44
|
-
import { caseOf,
|
|
68
|
+
import { caseOf, describeTask, expect } from 'vieval'
|
|
45
69
|
|
|
46
|
-
|
|
47
|
-
caseOf('
|
|
70
|
+
describeTask('smoke', () => {
|
|
71
|
+
caseOf('arithmetic-default', (context) => {
|
|
72
|
+
expect(context.task.matrix.run.scenario).toBe('baseline')
|
|
48
73
|
expect(2 + 2).toBe(4)
|
|
74
|
+
}, {
|
|
75
|
+
input: {
|
|
76
|
+
prompt: 'Check simple arithmetic.',
|
|
77
|
+
},
|
|
49
78
|
})
|
|
50
79
|
})
|
|
51
80
|
```
|
|
52
81
|
|
|
53
|
-
### 3
|
|
82
|
+
### Step 3. Run
|
|
54
83
|
|
|
55
84
|
```bash
|
|
56
85
|
pnpm -F vieval eval:run -- --config ./vieval.config.ts
|
|
57
86
|
```
|
|
58
87
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
### Matrix layering
|
|
62
|
-
|
|
63
|
-
`vieval` expands matrices in scope order:
|
|
88
|
+
The published binary form is:
|
|
64
89
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
90
|
+
```bash
|
|
91
|
+
vieval run --config ./vieval.config.ts
|
|
92
|
+
```
|
|
68
93
|
|
|
69
|
-
|
|
94
|
+
## Authoring API
|
|
70
95
|
|
|
71
|
-
|
|
72
|
-
2. `extend`
|
|
73
|
-
3. `override`
|
|
74
|
-
|
|
75
|
-
Both `runMatrix` and `evalMatrix` are supported at each scope.
|
|
96
|
+
Use `describeTask` for the common Vitest-like authoring path:
|
|
76
97
|
|
|
77
|
-
|
|
98
|
+
```ts
|
|
99
|
+
import { caseOf, describeTask, expect } from 'vieval'
|
|
100
|
+
|
|
101
|
+
describeTask('prompt-language-ablation', () => {
|
|
102
|
+
caseOf('resolves matrix axes', async (context) => {
|
|
103
|
+
const selectedModel = context.model()
|
|
104
|
+
const language = context.task.matrix.run.promptLanguage
|
|
105
|
+
const scenario = context.task.matrix.run.scenario
|
|
106
|
+
|
|
107
|
+
expect(selectedModel.id.length).toBeGreaterThan(0)
|
|
108
|
+
expect(language).toBeDefined()
|
|
109
|
+
expect(scenario).toBeDefined()
|
|
110
|
+
}, {
|
|
111
|
+
input: {
|
|
112
|
+
prompt: 'Summarize the position in one sentence.',
|
|
113
|
+
},
|
|
114
|
+
})
|
|
115
|
+
})
|
|
116
|
+
```
|
|
78
117
|
|
|
79
|
-
|
|
118
|
+
Use builder style when loading a batch of inputs:
|
|
80
119
|
|
|
81
|
-
|
|
120
|
+
```ts
|
|
121
|
+
import { describeTask, expect } from 'vieval'
|
|
82
122
|
|
|
83
|
-
|
|
123
|
+
const arithmeticCases = [
|
|
124
|
+
{ name: 'addition-small', input: { a: 1, b: 2, expected: 3 } },
|
|
125
|
+
{ name: 'addition-large', input: { a: 20, b: 22, expected: 42 } },
|
|
126
|
+
]
|
|
84
127
|
|
|
85
|
-
-
|
|
86
|
-
-
|
|
87
|
-
|
|
88
|
-
|
|
128
|
+
describeTask('arithmetic-quality', ({ casesFromInputs }) => {
|
|
129
|
+
casesFromInputs('arithmetic-case', arithmeticCases, ({ matrix }) => {
|
|
130
|
+
const result = matrix.inputs.input.a + matrix.inputs.input.b
|
|
131
|
+
expect(result).toBe(matrix.inputs.input.expected)
|
|
132
|
+
})
|
|
133
|
+
})
|
|
134
|
+
```
|
|
89
135
|
|
|
90
|
-
|
|
136
|
+
`describeEval` remains exported as an alias of `describeTask`, but new examples should prefer `describeTask` because task/case semantics are the primary runtime model.
|
|
91
137
|
|
|
92
|
-
##
|
|
138
|
+
## Matrix Model
|
|
93
139
|
|
|
94
|
-
|
|
95
|
-
flowchart LR
|
|
96
|
-
BIN["src/bin/vieval.ts\n(executable shim)"] --> CLI["src/cli/index.ts\n(runTopLevelCli)"]
|
|
97
|
-
CLI --> RUN["src/cli/run.ts\n(runVievalCli + formatter)"]
|
|
98
|
-
RUN --> CFG["src/cli/config.ts\n(loadVievalCliConfig)"]
|
|
99
|
-
RUN --> DISC["src/cli/discovery.ts\n(discoverEvalFiles)"]
|
|
100
|
-
RUN --> REG["src/dsl/registry.ts\n(module registrations)"]
|
|
101
|
-
RUN --> DSL["src/dsl/task.ts\n(describeTask/caseOf hooks)"]
|
|
102
|
-
RUN --> REP["src/cli/reporters/*\n(summary + windowed + noop)"]
|
|
103
|
-
|
|
104
|
-
RUN --> COLLECT["src/core/runner/collect.ts\n(collectEvalEntries)"]
|
|
105
|
-
RUN --> SCHEDULE["src/core/runner/schedule.ts\n(createRunnerSchedule)"]
|
|
106
|
-
RUN --> EXEC["src/core/runner/run.ts\n(runScheduledTasks)"]
|
|
107
|
-
EXEC --> CTX["src/core/runner/task-context.ts\n(createTaskExecutionContext)"]
|
|
108
|
-
EXEC --> AGG["src/core/runner/aggregate.ts\n(aggregateRunResults)"]
|
|
140
|
+
`vieval` expands matrix scopes in this order:
|
|
109
141
|
|
|
110
|
-
|
|
111
|
-
|
|
142
|
+
1. Project config from `vieval.config.*`.
|
|
143
|
+
2. Eval definition from `defineEval(...)`.
|
|
144
|
+
3. Task definition from `defineTask(...)`.
|
|
112
145
|
|
|
113
|
-
|
|
114
|
-
PLUGINS["src/plugins/chat-models/*\n(model aliases/plugins)"] --> CFG
|
|
146
|
+
Within each scope, matrix layers resolve in this order:
|
|
115
147
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
TESTS --> DSL
|
|
120
|
-
TESTS --> REP
|
|
121
|
-
```
|
|
148
|
+
1. `disable`
|
|
149
|
+
2. `extend`
|
|
150
|
+
3. `override`
|
|
122
151
|
|
|
123
|
-
|
|
152
|
+
Both `runMatrix` and `evalMatrix` are supported at project, eval, and task scope. A flat object such as `runMatrix: { scenario: ['baseline'] }` is normalized to `runMatrix.extend`; layered form is preferred for new docs and examples.
|
|
124
153
|
|
|
125
|
-
|
|
126
|
-
- `src/dsl/task.ts` emits case lifecycle hooks (`onCaseStart` / `onCaseEnd`) that feed the live reporter when `reporterHooks` is present in task context.
|
|
127
|
-
- `src/core/runner/run.ts` owns task lifecycle (`onTaskStart` / `onTaskEnd`) and result aggregation boundaries.
|
|
128
|
-
- `src/cli/reporters/summary-reporter.ts` and `src/cli/reporters/renderers/windowed-renderer.ts` provide the Vitest-style live TTY experience; non-TTY falls back to noop reporter + final static formatter.
|
|
154
|
+
Each scheduled task receives stable matrix metadata:
|
|
129
155
|
|
|
130
|
-
|
|
156
|
+
- `task.matrix.run`
|
|
157
|
+
- `task.matrix.eval`
|
|
158
|
+
- `task.matrix.meta.runRowId`
|
|
159
|
+
- `task.matrix.meta.evalRowId`
|
|
160
|
+
- `task.matrix.inputs` for `caseOf(..., { input })` and `casesFromInputs(...)`
|
|
131
161
|
|
|
132
|
-
|
|
133
|
-
sequenceDiagram
|
|
134
|
-
participant U as User
|
|
135
|
-
participant B as src/bin/vieval.ts
|
|
136
|
-
participant C as src/cli/index.ts
|
|
137
|
-
participant R as src/cli/run.ts
|
|
138
|
-
participant L as src/cli/config.ts
|
|
139
|
-
participant D as src/cli/discovery.ts
|
|
140
|
-
participant S as src/core/runner/*
|
|
141
|
-
participant T as src/dsl/task.ts
|
|
142
|
-
participant P as src/cli/reporters/*
|
|
143
|
-
|
|
144
|
-
U->>B: pnpm run eval:run -- --config ...
|
|
145
|
-
B->>C: runTopLevelCli(argv)
|
|
146
|
-
C->>R: runVievalCli(options)
|
|
147
|
-
R->>L: loadVievalCliConfig()
|
|
148
|
-
R->>D: discoverEvalFiles()
|
|
149
|
-
R->>S: collectEvalEntries() + createRunnerSchedule()
|
|
150
|
-
R->>P: createCliReporter(isTTY)
|
|
151
|
-
R->>P: onRunStart + onTaskQueued
|
|
152
|
-
R->>S: runScheduledTasks(...)
|
|
153
|
-
S->>P: onTaskStart / onTaskEnd
|
|
154
|
-
S->>T: task.run(context)
|
|
155
|
-
T->>P: reporterHooks.onCaseStart / onCaseEnd
|
|
156
|
-
S-->>R: aggregated run results
|
|
157
|
-
R->>P: onRunEnd + dispose
|
|
158
|
-
R-->>C: CliRunOutput
|
|
159
|
-
C->>U: static summary (or JSON)
|
|
160
|
-
```
|
|
161
|
-
|
|
162
|
-
## Config Example (Control Group Style)
|
|
162
|
+
## Config Example
|
|
163
163
|
|
|
164
164
|
```ts
|
|
165
165
|
import { defineConfig } from 'vieval'
|
|
166
|
+
import { chatModelFrom, ChatModels } from 'vieval/plugins/chat-models'
|
|
166
167
|
|
|
167
168
|
export default defineConfig({
|
|
169
|
+
plugins: [
|
|
170
|
+
ChatModels({
|
|
171
|
+
models: [
|
|
172
|
+
chatModelFrom({
|
|
173
|
+
aliases: ['agent-mini', 'judge-mini'],
|
|
174
|
+
inferenceExecutor: 'openai',
|
|
175
|
+
model: 'gpt-4.1-mini',
|
|
176
|
+
}),
|
|
177
|
+
chatModelFrom({
|
|
178
|
+
aliases: ['agent-large', 'judge-large'],
|
|
179
|
+
inferenceExecutor: 'openai',
|
|
180
|
+
model: 'gpt-4.1',
|
|
181
|
+
}),
|
|
182
|
+
chatModelFrom({
|
|
183
|
+
aliases: ['agent-openrouter-mini'],
|
|
184
|
+
inferenceExecutor: 'openrouter',
|
|
185
|
+
model: 'openai/gpt-4.1-mini',
|
|
186
|
+
}),
|
|
187
|
+
],
|
|
188
|
+
}),
|
|
189
|
+
],
|
|
168
190
|
projects: [
|
|
169
191
|
{
|
|
170
192
|
name: 'chat-evals',
|
|
193
|
+
root: '.',
|
|
194
|
+
include: ['evals/*.eval.ts'],
|
|
171
195
|
runMatrix: {
|
|
172
196
|
extend: {
|
|
173
|
-
model: ['
|
|
197
|
+
model: ['agent-mini', 'agent-large'],
|
|
174
198
|
promptLanguage: ['en', 'zh'],
|
|
175
199
|
scenario: ['baseline', 'stress'],
|
|
176
200
|
},
|
|
@@ -186,9 +210,9 @@ export default defineConfig({
|
|
|
186
210
|
})
|
|
187
211
|
```
|
|
188
212
|
|
|
189
|
-
## Custom Executor
|
|
213
|
+
## Custom Executor
|
|
190
214
|
|
|
191
|
-
|
|
215
|
+
If a project provides no `executor`, `vieval run` still discovers eval files, schedules tasks, and executes module-defined task callbacks. Provide `projects[].executor` when a project needs custom execution for ASR, TTS, image, motion, hosted agents, or another domain runtime.
|
|
192
216
|
|
|
193
217
|
```ts
|
|
194
218
|
import { defineConfig } from 'vieval'
|
|
@@ -197,18 +221,20 @@ export default defineConfig({
|
|
|
197
221
|
projects: [
|
|
198
222
|
{
|
|
199
223
|
name: 'motion-evals',
|
|
224
|
+
root: '.',
|
|
225
|
+
include: ['evals/*.eval.ts'],
|
|
200
226
|
inferenceExecutors: [{ id: 'motion-engine' }],
|
|
201
227
|
models: [
|
|
202
228
|
{
|
|
203
229
|
id: 'motion-engine:v2',
|
|
230
|
+
aliases: ['motion-default'],
|
|
204
231
|
inferenceExecutor: 'motion-engine',
|
|
205
232
|
inferenceExecutorId: 'motion-engine',
|
|
206
233
|
model: 'v2',
|
|
207
|
-
aliases: ['motion-default'],
|
|
208
234
|
},
|
|
209
235
|
],
|
|
210
236
|
async executor(task, context) {
|
|
211
|
-
const model = context.model()
|
|
237
|
+
const model = context.model({ name: 'motion-default' })
|
|
212
238
|
const success = model.model === 'v2' && task.matrix.run.scenario === 'baseline'
|
|
213
239
|
|
|
214
240
|
return {
|
|
@@ -227,26 +253,109 @@ export default defineConfig({
|
|
|
227
253
|
## CLI
|
|
228
254
|
|
|
229
255
|
```bash
|
|
230
|
-
vieval run [--config <path>] [--project <name>] [--json]
|
|
256
|
+
vieval run [--config <path>] [--project <name>] [--json] [--report-out <path>]
|
|
231
257
|
vieval compare [--config <path>] [--comparison <id>] [--output <path>] [--format table|json]
|
|
258
|
+
vieval report analyze <report-directory>
|
|
259
|
+
vieval report index <report-directory> [--output <path>] [--format table|json|jsonl]
|
|
260
|
+
vieval report cases <report-directory> [--where <key=value>] [--group-by <key>] [--format table|json|jsonl]
|
|
261
|
+
vieval report compare <left-report-directory> <right-report-directory> [--case-key <key>] [--score-kind <kind>] [--format table|json]
|
|
232
262
|
```
|
|
233
263
|
|
|
234
|
-
Common
|
|
264
|
+
Common workspace commands:
|
|
235
265
|
|
|
236
266
|
```bash
|
|
267
|
+
pnpm install
|
|
237
268
|
pnpm -F vieval eval:run
|
|
238
269
|
pnpm -F vieval eval:run -- --config ./vieval.config.ts
|
|
239
270
|
pnpm -F vieval eval:run -- --config ./vieval.config.ts --project chess --project moderation
|
|
240
271
|
pnpm -F vieval eval:run -- --json
|
|
241
|
-
pnpm -F vieval
|
|
272
|
+
pnpm -F vieval eval:run -- --report-out .vieval/reports --workspace local --experiment prompt-v2 --attempt attempt-a
|
|
273
|
+
pnpm -F vieval exec tsx src/bin/vieval.ts compare --config ./vieval.config.ts --comparison agent-memory
|
|
274
|
+
pnpm -F vieval exec tsx src/bin/vieval.ts report analyze .vieval/reports/my-run
|
|
242
275
|
pnpm -F vieval eval:run -- --help
|
|
243
276
|
```
|
|
244
277
|
|
|
278
|
+
Concurrency flags are available on `vieval run`:
|
|
279
|
+
|
|
280
|
+
- `--workspace-concurrency`
|
|
281
|
+
- `--project-concurrency`
|
|
282
|
+
- `--task-concurrency`
|
|
283
|
+
- `--attempt-concurrency`
|
|
284
|
+
- `--case-concurrency`
|
|
285
|
+
|
|
286
|
+
## Public Entrypoints
|
|
287
|
+
|
|
288
|
+
- `vieval`: `defineConfig`, `loadEnv`, `requiredEnvFrom`, `describeTask`, `describeEval`, `caseOf`, `casesFromInputs`, and `expect`.
|
|
289
|
+
- `vieval/config`: lower-level `defineEval`, `defineTask`, matrix types, task context types, model definitions, and plugin contracts.
|
|
290
|
+
- `vieval/plugins/chat-models`: `ChatModels`, `ChatProviders`, `chatModelFrom`, `chatProviderFrom`, `chatModelMatrix`, runtime config helpers, and chat telemetry helpers.
|
|
291
|
+
- `vieval/core/runner`: collection, scheduling, task context, cache runtime, scheduler runtime, execution, and aggregation utilities.
|
|
292
|
+
- `vieval/core/assertions`: assertion primitives and pipeline helpers.
|
|
293
|
+
- `vieval/core/inference-executors`: env helpers and remote provider executors.
|
|
294
|
+
- `vieval/testing/expect-extensions`: Vitest expect extensions for testing eval behavior.
|
|
295
|
+
|
|
296
|
+
## Architecture
|
|
297
|
+
|
|
298
|
+
```mermaid
|
|
299
|
+
flowchart LR
|
|
300
|
+
CLI["src/cli/index.ts\n(runTopLevelCli)"] --> RUN["src/cli/eval-run.ts\n(runEvalRunCli)"]
|
|
301
|
+
CLI --> COMPARE["src/cli/compare.ts\n(runCompareCli)"]
|
|
302
|
+
CLI --> REPORT["src/cli/report-*.ts\n(report commands)"]
|
|
303
|
+
RUN --> ORCH["src/cli/run.ts\n(runVievalCli)"]
|
|
304
|
+
ORCH --> CFG["src/cli/config.ts\n(loadVievalCliConfig)"]
|
|
305
|
+
ORCH --> DISC["src/cli/discovery.ts\n(discoverEvalFiles)"]
|
|
306
|
+
ORCH --> MODULES["src/cli/module-runtime.ts\n(load eval modules)"]
|
|
307
|
+
MODULES --> DSL["src/dsl/task.ts\n(describeTask/caseOf/casesFromInputs)"]
|
|
308
|
+
ORCH --> SCHEDULE["src/core/runner/schedule.ts\n(createRunnerSchedule)"]
|
|
309
|
+
ORCH --> EXEC["src/core/runner/run.ts\n(runScheduledTasks)"]
|
|
310
|
+
EXEC --> CTX["src/core/runner/task-context.ts\n(createTaskExecutionContext)"]
|
|
311
|
+
EXEC --> AGG["src/core/runner/aggregate.ts\n(aggregateRunResults)"]
|
|
312
|
+
ORCH --> REPORTERS["src/cli/reporters/*\nlive reporter + Vitest bridge"]
|
|
313
|
+
ORCH --> ARTIFACTS["src/cli/report-artifacts.ts\nJSONL report artifacts"]
|
|
314
|
+
CHAT["src/plugins/chat-models/*\nmodel/provider plugins"] --> CFG
|
|
315
|
+
PROVIDERS["src/core/inference-executors/*\nprovider adapters + env"] --> CTX
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
### Runtime Sequence
|
|
319
|
+
|
|
320
|
+
```mermaid
|
|
321
|
+
sequenceDiagram
|
|
322
|
+
participant U as User
|
|
323
|
+
participant C as src/cli/index.ts
|
|
324
|
+
participant E as src/cli/eval-run.ts
|
|
325
|
+
participant R as src/cli/run.ts
|
|
326
|
+
participant L as src/cli/config.ts
|
|
327
|
+
participant D as src/cli/discovery.ts
|
|
328
|
+
participant M as src/cli/module-runtime.ts
|
|
329
|
+
participant S as src/core/runner/*
|
|
330
|
+
participant T as src/dsl/task.ts
|
|
331
|
+
participant P as src/cli/reporters/*
|
|
332
|
+
|
|
333
|
+
U->>C: vieval run --config ...
|
|
334
|
+
C->>E: runEvalRunCli(argv)
|
|
335
|
+
E->>R: runVievalCli(options)
|
|
336
|
+
R->>L: loadVievalCliConfig()
|
|
337
|
+
R->>D: discoverEvalFiles()
|
|
338
|
+
R->>M: loadEvalModulesWithVitestRuntime()
|
|
339
|
+
M->>T: register describeTask definitions
|
|
340
|
+
R->>S: collectEvalEntries() + createRunnerSchedule()
|
|
341
|
+
R->>P: createCliReporter(isTTY)
|
|
342
|
+
R->>P: onRunStart + onTaskQueued
|
|
343
|
+
R->>S: runScheduledTasks(...)
|
|
344
|
+
S->>P: onTaskStart / onTaskEnd
|
|
345
|
+
S->>T: task.run(context)
|
|
346
|
+
T->>P: reporterHooks.onCaseStart / onCaseEnd
|
|
347
|
+
S-->>R: aggregated run results
|
|
348
|
+
R->>P: onRunEnd + dispose
|
|
349
|
+
R-->>E: CliRunOutput
|
|
350
|
+
E->>U: static summary or JSON
|
|
351
|
+
```
|
|
352
|
+
|
|
245
353
|
## Examples In This Repository
|
|
246
354
|
|
|
247
355
|
- [Define a custom eval task API](tests/projects/example-api-defining-new-task)
|
|
248
356
|
- [Configure run/eval matrix combinations](tests/projects/example-api-config-matrix)
|
|
249
357
|
- [Load datasource records as task cases](tests/projects/example-api-load-datasource-as-cases)
|
|
358
|
+
- [Use assertion helpers and Vitest expect extensions](tests/projects/example-api-expect)
|
|
250
359
|
- [Compare reporters and experiment/attempt layering](tests/projects/example-api-reporters-and-experiments)
|
|
251
360
|
- [Bring your own agent execution pattern](tests/projects/example-pattern-byoa-bring-your-own-agent)
|
|
252
361
|
|
|
@@ -256,7 +365,7 @@ pnpm -F vieval eval:run -- --help
|
|
|
256
365
|
pnpm install
|
|
257
366
|
pnpm -F vieval test:run
|
|
258
367
|
pnpm -F vieval typecheck
|
|
259
|
-
pnpm lint
|
|
368
|
+
pnpm lint
|
|
260
369
|
```
|
|
261
370
|
|
|
262
371
|
## When To Use / Not Use
|
|
@@ -264,13 +373,14 @@ pnpm lint:fix
|
|
|
264
373
|
Use `vieval` when:
|
|
265
374
|
|
|
266
375
|
- you want evals close to app code with Vitest-like ergonomics;
|
|
267
|
-
- you need matrix experiments and
|
|
268
|
-
- you want
|
|
376
|
+
- you need repeatable matrix experiments and stable run metadata;
|
|
377
|
+
- you want local diagnostics, CI JSON, and report artifacts from one runner;
|
|
378
|
+
- you need to evaluate product code or custom agent flows without moving them into a hosted eval system.
|
|
269
379
|
|
|
270
380
|
Do not use `vieval` when:
|
|
271
381
|
|
|
272
382
|
- you need hosted dataset management, annotation UI, or SaaS observability out of the box;
|
|
273
|
-
- you only need one-off
|
|
383
|
+
- you only need a one-off script without reusable eval definitions or matrix scheduling.
|
|
274
384
|
|
|
275
385
|
## Acknowledgements
|
|
276
386
|
|
package/dist/bin/vieval.mjs
CHANGED
package/dist/cli/index.mjs
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as runTopLevelCli, t as parseTopLevelCliArguments } from "../cli-
|
|
1
|
+
import { n as runTopLevelCli, t as parseTopLevelCliArguments } from "../cli-ImxGpoYQ.mjs";
|
|
2
2
|
export { parseTopLevelCliArguments, runTopLevelCli };
|