vieval 0.0.8 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +51 -4
- package/dist/bin/vieval.mjs +1 -1
- package/dist/cli/index.mjs +1 -1
- package/dist/{cli-Dao25VxV.mjs → cli-DTDgaqeI.mjs} +669 -599
- package/dist/cli-DTDgaqeI.mjs.map +1 -0
- package/dist/config.d.mts +1 -1
- package/dist/core/assertions/index.d.mts +1 -1
- package/dist/core/inference-executors/index.d.mts +1 -1
- package/dist/core/inference-executors/index.mjs +10 -4
- package/dist/core/inference-executors/index.mjs.map +1 -1
- package/dist/core/processors/results/index.d.mts +1 -1
- package/dist/core/runner/index.d.mts +2 -2
- package/dist/core/runner/index.mjs +2 -2
- package/dist/core/scheduler/index.d.mts +1 -1
- package/dist/core/scheduler/index.mjs +2 -2
- package/dist/core/scheduler/index.mjs.map +1 -1
- package/dist/{env-BeHv_5mo.d.mts → env-DfWZy_n4.d.mts} +14 -9
- package/dist/env-nV5rVErX.mjs +35 -0
- package/dist/env-nV5rVErX.mjs.map +1 -0
- package/dist/{index-fakXoZEe.d.mts → index-Bg0atWBF.d.mts} +4 -3
- package/dist/{index-BkjyCInx.d.mts → index-D_aMeWqO.d.mts} +2 -2
- package/dist/index.d.mts +2 -2
- package/dist/index.mjs +21 -26
- package/dist/index.mjs.map +1 -1
- package/dist/plugins/chat-models/index.d.mts +1 -1
- package/dist/plugins/chat-models/index.mjs +15 -13
- package/dist/plugins/chat-models/index.mjs.map +1 -1
- package/dist/{registry-BHGMxjpA.mjs → registry-DMnwE_mY.mjs} +54 -10
- package/dist/registry-DMnwE_mY.mjs.map +1 -0
- package/package.json +1 -1
- package/dist/cli-Dao25VxV.mjs.map +0 -1
- package/dist/env-BFSjny07.mjs +0 -41
- package/dist/env-BFSjny07.mjs.map +0 -1
- package/dist/registry-BHGMxjpA.mjs.map +0 -1
package/README.md
CHANGED
|
@@ -25,7 +25,9 @@ Vitest-style evaluation framework for agents, models, and task pipelines.
|
|
|
25
25
|
|
|
26
26
|
```ts
|
|
27
27
|
// vieval.config.ts
|
|
28
|
-
import {
|
|
28
|
+
import { cwd } from 'node:process'
|
|
29
|
+
|
|
30
|
+
import { defineConfig, loadEnv, requiredEnvFrom } from 'vieval'
|
|
29
31
|
import { chatModelFrom, ChatModels } from 'vieval/plugins/chat-models'
|
|
30
32
|
|
|
31
33
|
export default defineConfig({
|
|
@@ -34,12 +36,17 @@ export default defineConfig({
|
|
|
34
36
|
models: [
|
|
35
37
|
chatModelFrom({
|
|
36
38
|
aliases: ['agent-mini', 'judge-mini'],
|
|
39
|
+
apiKey: config => requiredEnvFrom(config.env, {
|
|
40
|
+
name: 'OPENAI_API_KEY',
|
|
41
|
+
type: 'string',
|
|
42
|
+
}),
|
|
37
43
|
inferenceExecutor: 'openai',
|
|
38
44
|
model: 'gpt-4.1-mini',
|
|
39
45
|
}),
|
|
40
46
|
],
|
|
41
47
|
}),
|
|
42
48
|
],
|
|
49
|
+
env: loadEnv('test', cwd(), ''),
|
|
43
50
|
projects: [
|
|
44
51
|
{
|
|
45
52
|
name: 'default',
|
|
@@ -97,10 +104,11 @@ Use `describeTask` for the common Vitest-like authoring path:
|
|
|
97
104
|
|
|
98
105
|
```ts
|
|
99
106
|
import { caseOf, describeTask, expect } from 'vieval'
|
|
107
|
+
import { modelFromRun } from 'vieval/plugins/chat-models'
|
|
100
108
|
|
|
101
109
|
describeTask('prompt-language-ablation', () => {
|
|
102
110
|
caseOf('resolves matrix axes', async (context) => {
|
|
103
|
-
const selectedModel = context
|
|
111
|
+
const selectedModel = modelFromRun(context, { axis: 'model' })
|
|
104
112
|
const language = context.task.matrix.run.promptLanguage
|
|
105
113
|
const scenario = context.task.matrix.run.scenario
|
|
106
114
|
|
|
@@ -159,10 +167,27 @@ Each scheduled task receives stable matrix metadata:
|
|
|
159
167
|
- `task.matrix.meta.evalRowId`
|
|
160
168
|
- `task.matrix.inputs` for `caseOf(..., { input })` and `casesFromInputs(...)`
|
|
161
169
|
|
|
170
|
+
## Orchestration Model
|
|
171
|
+
|
|
172
|
+
`vieval` separates benchmark management from run reliability:
|
|
173
|
+
|
|
174
|
+
- `comparison`: cross-project or cross-workspace benchmark. Use it for horizontal evaluation across multiple agent, memory, model, paper, or backend implementations. Methods do not need perfect project/case alignment; compare artifacts report project and case coverage.
|
|
175
|
+
- `workspace`: a batch-management boundary for related eval projects. Use it when one benchmark family spans multiple task projects, roots, env settings, or model registrations.
|
|
176
|
+
- `project`: one eval task project with discovery rules, model registrations, optional executor, matrix layers, and scoring/reporting behavior.
|
|
177
|
+
- `experiment`: run metadata derived from explicit `--experiment` or, when omitted, stable matrix row metadata. It does not create an extra scheduler layer.
|
|
178
|
+
- `task`: one eval definition discovered from files and expanded across inference executors plus run/eval matrix rows.
|
|
179
|
+
- `case`: the scoring and evidence source inside a task. `context.score(...)` contributes normalized score evidence; `context.metric(...)` emits benchmark metadata for reports.
|
|
180
|
+
- `attempt`: a full task rerun used to estimate reliability. With `autoAttempt`, each full attempt contributes evidence, so a fail-then-pass pair scores as a success rate rather than replacing the earlier failure.
|
|
181
|
+
- `retry`: an in-case retry. With `autoRetry`, a case can recover inside one attempt; a retry pass still counts as that attempt passing.
|
|
182
|
+
|
|
183
|
+
Config inheritance follows the same outside-in model: top-level defaults apply first, workspace/project entries refine them, and project-local plugins can append or override project-local models/reporters/concurrency without leaking to sibling projects.
|
|
184
|
+
|
|
162
185
|
## Config Example
|
|
163
186
|
|
|
164
187
|
```ts
|
|
165
|
-
import {
|
|
188
|
+
import { cwd } from 'node:process'
|
|
189
|
+
|
|
190
|
+
import { defineConfig, loadEnv, requiredEnvFrom } from 'vieval'
|
|
166
191
|
import { chatModelFrom, ChatModels } from 'vieval/plugins/chat-models'
|
|
167
192
|
|
|
168
193
|
export default defineConfig({
|
|
@@ -171,22 +196,35 @@ export default defineConfig({
|
|
|
171
196
|
models: [
|
|
172
197
|
chatModelFrom({
|
|
173
198
|
aliases: ['agent-mini', 'judge-mini'],
|
|
199
|
+
apiKey: config => requiredEnvFrom(config.env, {
|
|
200
|
+
name: 'OPENAI_API_KEY',
|
|
201
|
+
type: 'string',
|
|
202
|
+
}),
|
|
174
203
|
inferenceExecutor: 'openai',
|
|
175
204
|
model: 'gpt-4.1-mini',
|
|
176
205
|
}),
|
|
177
206
|
chatModelFrom({
|
|
178
207
|
aliases: ['agent-large', 'judge-large'],
|
|
208
|
+
apiKey: config => requiredEnvFrom(config.env, {
|
|
209
|
+
name: 'OPENAI_API_KEY',
|
|
210
|
+
type: 'string',
|
|
211
|
+
}),
|
|
179
212
|
inferenceExecutor: 'openai',
|
|
180
213
|
model: 'gpt-4.1',
|
|
181
214
|
}),
|
|
182
215
|
chatModelFrom({
|
|
183
216
|
aliases: ['agent-openrouter-mini'],
|
|
217
|
+
apiKey: config => requiredEnvFrom(config.env, {
|
|
218
|
+
name: 'OPENROUTER_API_KEY',
|
|
219
|
+
type: 'string',
|
|
220
|
+
}),
|
|
184
221
|
inferenceExecutor: 'openrouter',
|
|
185
222
|
model: 'openai/gpt-4.1-mini',
|
|
186
223
|
}),
|
|
187
224
|
],
|
|
188
225
|
}),
|
|
189
226
|
],
|
|
227
|
+
env: loadEnv('test', cwd(), ''),
|
|
190
228
|
projects: [
|
|
191
229
|
{
|
|
192
230
|
name: 'chat-evals',
|
|
@@ -234,7 +272,16 @@ export default defineConfig({
|
|
|
234
272
|
},
|
|
235
273
|
],
|
|
236
274
|
async executor(task, context) {
|
|
237
|
-
const model = context.model
|
|
275
|
+
const model = context.models.find(model =>
|
|
276
|
+
model.id === 'motion-default'
|
|
277
|
+
|| model.model === 'motion-default'
|
|
278
|
+
|| model.aliases.includes('motion-default'),
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
if (model == null) {
|
|
282
|
+
throw new Error('Missing configured model "motion-default".')
|
|
283
|
+
}
|
|
284
|
+
|
|
238
285
|
const success = model.model === 'v2' && task.matrix.run.scenario === 'baseline'
|
|
239
286
|
|
|
240
287
|
return {
|
package/dist/bin/vieval.mjs
CHANGED
package/dist/cli/index.mjs
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as runTopLevelCli, t as parseTopLevelCliArguments } from "../cli-
|
|
1
|
+
import { n as runTopLevelCli, t as parseTopLevelCliArguments } from "../cli-DTDgaqeI.mjs";
|
|
2
2
|
export { parseTopLevelCliArguments, runTopLevelCli };
|