vieval 0.0.9 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -2
- package/dist/bin/vieval.mjs +1 -1
- package/dist/cli/index.mjs +1 -1
- package/dist/{cli-Dao25VxV.mjs → cli-DTDgaqeI.mjs} +669 -599
- package/dist/cli-DTDgaqeI.mjs.map +1 -0
- package/dist/config.d.mts +1 -1
- package/dist/core/assertions/index.d.mts +1 -1
- package/dist/core/processors/results/index.d.mts +1 -1
- package/dist/core/runner/index.d.mts +2 -2
- package/dist/core/runner/index.mjs +2 -2
- package/dist/core/scheduler/index.d.mts +1 -1
- package/dist/core/scheduler/index.mjs +2 -2
- package/dist/core/scheduler/index.mjs.map +1 -1
- package/dist/{index-fakXoZEe.d.mts → index-Bg0atWBF.d.mts} +4 -3
- package/dist/{index-BkjyCInx.d.mts → index-D_aMeWqO.d.mts} +2 -2
- package/dist/index.d.mts +1 -1
- package/dist/index.mjs +20 -25
- package/dist/index.mjs.map +1 -1
- package/dist/plugins/chat-models/index.d.mts +1 -1
- package/dist/{registry-BHGMxjpA.mjs → registry-DMnwE_mY.mjs} +54 -10
- package/dist/registry-DMnwE_mY.mjs.map +1 -0
- package/package.json +1 -1
- package/dist/cli-Dao25VxV.mjs.map +0 -1
- package/dist/registry-BHGMxjpA.mjs.map +0 -1
package/README.md
CHANGED
|
@@ -104,10 +104,11 @@ Use `describeTask` for the common Vitest-like authoring path:
|
|
|
104
104
|
|
|
105
105
|
```ts
|
|
106
106
|
import { caseOf, describeTask, expect } from 'vieval'
|
|
107
|
+
import { modelFromRun } from 'vieval/plugins/chat-models'
|
|
107
108
|
|
|
108
109
|
describeTask('prompt-language-ablation', () => {
|
|
109
110
|
caseOf('resolves matrix axes', async (context) => {
|
|
110
|
-
const selectedModel = context
|
|
111
|
+
const selectedModel = modelFromRun(context, { axis: 'model' })
|
|
111
112
|
const language = context.task.matrix.run.promptLanguage
|
|
112
113
|
const scenario = context.task.matrix.run.scenario
|
|
113
114
|
|
|
@@ -166,6 +167,21 @@ Each scheduled task receives stable matrix metadata:
|
|
|
166
167
|
- `task.matrix.meta.evalRowId`
|
|
167
168
|
- `task.matrix.inputs` for `caseOf(..., { input })` and `casesFromInputs(...)`
|
|
168
169
|
|
|
170
|
+
## Orchestration Model
|
|
171
|
+
|
|
172
|
+
`vieval` separates benchmark management from run reliability:
|
|
173
|
+
|
|
174
|
+
- `comparison`: cross-project or cross-workspace benchmark. Use it for horizontal evaluation across multiple agent, memory, model, paper, or backend implementations. Methods do not need perfect project/case alignment; compare artifacts report project and case coverage.
|
|
175
|
+
- `workspace`: a batch-management boundary for related eval projects. Use it when one benchmark family spans multiple task projects, roots, env settings, or model registrations.
|
|
176
|
+
- `project`: one eval task project with discovery rules, model registrations, optional executor, matrix layers, and scoring/reporting behavior.
|
|
177
|
+
- `experiment`: run metadata derived from explicit `--experiment` or, when omitted, stable matrix row metadata. It does not create an extra scheduler layer.
|
|
178
|
+
- `task`: one eval definition discovered from files and expanded across inference executors plus run/eval matrix rows.
|
|
179
|
+
- `case`: the scoring and evidence source inside a task. `context.score(...)` contributes normalized score evidence; `context.metric(...)` emits benchmark metadata for reports.
|
|
180
|
+
- `attempt`: a full task rerun used to estimate reliability. With `autoAttempt`, each full attempt contributes evidence, so a fail-then-pass pair scores as a success rate rather than replacing the earlier failure.
|
|
181
|
+
- `retry`: an in-case retry. With `autoRetry`, a case can recover inside one attempt; a retry pass still counts as that attempt passing.
|
|
182
|
+
|
|
183
|
+
Config inheritance follows the same outside-in model: top-level defaults apply first, workspace/project entries refine them, and project-local plugins can append or override project-local models/reporters/concurrency without leaking to sibling projects.
|
|
184
|
+
|
|
169
185
|
## Config Example
|
|
170
186
|
|
|
171
187
|
```ts
|
|
@@ -256,7 +272,16 @@ export default defineConfig({
|
|
|
256
272
|
},
|
|
257
273
|
],
|
|
258
274
|
async executor(task, context) {
|
|
259
|
-
const model = context.model
|
|
275
|
+
const model = context.models.find(model =>
|
|
276
|
+
model.id === 'motion-default'
|
|
277
|
+
|| model.model === 'motion-default'
|
|
278
|
+
|| model.aliases.includes('motion-default'),
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
if (model == null) {
|
|
282
|
+
throw new Error('Missing configured model "motion-default".')
|
|
283
|
+
}
|
|
284
|
+
|
|
260
285
|
const success = model.model === 'v2' && task.matrix.run.scenario === 'baseline'
|
|
261
286
|
|
|
262
287
|
return {
|
package/dist/bin/vieval.mjs
CHANGED
package/dist/cli/index.mjs
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as runTopLevelCli, t as parseTopLevelCliArguments } from "../cli-
|
|
1
|
+
import { n as runTopLevelCli, t as parseTopLevelCliArguments } from "../cli-DTDgaqeI.mjs";
|
|
2
2
|
export { parseTopLevelCliArguments, runTopLevelCli };
|