vieval 0.0.8 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/README.md +51 -4
  2. package/dist/bin/vieval.mjs +1 -1
  3. package/dist/cli/index.mjs +1 -1
  4. package/dist/{cli-Dao25VxV.mjs → cli-DTDgaqeI.mjs} +669 -599
  5. package/dist/cli-DTDgaqeI.mjs.map +1 -0
  6. package/dist/config.d.mts +1 -1
  7. package/dist/core/assertions/index.d.mts +1 -1
  8. package/dist/core/inference-executors/index.d.mts +1 -1
  9. package/dist/core/inference-executors/index.mjs +10 -4
  10. package/dist/core/inference-executors/index.mjs.map +1 -1
  11. package/dist/core/processors/results/index.d.mts +1 -1
  12. package/dist/core/runner/index.d.mts +2 -2
  13. package/dist/core/runner/index.mjs +2 -2
  14. package/dist/core/scheduler/index.d.mts +1 -1
  15. package/dist/core/scheduler/index.mjs +2 -2
  16. package/dist/core/scheduler/index.mjs.map +1 -1
  17. package/dist/{env-BeHv_5mo.d.mts → env-DfWZy_n4.d.mts} +14 -9
  18. package/dist/env-nV5rVErX.mjs +35 -0
  19. package/dist/env-nV5rVErX.mjs.map +1 -0
  20. package/dist/{index-fakXoZEe.d.mts → index-Bg0atWBF.d.mts} +4 -3
  21. package/dist/{index-BkjyCInx.d.mts → index-D_aMeWqO.d.mts} +2 -2
  22. package/dist/index.d.mts +2 -2
  23. package/dist/index.mjs +21 -26
  24. package/dist/index.mjs.map +1 -1
  25. package/dist/plugins/chat-models/index.d.mts +1 -1
  26. package/dist/plugins/chat-models/index.mjs +15 -13
  27. package/dist/plugins/chat-models/index.mjs.map +1 -1
  28. package/dist/{registry-BHGMxjpA.mjs → registry-DMnwE_mY.mjs} +54 -10
  29. package/dist/registry-DMnwE_mY.mjs.map +1 -0
  30. package/package.json +1 -1
  31. package/dist/cli-Dao25VxV.mjs.map +0 -1
  32. package/dist/env-BFSjny07.mjs +0 -41
  33. package/dist/env-BFSjny07.mjs.map +0 -1
  34. package/dist/registry-BHGMxjpA.mjs.map +0 -1
package/README.md CHANGED
@@ -25,7 +25,9 @@ Vitest-style evaluation framework for agents, models, and task pipelines.
25
25
 
26
26
  ```ts
27
27
  // vieval.config.ts
28
- import { defineConfig } from 'vieval'
28
+ import { cwd } from 'node:process'
29
+
30
+ import { defineConfig, loadEnv, requiredEnvFrom } from 'vieval'
29
31
  import { chatModelFrom, ChatModels } from 'vieval/plugins/chat-models'
30
32
 
31
33
  export default defineConfig({
@@ -34,12 +36,17 @@ export default defineConfig({
34
36
  models: [
35
37
  chatModelFrom({
36
38
  aliases: ['agent-mini', 'judge-mini'],
39
+ apiKey: config => requiredEnvFrom(config.env, {
40
+ name: 'OPENAI_API_KEY',
41
+ type: 'string',
42
+ }),
37
43
  inferenceExecutor: 'openai',
38
44
  model: 'gpt-4.1-mini',
39
45
  }),
40
46
  ],
41
47
  }),
42
48
  ],
49
+ env: loadEnv('test', cwd(), ''),
43
50
  projects: [
44
51
  {
45
52
  name: 'default',
@@ -97,10 +104,11 @@ Use `describeTask` for the common Vitest-like authoring path:
97
104
 
98
105
  ```ts
99
106
  import { caseOf, describeTask, expect } from 'vieval'
107
+ import { modelFromRun } from 'vieval/plugins/chat-models'
100
108
 
101
109
  describeTask('prompt-language-ablation', () => {
102
110
  caseOf('resolves matrix axes', async (context) => {
103
- const selectedModel = context.model()
111
+ const selectedModel = modelFromRun(context, { axis: 'model' })
104
112
  const language = context.task.matrix.run.promptLanguage
105
113
  const scenario = context.task.matrix.run.scenario
106
114
 
@@ -159,10 +167,27 @@ Each scheduled task receives stable matrix metadata:
159
167
  - `task.matrix.meta.evalRowId`
160
168
  - `task.matrix.inputs` for `caseOf(..., { input })` and `casesFromInputs(...)`
161
169
 
170
+ ## Orchestration Model
171
+
172
+ `vieval` separates benchmark management from run reliability:
173
+
174
+ - `comparison`: cross-project or cross-workspace benchmark. Use it for horizontal evaluation across multiple agent, memory, model, paper, or backend implementations. Methods do not need perfect project/case alignment; compare artifacts report project and case coverage.
175
+ - `workspace`: a batch-management boundary for related eval projects. Use it when one benchmark family spans multiple task projects, roots, env settings, or model registrations.
176
+ - `project`: one eval task project with discovery rules, model registrations, optional executor, matrix layers, and scoring/reporting behavior.
177
+ - `experiment`: run metadata derived from explicit `--experiment` or, when omitted, stable matrix row metadata. It does not create an extra scheduler layer.
178
+ - `task`: one eval definition discovered from files and expanded across inference executors plus run/eval matrix rows.
179
+ - `case`: the scoring and evidence source inside a task. `context.score(...)` contributes normalized score evidence; `context.metric(...)` emits benchmark metadata for reports.
180
+ - `attempt`: a full task rerun used to estimate reliability. With `autoAttempt`, each full attempt contributes evidence, so a fail-then-pass pair scores as a success rate rather than replacing the earlier failure.
181
+ - `retry`: an in-case retry. With `autoRetry`, a case can recover inside one attempt; a retry pass still counts as that attempt passing.
182
+
183
+ Config inheritance follows the same outside-in model: top-level defaults apply first, workspace/project entries refine them, and project-local plugins can append or override project-local models/reporters/concurrency without leaking to sibling projects.
184
+
162
185
  ## Config Example
163
186
 
164
187
  ```ts
165
- import { defineConfig } from 'vieval'
188
+ import { cwd } from 'node:process'
189
+
190
+ import { defineConfig, loadEnv, requiredEnvFrom } from 'vieval'
166
191
  import { chatModelFrom, ChatModels } from 'vieval/plugins/chat-models'
167
192
 
168
193
  export default defineConfig({
@@ -171,22 +196,35 @@ export default defineConfig({
171
196
  models: [
172
197
  chatModelFrom({
173
198
  aliases: ['agent-mini', 'judge-mini'],
199
+ apiKey: config => requiredEnvFrom(config.env, {
200
+ name: 'OPENAI_API_KEY',
201
+ type: 'string',
202
+ }),
174
203
  inferenceExecutor: 'openai',
175
204
  model: 'gpt-4.1-mini',
176
205
  }),
177
206
  chatModelFrom({
178
207
  aliases: ['agent-large', 'judge-large'],
208
+ apiKey: config => requiredEnvFrom(config.env, {
209
+ name: 'OPENAI_API_KEY',
210
+ type: 'string',
211
+ }),
179
212
  inferenceExecutor: 'openai',
180
213
  model: 'gpt-4.1',
181
214
  }),
182
215
  chatModelFrom({
183
216
  aliases: ['agent-openrouter-mini'],
217
+ apiKey: config => requiredEnvFrom(config.env, {
218
+ name: 'OPENROUTER_API_KEY',
219
+ type: 'string',
220
+ }),
184
221
  inferenceExecutor: 'openrouter',
185
222
  model: 'openai/gpt-4.1-mini',
186
223
  }),
187
224
  ],
188
225
  }),
189
226
  ],
227
+ env: loadEnv('test', cwd(), ''),
190
228
  projects: [
191
229
  {
192
230
  name: 'chat-evals',
@@ -234,7 +272,16 @@ export default defineConfig({
234
272
  },
235
273
  ],
236
274
  async executor(task, context) {
237
- const model = context.model({ name: 'motion-default' })
275
+ const model = context.models.find(model =>
276
+ model.id === 'motion-default'
277
+ || model.model === 'motion-default'
278
+ || model.aliases.includes('motion-default'),
279
+ )
280
+
281
+ if (model == null) {
282
+ throw new Error('Missing configured model "motion-default".')
283
+ }
284
+
238
285
  const success = model.model === 'v2' && task.matrix.run.scenario === 'baseline'
239
286
 
240
287
  return {
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { n as runTopLevelCli } from "../cli-Dao25VxV.mjs";
2
+ import { n as runTopLevelCli } from "../cli-DTDgaqeI.mjs";
3
3
  import process from "node:process";
4
4
  import { errorMessageFrom } from "@moeru/std";
5
5
  //#region src/bin/vieval.ts
@@ -1,2 +1,2 @@
1
- import { n as runTopLevelCli, t as parseTopLevelCliArguments } from "../cli-Dao25VxV.mjs";
1
+ import { n as runTopLevelCli, t as parseTopLevelCliArguments } from "../cli-DTDgaqeI.mjs";
2
2
  export { parseTopLevelCliArguments, runTopLevelCli };