vieval 0.0.9 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/README.md +27 -2
  2. package/dist/bin/vieval.mjs +1 -1
  3. package/dist/bin/vieval.mjs.map +1 -1
  4. package/dist/cli/index.mjs +1 -1
  5. package/dist/{cli-Dao25VxV.mjs → cli-CHFCF8UR.mjs} +670 -600
  6. package/dist/cli-CHFCF8UR.mjs.map +1 -0
  7. package/dist/config.d.mts +1 -1
  8. package/dist/config.mjs +1 -1
  9. package/dist/config.mjs.map +1 -1
  10. package/dist/core/assertions/index.d.mts +1 -1
  11. package/dist/core/assertions/index.mjs.map +1 -1
  12. package/dist/core/inference-executors/index.d.mts +1 -1
  13. package/dist/core/inference-executors/index.mjs +3 -3
  14. package/dist/core/inference-executors/index.mjs.map +1 -1
  15. package/dist/core/processors/results/index.d.mts +1 -1
  16. package/dist/core/processors/results/index.mjs.map +1 -1
  17. package/dist/core/runner/index.d.mts +2 -2
  18. package/dist/core/runner/index.mjs +4 -4
  19. package/dist/core/runner/index.mjs.map +1 -1
  20. package/dist/core/scheduler/index.d.mts +1 -1
  21. package/dist/core/scheduler/index.mjs +3 -3
  22. package/dist/core/scheduler/index.mjs.map +1 -1
  23. package/dist/{env-nV5rVErX.mjs → env-BVYeJhGA.mjs} +1 -1
  24. package/dist/{env-nV5rVErX.mjs.map → env-BVYeJhGA.mjs.map} +1 -1
  25. package/dist/{env-DfWZy_n4.d.mts → env-bRH0K6fU.d.mts} +1 -1
  26. package/dist/{expect-extensions-DCSqlneN.mjs → expect-extensions-Mf1sMNBv.mjs} +1 -1
  27. package/dist/{expect-extensions-DCSqlneN.mjs.map → expect-extensions-Mf1sMNBv.mjs.map} +1 -1
  28. package/dist/expect.d.mts +1 -3
  29. package/dist/expect.mjs +1 -1
  30. package/dist/expect.mjs.map +1 -1
  31. package/dist/{index-fakXoZEe.d.mts → index-Be5I1ZJL.d.mts} +4 -3
  32. package/dist/{index-BkjyCInx.d.mts → index-CwKBlCG9.d.mts} +2 -2
  33. package/dist/index.d.mts +3 -4
  34. package/dist/index.mjs +22 -27
  35. package/dist/index.mjs.map +1 -1
  36. package/dist/{models-pBSRUZhY.mjs → models-CaCOUPZw.mjs} +1 -1
  37. package/dist/{models-pBSRUZhY.mjs.map → models-CaCOUPZw.mjs.map} +1 -1
  38. package/dist/plugins/chat-models/index.d.mts +1 -1
  39. package/dist/plugins/chat-models/index.mjs +2 -2
  40. package/dist/plugins/chat-models/index.mjs.map +1 -1
  41. package/dist/{queue-DsZQkZO_.mjs → queue-BL86z2W_.mjs} +1 -1
  42. package/dist/{queue-DsZQkZO_.mjs.map → queue-BL86z2W_.mjs.map} +1 -1
  43. package/dist/{registry-BHGMxjpA.mjs → registry-BSyjwZFx.mjs} +55 -11
  44. package/dist/registry-BSyjwZFx.mjs.map +1 -0
  45. package/dist/testing/expect-extensions.mjs +1 -1
  46. package/package.json +10 -10
  47. package/dist/cli-Dao25VxV.mjs.map +0 -1
  48. package/dist/registry-BHGMxjpA.mjs.map +0 -1
package/README.md CHANGED
@@ -104,10 +104,11 @@ Use `describeTask` for the common Vitest-like authoring path:
104
104
 
105
105
  ```ts
106
106
  import { caseOf, describeTask, expect } from 'vieval'
107
+ import { modelFromRun } from 'vieval/plugins/chat-models'
107
108
 
108
109
  describeTask('prompt-language-ablation', () => {
109
110
  caseOf('resolves matrix axes', async (context) => {
110
- const selectedModel = context.model()
111
+ const selectedModel = modelFromRun(context, { axis: 'model' })
111
112
  const language = context.task.matrix.run.promptLanguage
112
113
  const scenario = context.task.matrix.run.scenario
113
114
 
@@ -166,6 +167,21 @@ Each scheduled task receives stable matrix metadata:
166
167
  - `task.matrix.meta.evalRowId`
167
168
  - `task.matrix.inputs` for `caseOf(..., { input })` and `casesFromInputs(...)`
168
169
 
170
+ ## Orchestration Model
171
+
172
+ `vieval` separates benchmark management from run reliability:
173
+
174
+ - `comparison`: cross-project or cross-workspace benchmark. Use it for horizontal evaluation across multiple agent, memory, model, paper, or backend implementations. Methods do not need perfect project/case alignment; compare artifacts report project and case coverage.
175
+ - `workspace`: a batch-management boundary for related eval projects. Use it when one benchmark family spans multiple task projects, roots, env settings, or model registrations.
176
+ - `project`: one eval task project with discovery rules, model registrations, optional executor, matrix layers, and scoring/reporting behavior.
177
+ - `experiment`: run metadata derived from explicit `--experiment` or, when omitted, stable matrix row metadata. It does not create an extra scheduler layer.
178
+ - `task`: one eval definition discovered from files and expanded across inference executors plus run/eval matrix rows.
179
+ - `case`: the scoring and evidence source inside a task. `context.score(...)` contributes normalized score evidence; `context.metric(...)` emits benchmark metadata for reports.
180
+ - `attempt`: a full task rerun used to estimate reliability. With `autoAttempt`, each full attempt contributes evidence, so a fail-then-pass pair scores as a success rate rather than replacing the earlier failure.
181
+ - `retry`: an in-case retry. With `autoRetry`, a case can recover inside one attempt; a retry pass still counts as that attempt passing.
182
+
183
+ Config inheritance follows the same outside-in model: top-level defaults apply first, workspace/project entries refine them, and project-local plugins can append or override project-local models/reporters/concurrency without leaking to sibling projects.
184
+
169
185
  ## Config Example
170
186
 
171
187
  ```ts
@@ -256,7 +272,16 @@ export default defineConfig({
256
272
  },
257
273
  ],
258
274
  async executor(task, context) {
259
- const model = context.model({ name: 'motion-default' })
275
+ const model = context.models.find(model =>
276
+ model.id === 'motion-default'
277
+ || model.model === 'motion-default'
278
+ || model.aliases.includes('motion-default'),
279
+ )
280
+
281
+ if (model == null) {
282
+ throw new Error('Missing configured model "motion-default".')
283
+ }
284
+
260
285
  const success = model.model === 'v2' && task.matrix.run.scenario === 'baseline'
261
286
 
262
287
  return {
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { n as runTopLevelCli } from "../cli-Dao25VxV.mjs";
2
+ import { n as runTopLevelCli } from "../cli-CHFCF8UR.mjs";
3
3
  import process from "node:process";
4
4
  import { errorMessageFrom } from "@moeru/std";
5
5
  //#region src/bin/vieval.ts
@@ -1 +1 @@
1
- {"version":3,"file":"vieval.mjs","names":[],"sources":["../../src/bin/vieval.ts"],"sourcesContent":["#!/usr/bin/env node\n\nimport process from 'node:process'\n\nimport { errorMessageFrom } from '@moeru/std'\n\nimport { runTopLevelCli } from '../cli/index'\n\n/**\n * Bootstraps the published `vieval` executable.\n *\n * Call stack:\n *\n * package manager shim / direct node execution\n * -> {@link runTopLevelCli} (`../cli`)\n * -> subcommand orchestration modules\n *\n * Use when:\n * - the installed `vieval` binary starts from the command line\n * - process-bound startup must stay outside import-safe CLI modules\n *\n * Expects:\n * - `process.argv` contains the raw CLI arguments after the node executable path\n *\n * Returns:\n * - resolves after the selected subcommand finishes and updates `process.exitCode`\n */\nrunTopLevelCli(process.argv.slice(2)).catch((error) => {\n const errorMessage = errorMessageFrom(error) ?? 'Unknown CLI failure.'\n process.stderr.write(`[vieval] ${errorMessage}\\n`)\n process.exitCode = 1\n})\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;AA2BA,eAAe,QAAQ,KAAK,MAAM,EAAE,CAAC,CAAC,OAAO,UAAU;CACrD,MAAM,eAAe,iBAAiB,MAAM,IAAI;AAChD,SAAQ,OAAO,MAAM,YAAY,aAAa,IAAI;AAClD,SAAQ,WAAW;EACnB"}
1
+ {"version":3,"file":"vieval.mjs","names":[],"sources":["../../src/bin/vieval.ts"],"sourcesContent":["#!/usr/bin/env node\n\nimport process from 'node:process'\n\nimport { errorMessageFrom } from '@moeru/std'\n\nimport { runTopLevelCli } from '../cli/index'\n\n/**\n * Bootstraps the published `vieval` executable.\n *\n * Call stack:\n *\n * package manager shim / direct node execution\n * -> {@link runTopLevelCli} (`../cli`)\n * -> subcommand orchestration modules\n *\n * Use when:\n * - the installed `vieval` binary starts from the command line\n * - process-bound startup must stay outside import-safe CLI modules\n *\n * Expects:\n * - `process.argv` contains the raw CLI arguments after the node executable path\n *\n * Returns:\n * - resolves after the selected subcommand finishes and updates `process.exitCode`\n */\nrunTopLevelCli(process.argv.slice(2)).catch((error) => {\n const errorMessage = errorMessageFrom(error) ?? 'Unknown CLI failure.'\n process.stderr.write(`[vieval] ${errorMessage}\\n`)\n process.exitCode = 1\n})\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;AA2BA,eAAe,QAAQ,KAAK,MAAM,CAAC,CAAC,CAAC,CAAC,OAAO,UAAU;CACrD,MAAM,eAAe,iBAAiB,KAAK,KAAK;CAChD,QAAQ,OAAO,MAAM,YAAY,aAAa,GAAG;CACjD,QAAQ,WAAW;AACrB,CAAC"}
@@ -1,2 +1,2 @@
1
- import { n as runTopLevelCli, t as parseTopLevelCliArguments } from "../cli-Dao25VxV.mjs";
1
+ import { n as runTopLevelCli, t as parseTopLevelCliArguments } from "../cli-CHFCF8UR.mjs";
2
2
  export { parseTopLevelCliArguments, runTopLevelCli };