vieval 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -5
- package/dist/cli/index.d.mts +1 -1
- package/dist/cli/index.mjs +1204 -61
- package/dist/cli/index.mjs.map +1 -1
- package/dist/{config-D2fe1SnT.mjs → config-CHN24egi.mjs} +1 -1
- package/dist/{config-D2fe1SnT.mjs.map → config-CHN24egi.mjs.map} +1 -1
- package/dist/config.d.mts +2 -3
- package/dist/config.mjs +2 -2
- package/dist/core/assertions/index.d.mts +1 -1
- package/dist/core/inference-executors/index.d.mts +1 -45
- package/dist/core/inference-executors/index.mjs +1 -38
- package/dist/core/inference-executors/index.mjs.map +1 -1
- package/dist/core/processors/results/index.d.mts +1 -1
- package/dist/core/runner/index.d.mts +2 -2
- package/dist/core/runner/index.mjs +2 -2
- package/dist/env-C7X81PWa.mjs +41 -0
- package/dist/env-C7X81PWa.mjs.map +1 -0
- package/dist/env-DtpjACOW.d.mts +47 -0
- package/dist/expect-B2vaoRVZ.d.mts +10 -0
- package/dist/{expect-i9WZWGrA.mjs → expect-CaXiUkwY.mjs} +3 -3
- package/dist/expect-CaXiUkwY.mjs.map +1 -0
- package/dist/expect-extensions-BOzwV5EJ.mjs +197 -0
- package/dist/expect-extensions-BOzwV5EJ.mjs.map +1 -0
- package/dist/expect.d.mts +1 -1
- package/dist/expect.mjs +1 -1
- package/dist/{index-DP7jsORl.d.mts → index-BDMEAmf2.d.mts} +246 -3
- package/dist/{index-oSXhM1zx.d.mts → index-C3gPFmcR.d.mts} +2 -2
- package/dist/index.d.mts +326 -6
- package/dist/index.mjs +65 -23
- package/dist/index.mjs.map +1 -1
- package/dist/{models-D_MsBtYw.mjs → models-DIGdOUpJ.mjs} +1 -1
- package/dist/{models-D_MsBtYw.mjs.map → models-DIGdOUpJ.mjs.map} +1 -1
- package/dist/plugins/chat-models/index.d.mts +465 -6
- package/dist/plugins/chat-models/index.mjs +469 -6
- package/dist/plugins/chat-models/index.mjs.map +1 -1
- package/dist/{registry-ChOjjdEC.mjs → registry-CHJcTN2W.mjs} +75 -16
- package/dist/registry-CHJcTN2W.mjs.map +1 -0
- package/dist/{runner-4ZsOveoY.mjs → runner-Dpy-eivM.mjs} +177 -21
- package/dist/runner-Dpy-eivM.mjs.map +1 -0
- package/dist/testing/expect-extensions.d.mts +44 -38
- package/dist/testing/expect-extensions.mjs +1 -1
- package/package.json +11 -4
- package/dist/expect-0jPJ7Zio.d.mts +0 -2318
- package/dist/expect-extensions-CwPtgTz8.mjs +0 -13471
- package/dist/expect-extensions-CwPtgTz8.mjs.map +0 -1
- package/dist/expect-i9WZWGrA.mjs.map +0 -1
- package/dist/magic-string.es-CH1jwzMg.mjs +0 -1013
- package/dist/magic-string.es-CH1jwzMg.mjs.map +0 -1
- package/dist/plugin-DVaRZY2x.d.mts +0 -84
- package/dist/registry-ChOjjdEC.mjs.map +0 -1
- package/dist/runner-4ZsOveoY.mjs.map +0 -1
package/dist/index.mjs
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
|
-
import { a as defineConfig, i as registerEvalDefinition,
|
|
2
|
-
import { n as
|
|
3
|
-
import { t as
|
|
1
|
+
import { a as defineConfig, i as registerEvalDefinition, s as loadEnv } from "./registry-CHJcTN2W.mjs";
|
|
2
|
+
import { n as requiredEnvFrom } from "./env-C7X81PWa.mjs";
|
|
3
|
+
import { n as defineTask, t as defineEval } from "./config-CHN24egi.mjs";
|
|
4
|
+
import { t as expect } from "./expect-CaXiUkwY.mjs";
|
|
5
|
+
import { errorMessageFrom } from "@moeru/std";
|
|
4
6
|
//#region src/dsl/task.ts
|
|
5
7
|
function cloneCaseMatrix(matrix) {
|
|
6
8
|
return {
|
|
@@ -9,6 +11,12 @@ function cloneCaseMatrix(matrix) {
|
|
|
9
11
|
run: { ...matrix.run }
|
|
10
12
|
};
|
|
11
13
|
}
|
|
14
|
+
function createTaskCaseReporterId(index, name) {
|
|
15
|
+
return `${index}:${encodeURIComponent(name)}`;
|
|
16
|
+
}
|
|
17
|
+
function assertValidScore(score) {
|
|
18
|
+
if (!Number.isFinite(score) || score < 0 || score > 1) throw new Error(`Case score must be a finite number in range 0..1, got "${score}".`);
|
|
19
|
+
}
|
|
12
20
|
function emitCaseStart(hooks, payload) {
|
|
13
21
|
try {
|
|
14
22
|
hooks?.onCaseStart?.(payload);
|
|
@@ -20,14 +28,15 @@ function emitCaseEnd(hooks, payload) {
|
|
|
20
28
|
} catch {}
|
|
21
29
|
}
|
|
22
30
|
function createCaseBuilder(registeredCases) {
|
|
31
|
+
function registerCase(name, run, options) {
|
|
32
|
+
registeredCases.push({
|
|
33
|
+
input: options?.input,
|
|
34
|
+
name,
|
|
35
|
+
run
|
|
36
|
+
});
|
|
37
|
+
}
|
|
23
38
|
return {
|
|
24
|
-
caseOf
|
|
25
|
-
registeredCases.push({
|
|
26
|
-
input,
|
|
27
|
-
name,
|
|
28
|
-
run
|
|
29
|
-
});
|
|
30
|
-
},
|
|
39
|
+
caseOf: registerCase,
|
|
31
40
|
casesFromInputs(namePrefix, inputs, run) {
|
|
32
41
|
inputs.forEach((input, index) => {
|
|
33
42
|
registeredCases.push({
|
|
@@ -53,12 +62,9 @@ function getActiveCases() {
|
|
|
53
62
|
if (active == null) throw new Error("caseOf/casesFromInputs must be called inside describeTask/describeEval.");
|
|
54
63
|
return active;
|
|
55
64
|
}
|
|
56
|
-
|
|
57
|
-
* Registers one case in the currently active task scope.
|
|
58
|
-
*/
|
|
59
|
-
function caseOf(name, run, input) {
|
|
65
|
+
function caseOf(name, run, options) {
|
|
60
66
|
getActiveCases().push({
|
|
61
|
-
input,
|
|
67
|
+
input: options?.input,
|
|
62
68
|
name,
|
|
63
69
|
run
|
|
64
70
|
});
|
|
@@ -103,37 +109,73 @@ function describeTask(name, build, options = {}) {
|
|
|
103
109
|
score: 1
|
|
104
110
|
}] };
|
|
105
111
|
const totalCases = registeredCases.length;
|
|
106
|
-
const
|
|
112
|
+
const scoreBucketsByKind = {
|
|
113
|
+
exact: [],
|
|
114
|
+
judge: []
|
|
115
|
+
};
|
|
116
|
+
await Promise.all(registeredCases.map(async (taskCase, index) => {
|
|
107
117
|
emitCaseStart(context.reporterHooks, {
|
|
108
118
|
index,
|
|
109
119
|
name: taskCase.name,
|
|
110
120
|
total: totalCases
|
|
111
121
|
});
|
|
112
122
|
let state = "passed";
|
|
123
|
+
let errorMessage;
|
|
124
|
+
const caseId = createTaskCaseReporterId(index, taskCase.name);
|
|
125
|
+
const customScoresByKind = /* @__PURE__ */ new Map();
|
|
113
126
|
try {
|
|
114
127
|
await taskCase.run({
|
|
115
128
|
...context,
|
|
116
129
|
matrix: {
|
|
117
130
|
...cloneCaseMatrix(context.task.matrix),
|
|
118
131
|
inputs: taskCase.input
|
|
132
|
+
},
|
|
133
|
+
metric(name, value) {
|
|
134
|
+
context.reporterHooks?.onEvent?.({
|
|
135
|
+
caseId,
|
|
136
|
+
data: {
|
|
137
|
+
name,
|
|
138
|
+
value
|
|
139
|
+
},
|
|
140
|
+
event: "task.case.metric"
|
|
141
|
+
});
|
|
142
|
+
},
|
|
143
|
+
score(score, kind = "exact") {
|
|
144
|
+
assertValidScore(score);
|
|
145
|
+
customScoresByKind.set(kind, score);
|
|
119
146
|
}
|
|
120
147
|
});
|
|
121
|
-
} catch {
|
|
148
|
+
} catch (error) {
|
|
122
149
|
state = "failed";
|
|
150
|
+
errorMessage = errorMessageFrom(error) ?? "Unknown case failure.";
|
|
123
151
|
} finally {
|
|
124
152
|
emitCaseEnd(context.reporterHooks, {
|
|
153
|
+
...errorMessage == null ? {} : { errorMessage },
|
|
125
154
|
index,
|
|
126
155
|
state,
|
|
127
156
|
name: taskCase.name,
|
|
128
157
|
total: totalCases
|
|
129
158
|
});
|
|
130
159
|
}
|
|
131
|
-
|
|
160
|
+
if (state === "failed") {
|
|
161
|
+
scoreBucketsByKind.exact.push(0);
|
|
162
|
+
return;
|
|
163
|
+
}
|
|
164
|
+
if (customScoresByKind.size === 0) {
|
|
165
|
+
scoreBucketsByKind.exact.push(1);
|
|
166
|
+
return;
|
|
167
|
+
}
|
|
168
|
+
scoreBucketsByKind.exact.push(customScoresByKind.get("exact") ?? 1);
|
|
169
|
+
const judgeScore = customScoresByKind.get("judge");
|
|
170
|
+
if (judgeScore != null) scoreBucketsByKind.judge.push(judgeScore);
|
|
132
171
|
}));
|
|
133
|
-
return { scores: [{
|
|
134
|
-
kind
|
|
135
|
-
|
|
136
|
-
|
|
172
|
+
return { scores: Object.keys(scoreBucketsByKind).filter((kind) => scoreBucketsByKind[kind].length > 0).map((kind) => {
|
|
173
|
+
const values = scoreBucketsByKind[kind];
|
|
174
|
+
return {
|
|
175
|
+
kind,
|
|
176
|
+
score: values.reduce((sum, value) => sum + value, 0) / values.length
|
|
177
|
+
};
|
|
178
|
+
}) };
|
|
137
179
|
}
|
|
138
180
|
})
|
|
139
181
|
});
|
|
@@ -145,6 +187,6 @@ function describeTask(name, build, options = {}) {
|
|
|
145
187
|
*/
|
|
146
188
|
const describeEval = describeTask;
|
|
147
189
|
//#endregion
|
|
148
|
-
export { caseOf, casesFromInputs, defineConfig, describeEval, describeTask, expect, loadEnv };
|
|
190
|
+
export { caseOf, casesFromInputs, defineConfig, describeEval, describeTask, expect, loadEnv, requiredEnvFrom };
|
|
149
191
|
|
|
150
192
|
//# sourceMappingURL=index.mjs.map
|
package/dist/index.mjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.mjs","names":[],"sources":["../src/dsl/task.ts"],"sourcesContent":["import type { TaskRunContext, TaskRunOutput } from '../config'\n\nimport { defineEval, defineTask } from '../config'\nimport { registerEvalDefinition } from './registry'\n\n/**\n * Runtime context provided to a task case callback.\n */\nexport interface CaseRunContext<TInput> extends TaskRunContext {\n /**\n * Case-scoped matrix payload.\n */\n matrix: TaskRunContext['task']['matrix'] & { inputs: TInput }\n}\n\n/**\n * Callback for one task case.\n */\nexport type CaseRunner<TInput> = (context: CaseRunContext<TInput>) => Promise<void> | void\n\ninterface RegisteredCase<TInput> {\n input: TInput\n name: string\n run: CaseRunner<TInput>\n}\n\nfunction cloneCaseMatrix(matrix: TaskRunContext['task']['matrix']): TaskRunContext['task']['matrix'] {\n return {\n eval: {\n ...matrix.eval,\n },\n meta: {\n ...matrix.meta,\n },\n run: {\n ...matrix.run,\n },\n }\n}\n\nfunction emitCaseStart(\n hooks: TaskRunContext['reporterHooks'] | undefined,\n payload: {\n index: number\n name: string\n total: number\n },\n): void {\n try {\n hooks?.onCaseStart?.(payload)\n }\n catch {\n // Reporter hooks must never affect task scoring.\n }\n}\n\nfunction emitCaseEnd(\n hooks: TaskRunContext['reporterHooks'] | undefined,\n payload: {\n index: number\n state: 'passed' | 'failed'\n name: string\n total: number\n },\n): void {\n try {\n hooks?.onCaseEnd?.(payload)\n }\n catch {\n // Reporter hooks must never affect task scoring.\n }\n}\n\n/**\n * Builder callbacks passed into `describeTask`.\n */\nexport interface DescribeTaskBuilder {\n /**\n * Registers one explicit case.\n */\n caseOf: <TInput>(name: string, run: CaseRunner<TInput>, input: TInput) => void\n /**\n * Registers multiple cases from input list.\n */\n casesFromInputs: <TInput>(\n namePrefix: string,\n inputs: readonly TInput[],\n run: CaseRunner<TInput>,\n ) => void\n}\n\n/**\n * Options for `describeTask`.\n */\nexport interface DescribeTaskOptions {\n /**\n * Optional description override.\n */\n description?: string\n}\n\nfunction createCaseBuilder(registeredCases: RegisteredCase<unknown>[]): DescribeTaskBuilder {\n return {\n caseOf(name, run, input) {\n registeredCases.push({\n input,\n name,\n run: run as CaseRunner<unknown>,\n })\n },\n casesFromInputs(namePrefix, inputs, run) {\n inputs.forEach((input, index) => {\n registeredCases.push({\n input,\n name: `${namePrefix} #${index + 1}`,\n run: run as CaseRunner<unknown>,\n })\n })\n },\n }\n}\n\nlet activeCasesStack: RegisteredCase<unknown>[][] = []\n\nfunction withActiveCases<T>(cases: RegisteredCase<unknown>[], callback: () => T): T {\n activeCasesStack = [...activeCasesStack, cases]\n\n try {\n return callback()\n }\n finally {\n activeCasesStack = activeCasesStack.slice(0, -1)\n }\n}\n\nfunction getActiveCases(): RegisteredCase<unknown>[] {\n const active = activeCasesStack.at(-1)\n if (active == null) {\n throw new Error('caseOf/casesFromInputs must be called inside describeTask/describeEval.')\n }\n\n return active\n}\n\n/**\n * Registers one case in the currently active task scope.\n */\nexport function caseOf<TInput>(\n name: string,\n run: CaseRunner<TInput>,\n input: TInput,\n): void {\n getActiveCases().push({\n input,\n name,\n run: run as CaseRunner<unknown>,\n })\n}\n\n/**\n * Registers multiple cases in the currently active task scope.\n */\nexport function casesFromInputs<TInput>(\n namePrefix: string,\n inputs: readonly TInput[],\n run: CaseRunner<TInput>,\n): void {\n inputs.forEach((input, index) => {\n getActiveCases().push({\n input,\n name: `${namePrefix} #${index + 1}`,\n run: run as CaseRunner<unknown>,\n })\n })\n}\n\n/**\n * Defines one eval task with task/case semantics similar to Vitest.\n *\n * Use when:\n * - task behavior should be declared with `caseOf` and `casesFromInputs`\n * - business agent code should be imported and run from eval task files\n */\nexport function describeTask(\n name: string,\n build: ((builder: DescribeTaskBuilder) => void) | (() => void),\n options: DescribeTaskOptions = {},\n) {\n const registeredCases: RegisteredCase<unknown>[] = []\n const builder = createCaseBuilder(registeredCases)\n withActiveCases(registeredCases, () => {\n if (build.length > 0) {\n (build as (builder: DescribeTaskBuilder) => void)(builder)\n return\n }\n\n ;(build as () => void)()\n })\n\n const description = options.description ?? name\n\n const definition = defineEval({\n description,\n name,\n task: defineTask({\n id: name,\n async run(context): Promise<TaskRunOutput> {\n if (registeredCases.length === 0) {\n return {\n scores: [{ kind: 'exact', score: 1 }],\n }\n }\n\n const totalCases = registeredCases.length\n\n const caseScores: number[] = await Promise.all(\n registeredCases.map(async (taskCase, index) => {\n emitCaseStart(context.reporterHooks, {\n index,\n name: taskCase.name,\n total: totalCases,\n })\n\n let state: 'passed' | 'failed' = 'passed'\n\n try {\n await taskCase.run({\n ...context,\n matrix: {\n ...cloneCaseMatrix(context.task.matrix),\n inputs: taskCase.input,\n },\n })\n }\n catch {\n state = 'failed'\n }\n finally {\n emitCaseEnd(context.reporterHooks, {\n index,\n state,\n name: taskCase.name,\n total: totalCases,\n })\n }\n\n return state === 'passed' ? 1 : 0\n }),\n )\n\n const averageScore = caseScores.reduce((sum, score) => sum + score, 0) / caseScores.length\n\n return {\n scores: [{ kind: 'exact', score: averageScore }],\n }\n },\n }),\n })\n\n registerEvalDefinition(definition)\n\n return definition\n}\n\n/**\n * Alias of `describeTask` for eval-centric naming.\n */\nexport const describeEval = describeTask\n"],"mappings":";;;;AA0BA,SAAS,gBAAgB,QAA4E;AACnG,QAAO;EACL,MAAM,EACJ,GAAG,OAAO,MACX;EACD,MAAM,EACJ,GAAG,OAAO,MACX;EACD,KAAK,EACH,GAAG,OAAO,KACX;EACF;;AAGH,SAAS,cACP,OACA,SAKM;AACN,KAAI;AACF,SAAO,cAAc,QAAQ;SAEzB;;AAKR,SAAS,YACP,OACA,SAMM;AACN,KAAI;AACF,SAAO,YAAY,QAAQ;SAEvB;;AAiCR,SAAS,kBAAkB,iBAAiE;AAC1F,QAAO;EACL,OAAO,MAAM,KAAK,OAAO;AACvB,mBAAgB,KAAK;IACnB;IACA;IACK;IACN,CAAC;;EAEJ,gBAAgB,YAAY,QAAQ,KAAK;AACvC,UAAO,SAAS,OAAO,UAAU;AAC/B,oBAAgB,KAAK;KACnB;KACA,MAAM,GAAG,WAAW,IAAI,QAAQ;KAC3B;KACN,CAAC;KACF;;EAEL;;AAGH,IAAI,mBAAgD,EAAE;AAEtD,SAAS,gBAAmB,OAAkC,UAAsB;AAClF,oBAAmB,CAAC,GAAG,kBAAkB,MAAM;AAE/C,KAAI;AACF,SAAO,UAAU;WAEX;AACN,qBAAmB,iBAAiB,MAAM,GAAG,GAAG;;;AAIpD,SAAS,iBAA4C;CACnD,MAAM,SAAS,iBAAiB,GAAG,GAAG;AACtC,KAAI,UAAU,KACZ,OAAM,IAAI,MAAM,0EAA0E;AAG5F,QAAO;;;;;AAMT,SAAgB,OACd,MACA,KACA,OACM;AACN,iBAAgB,CAAC,KAAK;EACpB;EACA;EACK;EACN,CAAC;;;;;AAMJ,SAAgB,gBACd,YACA,QACA,KACM;AACN,QAAO,SAAS,OAAO,UAAU;AAC/B,kBAAgB,CAAC,KAAK;GACpB;GACA,MAAM,GAAG,WAAW,IAAI,QAAQ;GAC3B;GACN,CAAC;GACF;;;;;;;;;AAUJ,SAAgB,aACd,MACA,OACA,UAA+B,EAAE,EACjC;CACA,MAAM,kBAA6C,EAAE;CACrD,MAAM,UAAU,kBAAkB,gBAAgB;AAClD,iBAAgB,uBAAuB;AACrC,MAAI,MAAM,SAAS,GAAG;AACnB,SAAiD,QAAQ;AAC1D;;AAGA,SAAsB;GACxB;CAIF,MAAM,aAAa,WAAW;EAC5B,aAHkB,QAAQ,eAAe;EAIzC;EACA,MAAM,WAAW;GACf,IAAI;GACJ,MAAM,IAAI,SAAiC;AACzC,QAAI,gBAAgB,WAAW,EAC7B,QAAO,EACL,QAAQ,CAAC;KAAE,MAAM;KAAS,OAAO;KAAG,CAAC,EACtC;IAGH,MAAM,aAAa,gBAAgB;IAEnC,MAAM,aAAuB,MAAM,QAAQ,IACzC,gBAAgB,IAAI,OAAO,UAAU,UAAU;AAC7C,mBAAc,QAAQ,eAAe;MACnC;MACA,MAAM,SAAS;MACf,OAAO;MACR,CAAC;KAEF,IAAI,QAA6B;AAEjC,SAAI;AACF,YAAM,SAAS,IAAI;OACjB,GAAG;OACH,QAAQ;QACN,GAAG,gBAAgB,QAAQ,KAAK,OAAO;QACvC,QAAQ,SAAS;QAClB;OACF,CAAC;aAEE;AACJ,cAAQ;eAEF;AACN,kBAAY,QAAQ,eAAe;OACjC;OACA;OACA,MAAM,SAAS;OACf,OAAO;OACR,CAAC;;AAGJ,YAAO,UAAU,WAAW,IAAI;MAChC,CACH;AAID,WAAO,EACL,QAAQ,CAAC;KAAE,MAAM;KAAS,OAHP,WAAW,QAAQ,KAAK,UAAU,MAAM,OAAO,EAAE,GAAG,WAAW;KAGnC,CAAC,EACjD;;GAEJ,CAAC;EACH,CAAC;AAEF,wBAAuB,WAAW;AAElC,QAAO;;;;;AAMT,MAAa,eAAe"}
|
|
1
|
+
{"version":3,"file":"index.mjs","names":[],"sources":["../src/dsl/task.ts"],"sourcesContent":["import type { TaskRunContext, TaskRunOutput } from '../config'\nimport type { RunScoreKind } from '../core/runner'\n\nimport { errorMessageFrom } from '@moeru/std'\n\nimport { defineEval, defineTask } from '../config'\nimport { registerEvalDefinition } from './registry'\n\n/**\n * Runtime context provided to a task case callback.\n */\nexport interface CaseRunContext<TInput> extends TaskRunContext {\n /**\n * Case-scoped matrix payload.\n */\n matrix: TaskRunContext['task']['matrix'] & { inputs: TInput }\n /**\n * Overrides one case score family with a custom normalized value.\n *\n * Use when:\n * - one case computes a benchmark-native score that should flow into run aggregation\n *\n * Expects:\n * - `score` to stay in the `0..1` range\n */\n score: (score: number, kind?: RunScoreKind) => void\n /**\n * Emits one custom case metric into report events.\n *\n * Use when:\n * - tasks need structured benchmark metadata beyond exact/judge score families\n *\n * Expects:\n * - `name` to be a stable metric identifier\n * - `value` to be JSON-serializable\n */\n metric: (name: string, value: boolean | number | string | null) => void\n}\n\n/**\n * Callback for one task case.\n */\nexport type CaseRunner<TInput> = (context: CaseRunContext<TInput>) => Promise<void> | void\n\ninterface RegisteredCase<TInput> {\n input: TInput\n name: string\n run: CaseRunner<TInput>\n}\n\nfunction cloneCaseMatrix(matrix: TaskRunContext['task']['matrix']): TaskRunContext['task']['matrix'] {\n return {\n eval: {\n ...matrix.eval,\n },\n meta: {\n ...matrix.meta,\n },\n run: {\n ...matrix.run,\n },\n }\n}\n\nfunction createTaskCaseReporterId(index: number, name: string): string {\n return `${index}:${encodeURIComponent(name)}`\n}\n\nfunction assertValidScore(score: number): void {\n if (!Number.isFinite(score) || score < 0 || score > 1) {\n throw new Error(`Case score must be a finite number in range 0..1, got \"${score}\".`)\n }\n}\n\nfunction emitCaseStart(\n hooks: TaskRunContext['reporterHooks'] | undefined,\n payload: {\n index: number\n name: string\n total: number\n },\n): void {\n try {\n hooks?.onCaseStart?.(payload)\n }\n catch {\n // Reporter hooks must never affect task scoring.\n }\n}\n\nfunction emitCaseEnd(\n hooks: TaskRunContext['reporterHooks'] | undefined,\n payload: {\n index: number\n state: 'passed' | 'failed'\n name: string\n total: number\n errorMessage?: string\n },\n): void {\n try {\n hooks?.onCaseEnd?.(payload)\n }\n catch {\n // Reporter hooks must never affect task scoring.\n }\n}\n\n/**\n * Builder callbacks passed into `describeTask`.\n */\nexport interface DescribeTaskBuilder {\n /**\n * Registers one explicit case.\n */\n caseOf: {\n (name: string, run: CaseRunner<undefined>): void\n <TInput>(name: string, run: CaseRunner<TInput>, options: { input: TInput }): void\n }\n /**\n * Registers multiple cases from input list.\n */\n casesFromInputs: <TInput>(\n namePrefix: string,\n inputs: readonly TInput[],\n run: CaseRunner<TInput>,\n ) => void\n}\n\n/**\n * Options for `describeTask`.\n */\nexport interface DescribeTaskOptions {\n /**\n * Optional description override.\n */\n description?: string\n}\n\nfunction createCaseBuilder(registeredCases: RegisteredCase<unknown>[]): DescribeTaskBuilder {\n function registerCase(name: string, run: CaseRunner<undefined>): void\n function registerCase<TInput>(name: string, run: CaseRunner<TInput>, options: { input: TInput }): void\n function registerCase<TInput>(\n name: string,\n run: CaseRunner<TInput> | CaseRunner<undefined>,\n options?: { input: TInput },\n ): void {\n registeredCases.push({\n input: options?.input,\n name,\n run: run as CaseRunner<unknown>,\n })\n }\n\n return {\n caseOf: registerCase,\n casesFromInputs(namePrefix, inputs, run) {\n inputs.forEach((input, index) => {\n registeredCases.push({\n input,\n name: `${namePrefix} #${index + 1}`,\n run: run as CaseRunner<unknown>,\n })\n })\n },\n }\n}\n\nlet activeCasesStack: RegisteredCase<unknown>[][] = []\n\nfunction withActiveCases<T>(cases: RegisteredCase<unknown>[], callback: () => T): T {\n activeCasesStack = [...activeCasesStack, cases]\n\n try {\n return callback()\n }\n finally {\n activeCasesStack = activeCasesStack.slice(0, -1)\n }\n}\n\nfunction getActiveCases(): RegisteredCase<unknown>[] {\n const active = activeCasesStack.at(-1)\n if (active == null) {\n throw new Error('caseOf/casesFromInputs must be called inside describeTask/describeEval.')\n }\n\n return active\n}\n\n/**\n * Registers one case in the currently active task scope.\n */\nexport function caseOf(\n name: string,\n run: CaseRunner<undefined>,\n): void\n\nexport function caseOf<TInput>(\n name: string,\n run: CaseRunner<TInput>,\n options: { input: TInput },\n): void\n\nexport function caseOf<TInput>(\n name: string,\n run: CaseRunner<TInput> | CaseRunner<undefined>,\n options?: { input: TInput },\n): void {\n getActiveCases().push({\n input: options?.input,\n name,\n run: run as CaseRunner<unknown>,\n })\n}\n\n/**\n * Registers multiple cases in the currently active task scope.\n */\nexport function casesFromInputs<TInput>(\n namePrefix: string,\n inputs: readonly TInput[],\n run: CaseRunner<TInput>,\n): void {\n inputs.forEach((input, index) => {\n getActiveCases().push({\n input,\n name: `${namePrefix} #${index + 1}`,\n run: run as CaseRunner<unknown>,\n })\n })\n}\n\n/**\n * Defines one eval task with task/case semantics similar to Vitest.\n *\n * Use when:\n * - task behavior should be declared with `caseOf` and `casesFromInputs`\n * - business agent code should be imported and run from eval task files\n */\nexport function describeTask(\n name: string,\n build: ((builder: DescribeTaskBuilder) => void) | (() => void),\n options: DescribeTaskOptions = {},\n) {\n const registeredCases: RegisteredCase<unknown>[] = []\n const builder = createCaseBuilder(registeredCases)\n withActiveCases(registeredCases, () => {\n if (build.length > 0) {\n (build as (builder: DescribeTaskBuilder) => void)(builder)\n return\n }\n\n ;(build as () => void)()\n })\n\n const description = options.description ?? name\n\n const definition = defineEval({\n description,\n name,\n task: defineTask({\n id: name,\n async run(context): Promise<TaskRunOutput> {\n if (registeredCases.length === 0) {\n return {\n scores: [{ kind: 'exact', score: 1 }],\n }\n }\n\n const totalCases = registeredCases.length\n\n const scoreBucketsByKind: Record<RunScoreKind, number[]> = {\n exact: [],\n judge: [],\n }\n\n await Promise.all(\n registeredCases.map(async (taskCase, index) => {\n emitCaseStart(context.reporterHooks, {\n index,\n name: taskCase.name,\n total: totalCases,\n })\n\n let state: 'passed' | 'failed' = 'passed'\n let errorMessage: string | undefined\n const caseId = createTaskCaseReporterId(index, taskCase.name)\n const customScoresByKind = new Map<RunScoreKind, number>()\n\n try {\n await taskCase.run({\n ...context,\n matrix: {\n ...cloneCaseMatrix(context.task.matrix),\n inputs: taskCase.input,\n },\n metric(name, value) {\n context.reporterHooks?.onEvent?.({\n caseId,\n data: {\n name,\n value,\n },\n event: 'task.case.metric',\n })\n },\n score(score, kind = 'exact') {\n assertValidScore(score)\n customScoresByKind.set(kind, score)\n },\n })\n }\n catch (error) {\n state = 'failed'\n errorMessage = errorMessageFrom(error) ?? 'Unknown case failure.'\n }\n finally {\n emitCaseEnd(context.reporterHooks, {\n ...(errorMessage == null ? {} : { errorMessage }),\n index,\n state,\n name: taskCase.name,\n total: totalCases,\n })\n }\n\n if (state === 'failed') {\n scoreBucketsByKind.exact.push(0)\n return\n }\n\n if (customScoresByKind.size === 0) {\n scoreBucketsByKind.exact.push(1)\n return\n }\n\n scoreBucketsByKind.exact.push(customScoresByKind.get('exact') ?? 1)\n const judgeScore = customScoresByKind.get('judge')\n if (judgeScore != null) {\n scoreBucketsByKind.judge.push(judgeScore)\n }\n }),\n )\n\n const scores = (Object.keys(scoreBucketsByKind) as RunScoreKind[])\n .filter(kind => scoreBucketsByKind[kind].length > 0)\n .map((kind) => {\n const values = scoreBucketsByKind[kind]\n const total = values.reduce((sum, value) => sum + value, 0)\n return {\n kind,\n score: total / values.length,\n }\n })\n\n return {\n scores,\n }\n },\n }),\n })\n\n registerEvalDefinition(definition)\n\n return definition\n}\n\n/**\n * Alias of `describeTask` for eval-centric naming.\n */\nexport const describeEval = describeTask\n"],"mappings":";;;;;;AAkDA,SAAS,gBAAgB,QAA4E;AACnG,QAAO;EACL,MAAM,EACJ,GAAG,OAAO,MACX;EACD,MAAM,EACJ,GAAG,OAAO,MACX;EACD,KAAK,EACH,GAAG,OAAO,KACX;EACF;;AAGH,SAAS,yBAAyB,OAAe,MAAsB;AACrE,QAAO,GAAG,MAAM,GAAG,mBAAmB,KAAK;;AAG7C,SAAS,iBAAiB,OAAqB;AAC7C,KAAI,CAAC,OAAO,SAAS,MAAM,IAAI,QAAQ,KAAK,QAAQ,EAClD,OAAM,IAAI,MAAM,0DAA0D,MAAM,IAAI;;AAIxF,SAAS,cACP,OACA,SAKM;AACN,KAAI;AACF,SAAO,cAAc,QAAQ;SAEzB;;AAKR,SAAS,YACP,OACA,SAOM;AACN,KAAI;AACF,SAAO,YAAY,QAAQ;SAEvB;;AAoCR,SAAS,kBAAkB,iBAAiE;CAG1F,SAAS,aACP,MACA,KACA,SACM;AACN,kBAAgB,KAAK;GACnB,OAAO,SAAS;GAChB;GACK;GACN,CAAC;;AAGJ,QAAO;EACL,QAAQ;EACR,gBAAgB,YAAY,QAAQ,KAAK;AACvC,UAAO,SAAS,OAAO,UAAU;AAC/B,oBAAgB,KAAK;KACnB;KACA,MAAM,GAAG,WAAW,IAAI,QAAQ;KAC3B;KACN,CAAC;KACF;;EAEL;;AAGH,IAAI,mBAAgD,EAAE;AAEtD,SAAS,gBAAmB,OAAkC,UAAsB;AAClF,oBAAmB,CAAC,GAAG,kBAAkB,MAAM;AAE/C,KAAI;AACF,SAAO,UAAU;WAEX;AACN,qBAAmB,iBAAiB,MAAM,GAAG,GAAG;;;AAIpD,SAAS,iBAA4C;CACnD,MAAM,SAAS,iBAAiB,GAAG,GAAG;AACtC,KAAI,UAAU,KACZ,OAAM,IAAI,MAAM,0EAA0E;AAG5F,QAAO;;AAiBT,SAAgB,OACd,MACA,KACA,SACM;AACN,iBAAgB,CAAC,KAAK;EACpB,OAAO,SAAS;EAChB;EACK;EACN,CAAC;;;;;AAMJ,SAAgB,gBACd,YACA,QACA,KACM;AACN,QAAO,SAAS,OAAO,UAAU;AAC/B,kBAAgB,CAAC,KAAK;GACpB;GACA,MAAM,GAAG,WAAW,IAAI,QAAQ;GAC3B;GACN,CAAC;GACF;;;;;;;;;AAUJ,SAAgB,aACd,MACA,OACA,UAA+B,EAAE,EACjC;CACA,MAAM,kBAA6C,EAAE;CACrD,MAAM,UAAU,kBAAkB,gBAAgB;AAClD,iBAAgB,uBAAuB;AACrC,MAAI,MAAM,SAAS,GAAG;AACnB,SAAiD,QAAQ;AAC1D;;AAGA,SAAsB;GACxB;CAIF,MAAM,aAAa,WAAW;EAC5B,aAHkB,QAAQ,eAAe;EAIzC;EACA,MAAM,WAAW;GACf,IAAI;GACJ,MAAM,IAAI,SAAiC;AACzC,QAAI,gBAAgB,WAAW,EAC7B,QAAO,EACL,QAAQ,CAAC;KAAE,MAAM;KAAS,OAAO;KAAG,CAAC,EACtC;IAGH,MAAM,aAAa,gBAAgB;IAEnC,MAAM,qBAAqD;KACzD,OAAO,EAAE;KACT,OAAO,EAAE;KACV;AAED,UAAM,QAAQ,IACZ,gBAAgB,IAAI,OAAO,UAAU,UAAU;AAC7C,mBAAc,QAAQ,eAAe;MACnC;MACA,MAAM,SAAS;MACf,OAAO;MACR,CAAC;KAEF,IAAI,QAA6B;KACjC,IAAI;KACJ,MAAM,SAAS,yBAAyB,OAAO,SAAS,KAAK;KAC7D,MAAM,qCAAqB,IAAI,KAA2B;AAE1D,SAAI;AACF,YAAM,SAAS,IAAI;OACjB,GAAG;OACH,QAAQ;QACN,GAAG,gBAAgB,QAAQ,KAAK,OAAO;QACvC,QAAQ,SAAS;QAClB;OACD,OAAO,MAAM,OAAO;AAClB,gBAAQ,eAAe,UAAU;SAC/B;SACA,MAAM;UACJ;UACA;UACD;SACD,OAAO;SACR,CAAC;;OAEJ,MAAM,OAAO,OAAO,SAAS;AAC3B,yBAAiB,MAAM;AACvB,2BAAmB,IAAI,MAAM,MAAM;;OAEtC,CAAC;cAEG,OAAO;AACZ,cAAQ;AACR,qBAAe,iBAAiB,MAAM,IAAI;eAEpC;AACN,kBAAY,QAAQ,eAAe;OACjC,GAAI,gBAAgB,OAAO,EAAE,GAAG,EAAE,cAAc;OAChD;OACA;OACA,MAAM,SAAS;OACf,OAAO;OACR,CAAC;;AAGJ,SAAI,UAAU,UAAU;AACtB,yBAAmB,MAAM,KAAK,EAAE;AAChC;;AAGF,SAAI,mBAAmB,SAAS,GAAG;AACjC,yBAAmB,MAAM,KAAK,EAAE;AAChC;;AAGF,wBAAmB,MAAM,KAAK,mBAAmB,IAAI,QAAQ,IAAI,EAAE;KACnE,MAAM,aAAa,mBAAmB,IAAI,QAAQ;AAClD,SAAI,cAAc,KAChB,oBAAmB,MAAM,KAAK,WAAW;MAE3C,CACH;AAaD,WAAO,EACL,QAZc,OAAO,KAAK,mBAAmB,CAC5C,QAAO,SAAQ,mBAAmB,MAAM,SAAS,EAAE,CACnD,KAAK,SAAS;KACb,MAAM,SAAS,mBAAmB;AAElC,YAAO;MACL;MACA,OAHY,OAAO,QAAQ,KAAK,UAAU,MAAM,OAAO,EAAE,GAG1C,OAAO;MACvB;MACD,EAIH;;GAEJ,CAAC;EACH,CAAC;AAEF,wBAAuB,WAAW;AAElC,QAAO;;;;;AAMT,MAAa,eAAe"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"models-
|
|
1
|
+
{"version":3,"file":"models-DIGdOUpJ.mjs","names":[],"sources":["../src/config/models.ts"],"sourcesContent":["/**\n * Canonical model definition consumed by vieval runtime and config.\n *\n * Use when:\n * - declaring models in `vieval.config.*`\n * - resolving task runtime models by id, alias, or concrete model name\n *\n * Expects:\n * - `id` to be stable and unique within one config\n * - `inferenceExecutorId` to match scheduler/executor identifiers\n *\n * Returns:\n * - one normalized model registration record\n */\nexport interface ModelDefinition {\n /**\n * Stable model id.\n */\n id: string\n /**\n * Inference-executor id used for matching and reporting.\n */\n inferenceExecutorId: string\n /**\n * Executor reference passed through config.\n *\n * `vieval` core treats this as opaque runtime metadata. Builder plugins can\n * narrow this field with plugin-specific executor input types.\n */\n inferenceExecutor: unknown\n /**\n * Concrete model name passed to the inference executor.\n */\n model: string\n /**\n * Alias names that can resolve this model.\n */\n aliases: string[]\n /**\n * Optional model-level call parameters.\n */\n parameters?: Record<string, unknown>\n}\n\n/**\n * Resolves one model by id, model name, or alias in registration order.\n *\n * Returns:\n * - the first matching model, or `undefined` when no match exists\n */\nexport function resolveModelByName(\n models: readonly ModelDefinition[],\n name: string,\n): ModelDefinition | undefined {\n return models.find(model => model.id === name || model.model === name || model.aliases.includes(name))\n}\n"],"mappings":";;;;;;;AAkDA,SAAgB,mBACd,QACA,MAC6B;AAC7B,QAAO,OAAO,MAAK,UAAS,MAAM,OAAO,QAAQ,MAAM,UAAU,QAAQ,MAAM,QAAQ,SAAS,KAAK,CAAC"}
|