peerbench 0.0.1 → 0.0.2-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +308 -2
- package/dist/abstract-Dec9Sc5O.d.ts +12 -0
- package/dist/benchmarks/index.d.ts +1698 -0
- package/dist/benchmarks/index.js +915 -0
- package/dist/benchmarks/index.js.map +1 -0
- package/dist/catalogs/index.d.ts +75 -0
- package/dist/catalogs/index.js +88 -0
- package/dist/catalogs/index.js.map +1 -0
- package/dist/chunk-22HU24QF.js +8 -0
- package/dist/chunk-22HU24QF.js.map +1 -0
- package/dist/chunk-232PY7K3.js +50 -0
- package/dist/chunk-232PY7K3.js.map +1 -0
- package/dist/chunk-7TREBPSJ.js +26 -0
- package/dist/chunk-7TREBPSJ.js.map +1 -0
- package/dist/chunk-DUBKY73H.js +128 -0
- package/dist/chunk-DUBKY73H.js.map +1 -0
- package/dist/chunk-GVF4YZF3.js +15 -0
- package/dist/chunk-GVF4YZF3.js.map +1 -0
- package/dist/chunk-HJH3SW3L.js +103 -0
- package/dist/chunk-HJH3SW3L.js.map +1 -0
- package/dist/chunk-IUN2IUCS.js +58 -0
- package/dist/chunk-IUN2IUCS.js.map +1 -0
- package/dist/chunk-PZ5AY32C.js +10 -0
- package/dist/chunk-PZ5AY32C.js.map +1 -0
- package/dist/chunk-VBOM2YEG.js +47 -0
- package/dist/chunk-VBOM2YEG.js.map +1 -0
- package/dist/chunk-ZJWSK4VO.js +11 -0
- package/dist/chunk-ZJWSK4VO.js.map +1 -0
- package/dist/data-BmN5WjZ4.d.ts +57 -0
- package/dist/generic-array-DLHWSvf1.d.ts +22 -0
- package/dist/index-WiPjF2AL.d.ts +15 -0
- package/dist/index.d.ts +38 -3845
- package/dist/index.js +40 -3557
- package/dist/index.js.map +1 -1
- package/dist/llm-DNj_tp2T.d.ts +22 -0
- package/dist/llm-judge-DIG1f1Az.d.ts +67 -0
- package/dist/provider-BDjGp2y-.d.ts +10 -0
- package/dist/providers/index.d.ts +72 -0
- package/dist/providers/index.js +263 -0
- package/dist/providers/index.js.map +1 -0
- package/dist/rate-limiter-CSmVIRsM.d.ts +60 -0
- package/dist/schemas/extensions/index.d.ts +14 -0
- package/dist/schemas/extensions/index.js +13 -0
- package/dist/schemas/extensions/index.js.map +1 -0
- package/dist/schemas/index.d.ts +233 -0
- package/dist/schemas/index.js +27 -0
- package/dist/schemas/index.js.map +1 -0
- package/dist/schemas/llm/index.d.ts +98 -0
- package/dist/schemas/llm/index.js +37 -0
- package/dist/schemas/llm/index.js.map +1 -0
- package/dist/scorers/index.d.ts +63 -0
- package/dist/scorers/index.js +494 -0
- package/dist/scorers/index.js.map +1 -0
- package/dist/simple-system-prompt-CzPYuvo0.d.ts +49 -0
- package/dist/system-prompt--0FdPWqK.d.ts +58 -0
- package/dist/utilities-BrRH32rD.d.ts +30 -0
- package/package.json +39 -21
- package/LICENSE +0 -21
|
@@ -0,0 +1,915 @@
|
|
|
1
|
+
import {
|
|
2
|
+
AbstractDataLoader,
|
|
3
|
+
GenericJSONArrayDataLoader,
|
|
4
|
+
stableStringify
|
|
5
|
+
} from "../chunk-HJH3SW3L.js";
|
|
6
|
+
import {
|
|
7
|
+
AbstractScorer
|
|
8
|
+
} from "../chunk-22HU24QF.js";
|
|
9
|
+
import {
|
|
10
|
+
bufferToString,
|
|
11
|
+
idGeneratorUUIDv7,
|
|
12
|
+
parseResponseAsJSON
|
|
13
|
+
} from "../chunk-DUBKY73H.js";
|
|
14
|
+
import {
|
|
15
|
+
BaseBenchmarkSpecSchemaV1,
|
|
16
|
+
BaseScoreSchemaV1,
|
|
17
|
+
BaseTestCaseSchemaV1,
|
|
18
|
+
defineBenchmarkSpecSchema,
|
|
19
|
+
defineScoreSchema,
|
|
20
|
+
defineTestCaseSchema
|
|
21
|
+
} from "../chunk-232PY7K3.js";
|
|
22
|
+
import {
|
|
23
|
+
ScoringMethod
|
|
24
|
+
} from "../chunk-ZJWSK4VO.js";
|
|
25
|
+
import {
|
|
26
|
+
ExtensionLLMAsAJudgeScorerFieldsV1
|
|
27
|
+
} from "../chunk-GVF4YZF3.js";
|
|
28
|
+
import {
|
|
29
|
+
BaseLLMChatResponseSchemaV1
|
|
30
|
+
} from "../chunk-7TREBPSJ.js";
|
|
31
|
+
import {
|
|
32
|
+
defineResponseSchema
|
|
33
|
+
} from "../chunk-IUN2IUCS.js";
|
|
34
|
+
import {
|
|
35
|
+
__export
|
|
36
|
+
} from "../chunk-PZ5AY32C.js";
|
|
37
|
+
|
|
38
|
+
// src/benchmarks/peerbench/index.ts
|
|
39
|
+
var peerbench_exports = {};
|
|
40
|
+
__export(peerbench_exports, {
|
|
41
|
+
PeerbenchJSONDataLoader: () => PeerbenchJSONDataLoader,
|
|
42
|
+
PeerbenchMultipleChoiceResponseSchemaV1: () => PeerbenchMultipleChoiceResponseSchemaV1,
|
|
43
|
+
PeerbenchMultipleChoiceScoreSchemaV1: () => PeerbenchMultipleChoiceScoreSchemaV1,
|
|
44
|
+
PeerbenchMultipleChoiceTestCaseSchemaV1: () => PeerbenchMultipleChoiceTestCaseSchemaV1,
|
|
45
|
+
PeerbenchOpenEndedResponseSchemaV1: () => PeerbenchOpenEndedResponseSchemaV1,
|
|
46
|
+
PeerbenchOpenEndedScoreSchemaV1: () => PeerbenchOpenEndedScoreSchemaV1,
|
|
47
|
+
PeerbenchOpenEndedTestCaseSchemaV1: () => PeerbenchOpenEndedTestCaseSchemaV1,
|
|
48
|
+
runTestCase: () => runTestCase
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
// src/benchmarks/peerbench/test-cases/mcq.v1.ts
|
|
52
|
+
import { z } from "zod";
|
|
53
|
+
|
|
54
|
+
// src/benchmarks/peerbench/score.ts
|
|
55
|
+
var PeerbenchBaseScoreSchemaV1 = defineScoreSchema({
|
|
56
|
+
baseSchema: BaseScoreSchemaV1,
|
|
57
|
+
fields: {
|
|
58
|
+
...ExtensionLLMAsAJudgeScorerFieldsV1
|
|
59
|
+
}
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
// src/benchmarks/peerbench/test-cases/mcq.v1.ts
|
|
63
|
+
var PeerbenchMultipleChoiceTestCaseSchemaV1 = defineTestCaseSchema({
|
|
64
|
+
baseSchema: BaseTestCaseSchemaV1,
|
|
65
|
+
kind: "pb.ts.mcq",
|
|
66
|
+
schemaVersion: 1,
|
|
67
|
+
fields: {
|
|
68
|
+
question: z.string(),
|
|
69
|
+
options: z.record(z.string(), z.string()),
|
|
70
|
+
answer: z.string(),
|
|
71
|
+
answerKey: z.string()
|
|
72
|
+
}
|
|
73
|
+
});
|
|
74
|
+
var PeerbenchMultipleChoiceResponseSchemaV1 = defineResponseSchema({
|
|
75
|
+
baseSchema: BaseLLMChatResponseSchemaV1,
|
|
76
|
+
kind: "pb.rs.mcq",
|
|
77
|
+
schemaVersion: 1,
|
|
78
|
+
fields: {}
|
|
79
|
+
});
|
|
80
|
+
var PeerbenchMultipleChoiceScoreSchemaV1 = defineScoreSchema({
|
|
81
|
+
baseSchema: PeerbenchBaseScoreSchemaV1,
|
|
82
|
+
kind: "pb.sc.mcq",
|
|
83
|
+
schemaVersion: 1,
|
|
84
|
+
fields: {
|
|
85
|
+
extractedAnswers: z.array(z.string())
|
|
86
|
+
}
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
// src/benchmarks/peerbench/test-cases/open-ended.v1.ts
|
|
90
|
+
import { z as z2 } from "zod";
|
|
91
|
+
var PeerbenchOpenEndedTestCaseSchemaV1 = defineTestCaseSchema({
|
|
92
|
+
kind: "pb.ts.open-ended",
|
|
93
|
+
schemaVersion: 1,
|
|
94
|
+
baseSchema: BaseTestCaseSchemaV1,
|
|
95
|
+
fields: {
|
|
96
|
+
question: z2.string(),
|
|
97
|
+
answer: z2.string().optional()
|
|
98
|
+
}
|
|
99
|
+
});
|
|
100
|
+
var PeerbenchOpenEndedResponseSchemaV1 = defineResponseSchema({
|
|
101
|
+
baseSchema: BaseLLMChatResponseSchemaV1,
|
|
102
|
+
kind: "pb.rs.open-ended",
|
|
103
|
+
schemaVersion: 1
|
|
104
|
+
});
|
|
105
|
+
var PeerbenchOpenEndedScoreSchemaV1 = defineScoreSchema({
|
|
106
|
+
baseSchema: PeerbenchBaseScoreSchemaV1,
|
|
107
|
+
kind: "pb.sc.open-ended",
|
|
108
|
+
schemaVersion: 1,
|
|
109
|
+
fields: {}
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
// src/benchmarks/peerbench/spec.ts
|
|
113
|
+
import z3 from "zod";
|
|
114
|
+
var PeerbenchBenchmarkSpecSchemaV1 = defineBenchmarkSpecSchema({
|
|
115
|
+
baseSchema: BaseBenchmarkSpecSchemaV1,
|
|
116
|
+
kind: "pb.benchmark.spec",
|
|
117
|
+
schemaVersion: 1,
|
|
118
|
+
fields: {
|
|
119
|
+
/**
|
|
120
|
+
* Big text contents that can be referred as <text>{key}</text> in a prompt or system prompt.
|
|
121
|
+
*/
|
|
122
|
+
blobTexts: z3.record(z3.string(), z3.string()).optional()
|
|
123
|
+
}
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
// src/benchmarks/peerbench/loader.ts
|
|
127
|
+
import z4 from "zod";
|
|
128
|
+
var PeerbenchJSONDataLoader = class extends GenericJSONArrayDataLoader {
|
|
129
|
+
kind = "pb.load.json.data";
|
|
130
|
+
async loadBenchmarkSpec(params) {
|
|
131
|
+
const content = bufferToString(params.content);
|
|
132
|
+
const parsed = PeerbenchBenchmarkSpecSchemaV1.parse(content);
|
|
133
|
+
return parsed;
|
|
134
|
+
}
|
|
135
|
+
testCaseBuilder(data) {
|
|
136
|
+
const testCaseValidation = z4.union([
|
|
137
|
+
PeerbenchMultipleChoiceTestCaseSchemaV1,
|
|
138
|
+
PeerbenchOpenEndedTestCaseSchemaV1
|
|
139
|
+
]).safeParse(data);
|
|
140
|
+
return testCaseValidation.success ? testCaseValidation.data : void 0;
|
|
141
|
+
}
|
|
142
|
+
async responseBuilder(data) {
|
|
143
|
+
const responseValidation = z4.union([
|
|
144
|
+
PeerbenchMultipleChoiceResponseSchemaV1,
|
|
145
|
+
PeerbenchOpenEndedResponseSchemaV1
|
|
146
|
+
]).safeParse(data);
|
|
147
|
+
return responseValidation.success ? responseValidation.data : void 0;
|
|
148
|
+
}
|
|
149
|
+
async scoreBuilder(data) {
|
|
150
|
+
const scoreValidation = z4.union([
|
|
151
|
+
PeerbenchMultipleChoiceScoreSchemaV1,
|
|
152
|
+
PeerbenchOpenEndedScoreSchemaV1
|
|
153
|
+
]).safeParse(data);
|
|
154
|
+
return scoreValidation.success ? scoreValidation.data : void 0;
|
|
155
|
+
}
|
|
156
|
+
};
|
|
157
|
+
|
|
158
|
+
// src/benchmarks/peerbench/runner.ts
|
|
159
|
+
async function runTestCase(params) {
|
|
160
|
+
const { testCase } = params;
|
|
161
|
+
const responseIdGenerator = params.idGenerators?.response ?? idGeneratorUUIDv7;
|
|
162
|
+
const scoreIdGenerator = params.idGenerators?.score ?? idGeneratorUUIDv7;
|
|
163
|
+
const messages = [];
|
|
164
|
+
if (params.systemPrompt) {
|
|
165
|
+
messages.push({
|
|
166
|
+
role: "system",
|
|
167
|
+
content: params.systemPrompt.content
|
|
168
|
+
});
|
|
169
|
+
}
|
|
170
|
+
if (testCase.kind === "pb.ts.mcq") {
|
|
171
|
+
const formattedPrompt = formatMCQPrompt(testCase);
|
|
172
|
+
messages.push({
|
|
173
|
+
role: "user",
|
|
174
|
+
content: formattedPrompt
|
|
175
|
+
});
|
|
176
|
+
const providerResponse = await params.provider.forward({
|
|
177
|
+
model: params.runConfig.model,
|
|
178
|
+
messages
|
|
179
|
+
});
|
|
180
|
+
const response = await PeerbenchMultipleChoiceResponseSchemaV1.newWithId(
|
|
181
|
+
{
|
|
182
|
+
data: providerResponse.data,
|
|
183
|
+
startedAt: providerResponse.startedAt,
|
|
184
|
+
completedAt: providerResponse.completedAt,
|
|
185
|
+
testCaseId: testCase.id,
|
|
186
|
+
modelSlug: params.runConfig.model,
|
|
187
|
+
provider: params.provider.kind,
|
|
188
|
+
inputTokensUsed: providerResponse.inputTokensUsed,
|
|
189
|
+
outputTokensUsed: providerResponse.outputTokensUsed,
|
|
190
|
+
inputCost: providerResponse.inputCost,
|
|
191
|
+
outputCost: providerResponse.outputCost
|
|
192
|
+
},
|
|
193
|
+
responseIdGenerator
|
|
194
|
+
);
|
|
195
|
+
if (params.scorer?.kind === "mcq") {
|
|
196
|
+
const scorerResult = await params.scorer.score({
|
|
197
|
+
response: response.data,
|
|
198
|
+
choices: testCase.options ?? {},
|
|
199
|
+
correctAnswers: [testCase.answerKey]
|
|
200
|
+
});
|
|
201
|
+
if (scorerResult !== null) {
|
|
202
|
+
const score = await PeerbenchMultipleChoiceScoreSchemaV1.newWithId(
|
|
203
|
+
{
|
|
204
|
+
scoringMethod: ScoringMethod.algo,
|
|
205
|
+
value: scorerResult.value,
|
|
206
|
+
responseId: response.id,
|
|
207
|
+
extractedAnswers: scorerResult.extractedAnswers,
|
|
208
|
+
metadata: response.metadata
|
|
209
|
+
},
|
|
210
|
+
scoreIdGenerator
|
|
211
|
+
);
|
|
212
|
+
return { response, score };
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
return { response };
|
|
216
|
+
} else if (testCase.kind === "pb.ts.open-ended") {
|
|
217
|
+
const messages2 = [];
|
|
218
|
+
if (params.systemPrompt) {
|
|
219
|
+
messages2.push({
|
|
220
|
+
role: "system",
|
|
221
|
+
content: params.systemPrompt.content
|
|
222
|
+
});
|
|
223
|
+
}
|
|
224
|
+
messages2.push({
|
|
225
|
+
role: "user",
|
|
226
|
+
content: testCase.question
|
|
227
|
+
});
|
|
228
|
+
const providerResponse = await params.provider.forward({
|
|
229
|
+
model: params.runConfig.model,
|
|
230
|
+
messages: messages2
|
|
231
|
+
});
|
|
232
|
+
const response = await PeerbenchOpenEndedResponseSchemaV1.newWithId(
|
|
233
|
+
{
|
|
234
|
+
data: providerResponse.data,
|
|
235
|
+
startedAt: providerResponse.startedAt,
|
|
236
|
+
completedAt: providerResponse.completedAt,
|
|
237
|
+
testCaseId: testCase.id,
|
|
238
|
+
modelSlug: params.runConfig.model,
|
|
239
|
+
provider: params.provider.kind,
|
|
240
|
+
inputTokensUsed: providerResponse.inputTokensUsed,
|
|
241
|
+
outputTokensUsed: providerResponse.outputTokensUsed,
|
|
242
|
+
inputCost: providerResponse.inputCost,
|
|
243
|
+
outputCost: providerResponse.outputCost
|
|
244
|
+
},
|
|
245
|
+
responseIdGenerator
|
|
246
|
+
);
|
|
247
|
+
if (params.scorer?.kind === "llmJudge" && params.runConfig.llmJudgeModel) {
|
|
248
|
+
const scorerResult = await params.scorer.score({
|
|
249
|
+
task: testCase.question,
|
|
250
|
+
candidateAnswer: response.data,
|
|
251
|
+
referenceAnswer: testCase.answer,
|
|
252
|
+
model: params.runConfig.llmJudgeModel
|
|
253
|
+
});
|
|
254
|
+
if (scorerResult !== null) {
|
|
255
|
+
const score = await PeerbenchOpenEndedScoreSchemaV1.newWithId(
|
|
256
|
+
{
|
|
257
|
+
scoringMethod: ScoringMethod.ai,
|
|
258
|
+
value: scorerResult.value,
|
|
259
|
+
responseId: response.id,
|
|
260
|
+
explanation: scorerResult.explanation,
|
|
261
|
+
metadata: scorerResult.metadata,
|
|
262
|
+
scorerAIProvider: scorerResult.provider,
|
|
263
|
+
scorerAIModelSlug: params.runConfig.llmJudgeModel,
|
|
264
|
+
scorerAIInputTokensUsed: scorerResult.inputTokensUsed,
|
|
265
|
+
scorerAIOutputTokensUsed: scorerResult.outputTokensUsed,
|
|
266
|
+
scorerAIInputCost: scorerResult.inputCost,
|
|
267
|
+
scorerAIOutputCost: scorerResult.outputCost
|
|
268
|
+
},
|
|
269
|
+
scoreIdGenerator
|
|
270
|
+
);
|
|
271
|
+
return { response, score };
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
return { response };
|
|
275
|
+
}
|
|
276
|
+
throw new Error("Unsupported test case kind");
|
|
277
|
+
}
|
|
278
|
+
function formatMCQPrompt(testCase) {
|
|
279
|
+
return `Question: ${testCase.question}
|
|
280
|
+
Options:
|
|
281
|
+
${Object.entries(
|
|
282
|
+
testCase.options ?? {}
|
|
283
|
+
).map(([key, value]) => `${key}: ${value}`).join("\n")}`;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// src/benchmarks/mmlu-pro/index.ts
|
|
287
|
+
var mmlu_pro_exports = {};
|
|
288
|
+
__export(mmlu_pro_exports, {
|
|
289
|
+
BaseMMLUProScoreSchemaV1: () => BaseMMLUProScoreSchemaV1,
|
|
290
|
+
MMLUProBenchmarkSpecSchemaV1: () => MMLUProBenchmarkSpecSchemaV1,
|
|
291
|
+
MMLUProJSONDataLoader: () => MMLUProJSONDataLoader,
|
|
292
|
+
MMLUProMainResponseSchemaV1: () => MMLUProMainResponseSchemaV1,
|
|
293
|
+
MMLUProMainScoreSchemaV1: () => MMLUProMainScoreSchemaV1,
|
|
294
|
+
MMLUProMainTestCaseSchemaV1: () => MMLUProMainTestCaseSchemaV1,
|
|
295
|
+
MMLUProParquetDataLoader: () => MMLUProParquetDataLoader,
|
|
296
|
+
runTestCase: () => runTestCase2
|
|
297
|
+
});
|
|
298
|
+
|
|
299
|
+
// src/benchmarks/mmlu-pro/score.ts
|
|
300
|
+
var BaseMMLUProScoreSchemaV1 = defineScoreSchema({
|
|
301
|
+
baseSchema: BaseScoreSchemaV1,
|
|
302
|
+
fields: {
|
|
303
|
+
...ExtensionLLMAsAJudgeScorerFieldsV1
|
|
304
|
+
}
|
|
305
|
+
});
|
|
306
|
+
|
|
307
|
+
// src/benchmarks/mmlu-pro/test-cases/main.v1.ts
|
|
308
|
+
import { z as z5 } from "zod";
|
|
309
|
+
var MMLUProMainTestCaseSchemaV1 = defineTestCaseSchema({
|
|
310
|
+
baseSchema: BaseTestCaseSchemaV1,
|
|
311
|
+
kind: "mmlu-pro.ts.main",
|
|
312
|
+
schemaVersion: 1,
|
|
313
|
+
fields: {
|
|
314
|
+
question: z5.string(),
|
|
315
|
+
options: z5.record(z5.string(), z5.string()),
|
|
316
|
+
answer: z5.string(),
|
|
317
|
+
answerKey: z5.string()
|
|
318
|
+
}
|
|
319
|
+
});
|
|
320
|
+
var MMLUProMainResponseSchemaV1 = defineResponseSchema({
|
|
321
|
+
baseSchema: BaseLLMChatResponseSchemaV1,
|
|
322
|
+
kind: "mmlu-pro.rs.main",
|
|
323
|
+
schemaVersion: 1
|
|
324
|
+
});
|
|
325
|
+
var MMLUProMainScoreSchemaV1 = defineScoreSchema({
|
|
326
|
+
baseSchema: BaseMMLUProScoreSchemaV1,
|
|
327
|
+
kind: "mmlu-pro.sc.main",
|
|
328
|
+
schemaVersion: 1,
|
|
329
|
+
fields: {
|
|
330
|
+
extractedAnswers: z5.array(z5.string())
|
|
331
|
+
}
|
|
332
|
+
});
|
|
333
|
+
|
|
334
|
+
// src/benchmarks/mmlu-pro/loader.ts
|
|
335
|
+
import z6 from "zod";
|
|
336
|
+
import { parquetReadObjects } from "hyparquet";
|
|
337
|
+
var jsonSchema = z6.object({
|
|
338
|
+
question_id: z6.coerce.number(),
|
|
339
|
+
question: z6.string(),
|
|
340
|
+
options: z6.array(z6.string()),
|
|
341
|
+
answer: z6.string(),
|
|
342
|
+
answer_index: z6.coerce.number(),
|
|
343
|
+
cot_content: z6.string(),
|
|
344
|
+
category: z6.string(),
|
|
345
|
+
src: z6.string()
|
|
346
|
+
}).array();
|
|
347
|
+
function mapData(data) {
|
|
348
|
+
return {
|
|
349
|
+
responses: [],
|
|
350
|
+
scores: [],
|
|
351
|
+
testCases: data.map(
|
|
352
|
+
(item) => MMLUProMainTestCaseSchemaV1.new({
|
|
353
|
+
id: `${item.src}-${item.category}-${item.question_id}`,
|
|
354
|
+
question: item.question,
|
|
355
|
+
answerKey: item.answer,
|
|
356
|
+
options: item.options.reduce(
|
|
357
|
+
(acc, option, index) => {
|
|
358
|
+
acc[String.fromCharCode(65 + index)] = option;
|
|
359
|
+
return acc;
|
|
360
|
+
},
|
|
361
|
+
{}
|
|
362
|
+
),
|
|
363
|
+
answer: item.options[item.answer_index],
|
|
364
|
+
metadata: {
|
|
365
|
+
category: item.category,
|
|
366
|
+
src: item.src,
|
|
367
|
+
answer_index: item.answer_index
|
|
368
|
+
}
|
|
369
|
+
})
|
|
370
|
+
)
|
|
371
|
+
};
|
|
372
|
+
}
|
|
373
|
+
var MMLUProJSONDataLoader = class extends AbstractDataLoader {
|
|
374
|
+
kind = "mmlu-pro.load.json.data";
|
|
375
|
+
loadData(params) {
|
|
376
|
+
const content = typeof params.content === "string" ? params.content : bufferToString(params.content);
|
|
377
|
+
const parsed = jsonSchema.parse(JSON.parse(content));
|
|
378
|
+
return mapData(parsed);
|
|
379
|
+
}
|
|
380
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
381
|
+
loadBenchmarkSpec(params) {
|
|
382
|
+
throw new Error("Not implemented");
|
|
383
|
+
}
|
|
384
|
+
};
|
|
385
|
+
var MMLUProParquetDataLoader = class extends AbstractDataLoader {
|
|
386
|
+
kind = "mmlu-pro.load.parquet.data";
|
|
387
|
+
async loadData(params) {
|
|
388
|
+
const data = await parquetReadObjects({
|
|
389
|
+
file: params.content.buffer
|
|
390
|
+
});
|
|
391
|
+
if (!data) {
|
|
392
|
+
throw new Error("Invalid Parquet file");
|
|
393
|
+
}
|
|
394
|
+
return mapData(jsonSchema.parse(data));
|
|
395
|
+
}
|
|
396
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
397
|
+
loadBenchmarkSpec(params) {
|
|
398
|
+
throw new Error("Not implemented");
|
|
399
|
+
}
|
|
400
|
+
};
|
|
401
|
+
|
|
402
|
+
// src/benchmarks/mmlu-pro/runner.ts
|
|
403
|
+
async function runTestCase2(params) {
|
|
404
|
+
const { testCase } = params;
|
|
405
|
+
const responseIdGenerator = params.idGenerators?.response ?? idGeneratorUUIDv7;
|
|
406
|
+
const scoreIdGenerator = params.idGenerators?.score ?? idGeneratorUUIDv7;
|
|
407
|
+
const messages = [];
|
|
408
|
+
if (params.systemPrompt) {
|
|
409
|
+
messages.push({
|
|
410
|
+
role: "system",
|
|
411
|
+
content: params.systemPrompt.content
|
|
412
|
+
});
|
|
413
|
+
}
|
|
414
|
+
if (testCase.kind === "mmlu-pro.ts.main") {
|
|
415
|
+
const formattedPrompt = formatMCQPrompt2(testCase);
|
|
416
|
+
messages.push({
|
|
417
|
+
role: "user",
|
|
418
|
+
content: formattedPrompt
|
|
419
|
+
});
|
|
420
|
+
const providerResponse = await params.provider.forward({
|
|
421
|
+
model: params.runConfig.model,
|
|
422
|
+
messages
|
|
423
|
+
});
|
|
424
|
+
const response = await MMLUProMainResponseSchemaV1.newWithId(
|
|
425
|
+
{
|
|
426
|
+
data: providerResponse.data,
|
|
427
|
+
startedAt: providerResponse.startedAt,
|
|
428
|
+
completedAt: providerResponse.completedAt,
|
|
429
|
+
testCaseId: testCase.id,
|
|
430
|
+
modelSlug: params.runConfig.model,
|
|
431
|
+
provider: params.provider.kind,
|
|
432
|
+
inputTokensUsed: providerResponse.inputTokensUsed,
|
|
433
|
+
outputTokensUsed: providerResponse.outputTokensUsed,
|
|
434
|
+
inputCost: providerResponse.inputCost,
|
|
435
|
+
outputCost: providerResponse.outputCost
|
|
436
|
+
},
|
|
437
|
+
responseIdGenerator
|
|
438
|
+
);
|
|
439
|
+
if (params.scorer?.kind === "mcq") {
|
|
440
|
+
const scorerResult = await params.scorer.score({
|
|
441
|
+
response: response.data,
|
|
442
|
+
choices: testCase.options ?? {},
|
|
443
|
+
correctAnswers: [testCase.answerKey]
|
|
444
|
+
});
|
|
445
|
+
if (scorerResult !== null) {
|
|
446
|
+
const score = await MMLUProMainScoreSchemaV1.newWithId(
|
|
447
|
+
{
|
|
448
|
+
scoringMethod: ScoringMethod.algo,
|
|
449
|
+
value: scorerResult.value,
|
|
450
|
+
responseId: response.id,
|
|
451
|
+
extractedAnswers: scorerResult.extractedAnswers,
|
|
452
|
+
metadata: response.metadata
|
|
453
|
+
},
|
|
454
|
+
scoreIdGenerator
|
|
455
|
+
);
|
|
456
|
+
return { response, score };
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
return { response };
|
|
460
|
+
} else if (testCase.kind === "pb.ts.open-ended") {
|
|
461
|
+
const messages2 = [];
|
|
462
|
+
if (params.systemPrompt) {
|
|
463
|
+
messages2.push({
|
|
464
|
+
role: "system",
|
|
465
|
+
content: params.systemPrompt.content
|
|
466
|
+
});
|
|
467
|
+
}
|
|
468
|
+
messages2.push({
|
|
469
|
+
role: "user",
|
|
470
|
+
content: testCase.question
|
|
471
|
+
});
|
|
472
|
+
const providerResponse = await params.provider.forward({
|
|
473
|
+
model: params.runConfig.model,
|
|
474
|
+
messages: messages2
|
|
475
|
+
});
|
|
476
|
+
const response = await MMLUProMainResponseSchemaV1.newWithId(
|
|
477
|
+
{
|
|
478
|
+
data: providerResponse.data,
|
|
479
|
+
startedAt: providerResponse.startedAt,
|
|
480
|
+
completedAt: providerResponse.completedAt,
|
|
481
|
+
testCaseId: testCase.id,
|
|
482
|
+
modelSlug: params.runConfig.model,
|
|
483
|
+
provider: params.provider.kind,
|
|
484
|
+
inputTokensUsed: providerResponse.inputTokensUsed,
|
|
485
|
+
outputTokensUsed: providerResponse.outputTokensUsed,
|
|
486
|
+
inputCost: providerResponse.inputCost,
|
|
487
|
+
outputCost: providerResponse.outputCost
|
|
488
|
+
},
|
|
489
|
+
responseIdGenerator
|
|
490
|
+
);
|
|
491
|
+
if (params.scorer?.kind === "llmJudge" && params.runConfig.llmJudgeModel) {
|
|
492
|
+
const scorerResult = await params.scorer.score({
|
|
493
|
+
task: testCase.question,
|
|
494
|
+
candidateAnswer: response.data,
|
|
495
|
+
referenceAnswer: testCase.answer,
|
|
496
|
+
model: params.runConfig.llmJudgeModel
|
|
497
|
+
});
|
|
498
|
+
if (scorerResult !== null) {
|
|
499
|
+
const score = await MMLUProMainScoreSchemaV1.newWithId(
|
|
500
|
+
{
|
|
501
|
+
scoringMethod: ScoringMethod.ai,
|
|
502
|
+
value: scorerResult.value,
|
|
503
|
+
responseId: response.id,
|
|
504
|
+
explanation: scorerResult.explanation,
|
|
505
|
+
metadata: scorerResult.metadata,
|
|
506
|
+
extractedAnswers: [],
|
|
507
|
+
scorerAIProvider: scorerResult.provider,
|
|
508
|
+
scorerAIModelSlug: params.runConfig.llmJudgeModel,
|
|
509
|
+
scorerAIInputTokensUsed: scorerResult.inputTokensUsed,
|
|
510
|
+
scorerAIOutputTokensUsed: scorerResult.outputTokensUsed,
|
|
511
|
+
scorerAIInputCost: scorerResult.inputCost,
|
|
512
|
+
scorerAIOutputCost: scorerResult.outputCost
|
|
513
|
+
},
|
|
514
|
+
scoreIdGenerator
|
|
515
|
+
);
|
|
516
|
+
return { response, score };
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
return { response };
|
|
520
|
+
}
|
|
521
|
+
throw new Error("Unsupported test case kind");
|
|
522
|
+
}
|
|
523
|
+
function formatMCQPrompt2(testCase) {
|
|
524
|
+
return `Question: ${testCase.question}
|
|
525
|
+
Options:
|
|
526
|
+
${Object.entries(
|
|
527
|
+
testCase.options ?? {}
|
|
528
|
+
).map(([key, value]) => `${key}: ${value}`).join("\n")}`;
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
// src/benchmarks/mmlu-pro/spec.ts
|
|
532
|
+
var MMLUProBenchmarkSpecSchemaV1 = defineBenchmarkSpecSchema({
|
|
533
|
+
baseSchema: BaseBenchmarkSpecSchemaV1,
|
|
534
|
+
kind: "mmlu-pro.benchmark.spec",
|
|
535
|
+
schemaVersion: 1,
|
|
536
|
+
fields: {}
|
|
537
|
+
});
|
|
538
|
+
|
|
539
|
+
// src/benchmarks/fnol/index.ts
|
|
540
|
+
var fnol_exports = {};
|
|
541
|
+
__export(fnol_exports, {
|
|
542
|
+
FNOLBaseScoreSchemaV1: () => FNOLBaseScoreSchemaV1,
|
|
543
|
+
FNOLConversationMessageSchemaV1: () => FNOLConversationMessageSchemaV1,
|
|
544
|
+
FNOLDoneReason: () => FNOLDoneReason,
|
|
545
|
+
FNOLFieldSchemaV1: () => FNOLFieldSchemaV1,
|
|
546
|
+
FNOLFieldValueType: () => FNOLFieldValueType,
|
|
547
|
+
FNOLFieldsScoreSchemaV1: () => FNOLFieldsScoreSchemaV1,
|
|
548
|
+
FNOLFieldsScorer: () => FNOLFieldsScorer,
|
|
549
|
+
FNOLLLMJudgeScoreSchemaV1: () => FNOLLLMJudgeScoreSchemaV1,
|
|
550
|
+
FNOLResponseSchemaV1: () => FNOLResponseSchemaV1,
|
|
551
|
+
FNOLTestCaseSchemaV1: () => FNOLTestCaseSchemaV1,
|
|
552
|
+
runTestCase: () => runTestCase3
|
|
553
|
+
});
|
|
554
|
+
|
|
555
|
+
// src/benchmarks/fnol/test-cases/fnol.v1.ts
|
|
556
|
+
import { z as z7 } from "zod";
|
|
557
|
+
|
|
558
|
+
// src/benchmarks/fnol/score.ts
|
|
559
|
+
var FNOLBaseScoreSchemaV1 = defineScoreSchema({
|
|
560
|
+
baseSchema: BaseScoreSchemaV1,
|
|
561
|
+
fields: {
|
|
562
|
+
...ExtensionLLMAsAJudgeScorerFieldsV1
|
|
563
|
+
}
|
|
564
|
+
});
|
|
565
|
+
|
|
566
|
+
// src/benchmarks/fnol/types.ts
|
|
567
|
+
var FNOLFieldValueType = {
|
|
568
|
+
string: "string",
|
|
569
|
+
number: "number",
|
|
570
|
+
boolean: "boolean",
|
|
571
|
+
object: "object"
|
|
572
|
+
};
|
|
573
|
+
var FNOLDoneReason = {
|
|
574
|
+
modelProvidedJson: "modelProvidedJson",
|
|
575
|
+
reachedMaxTurns: "reachedMaxTurns",
|
|
576
|
+
forcedFinalJson: "forcedFinalJson"
|
|
577
|
+
};
|
|
578
|
+
|
|
579
|
+
// src/benchmarks/fnol/test-cases/fnol.v1.ts
|
|
580
|
+
var FNOLFieldSchemaV1 = z7.object({
|
|
581
|
+
description: z7.string(),
|
|
582
|
+
required: z7.boolean().optional(),
|
|
583
|
+
/**
|
|
584
|
+
* Optional expected value used by the deterministic scorer.
|
|
585
|
+
* If omitted, the scorer will only check presence.
|
|
586
|
+
*/
|
|
587
|
+
expected: z7.unknown().optional(),
|
|
588
|
+
/**
|
|
589
|
+
* Optional type hint for the model/user simulation.
|
|
590
|
+
*/
|
|
591
|
+
valueType: z7.enum(FNOLFieldValueType).optional()
|
|
592
|
+
});
|
|
593
|
+
var FNOLTestCaseSchemaV1 = defineTestCaseSchema({
|
|
594
|
+
baseSchema: BaseTestCaseSchemaV1,
|
|
595
|
+
kind: "fnol.ts.v1",
|
|
596
|
+
schemaVersion: 1,
|
|
597
|
+
fields: {
|
|
598
|
+
/**
|
|
599
|
+
* Scenario starter message. This is what the "user" would say initially.
|
|
600
|
+
*/
|
|
601
|
+
initialUserMessage: z7.string(),
|
|
602
|
+
/**
|
|
603
|
+
* Private/structured information about the user and the incident.
|
|
604
|
+
* This is used by the user simulator LLM to answer the target model questions.
|
|
605
|
+
*/
|
|
606
|
+
userProfile: z7.record(z7.string(), z7.unknown()),
|
|
607
|
+
/**
|
|
608
|
+
* The fields the target model must collect.
|
|
609
|
+
* Keys are canonical identifiers (e.g. "policyNumber", "dateOfLoss").
|
|
610
|
+
*/
|
|
611
|
+
fieldsToCollect: z7.record(z7.string(), FNOLFieldSchemaV1),
|
|
612
|
+
/**
|
|
613
|
+
* Maximum number of back-and-forth turns (target question + user answer).
|
|
614
|
+
*/
|
|
615
|
+
maxTurns: z7.number().int().min(1).max(100).default(10)
|
|
616
|
+
}
|
|
617
|
+
});
|
|
618
|
+
var FNOLConversationMessageSchemaV1 = z7.object({
|
|
619
|
+
role: z7.enum(["system", "user", "assistant"]),
|
|
620
|
+
content: z7.string()
|
|
621
|
+
});
|
|
622
|
+
var FNOLResponseSchemaV1 = defineResponseSchema({
|
|
623
|
+
baseSchema: BaseLLMChatResponseSchemaV1,
|
|
624
|
+
kind: "fnol.rs.v1",
|
|
625
|
+
schemaVersion: 1,
|
|
626
|
+
fields: {
|
|
627
|
+
/**
|
|
628
|
+
* Full conversation between the target model and simulated user.
|
|
629
|
+
*/
|
|
630
|
+
conversation: z7.array(FNOLConversationMessageSchemaV1),
|
|
631
|
+
turnsUsed: z7.number().int(),
|
|
632
|
+
doneReason: z7.enum(FNOLDoneReason),
|
|
633
|
+
/**
|
|
634
|
+
* Parsed JSON object from the target model's final answer, if available.
|
|
635
|
+
*/
|
|
636
|
+
extracted: z7.record(z7.string(), z7.unknown()).optional()
|
|
637
|
+
}
|
|
638
|
+
});
|
|
639
|
+
var FNOLFieldsScoreSchemaV1 = defineScoreSchema({
|
|
640
|
+
baseSchema: FNOLBaseScoreSchemaV1,
|
|
641
|
+
kind: "fnol.sc.fields.v1",
|
|
642
|
+
schemaVersion: 1,
|
|
643
|
+
fields: {
|
|
644
|
+
requiredKeys: z7.array(z7.string()),
|
|
645
|
+
presentKeys: z7.array(z7.string()),
|
|
646
|
+
missingKeys: z7.array(z7.string()),
|
|
647
|
+
mismatchedKeys: z7.array(z7.string())
|
|
648
|
+
}
|
|
649
|
+
});
|
|
650
|
+
var FNOLLLMJudgeScoreSchemaV1 = defineScoreSchema({
|
|
651
|
+
baseSchema: FNOLBaseScoreSchemaV1,
|
|
652
|
+
kind: "fnol.sc.llm-judge.v1",
|
|
653
|
+
schemaVersion: 1,
|
|
654
|
+
fields: {
|
|
655
|
+
verdict: z7.enum(["pass", "borderline", "fail"]).optional()
|
|
656
|
+
}
|
|
657
|
+
});
|
|
658
|
+
|
|
659
|
+
// src/benchmarks/fnol/runner.ts
|
|
660
|
+
function formatFieldsToCollect(fieldsToCollect) {
|
|
661
|
+
return Object.entries(fieldsToCollect).map(([key, field]) => {
|
|
662
|
+
const required = field.required === false ? "optional" : "required";
|
|
663
|
+
return `- ${key} (${required}): ${field.description}`;
|
|
664
|
+
}).join("\n");
|
|
665
|
+
}
|
|
666
|
+
function hasAllRequiredFields(params) {
|
|
667
|
+
const extracted = params.extracted ?? {};
|
|
668
|
+
for (const [key, field] of Object.entries(params.fieldsToCollect)) {
|
|
669
|
+
if (field.required === false) continue;
|
|
670
|
+
const value = extracted[key];
|
|
671
|
+
if (value === void 0 || value === null || value === "") return false;
|
|
672
|
+
}
|
|
673
|
+
return true;
|
|
674
|
+
}
|
|
675
|
+
async function runTestCase3(params) {
|
|
676
|
+
const responseIdGenerator = params.idGenerators?.response ?? idGeneratorUUIDv7;
|
|
677
|
+
const scoreIdGenerator = params.idGenerators?.score ?? idGeneratorUUIDv7;
|
|
678
|
+
const userSimulatorProvider = params.userSimulatorProvider ?? params.provider;
|
|
679
|
+
const userSimulatorModel = params.runConfig.userSimulatorModel ?? params.runConfig.model;
|
|
680
|
+
const fieldsToCollectText = formatFieldsToCollect(
|
|
681
|
+
params.testCase.fieldsToCollect
|
|
682
|
+
);
|
|
683
|
+
const conversation = [];
|
|
684
|
+
if (params.systemPrompt) {
|
|
685
|
+
conversation.push({
|
|
686
|
+
role: "system",
|
|
687
|
+
content: params.systemPrompt.content
|
|
688
|
+
});
|
|
689
|
+
}
|
|
690
|
+
conversation.push({
|
|
691
|
+
role: "system",
|
|
692
|
+
content: [
|
|
693
|
+
"You are an insurance FNOL intake assistant.",
|
|
694
|
+
"Your job is to ask the user questions to collect the required fields listed below.",
|
|
695
|
+
"Ask concise questions, one or a few at a time.",
|
|
696
|
+
"When you have enough information OR when you are told to finish, output ONLY a single JSON object with the collected fields.",
|
|
697
|
+
"Do not include markdown fences. Do not include additional text outside the JSON.",
|
|
698
|
+
"",
|
|
699
|
+
"Fields to collect:",
|
|
700
|
+
fieldsToCollectText
|
|
701
|
+
].join("\n")
|
|
702
|
+
});
|
|
703
|
+
conversation.push({
|
|
704
|
+
role: "user",
|
|
705
|
+
content: params.testCase.initialUserMessage
|
|
706
|
+
});
|
|
707
|
+
let doneReason;
|
|
708
|
+
let extracted;
|
|
709
|
+
const startedAt = Date.now();
|
|
710
|
+
for (let turn = 0; turn < params.testCase.maxTurns; turn++) {
|
|
711
|
+
const targetReply = await params.provider.forward({
|
|
712
|
+
model: params.runConfig.model,
|
|
713
|
+
temperature: params.runConfig.temperature,
|
|
714
|
+
messages: conversation
|
|
715
|
+
});
|
|
716
|
+
conversation.push({
|
|
717
|
+
role: "assistant",
|
|
718
|
+
content: targetReply.data
|
|
719
|
+
});
|
|
720
|
+
extracted = parseResponseAsJSON(targetReply.data);
|
|
721
|
+
if (extracted && hasAllRequiredFields({
|
|
722
|
+
extracted,
|
|
723
|
+
fieldsToCollect: params.testCase.fieldsToCollect
|
|
724
|
+
})) {
|
|
725
|
+
doneReason = FNOLDoneReason.modelProvidedJson;
|
|
726
|
+
break;
|
|
727
|
+
}
|
|
728
|
+
const lastAssistantMessage = targetReply.data;
|
|
729
|
+
const simulatedUser = await userSimulatorProvider.forward({
|
|
730
|
+
model: userSimulatorModel,
|
|
731
|
+
temperature: params.runConfig.userSimulatorTemperature,
|
|
732
|
+
messages: [
|
|
733
|
+
{
|
|
734
|
+
role: "system",
|
|
735
|
+
content: [
|
|
736
|
+
"You are simulating a real insurance customer (the user).",
|
|
737
|
+
"Answer the assistant's questions truthfully using ONLY the provided user profile and incident details.",
|
|
738
|
+
"If asked about something not present in the profile, say you don't know.",
|
|
739
|
+
"Be concise and natural. Do not invent new facts.",
|
|
740
|
+
"",
|
|
741
|
+
"User profile (JSON):",
|
|
742
|
+
JSON.stringify(params.testCase.userProfile)
|
|
743
|
+
].join("\n")
|
|
744
|
+
},
|
|
745
|
+
{
|
|
746
|
+
role: "user",
|
|
747
|
+
content: lastAssistantMessage
|
|
748
|
+
}
|
|
749
|
+
]
|
|
750
|
+
});
|
|
751
|
+
conversation.push({
|
|
752
|
+
role: "user",
|
|
753
|
+
content: simulatedUser.data
|
|
754
|
+
});
|
|
755
|
+
}
|
|
756
|
+
if (!doneReason) {
|
|
757
|
+
doneReason = FNOLDoneReason.reachedMaxTurns;
|
|
758
|
+
const forced = await params.provider.forward({
|
|
759
|
+
model: params.runConfig.model,
|
|
760
|
+
temperature: params.runConfig.temperature,
|
|
761
|
+
messages: [
|
|
762
|
+
...conversation,
|
|
763
|
+
{
|
|
764
|
+
role: "user",
|
|
765
|
+
content: "Stop the interview now and output ONLY the final JSON object with the collected fields. No extra text."
|
|
766
|
+
}
|
|
767
|
+
]
|
|
768
|
+
});
|
|
769
|
+
conversation.push({ role: "assistant", content: forced.data });
|
|
770
|
+
extracted = parseResponseAsJSON(forced.data);
|
|
771
|
+
if (extracted) {
|
|
772
|
+
doneReason = FNOLDoneReason.forcedFinalJson;
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
const completedAt = Date.now();
|
|
776
|
+
const lastAssistant = [...conversation].reverse().find((m) => m.role === "assistant");
|
|
777
|
+
const response = await FNOLResponseSchemaV1.newWithId(
|
|
778
|
+
{
|
|
779
|
+
data: typeof lastAssistant?.content === "string" ? lastAssistant.content : "",
|
|
780
|
+
startedAt,
|
|
781
|
+
completedAt,
|
|
782
|
+
testCaseId: params.testCase.id,
|
|
783
|
+
modelSlug: params.runConfig.model,
|
|
784
|
+
provider: params.provider.kind,
|
|
785
|
+
conversation: conversation.map((m) => ({
|
|
786
|
+
role: m.role === "user" ? "user" : "assistant",
|
|
787
|
+
content: String(m.content)
|
|
788
|
+
})),
|
|
789
|
+
turnsUsed: conversation.filter((m) => m.role === "assistant").length,
|
|
790
|
+
doneReason,
|
|
791
|
+
extracted
|
|
792
|
+
},
|
|
793
|
+
responseIdGenerator
|
|
794
|
+
);
|
|
795
|
+
if (params.scorer?.kind === "fnol.fields") {
|
|
796
|
+
const scorerResult = await params.scorer.score({
|
|
797
|
+
fieldsToCollect: params.testCase.fieldsToCollect,
|
|
798
|
+
extracted
|
|
799
|
+
});
|
|
800
|
+
const score = await FNOLFieldsScoreSchemaV1.newWithId(
|
|
801
|
+
{
|
|
802
|
+
responseId: response.id,
|
|
803
|
+
value: scorerResult.value,
|
|
804
|
+
explanation: scorerResult.explanation,
|
|
805
|
+
metadata: scorerResult.metadata,
|
|
806
|
+
scoringMethod: ScoringMethod.algo,
|
|
807
|
+
requiredKeys: scorerResult.requiredKeys,
|
|
808
|
+
presentKeys: scorerResult.presentKeys,
|
|
809
|
+
missingKeys: scorerResult.missingKeys,
|
|
810
|
+
mismatchedKeys: scorerResult.mismatchedKeys
|
|
811
|
+
},
|
|
812
|
+
scoreIdGenerator
|
|
813
|
+
);
|
|
814
|
+
return { response, score };
|
|
815
|
+
}
|
|
816
|
+
if (params.scorer?.kind === "llmJudge" && params.runConfig.llmJudgeModel) {
|
|
817
|
+
const scorerResult = await params.scorer.score({
|
|
818
|
+
task: "Evaluate whether the FNOL JSON contains the required fields and correct values.",
|
|
819
|
+
candidateAnswer: response.data,
|
|
820
|
+
referenceAnswer: JSON.stringify(
|
|
821
|
+
Object.fromEntries(
|
|
822
|
+
Object.entries(params.testCase.fieldsToCollect).map(([k, v]) => [
|
|
823
|
+
k,
|
|
824
|
+
v.expected
|
|
825
|
+
])
|
|
826
|
+
)
|
|
827
|
+
),
|
|
828
|
+
model: params.runConfig.llmJudgeModel,
|
|
829
|
+
meta: {
|
|
830
|
+
fieldsToCollect: params.testCase.fieldsToCollect,
|
|
831
|
+
doneReason
|
|
832
|
+
}
|
|
833
|
+
});
|
|
834
|
+
if (scorerResult !== null) {
|
|
835
|
+
const score = await FNOLLLMJudgeScoreSchemaV1.newWithId(
|
|
836
|
+
{
|
|
837
|
+
responseId: response.id,
|
|
838
|
+
value: scorerResult.value,
|
|
839
|
+
explanation: scorerResult.explanation,
|
|
840
|
+
metadata: scorerResult.metadata,
|
|
841
|
+
scoringMethod: ScoringMethod.ai,
|
|
842
|
+
verdict: scorerResult.verdict,
|
|
843
|
+
scorerAIProvider: scorerResult.provider,
|
|
844
|
+
scorerAIModelSlug: params.runConfig.llmJudgeModel,
|
|
845
|
+
scorerAIInputTokensUsed: scorerResult.inputTokensUsed,
|
|
846
|
+
scorerAIOutputTokensUsed: scorerResult.outputTokensUsed,
|
|
847
|
+
scorerAIInputCost: scorerResult.inputCost,
|
|
848
|
+
scorerAIOutputCost: scorerResult.outputCost
|
|
849
|
+
},
|
|
850
|
+
scoreIdGenerator
|
|
851
|
+
);
|
|
852
|
+
return { response, score };
|
|
853
|
+
}
|
|
854
|
+
}
|
|
855
|
+
return { response };
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
// src/benchmarks/fnol/scorer.ts
|
|
859
|
+
function isMissing(value) {
|
|
860
|
+
return value === void 0 || value === null || value === "";
|
|
861
|
+
}
|
|
862
|
+
function normalizeString(value) {
|
|
863
|
+
return value.trim();
|
|
864
|
+
}
|
|
865
|
+
function valuesEqual(expected, actual) {
|
|
866
|
+
if (typeof expected === "string" && typeof actual === "string") {
|
|
867
|
+
return normalizeString(expected) === normalizeString(actual);
|
|
868
|
+
}
|
|
869
|
+
return stableStringify(expected) === stableStringify(actual);
|
|
870
|
+
}
|
|
871
|
+
var FNOLFieldsScorer = class extends AbstractScorer {
|
|
872
|
+
kind = "fnol.fields";
|
|
873
|
+
async score(params) {
|
|
874
|
+
const extracted = params.extracted ?? {};
|
|
875
|
+
const requiredKeys = Object.entries(params.fieldsToCollect).filter(([, field]) => field.required !== false).map(([key]) => key);
|
|
876
|
+
const presentKeys = [];
|
|
877
|
+
const missingKeys = [];
|
|
878
|
+
const mismatchedKeys = [];
|
|
879
|
+
for (const key of requiredKeys) {
|
|
880
|
+
const value = extracted[key];
|
|
881
|
+
if (isMissing(value)) {
|
|
882
|
+
missingKeys.push(key);
|
|
883
|
+
continue;
|
|
884
|
+
}
|
|
885
|
+
presentKeys.push(key);
|
|
886
|
+
const expected = params.fieldsToCollect[key]?.expected;
|
|
887
|
+
if (expected !== void 0 && !valuesEqual(expected, value)) {
|
|
888
|
+
mismatchedKeys.push(key);
|
|
889
|
+
}
|
|
890
|
+
}
|
|
891
|
+
const requiredCount = requiredKeys.length;
|
|
892
|
+
const correctCount = requiredCount - missingKeys.length - mismatchedKeys.length;
|
|
893
|
+
const score = requiredCount === 0 ? 1 : correctCount / requiredCount;
|
|
894
|
+
return {
|
|
895
|
+
value: Math.max(0, Math.min(1, score)),
|
|
896
|
+
explanation: missingKeys.length === 0 && mismatchedKeys.length === 0 ? "All required fields collected" : "Missing or mismatched fields",
|
|
897
|
+
requiredKeys,
|
|
898
|
+
presentKeys,
|
|
899
|
+
missingKeys,
|
|
900
|
+
mismatchedKeys,
|
|
901
|
+
metadata: {
|
|
902
|
+
requiredCount,
|
|
903
|
+
presentCount: presentKeys.length,
|
|
904
|
+
missingCount: missingKeys.length,
|
|
905
|
+
mismatchedCount: mismatchedKeys.length
|
|
906
|
+
}
|
|
907
|
+
};
|
|
908
|
+
}
|
|
909
|
+
};
|
|
910
|
+
export {
|
|
911
|
+
fnol_exports as fnol,
|
|
912
|
+
mmlu_pro_exports as mmluPro,
|
|
913
|
+
peerbench_exports as peerbench
|
|
914
|
+
};
|
|
915
|
+
//# sourceMappingURL=index.js.map
|