peerbench 0.0.2-alpha.0 → 0.0.2-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/README.md +123 -99
  2. package/dist/aggregators/index.d.ts +67 -0
  3. package/dist/aggregators/index.js +46 -0
  4. package/dist/aggregators/index.js.map +1 -0
  5. package/dist/benchmarks/index.d.ts +615 -1271
  6. package/dist/benchmarks/index.js +358 -805
  7. package/dist/benchmarks/index.js.map +1 -1
  8. package/dist/{chunk-DUBKY73H.js → chunk-4UBK6452.js} +13 -13
  9. package/dist/chunk-4UBK6452.js.map +1 -0
  10. package/dist/chunk-ERALDEZY.js +112 -0
  11. package/dist/chunk-ERALDEZY.js.map +1 -0
  12. package/dist/{chunk-ZJWSK4VO.js → chunk-HMQYGCKI.js} +1 -1
  13. package/dist/chunk-HMQYGCKI.js.map +1 -0
  14. package/dist/chunk-NUEOE3K5.js +8 -0
  15. package/dist/chunk-NUEOE3K5.js.map +1 -0
  16. package/dist/chunk-OQE6TQXZ.js +42 -0
  17. package/dist/chunk-OQE6TQXZ.js.map +1 -0
  18. package/dist/chunk-QY5MPNNB.js +28 -0
  19. package/dist/chunk-QY5MPNNB.js.map +1 -0
  20. package/dist/chunk-R76XA2K6.js +229 -0
  21. package/dist/chunk-R76XA2K6.js.map +1 -0
  22. package/dist/chunk-TRNCF2BG.js +35 -0
  23. package/dist/chunk-TRNCF2BG.js.map +1 -0
  24. package/dist/chunk-UHHHSYVE.js +11 -0
  25. package/dist/chunk-UHHHSYVE.js.map +1 -0
  26. package/dist/{chunk-232PY7K3.js → chunk-YY33MNMV.js} +29 -14
  27. package/dist/chunk-YY33MNMV.js.map +1 -0
  28. package/dist/chunk-ZEWI24CV.js +365 -0
  29. package/dist/chunk-ZEWI24CV.js.map +1 -0
  30. package/dist/chunk-ZXTQJFGL.js +44 -0
  31. package/dist/chunk-ZXTQJFGL.js.map +1 -0
  32. package/dist/index-BAioQhp2.d.ts +27 -0
  33. package/dist/index.d.ts +51 -26
  34. package/dist/index.js +28 -25
  35. package/dist/index.js.map +1 -1
  36. package/dist/json-file-ZwzLUbje.d.ts +73 -0
  37. package/dist/llm-judge-QThCZ9TQ.d.ts +67 -0
  38. package/dist/providers/index.d.ts +16 -19
  39. package/dist/providers/index.js +8 -253
  40. package/dist/providers/index.js.map +1 -1
  41. package/dist/schemas/extensions/index.d.ts +16 -2
  42. package/dist/schemas/extensions/index.js +9 -3
  43. package/dist/schemas/extensions/index.js.map +1 -1
  44. package/dist/schemas/index.d.ts +108 -141
  45. package/dist/schemas/index.js +7 -10
  46. package/dist/schemas/llm/index.d.ts +100 -82
  47. package/dist/schemas/llm/index.js +7 -29
  48. package/dist/schemas/llm/index.js.map +1 -1
  49. package/dist/scorers/index.d.ts +3 -2
  50. package/dist/scorers/index.js +8 -486
  51. package/dist/scorers/index.js.map +1 -1
  52. package/dist/storages/index.d.ts +69 -0
  53. package/dist/storages/index.js +98 -0
  54. package/dist/storages/index.js.map +1 -0
  55. package/package.json +12 -6
  56. package/dist/catalogs/index.d.ts +0 -75
  57. package/dist/catalogs/index.js +0 -88
  58. package/dist/catalogs/index.js.map +0 -1
  59. package/dist/chunk-22HU24QF.js +0 -8
  60. package/dist/chunk-22HU24QF.js.map +0 -1
  61. package/dist/chunk-232PY7K3.js.map +0 -1
  62. package/dist/chunk-7TREBPSJ.js +0 -26
  63. package/dist/chunk-7TREBPSJ.js.map +0 -1
  64. package/dist/chunk-DUBKY73H.js.map +0 -1
  65. package/dist/chunk-GVF4YZF3.js +0 -15
  66. package/dist/chunk-GVF4YZF3.js.map +0 -1
  67. package/dist/chunk-HJH3SW3L.js +0 -103
  68. package/dist/chunk-HJH3SW3L.js.map +0 -1
  69. package/dist/chunk-IUN2IUCS.js +0 -58
  70. package/dist/chunk-IUN2IUCS.js.map +0 -1
  71. package/dist/chunk-VBOM2YEG.js +0 -47
  72. package/dist/chunk-VBOM2YEG.js.map +0 -1
  73. package/dist/chunk-ZJWSK4VO.js.map +0 -1
  74. package/dist/data-BmN5WjZ4.d.ts +0 -57
  75. package/dist/generic-array-DLHWSvf1.d.ts +0 -22
  76. package/dist/index-WiPjF2AL.d.ts +0 -15
  77. package/dist/llm-judge-DIG1f1Az.d.ts +0 -67
  78. package/dist/simple-system-prompt-CzPYuvo0.d.ts +0 -49
  79. package/dist/system-prompt--0FdPWqK.d.ts +0 -58
  80. package/dist/utilities-BrRH32rD.d.ts +0 -30
@@ -1,36 +1,42 @@
1
1
  import {
2
- AbstractDataLoader,
3
- GenericJSONArrayDataLoader,
4
- stableStringify
5
- } from "../chunk-HJH3SW3L.js";
2
+ SimpleSystemPromptSchemaV1
3
+ } from "../chunk-ZXTQJFGL.js";
6
4
  import {
7
- AbstractScorer
8
- } from "../chunk-22HU24QF.js";
5
+ defineRunner
6
+ } from "../chunk-QY5MPNNB.js";
9
7
  import {
10
- bufferToString,
11
- idGeneratorUUIDv7,
12
- parseResponseAsJSON
13
- } from "../chunk-DUBKY73H.js";
8
+ AbstractLLMProvider
9
+ } from "../chunk-R76XA2K6.js";
14
10
  import {
15
- BaseBenchmarkSpecSchemaV1,
11
+ LLMAsAJudgeScorer,
12
+ MCQScorer
13
+ } from "../chunk-ZEWI24CV.js";
14
+ import {
15
+ PEERBENCH_NAMESPACE
16
+ } from "../chunk-UHHHSYVE.js";
17
+ import {
18
+ BaseResponseSchemaV1,
16
19
  BaseScoreSchemaV1,
17
20
  BaseTestCaseSchemaV1,
18
- defineBenchmarkSpecSchema,
21
+ defineResponseSchema,
19
22
  defineScoreSchema,
20
23
  defineTestCaseSchema
21
- } from "../chunk-232PY7K3.js";
24
+ } from "../chunk-YY33MNMV.js";
25
+ import "../chunk-OQE6TQXZ.js";
22
26
  import {
23
27
  ScoringMethod
24
- } from "../chunk-ZJWSK4VO.js";
28
+ } from "../chunk-HMQYGCKI.js";
25
29
  import {
26
- ExtensionLLMAsAJudgeScorerFieldsV1
27
- } from "../chunk-GVF4YZF3.js";
30
+ JSONFileStorage
31
+ } from "../chunk-ERALDEZY.js";
28
32
  import {
29
- BaseLLMChatResponseSchemaV1
30
- } from "../chunk-7TREBPSJ.js";
33
+ idGeneratorUUIDv7
34
+ } from "../chunk-4UBK6452.js";
31
35
  import {
32
- defineResponseSchema
33
- } from "../chunk-IUN2IUCS.js";
36
+ ExtensionLLMAsAJudgeScoreFieldsV1,
37
+ ExtensionLLMResponseFieldsV1
38
+ } from "../chunk-TRNCF2BG.js";
39
+ import "../chunk-NUEOE3K5.js";
34
40
  import {
35
41
  __export
36
42
  } from "../chunk-PZ5AY32C.js";
@@ -38,878 +44,425 @@ import {
38
44
  // src/benchmarks/peerbench/index.ts
39
45
  var peerbench_exports = {};
40
46
  __export(peerbench_exports, {
41
- PeerbenchJSONDataLoader: () => PeerbenchJSONDataLoader,
42
- PeerbenchMultipleChoiceResponseSchemaV1: () => PeerbenchMultipleChoiceResponseSchemaV1,
43
- PeerbenchMultipleChoiceScoreSchemaV1: () => PeerbenchMultipleChoiceScoreSchemaV1,
44
- PeerbenchMultipleChoiceTestCaseSchemaV1: () => PeerbenchMultipleChoiceTestCaseSchemaV1,
45
- PeerbenchOpenEndedResponseSchemaV1: () => PeerbenchOpenEndedResponseSchemaV1,
46
- PeerbenchOpenEndedScoreSchemaV1: () => PeerbenchOpenEndedScoreSchemaV1,
47
- PeerbenchOpenEndedTestCaseSchemaV1: () => PeerbenchOpenEndedTestCaseSchemaV1,
48
- runTestCase: () => runTestCase
47
+ MCQKind: () => MCQKind,
48
+ MCQResponseSchemaV1: () => MCQResponseSchemaV1,
49
+ MCQScoreSchemaV1: () => MCQScoreSchemaV1,
50
+ MCQTestCaseSchemaV1: () => MCQTestCaseSchemaV1,
51
+ MultiTurnKind: () => MultiTurnKind,
52
+ MultiTurnResponseSchemaV1: () => MultiTurnResponseSchemaV1,
53
+ MultiTurnScoreSchemaV1: () => MultiTurnScoreSchemaV1,
54
+ MultiTurnTestCaseSchemaV1: () => MultiTurnTestCaseSchemaV1,
55
+ PeerbenchJSONStorage: () => PeerbenchJSONStorage,
56
+ QAKind: () => QAKind,
57
+ QAResponseSchemaV1: () => QAResponseSchemaV1,
58
+ QAScoreSchemaV1: () => QAScoreSchemaV1,
59
+ QATestCaseSchemaV1: () => QATestCaseSchemaV1,
60
+ peerbenchRunner: () => peerbenchRunner
49
61
  });
50
62
 
51
- // src/benchmarks/peerbench/test-cases/mcq.v1.ts
63
+ // src/benchmarks/peerbench/schema-sets/mcq.v1.ts
52
64
  import { z } from "zod";
53
-
54
- // src/benchmarks/peerbench/score.ts
55
- var PeerbenchBaseScoreSchemaV1 = defineScoreSchema({
56
- baseSchema: BaseScoreSchemaV1,
57
- fields: {
58
- ...ExtensionLLMAsAJudgeScorerFieldsV1
59
- }
60
- });
61
-
62
- // src/benchmarks/peerbench/test-cases/mcq.v1.ts
63
- var PeerbenchMultipleChoiceTestCaseSchemaV1 = defineTestCaseSchema({
65
+ var MCQKind = `llm/mcq`;
66
+ var MCQTestCaseSchemaV1 = defineTestCaseSchema({
64
67
  baseSchema: BaseTestCaseSchemaV1,
65
- kind: "pb.ts.mcq",
68
+ namespace: PEERBENCH_NAMESPACE,
69
+ kind: MCQKind,
66
70
  schemaVersion: 1,
67
71
  fields: {
68
72
  question: z.string(),
69
73
  options: z.record(z.string(), z.string()),
70
- answer: z.string(),
71
- answerKey: z.string()
74
+ correctAnswerKeys: z.string().array()
72
75
  }
73
76
  });
74
- var PeerbenchMultipleChoiceResponseSchemaV1 = defineResponseSchema({
75
- baseSchema: BaseLLMChatResponseSchemaV1,
76
- kind: "pb.rs.mcq",
77
+ var MCQResponseSchemaV1 = defineResponseSchema({
78
+ baseSchema: BaseResponseSchemaV1,
79
+ namespace: PEERBENCH_NAMESPACE,
80
+ kind: MCQKind,
77
81
  schemaVersion: 1,
78
- fields: {}
82
+ fields: {
83
+ ...ExtensionLLMResponseFieldsV1
84
+ }
79
85
  });
80
- var PeerbenchMultipleChoiceScoreSchemaV1 = defineScoreSchema({
81
- baseSchema: PeerbenchBaseScoreSchemaV1,
82
- kind: "pb.sc.mcq",
86
+ var MCQScoreSchemaV1 = defineScoreSchema({
87
+ baseSchema: BaseScoreSchemaV1,
88
+ namespace: PEERBENCH_NAMESPACE,
89
+ kind: MCQKind,
83
90
  schemaVersion: 1,
84
91
  fields: {
92
+ ...ExtensionLLMAsAJudgeScoreFieldsV1,
85
93
  extractedAnswers: z.array(z.string())
86
94
  }
87
95
  });
88
96
 
89
- // src/benchmarks/peerbench/test-cases/open-ended.v1.ts
97
+ // src/benchmarks/peerbench/schema-sets/multi-turn.v1.ts
90
98
  import { z as z2 } from "zod";
91
- var PeerbenchOpenEndedTestCaseSchemaV1 = defineTestCaseSchema({
92
- kind: "pb.ts.open-ended",
93
- schemaVersion: 1,
99
+ var MultiTurnKind = `llm/multi-turn`;
100
+ var MultiTurnTestCaseSchemaV1 = defineTestCaseSchema({
94
101
  baseSchema: BaseTestCaseSchemaV1,
102
+ namespace: PEERBENCH_NAMESPACE,
103
+ kind: MultiTurnKind,
104
+ schemaVersion: 1,
95
105
  fields: {
96
- question: z2.string(),
97
- answer: z2.string().optional()
106
+ messages: z2.object({
107
+ role: z2.string(),
108
+ content: z2.string(),
109
+ goodAnswers: z2.string().array().optional(),
110
+ badAnswers: z2.string().array().optional()
111
+ }).array(),
112
+ maxTurns: z2.number().optional(),
113
+ expectedOutcome: z2.string().optional()
98
114
  }
99
115
  });
100
- var PeerbenchOpenEndedResponseSchemaV1 = defineResponseSchema({
101
- baseSchema: BaseLLMChatResponseSchemaV1,
102
- kind: "pb.rs.open-ended",
103
- schemaVersion: 1
104
- });
105
- var PeerbenchOpenEndedScoreSchemaV1 = defineScoreSchema({
106
- baseSchema: PeerbenchBaseScoreSchemaV1,
107
- kind: "pb.sc.open-ended",
108
- schemaVersion: 1,
109
- fields: {}
110
- });
111
-
112
- // src/benchmarks/peerbench/spec.ts
113
- import z3 from "zod";
114
- var PeerbenchBenchmarkSpecSchemaV1 = defineBenchmarkSpecSchema({
115
- baseSchema: BaseBenchmarkSpecSchemaV1,
116
- kind: "pb.benchmark.spec",
116
+ var MultiTurnResponseSchemaV1 = defineResponseSchema({
117
+ baseSchema: BaseResponseSchemaV1,
118
+ namespace: PEERBENCH_NAMESPACE,
119
+ kind: MultiTurnKind,
117
120
  schemaVersion: 1,
118
121
  fields: {
119
- /**
120
- * Big text contents that can be referred as <text>{key}</text> in a prompt or system prompt.
121
- */
122
- blobTexts: z3.record(z3.string(), z3.string()).optional()
122
+ ...ExtensionLLMResponseFieldsV1,
123
+ replies: z2.object({
124
+ messageIndex: z2.number(),
125
+ startedAt: z2.number(),
126
+ completedAt: z2.number(),
127
+ data: z2.string(),
128
+ inputTokensUsed: z2.number().optional(),
129
+ outputTokensUsed: z2.number().optional(),
130
+ inputCost: z2.string().optional(),
131
+ outputCost: z2.string().optional()
132
+ }).array()
123
133
  }
124
134
  });
125
-
126
- // src/benchmarks/peerbench/loader.ts
127
- import z4 from "zod";
128
- var PeerbenchJSONDataLoader = class extends GenericJSONArrayDataLoader {
129
- kind = "pb.load.json.data";
130
- async loadBenchmarkSpec(params) {
131
- const content = bufferToString(params.content);
132
- const parsed = PeerbenchBenchmarkSpecSchemaV1.parse(content);
133
- return parsed;
134
- }
135
- testCaseBuilder(data) {
136
- const testCaseValidation = z4.union([
137
- PeerbenchMultipleChoiceTestCaseSchemaV1,
138
- PeerbenchOpenEndedTestCaseSchemaV1
139
- ]).safeParse(data);
140
- return testCaseValidation.success ? testCaseValidation.data : void 0;
141
- }
142
- async responseBuilder(data) {
143
- const responseValidation = z4.union([
144
- PeerbenchMultipleChoiceResponseSchemaV1,
145
- PeerbenchOpenEndedResponseSchemaV1
146
- ]).safeParse(data);
147
- return responseValidation.success ? responseValidation.data : void 0;
148
- }
149
- async scoreBuilder(data) {
150
- const scoreValidation = z4.union([
151
- PeerbenchMultipleChoiceScoreSchemaV1,
152
- PeerbenchOpenEndedScoreSchemaV1
153
- ]).safeParse(data);
154
- return scoreValidation.success ? scoreValidation.data : void 0;
155
- }
156
- };
157
-
158
- // src/benchmarks/peerbench/runner.ts
159
- async function runTestCase(params) {
160
- const { testCase } = params;
161
- const responseIdGenerator = params.idGenerators?.response ?? idGeneratorUUIDv7;
162
- const scoreIdGenerator = params.idGenerators?.score ?? idGeneratorUUIDv7;
163
- const messages = [];
164
- if (params.systemPrompt) {
165
- messages.push({
166
- role: "system",
167
- content: params.systemPrompt.content
168
- });
169
- }
170
- if (testCase.kind === "pb.ts.mcq") {
171
- const formattedPrompt = formatMCQPrompt(testCase);
172
- messages.push({
173
- role: "user",
174
- content: formattedPrompt
175
- });
176
- const providerResponse = await params.provider.forward({
177
- model: params.runConfig.model,
178
- messages
179
- });
180
- const response = await PeerbenchMultipleChoiceResponseSchemaV1.newWithId(
181
- {
182
- data: providerResponse.data,
183
- startedAt: providerResponse.startedAt,
184
- completedAt: providerResponse.completedAt,
185
- testCaseId: testCase.id,
186
- modelSlug: params.runConfig.model,
187
- provider: params.provider.kind,
188
- inputTokensUsed: providerResponse.inputTokensUsed,
189
- outputTokensUsed: providerResponse.outputTokensUsed,
190
- inputCost: providerResponse.inputCost,
191
- outputCost: providerResponse.outputCost
192
- },
193
- responseIdGenerator
194
- );
195
- if (params.scorer?.kind === "mcq") {
196
- const scorerResult = await params.scorer.score({
197
- response: response.data,
198
- choices: testCase.options ?? {},
199
- correctAnswers: [testCase.answerKey]
200
- });
201
- if (scorerResult !== null) {
202
- const score = await PeerbenchMultipleChoiceScoreSchemaV1.newWithId(
203
- {
204
- scoringMethod: ScoringMethod.algo,
205
- value: scorerResult.value,
206
- responseId: response.id,
207
- extractedAnswers: scorerResult.extractedAnswers,
208
- metadata: response.metadata
209
- },
210
- scoreIdGenerator
211
- );
212
- return { response, score };
213
- }
214
- }
215
- return { response };
216
- } else if (testCase.kind === "pb.ts.open-ended") {
217
- const messages2 = [];
218
- if (params.systemPrompt) {
219
- messages2.push({
220
- role: "system",
221
- content: params.systemPrompt.content
222
- });
223
- }
224
- messages2.push({
225
- role: "user",
226
- content: testCase.question
227
- });
228
- const providerResponse = await params.provider.forward({
229
- model: params.runConfig.model,
230
- messages: messages2
231
- });
232
- const response = await PeerbenchOpenEndedResponseSchemaV1.newWithId(
233
- {
234
- data: providerResponse.data,
235
- startedAt: providerResponse.startedAt,
236
- completedAt: providerResponse.completedAt,
237
- testCaseId: testCase.id,
238
- modelSlug: params.runConfig.model,
239
- provider: params.provider.kind,
240
- inputTokensUsed: providerResponse.inputTokensUsed,
241
- outputTokensUsed: providerResponse.outputTokensUsed,
242
- inputCost: providerResponse.inputCost,
243
- outputCost: providerResponse.outputCost
244
- },
245
- responseIdGenerator
246
- );
247
- if (params.scorer?.kind === "llmJudge" && params.runConfig.llmJudgeModel) {
248
- const scorerResult = await params.scorer.score({
249
- task: testCase.question,
250
- candidateAnswer: response.data,
251
- referenceAnswer: testCase.answer,
252
- model: params.runConfig.llmJudgeModel
253
- });
254
- if (scorerResult !== null) {
255
- const score = await PeerbenchOpenEndedScoreSchemaV1.newWithId(
256
- {
257
- scoringMethod: ScoringMethod.ai,
258
- value: scorerResult.value,
259
- responseId: response.id,
260
- explanation: scorerResult.explanation,
261
- metadata: scorerResult.metadata,
262
- scorerAIProvider: scorerResult.provider,
263
- scorerAIModelSlug: params.runConfig.llmJudgeModel,
264
- scorerAIInputTokensUsed: scorerResult.inputTokensUsed,
265
- scorerAIOutputTokensUsed: scorerResult.outputTokensUsed,
266
- scorerAIInputCost: scorerResult.inputCost,
267
- scorerAIOutputCost: scorerResult.outputCost
268
- },
269
- scoreIdGenerator
270
- );
271
- return { response, score };
272
- }
273
- }
274
- return { response };
275
- }
276
- throw new Error("Unsupported test case kind");
277
- }
278
- function formatMCQPrompt(testCase) {
279
- return `Question: ${testCase.question}
280
- Options:
281
- ${Object.entries(
282
- testCase.options ?? {}
283
- ).map(([key, value]) => `${key}: ${value}`).join("\n")}`;
284
- }
285
-
286
- // src/benchmarks/mmlu-pro/index.ts
287
- var mmlu_pro_exports = {};
288
- __export(mmlu_pro_exports, {
289
- BaseMMLUProScoreSchemaV1: () => BaseMMLUProScoreSchemaV1,
290
- MMLUProBenchmarkSpecSchemaV1: () => MMLUProBenchmarkSpecSchemaV1,
291
- MMLUProJSONDataLoader: () => MMLUProJSONDataLoader,
292
- MMLUProMainResponseSchemaV1: () => MMLUProMainResponseSchemaV1,
293
- MMLUProMainScoreSchemaV1: () => MMLUProMainScoreSchemaV1,
294
- MMLUProMainTestCaseSchemaV1: () => MMLUProMainTestCaseSchemaV1,
295
- MMLUProParquetDataLoader: () => MMLUProParquetDataLoader,
296
- runTestCase: () => runTestCase2
297
- });
298
-
299
- // src/benchmarks/mmlu-pro/score.ts
300
- var BaseMMLUProScoreSchemaV1 = defineScoreSchema({
135
+ var MultiTurnScoreSchemaV1 = defineScoreSchema({
301
136
  baseSchema: BaseScoreSchemaV1,
137
+ namespace: PEERBENCH_NAMESPACE,
138
+ kind: MultiTurnKind,
139
+ schemaVersion: 1,
302
140
  fields: {
303
- ...ExtensionLLMAsAJudgeScorerFieldsV1
141
+ ...ExtensionLLMAsAJudgeScoreFieldsV1,
142
+ individualScores: z2.object({
143
+ replyIndex: z2.number(),
144
+ value: z2.number()
145
+ }).array()
304
146
  }
305
147
  });
306
148
 
307
- // src/benchmarks/mmlu-pro/test-cases/main.v1.ts
308
- import { z as z5 } from "zod";
309
- var MMLUProMainTestCaseSchemaV1 = defineTestCaseSchema({
149
+ // src/benchmarks/peerbench/schema-sets/qa.v1.ts
150
+ import { z as z3 } from "zod";
151
+ var QAKind = `llm/qa`;
152
+ var QATestCaseSchemaV1 = defineTestCaseSchema({
310
153
  baseSchema: BaseTestCaseSchemaV1,
311
- kind: "mmlu-pro.ts.main",
154
+ namespace: PEERBENCH_NAMESPACE,
155
+ kind: QAKind,
312
156
  schemaVersion: 1,
313
157
  fields: {
314
- question: z5.string(),
315
- options: z5.record(z5.string(), z5.string()),
316
- answer: z5.string(),
317
- answerKey: z5.string()
158
+ question: z3.string(),
159
+ goodAnswers: z3.string().array(),
160
+ badAnswers: z3.string().array()
318
161
  }
319
162
  });
320
- var MMLUProMainResponseSchemaV1 = defineResponseSchema({
321
- baseSchema: BaseLLMChatResponseSchemaV1,
322
- kind: "mmlu-pro.rs.main",
323
- schemaVersion: 1
324
- });
325
- var MMLUProMainScoreSchemaV1 = defineScoreSchema({
326
- baseSchema: BaseMMLUProScoreSchemaV1,
327
- kind: "mmlu-pro.sc.main",
163
+ var QAResponseSchemaV1 = defineResponseSchema({
164
+ baseSchema: BaseResponseSchemaV1,
165
+ namespace: PEERBENCH_NAMESPACE,
166
+ kind: QAKind,
328
167
  schemaVersion: 1,
329
168
  fields: {
330
- extractedAnswers: z5.array(z5.string())
169
+ ...ExtensionLLMResponseFieldsV1
331
170
  }
332
171
  });
333
-
334
- // src/benchmarks/mmlu-pro/loader.ts
335
- import z6 from "zod";
336
- import { parquetReadObjects } from "hyparquet";
337
- var jsonSchema = z6.object({
338
- question_id: z6.coerce.number(),
339
- question: z6.string(),
340
- options: z6.array(z6.string()),
341
- answer: z6.string(),
342
- answer_index: z6.coerce.number(),
343
- cot_content: z6.string(),
344
- category: z6.string(),
345
- src: z6.string()
346
- }).array();
347
- function mapData(data) {
348
- return {
349
- responses: [],
350
- scores: [],
351
- testCases: data.map(
352
- (item) => MMLUProMainTestCaseSchemaV1.new({
353
- id: `${item.src}-${item.category}-${item.question_id}`,
354
- question: item.question,
355
- answerKey: item.answer,
356
- options: item.options.reduce(
357
- (acc, option, index) => {
358
- acc[String.fromCharCode(65 + index)] = option;
359
- return acc;
360
- },
361
- {}
362
- ),
363
- answer: item.options[item.answer_index],
364
- metadata: {
365
- category: item.category,
366
- src: item.src,
367
- answer_index: item.answer_index
368
- }
369
- })
370
- )
371
- };
372
- }
373
- var MMLUProJSONDataLoader = class extends AbstractDataLoader {
374
- kind = "mmlu-pro.load.json.data";
375
- loadData(params) {
376
- const content = typeof params.content === "string" ? params.content : bufferToString(params.content);
377
- const parsed = jsonSchema.parse(JSON.parse(content));
378
- return mapData(parsed);
379
- }
380
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
381
- loadBenchmarkSpec(params) {
382
- throw new Error("Not implemented");
383
- }
384
- };
385
- var MMLUProParquetDataLoader = class extends AbstractDataLoader {
386
- kind = "mmlu-pro.load.parquet.data";
387
- async loadData(params) {
388
- const data = await parquetReadObjects({
389
- file: params.content.buffer
390
- });
391
- if (!data) {
392
- throw new Error("Invalid Parquet file");
393
- }
394
- return mapData(jsonSchema.parse(data));
395
- }
396
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
397
- loadBenchmarkSpec(params) {
398
- throw new Error("Not implemented");
172
+ var QAScoreSchemaV1 = defineScoreSchema({
173
+ baseSchema: BaseScoreSchemaV1,
174
+ namespace: PEERBENCH_NAMESPACE,
175
+ kind: QAKind,
176
+ schemaVersion: 1,
177
+ fields: {
178
+ ...ExtensionLLMAsAJudgeScoreFieldsV1
399
179
  }
400
- };
180
+ });
401
181
 
402
- // src/benchmarks/mmlu-pro/runner.ts
403
- async function runTestCase2(params) {
404
- const { testCase } = params;
405
- const responseIdGenerator = params.idGenerators?.response ?? idGeneratorUUIDv7;
406
- const scoreIdGenerator = params.idGenerators?.score ?? idGeneratorUUIDv7;
407
- const messages = [];
408
- if (params.systemPrompt) {
409
- messages.push({
410
- role: "system",
411
- content: params.systemPrompt.content
412
- });
413
- }
414
- if (testCase.kind === "mmlu-pro.ts.main") {
415
- const formattedPrompt = formatMCQPrompt2(testCase);
416
- messages.push({
417
- role: "user",
418
- content: formattedPrompt
419
- });
420
- const providerResponse = await params.provider.forward({
421
- model: params.runConfig.model,
422
- messages
423
- });
424
- const response = await MMLUProMainResponseSchemaV1.newWithId(
182
+ // src/benchmarks/peerbench/runner.ts
183
+ import Handlebars from "handlebars";
184
+ import z4 from "zod";
185
+ var peerbenchRunner = defineRunner(
186
+ {
187
+ schemaSets: [
425
188
  {
426
- data: providerResponse.data,
427
- startedAt: providerResponse.startedAt,
428
- completedAt: providerResponse.completedAt,
429
- testCaseId: testCase.id,
430
- modelSlug: params.runConfig.model,
431
- provider: params.provider.kind,
432
- inputTokensUsed: providerResponse.inputTokensUsed,
433
- outputTokensUsed: providerResponse.outputTokensUsed,
434
- inputCost: providerResponse.inputCost,
435
- outputCost: providerResponse.outputCost
189
+ testCase: MCQTestCaseSchemaV1,
190
+ response: MCQResponseSchemaV1,
191
+ score: MCQScoreSchemaV1
436
192
  },
437
- responseIdGenerator
438
- );
439
- if (params.scorer?.kind === "mcq") {
440
- const scorerResult = await params.scorer.score({
441
- response: response.data,
442
- choices: testCase.options ?? {},
443
- correctAnswers: [testCase.answerKey]
444
- });
445
- if (scorerResult !== null) {
446
- const score = await MMLUProMainScoreSchemaV1.newWithId(
447
- {
448
- scoringMethod: ScoringMethod.algo,
449
- value: scorerResult.value,
450
- responseId: response.id,
451
- extractedAnswers: scorerResult.extractedAnswers,
452
- metadata: response.metadata
453
- },
454
- scoreIdGenerator
455
- );
456
- return { response, score };
193
+ {
194
+ testCase: QATestCaseSchemaV1,
195
+ response: QAResponseSchemaV1,
196
+ score: QAScoreSchemaV1
457
197
  }
198
+ ],
199
+ providers: [AbstractLLMProvider],
200
+ scorers: [LLMAsAJudgeScorer, MCQScorer],
201
+ runConfigSchema: {
202
+ model: z4.string(),
203
+ llmJudgeModel: z4.string().optional(),
204
+ llmJudgeSystemPrompt: SimpleSystemPromptSchemaV1.optional(),
205
+ llmJudgeFieldsToExtract: z4.record(z4.string(), z4.custom()).optional(),
206
+ systemPrompt: SimpleSystemPromptSchemaV1.optional(),
207
+ templateVariables: z4.record(z4.string(), z4.string()).optional()
458
208
  }
459
- return { response };
460
- } else if (testCase.kind === "pb.ts.open-ended") {
461
- const messages2 = [];
462
- if (params.systemPrompt) {
463
- messages2.push({
209
+ },
210
+ async (params) => {
211
+ const { testCase, provider, scorer, runConfig } = params;
212
+ const messages = [];
213
+ if (runConfig.systemPrompt) {
214
+ messages.push({
464
215
  role: "system",
465
- content: params.systemPrompt.content
216
+ content: runConfig.systemPrompt.content
466
217
  });
467
218
  }
468
- messages2.push({
469
- role: "user",
470
- content: testCase.question
471
- });
472
- const providerResponse = await params.provider.forward({
473
- model: params.runConfig.model,
474
- messages: messages2
475
- });
476
- const response = await MMLUProMainResponseSchemaV1.newWithId(
477
- {
478
- data: providerResponse.data,
479
- startedAt: providerResponse.startedAt,
480
- completedAt: providerResponse.completedAt,
481
- testCaseId: testCase.id,
482
- modelSlug: params.runConfig.model,
483
- provider: params.provider.kind,
484
- inputTokensUsed: providerResponse.inputTokensUsed,
485
- outputTokensUsed: providerResponse.outputTokensUsed,
486
- inputCost: providerResponse.inputCost,
487
- outputCost: providerResponse.outputCost
488
- },
489
- responseIdGenerator
490
- );
491
- if (params.scorer?.kind === "llmJudge" && params.runConfig.llmJudgeModel) {
492
- const scorerResult = await params.scorer.score({
493
- task: testCase.question,
494
- candidateAnswer: response.data,
495
- referenceAnswer: testCase.answer,
496
- model: params.runConfig.llmJudgeModel
219
+ if (testCase.kind === "llm/mcq.tc") {
220
+ messages.push({
221
+ role: "user",
222
+ content: formatMCQ(testCase)
223
+ });
224
+ templateMessages(messages, runConfig.templateVariables ?? {});
225
+ return runMCQ({
226
+ testCase,
227
+ messages,
228
+ provider,
229
+ scorer,
230
+ runConfig,
231
+ idGenerators: {
232
+ response: params.idGenerators?.response ?? idGeneratorUUIDv7,
233
+ score: params.idGenerators?.score ?? idGeneratorUUIDv7
234
+ }
497
235
  });
498
- if (scorerResult !== null) {
499
- const score = await MMLUProMainScoreSchemaV1.newWithId(
500
- {
501
- scoringMethod: ScoringMethod.ai,
502
- value: scorerResult.value,
503
- responseId: response.id,
504
- explanation: scorerResult.explanation,
505
- metadata: scorerResult.metadata,
506
- extractedAnswers: [],
507
- scorerAIProvider: scorerResult.provider,
508
- scorerAIModelSlug: params.runConfig.llmJudgeModel,
509
- scorerAIInputTokensUsed: scorerResult.inputTokensUsed,
510
- scorerAIOutputTokensUsed: scorerResult.outputTokensUsed,
511
- scorerAIInputCost: scorerResult.inputCost,
512
- scorerAIOutputCost: scorerResult.outputCost
513
- },
514
- scoreIdGenerator
236
+ }
237
+ if (testCase.kind === "llm/qa.tc") {
238
+ if (scorer && scorer?.kind !== `${PEERBENCH_NAMESPACE}/llm-as-a-judge`) {
239
+ throw new Error(
240
+ `QA test cases can only be scored with an LLM as a judge scorer, but ${scorer?.kind} was provided`
515
241
  );
516
- return { response, score };
517
242
  }
243
+ messages.push({
244
+ role: "user",
245
+ content: testCase.question
246
+ });
247
+ templateMessages(messages, runConfig.templateVariables ?? {});
248
+ return runQA({
249
+ testCase,
250
+ messages,
251
+ provider,
252
+ scorer,
253
+ runConfig,
254
+ idGenerators: {
255
+ response: params.idGenerators?.response ?? idGeneratorUUIDv7,
256
+ score: params.idGenerators?.score ?? idGeneratorUUIDv7
257
+ }
258
+ });
518
259
  }
519
- return { response };
520
- }
521
- throw new Error("Unsupported test case kind");
522
- }
523
- function formatMCQPrompt2(testCase) {
524
- return `Question: ${testCase.question}
525
- Options:
526
- ${Object.entries(
527
- testCase.options ?? {}
528
- ).map(([key, value]) => `${key}: ${value}`).join("\n")}`;
529
- }
530
-
531
- // src/benchmarks/mmlu-pro/spec.ts
532
- var MMLUProBenchmarkSpecSchemaV1 = defineBenchmarkSpecSchema({
533
- baseSchema: BaseBenchmarkSpecSchemaV1,
534
- kind: "mmlu-pro.benchmark.spec",
535
- schemaVersion: 1,
536
- fields: {}
537
- });
538
-
539
- // src/benchmarks/fnol/index.ts
540
- var fnol_exports = {};
541
- __export(fnol_exports, {
542
- FNOLBaseScoreSchemaV1: () => FNOLBaseScoreSchemaV1,
543
- FNOLConversationMessageSchemaV1: () => FNOLConversationMessageSchemaV1,
544
- FNOLDoneReason: () => FNOLDoneReason,
545
- FNOLFieldSchemaV1: () => FNOLFieldSchemaV1,
546
- FNOLFieldValueType: () => FNOLFieldValueType,
547
- FNOLFieldsScoreSchemaV1: () => FNOLFieldsScoreSchemaV1,
548
- FNOLFieldsScorer: () => FNOLFieldsScorer,
549
- FNOLLLMJudgeScoreSchemaV1: () => FNOLLLMJudgeScoreSchemaV1,
550
- FNOLResponseSchemaV1: () => FNOLResponseSchemaV1,
551
- FNOLTestCaseSchemaV1: () => FNOLTestCaseSchemaV1,
552
- runTestCase: () => runTestCase3
553
- });
554
-
555
- // src/benchmarks/fnol/test-cases/fnol.v1.ts
556
- import { z as z7 } from "zod";
557
-
558
- // src/benchmarks/fnol/score.ts
559
- var FNOLBaseScoreSchemaV1 = defineScoreSchema({
560
- baseSchema: BaseScoreSchemaV1,
561
- fields: {
562
- ...ExtensionLLMAsAJudgeScorerFieldsV1
563
- }
564
- });
565
-
566
- // src/benchmarks/fnol/types.ts
567
- var FNOLFieldValueType = {
568
- string: "string",
569
- number: "number",
570
- boolean: "boolean",
571
- object: "object"
572
- };
573
- var FNOLDoneReason = {
574
- modelProvidedJson: "modelProvidedJson",
575
- reachedMaxTurns: "reachedMaxTurns",
576
- forcedFinalJson: "forcedFinalJson"
577
- };
578
-
579
- // src/benchmarks/fnol/test-cases/fnol.v1.ts
580
- var FNOLFieldSchemaV1 = z7.object({
581
- description: z7.string(),
582
- required: z7.boolean().optional(),
583
- /**
584
- * Optional expected value used by the deterministic scorer.
585
- * If omitted, the scorer will only check presence.
586
- */
587
- expected: z7.unknown().optional(),
588
- /**
589
- * Optional type hint for the model/user simulation.
590
- */
591
- valueType: z7.enum(FNOLFieldValueType).optional()
592
- });
593
- var FNOLTestCaseSchemaV1 = defineTestCaseSchema({
594
- baseSchema: BaseTestCaseSchemaV1,
595
- kind: "fnol.ts.v1",
596
- schemaVersion: 1,
597
- fields: {
598
- /**
599
- * Scenario starter message. This is what the "user" would say initially.
600
- */
601
- initialUserMessage: z7.string(),
602
- /**
603
- * Private/structured information about the user and the incident.
604
- * This is used by the user simulator LLM to answer the target model questions.
605
- */
606
- userProfile: z7.record(z7.string(), z7.unknown()),
607
- /**
608
- * The fields the target model must collect.
609
- * Keys are canonical identifiers (e.g. "policyNumber", "dateOfLoss").
610
- */
611
- fieldsToCollect: z7.record(z7.string(), FNOLFieldSchemaV1),
612
- /**
613
- * Maximum number of back-and-forth turns (target question + user answer).
614
- */
615
- maxTurns: z7.number().int().min(1).max(100).default(10)
616
- }
617
- });
618
- var FNOLConversationMessageSchemaV1 = z7.object({
619
- role: z7.enum(["system", "user", "assistant"]),
620
- content: z7.string()
621
- });
622
- var FNOLResponseSchemaV1 = defineResponseSchema({
623
- baseSchema: BaseLLMChatResponseSchemaV1,
624
- kind: "fnol.rs.v1",
625
- schemaVersion: 1,
626
- fields: {
627
- /**
628
- * Full conversation between the target model and simulated user.
629
- */
630
- conversation: z7.array(FNOLConversationMessageSchemaV1),
631
- turnsUsed: z7.number().int(),
632
- doneReason: z7.enum(FNOLDoneReason),
633
- /**
634
- * Parsed JSON object from the target model's final answer, if available.
635
- */
636
- extracted: z7.record(z7.string(), z7.unknown()).optional()
637
- }
638
- });
639
- var FNOLFieldsScoreSchemaV1 = defineScoreSchema({
640
- baseSchema: FNOLBaseScoreSchemaV1,
641
- kind: "fnol.sc.fields.v1",
642
- schemaVersion: 1,
643
- fields: {
644
- requiredKeys: z7.array(z7.string()),
645
- presentKeys: z7.array(z7.string()),
646
- missingKeys: z7.array(z7.string()),
647
- mismatchedKeys: z7.array(z7.string())
648
- }
649
- });
650
- var FNOLLLMJudgeScoreSchemaV1 = defineScoreSchema({
651
- baseSchema: FNOLBaseScoreSchemaV1,
652
- kind: "fnol.sc.llm-judge.v1",
653
- schemaVersion: 1,
654
- fields: {
655
- verdict: z7.enum(["pass", "borderline", "fail"]).optional()
656
- }
657
- });
658
-
659
- // src/benchmarks/fnol/runner.ts
660
- function formatFieldsToCollect(fieldsToCollect) {
661
- return Object.entries(fieldsToCollect).map(([key, field]) => {
662
- const required = field.required === false ? "optional" : "required";
663
- return `- ${key} (${required}): ${field.description}`;
664
- }).join("\n");
665
- }
666
- function hasAllRequiredFields(params) {
667
- const extracted = params.extracted ?? {};
668
- for (const [key, field] of Object.entries(params.fieldsToCollect)) {
669
- if (field.required === false) continue;
670
- const value = extracted[key];
671
- if (value === void 0 || value === null || value === "") return false;
672
- }
673
- return true;
674
- }
675
- async function runTestCase3(params) {
676
- const responseIdGenerator = params.idGenerators?.response ?? idGeneratorUUIDv7;
677
- const scoreIdGenerator = params.idGenerators?.score ?? idGeneratorUUIDv7;
678
- const userSimulatorProvider = params.userSimulatorProvider ?? params.provider;
679
- const userSimulatorModel = params.runConfig.userSimulatorModel ?? params.runConfig.model;
680
- const fieldsToCollectText = formatFieldsToCollect(
681
- params.testCase.fieldsToCollect
682
- );
683
- const conversation = [];
684
- if (params.systemPrompt) {
685
- conversation.push({
686
- role: "system",
687
- content: params.systemPrompt.content
688
- });
689
- }
690
- conversation.push({
691
- role: "system",
692
- content: [
693
- "You are an insurance FNOL intake assistant.",
694
- "Your job is to ask the user questions to collect the required fields listed below.",
695
- "Ask concise questions, one or a few at a time.",
696
- "When you have enough information OR when you are told to finish, output ONLY a single JSON object with the collected fields.",
697
- "Do not include markdown fences. Do not include additional text outside the JSON.",
698
- "",
699
- "Fields to collect:",
700
- fieldsToCollectText
701
- ].join("\n")
702
- });
703
- conversation.push({
704
- role: "user",
705
- content: params.testCase.initialUserMessage
260
+ throw new Error("Unsupported test case kind");
261
+ }
262
+ );
263
+ async function runQA(params) {
264
+ const { messages, testCase, provider, scorer, runConfig } = params;
265
+ const providerResponse = await provider.forward({
266
+ model: runConfig.model,
267
+ messages
706
268
  });
707
- let doneReason;
708
- let extracted;
709
- const startedAt = Date.now();
710
- for (let turn = 0; turn < params.testCase.maxTurns; turn++) {
711
- const targetReply = await params.provider.forward({
712
- model: params.runConfig.model,
713
- temperature: params.runConfig.temperature,
714
- messages: conversation
715
- });
716
- conversation.push({
717
- role: "assistant",
718
- content: targetReply.data
719
- });
720
- extracted = parseResponseAsJSON(targetReply.data);
721
- if (extracted && hasAllRequiredFields({
722
- extracted,
723
- fieldsToCollect: params.testCase.fieldsToCollect
724
- })) {
725
- doneReason = FNOLDoneReason.modelProvidedJson;
726
- break;
269
+ const response = await QAResponseSchemaV1.newWithId(
270
+ {
271
+ data: providerResponse.data,
272
+ startedAt: providerResponse.startedAt,
273
+ completedAt: providerResponse.completedAt,
274
+ testCaseId: testCase.id,
275
+ modelSlug: runConfig.model,
276
+ provider: provider.kind,
277
+ systemPromptId: runConfig.systemPrompt?.id,
278
+ inputTokensUsed: providerResponse.inputTokensUsed,
279
+ outputTokensUsed: providerResponse.outputTokensUsed,
280
+ inputCost: providerResponse.inputCost,
281
+ outputCost: providerResponse.outputCost
282
+ },
283
+ params.idGenerators?.response ?? idGeneratorUUIDv7
284
+ );
285
+ if (scorer?.kind === `${PEERBENCH_NAMESPACE}/llm-as-a-judge`) {
286
+ if (!runConfig.llmJudgeModel) {
287
+ throw new Error(
288
+ "LLM judge model is required when using LLM as a judge scorer"
289
+ );
727
290
  }
728
- const lastAssistantMessage = targetReply.data;
729
- const simulatedUser = await userSimulatorProvider.forward({
730
- model: userSimulatorModel,
731
- temperature: params.runConfig.userSimulatorTemperature,
732
- messages: [
733
- {
734
- role: "system",
735
- content: [
736
- "You are simulating a real insurance customer (the user).",
737
- "Answer the assistant's questions truthfully using ONLY the provided user profile and incident details.",
738
- "If asked about something not present in the profile, say you don't know.",
739
- "Be concise and natural. Do not invent new facts.",
740
- "",
741
- "User profile (JSON):",
742
- JSON.stringify(params.testCase.userProfile)
743
- ].join("\n")
744
- },
291
+ const scorerResult = await scorer.score({
292
+ model: runConfig.llmJudgeModel,
293
+ response: response.data,
294
+ rubric: `Expected/Valid answers: ${testCase.goodAnswers.join("\n")}
295
+ Invalid answers: ${testCase.badAnswers.join("\n")}`,
296
+ systemPrompt: runConfig.llmJudgeSystemPrompt?.content,
297
+ criteria: [
745
298
  {
746
- role: "user",
747
- content: lastAssistantMessage
299
+ id: "correctness",
300
+ description: "Is the response matches with the expected/valid answers in terms of meaning?",
301
+ weight: 1
748
302
  }
749
- ]
303
+ ],
304
+ fieldsToExtract: runConfig.llmJudgeFieldsToExtract ?? {}
750
305
  });
751
- conversation.push({
752
- role: "user",
753
- content: simulatedUser.data
754
- });
755
- }
756
- if (!doneReason) {
757
- doneReason = FNOLDoneReason.reachedMaxTurns;
758
- const forced = await params.provider.forward({
759
- model: params.runConfig.model,
760
- temperature: params.runConfig.temperature,
761
- messages: [
762
- ...conversation,
306
+ if (scorerResult !== null) {
307
+ const score = await QAScoreSchemaV1.newWithId(
763
308
  {
764
- role: "user",
765
- content: "Stop the interview now and output ONLY the final JSON object with the collected fields. No extra text."
766
- }
767
- ]
768
- });
769
- conversation.push({ role: "assistant", content: forced.data });
770
- extracted = parseResponseAsJSON(forced.data);
771
- if (extracted) {
772
- doneReason = FNOLDoneReason.forcedFinalJson;
309
+ scoringMethod: ScoringMethod.ai,
310
+ value: scorerResult.value,
311
+ responseId: response.id,
312
+ explanation: scorerResult.explanation,
313
+ scorerAIInputCost: scorerResult.inputCost,
314
+ scorerAIOutputCost: scorerResult.outputCost,
315
+ scorerAIInputTokensUsed: scorerResult.inputTokensUsed,
316
+ scorerAIOutputTokensUsed: scorerResult.outputTokensUsed,
317
+ scorerAIProvider: scorerResult.provider,
318
+ scorerAIModelSlug: runConfig.llmJudgeModel,
319
+ scorerAISystemPromptId: runConfig.llmJudgeSystemPrompt?.id,
320
+ metadata: {
321
+ ...scorerResult.metadata,
322
+ extractedFields: scorerResult.extractedFields
323
+ }
324
+ },
325
+ params.idGenerators?.score ?? idGeneratorUUIDv7
326
+ );
327
+ return { response, score };
773
328
  }
774
329
  }
775
- const completedAt = Date.now();
776
- const lastAssistant = [...conversation].reverse().find((m) => m.role === "assistant");
777
- const response = await FNOLResponseSchemaV1.newWithId(
330
+ return { response };
331
+ }
332
+ async function runMCQ(params) {
333
+ const { messages, testCase, provider, scorer, runConfig } = params;
334
+ const providerResponse = await provider.forward({
335
+ model: runConfig.model,
336
+ messages
337
+ });
338
+ const response = await MCQResponseSchemaV1.newWithId(
778
339
  {
779
- data: typeof lastAssistant?.content === "string" ? lastAssistant.content : "",
780
- startedAt,
781
- completedAt,
782
- testCaseId: params.testCase.id,
783
- modelSlug: params.runConfig.model,
784
- provider: params.provider.kind,
785
- conversation: conversation.map((m) => ({
786
- role: m.role === "user" ? "user" : "assistant",
787
- content: String(m.content)
788
- })),
789
- turnsUsed: conversation.filter((m) => m.role === "assistant").length,
790
- doneReason,
791
- extracted
340
+ data: providerResponse.data,
341
+ startedAt: providerResponse.startedAt,
342
+ completedAt: providerResponse.completedAt,
343
+ testCaseId: testCase.id,
344
+ modelSlug: runConfig.model,
345
+ provider: provider.kind,
346
+ systemPromptId: runConfig.systemPrompt?.id,
347
+ inputTokensUsed: providerResponse.inputTokensUsed,
348
+ outputTokensUsed: providerResponse.outputTokensUsed,
349
+ inputCost: providerResponse.inputCost,
350
+ outputCost: providerResponse.outputCost
792
351
  },
793
- responseIdGenerator
352
+ params.idGenerators?.response ?? idGeneratorUUIDv7
794
353
  );
795
- if (params.scorer?.kind === "fnol.fields") {
796
- const scorerResult = await params.scorer.score({
797
- fieldsToCollect: params.testCase.fieldsToCollect,
798
- extracted
354
+ if (scorer?.kind === `${PEERBENCH_NAMESPACE}/mcq`) {
355
+ const scorerResult = await scorer.score({
356
+ response: response.data,
357
+ choices: testCase.options,
358
+ correctAnswers: testCase.correctAnswerKeys
799
359
  });
800
- const score = await FNOLFieldsScoreSchemaV1.newWithId(
801
- {
802
- responseId: response.id,
803
- value: scorerResult.value,
804
- explanation: scorerResult.explanation,
805
- metadata: scorerResult.metadata,
806
- scoringMethod: ScoringMethod.algo,
807
- requiredKeys: scorerResult.requiredKeys,
808
- presentKeys: scorerResult.presentKeys,
809
- missingKeys: scorerResult.missingKeys,
810
- mismatchedKeys: scorerResult.mismatchedKeys
811
- },
812
- scoreIdGenerator
813
- );
814
- return { response, score };
360
+ if (scorerResult !== null) {
361
+ const score = await MCQScoreSchemaV1.newWithId(
362
+ {
363
+ scoringMethod: ScoringMethod.algo,
364
+ value: scorerResult.value,
365
+ responseId: response.id,
366
+ extractedAnswers: scorerResult.extractedAnswers,
367
+ explanation: scorerResult.explanation,
368
+ metadata: scorerResult.metadata
369
+ },
370
+ params.idGenerators?.score ?? idGeneratorUUIDv7
371
+ );
372
+ return { response, score };
373
+ }
815
374
  }
816
- if (params.scorer?.kind === "llmJudge" && params.runConfig.llmJudgeModel) {
817
- const scorerResult = await params.scorer.score({
818
- task: "Evaluate whether the FNOL JSON contains the required fields and correct values.",
819
- candidateAnswer: response.data,
820
- referenceAnswer: JSON.stringify(
821
- Object.fromEntries(
822
- Object.entries(params.testCase.fieldsToCollect).map(([k, v]) => [
823
- k,
824
- v.expected
825
- ])
826
- )
827
- ),
828
- model: params.runConfig.llmJudgeModel,
829
- meta: {
830
- fieldsToCollect: params.testCase.fieldsToCollect,
831
- doneReason
832
- }
375
+ if (scorer?.kind === `${PEERBENCH_NAMESPACE}/llm-as-a-judge`) {
376
+ if (!runConfig.llmJudgeModel) {
377
+ throw new Error(
378
+ "LLM judge model is required when using LLM as a judge scorer"
379
+ );
380
+ }
381
+ const scorerResult = await scorer.score({
382
+ model: runConfig.llmJudgeModel,
383
+ criteria: [
384
+ {
385
+ id: "correctness",
386
+ description: "Is the given answer key matches with one of the correct answer keys?",
387
+ weight: 1
388
+ }
389
+ ],
390
+ rubric: `Answer text itself or the key (A, B, C) is accepted
391
+ Valid answer keys: ${testCase.correctAnswerKeys.map((key) => `- ${key}`).join("\n")}
392
+ Valid Answer texts: ${testCase.correctAnswerKeys.map((key) => `- ${testCase.options?.[key] ?? ""}`).join("\n")}`,
393
+ fieldsToExtract: {
394
+ extractedAnswers: z4.string().array().describe(
395
+ "The extracted answer keys, valid or invalid (even if the answer text is provided rather than the key)"
396
+ ),
397
+ ...runConfig.llmJudgeFieldsToExtract ?? {}
398
+ },
399
+ response: response.data,
400
+ systemPrompt: runConfig.llmJudgeSystemPrompt?.content
833
401
  });
834
402
  if (scorerResult !== null) {
835
- const score = await FNOLLLMJudgeScoreSchemaV1.newWithId(
403
+ const { extractedAnswers, ...extractedFields } = scorerResult.extractedFields;
404
+ const score = await MCQScoreSchemaV1.newWithId(
836
405
  {
837
- responseId: response.id,
406
+ scoringMethod: ScoringMethod.ai,
838
407
  value: scorerResult.value,
408
+ extractedAnswers,
409
+ responseId: response.id,
839
410
  explanation: scorerResult.explanation,
840
- metadata: scorerResult.metadata,
841
- scoringMethod: ScoringMethod.ai,
842
- verdict: scorerResult.verdict,
843
- scorerAIProvider: scorerResult.provider,
844
- scorerAIModelSlug: params.runConfig.llmJudgeModel,
411
+ scorerAIInputCost: scorerResult.inputCost,
412
+ scorerAIOutputCost: scorerResult.outputCost,
845
413
  scorerAIInputTokensUsed: scorerResult.inputTokensUsed,
846
414
  scorerAIOutputTokensUsed: scorerResult.outputTokensUsed,
847
- scorerAIInputCost: scorerResult.inputCost,
848
- scorerAIOutputCost: scorerResult.outputCost
415
+ scorerAIProvider: scorerResult.provider,
416
+ scorerAIModelSlug: runConfig.llmJudgeModel,
417
+ scorerAISystemPromptId: runConfig.llmJudgeSystemPrompt?.id,
418
+ metadata: {
419
+ ...scorerResult.metadata,
420
+ extractedFields
421
+ }
849
422
  },
850
- scoreIdGenerator
423
+ params.idGenerators?.score ?? idGeneratorUUIDv7
851
424
  );
852
425
  return { response, score };
853
426
  }
854
427
  }
855
428
  return { response };
856
429
  }
857
-
858
- // src/benchmarks/fnol/scorer.ts
859
- function isMissing(value) {
860
- return value === void 0 || value === null || value === "";
861
- }
862
- function normalizeString(value) {
863
- return value.trim();
430
+ function formatMCQ(testCase) {
431
+ return `Question: ${testCase.question}
432
+ Options:
433
+ ${Object.entries(
434
+ testCase.options ?? {}
435
+ ).map(([key, value]) => `${key}: ${value}`).join("\n")}`;
864
436
  }
865
- function valuesEqual(expected, actual) {
866
- if (typeof expected === "string" && typeof actual === "string") {
867
- return normalizeString(expected) === normalizeString(actual);
437
+ function templateMessages(messages, templateVariables) {
438
+ for (let i = 0; i < messages.length; i++) {
439
+ const template = Handlebars.compile(messages[i].content);
440
+ messages[i].content = template(templateVariables);
868
441
  }
869
- return stableStringify(expected) === stableStringify(actual);
870
442
  }
871
- var FNOLFieldsScorer = class extends AbstractScorer {
872
- kind = "fnol.fields";
873
- async score(params) {
874
- const extracted = params.extracted ?? {};
875
- const requiredKeys = Object.entries(params.fieldsToCollect).filter(([, field]) => field.required !== false).map(([key]) => key);
876
- const presentKeys = [];
877
- const missingKeys = [];
878
- const mismatchedKeys = [];
879
- for (const key of requiredKeys) {
880
- const value = extracted[key];
881
- if (isMissing(value)) {
882
- missingKeys.push(key);
883
- continue;
884
- }
885
- presentKeys.push(key);
886
- const expected = params.fieldsToCollect[key]?.expected;
887
- if (expected !== void 0 && !valuesEqual(expected, value)) {
888
- mismatchedKeys.push(key);
889
- }
890
- }
891
- const requiredCount = requiredKeys.length;
892
- const correctCount = requiredCount - missingKeys.length - mismatchedKeys.length;
893
- const score = requiredCount === 0 ? 1 : correctCount / requiredCount;
894
- return {
895
- value: Math.max(0, Math.min(1, score)),
896
- explanation: missingKeys.length === 0 && mismatchedKeys.length === 0 ? "All required fields collected" : "Missing or mismatched fields",
897
- requiredKeys,
898
- presentKeys,
899
- missingKeys,
900
- mismatchedKeys,
901
- metadata: {
902
- requiredCount,
903
- presentCount: presentKeys.length,
904
- missingCount: missingKeys.length,
905
- mismatchedCount: mismatchedKeys.length
906
- }
907
- };
443
+
444
+ // src/benchmarks/peerbench/storages/json.ts
445
+ import z5 from "zod";
446
+ var PeerbenchJSONStorage = class extends JSONFileStorage {
447
+ constructor(config) {
448
+ super({
449
+ path: config.path,
450
+ chunkSize: config.chunkSize,
451
+ schema: z5.union([
452
+ MCQTestCaseSchemaV1,
453
+ MCQResponseSchemaV1,
454
+ MCQScoreSchemaV1,
455
+ QATestCaseSchemaV1,
456
+ QAResponseSchemaV1,
457
+ QAScoreSchemaV1,
458
+ MultiTurnTestCaseSchemaV1,
459
+ MultiTurnResponseSchemaV1,
460
+ MultiTurnScoreSchemaV1
461
+ ])
462
+ });
908
463
  }
909
464
  };
910
465
  export {
911
- fnol_exports as fnol,
912
- mmlu_pro_exports as mmluPro,
913
466
  peerbench_exports as peerbench
914
467
  };
915
468
  //# sourceMappingURL=index.js.map