peerbench 0.0.2-alpha.0 → 0.0.2-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +123 -99
- package/dist/aggregators/index.d.ts +67 -0
- package/dist/aggregators/index.js +46 -0
- package/dist/aggregators/index.js.map +1 -0
- package/dist/benchmarks/index.d.ts +614 -1271
- package/dist/benchmarks/index.js +346 -803
- package/dist/benchmarks/index.js.map +1 -1
- package/dist/{chunk-DUBKY73H.js → chunk-4UBK6452.js} +13 -13
- package/dist/chunk-4UBK6452.js.map +1 -0
- package/dist/chunk-ERALDEZY.js +112 -0
- package/dist/chunk-ERALDEZY.js.map +1 -0
- package/dist/{chunk-ZJWSK4VO.js → chunk-HMQYGCKI.js} +1 -1
- package/dist/chunk-HMQYGCKI.js.map +1 -0
- package/dist/chunk-NUEOE3K5.js +8 -0
- package/dist/chunk-NUEOE3K5.js.map +1 -0
- package/dist/chunk-OQE6TQXZ.js +42 -0
- package/dist/chunk-OQE6TQXZ.js.map +1 -0
- package/dist/chunk-Q6GSOHOP.js +44 -0
- package/dist/chunk-Q6GSOHOP.js.map +1 -0
- package/dist/chunk-QY5MPNNB.js +28 -0
- package/dist/chunk-QY5MPNNB.js.map +1 -0
- package/dist/chunk-R76XA2K6.js +229 -0
- package/dist/chunk-R76XA2K6.js.map +1 -0
- package/dist/chunk-TRNCF2BG.js +35 -0
- package/dist/chunk-TRNCF2BG.js.map +1 -0
- package/dist/chunk-UHHHSYVE.js +11 -0
- package/dist/chunk-UHHHSYVE.js.map +1 -0
- package/dist/{chunk-232PY7K3.js → chunk-YY33MNMV.js} +29 -14
- package/dist/chunk-YY33MNMV.js.map +1 -0
- package/dist/chunk-ZEWI24CV.js +365 -0
- package/dist/chunk-ZEWI24CV.js.map +1 -0
- package/dist/index-BAioQhp2.d.ts +27 -0
- package/dist/index.d.ts +51 -26
- package/dist/index.js +28 -25
- package/dist/index.js.map +1 -1
- package/dist/json-file-ZwzLUbje.d.ts +73 -0
- package/dist/llm-judge-QThCZ9TQ.d.ts +67 -0
- package/dist/providers/index.d.ts +16 -19
- package/dist/providers/index.js +8 -253
- package/dist/providers/index.js.map +1 -1
- package/dist/schemas/extensions/index.d.ts +16 -2
- package/dist/schemas/extensions/index.js +9 -3
- package/dist/schemas/extensions/index.js.map +1 -1
- package/dist/schemas/index.d.ts +108 -141
- package/dist/schemas/index.js +7 -10
- package/dist/schemas/llm/index.d.ts +100 -82
- package/dist/schemas/llm/index.js +7 -29
- package/dist/schemas/llm/index.js.map +1 -1
- package/dist/scorers/index.d.ts +3 -2
- package/dist/scorers/index.js +8 -486
- package/dist/scorers/index.js.map +1 -1
- package/dist/storages/index.d.ts +69 -0
- package/dist/storages/index.js +98 -0
- package/dist/storages/index.js.map +1 -0
- package/package.json +12 -6
- package/dist/catalogs/index.d.ts +0 -75
- package/dist/catalogs/index.js +0 -88
- package/dist/catalogs/index.js.map +0 -1
- package/dist/chunk-22HU24QF.js +0 -8
- package/dist/chunk-22HU24QF.js.map +0 -1
- package/dist/chunk-232PY7K3.js.map +0 -1
- package/dist/chunk-7TREBPSJ.js +0 -26
- package/dist/chunk-7TREBPSJ.js.map +0 -1
- package/dist/chunk-DUBKY73H.js.map +0 -1
- package/dist/chunk-GVF4YZF3.js +0 -15
- package/dist/chunk-GVF4YZF3.js.map +0 -1
- package/dist/chunk-HJH3SW3L.js +0 -103
- package/dist/chunk-HJH3SW3L.js.map +0 -1
- package/dist/chunk-IUN2IUCS.js +0 -58
- package/dist/chunk-IUN2IUCS.js.map +0 -1
- package/dist/chunk-VBOM2YEG.js +0 -47
- package/dist/chunk-VBOM2YEG.js.map +0 -1
- package/dist/chunk-ZJWSK4VO.js.map +0 -1
- package/dist/data-BmN5WjZ4.d.ts +0 -57
- package/dist/generic-array-DLHWSvf1.d.ts +0 -22
- package/dist/index-WiPjF2AL.d.ts +0 -15
- package/dist/llm-judge-DIG1f1Az.d.ts +0 -67
- package/dist/simple-system-prompt-CzPYuvo0.d.ts +0 -49
- package/dist/system-prompt--0FdPWqK.d.ts +0 -58
- package/dist/utilities-BrRH32rD.d.ts +0 -30
package/dist/benchmarks/index.js
CHANGED
|
@@ -1,36 +1,42 @@
|
|
|
1
1
|
import {
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
stableStringify
|
|
5
|
-
} from "../chunk-HJH3SW3L.js";
|
|
2
|
+
SimpleSystemPromptSchemaV1
|
|
3
|
+
} from "../chunk-Q6GSOHOP.js";
|
|
6
4
|
import {
|
|
7
|
-
|
|
8
|
-
} from "../chunk-
|
|
5
|
+
defineRunner
|
|
6
|
+
} from "../chunk-QY5MPNNB.js";
|
|
9
7
|
import {
|
|
10
|
-
|
|
11
|
-
idGeneratorUUIDv7,
|
|
12
|
-
parseResponseAsJSON
|
|
13
|
-
} from "../chunk-DUBKY73H.js";
|
|
14
|
-
import {
|
|
15
|
-
BaseBenchmarkSpecSchemaV1,
|
|
8
|
+
BaseResponseSchemaV1,
|
|
16
9
|
BaseScoreSchemaV1,
|
|
17
10
|
BaseTestCaseSchemaV1,
|
|
18
|
-
|
|
11
|
+
defineResponseSchema,
|
|
19
12
|
defineScoreSchema,
|
|
20
13
|
defineTestCaseSchema
|
|
21
|
-
} from "../chunk-
|
|
14
|
+
} from "../chunk-YY33MNMV.js";
|
|
15
|
+
import "../chunk-OQE6TQXZ.js";
|
|
22
16
|
import {
|
|
23
17
|
ScoringMethod
|
|
24
|
-
} from "../chunk-
|
|
18
|
+
} from "../chunk-HMQYGCKI.js";
|
|
19
|
+
import {
|
|
20
|
+
JSONFileStorage
|
|
21
|
+
} from "../chunk-ERALDEZY.js";
|
|
22
|
+
import {
|
|
23
|
+
LLMAsAJudgeScorer,
|
|
24
|
+
MCQScorer
|
|
25
|
+
} from "../chunk-ZEWI24CV.js";
|
|
26
|
+
import {
|
|
27
|
+
AbstractLLMProvider
|
|
28
|
+
} from "../chunk-R76XA2K6.js";
|
|
25
29
|
import {
|
|
26
|
-
|
|
27
|
-
} from "../chunk-
|
|
30
|
+
PEERBENCH_NAMESPACE
|
|
31
|
+
} from "../chunk-UHHHSYVE.js";
|
|
28
32
|
import {
|
|
29
|
-
|
|
30
|
-
} from "../chunk-
|
|
33
|
+
idGeneratorUUIDv7
|
|
34
|
+
} from "../chunk-4UBK6452.js";
|
|
31
35
|
import {
|
|
32
|
-
|
|
33
|
-
|
|
36
|
+
ExtensionLLMAsAJudgeScoreFieldsV1,
|
|
37
|
+
ExtensionLLMResponseFieldsV1
|
|
38
|
+
} from "../chunk-TRNCF2BG.js";
|
|
39
|
+
import "../chunk-NUEOE3K5.js";
|
|
34
40
|
import {
|
|
35
41
|
__export
|
|
36
42
|
} from "../chunk-PZ5AY32C.js";
|
|
@@ -38,878 +44,415 @@ import {
|
|
|
38
44
|
// src/benchmarks/peerbench/index.ts
|
|
39
45
|
var peerbench_exports = {};
|
|
40
46
|
__export(peerbench_exports, {
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
MCQKind: () => MCQKind,
|
|
48
|
+
MCQResponseSchemaV1: () => MCQResponseSchemaV1,
|
|
49
|
+
MCQScoreSchemaV1: () => MCQScoreSchemaV1,
|
|
50
|
+
MCQTestCaseSchemaV1: () => MCQTestCaseSchemaV1,
|
|
51
|
+
MultiTurnKind: () => MultiTurnKind,
|
|
52
|
+
MultiTurnResponseSchemaV1: () => MultiTurnResponseSchemaV1,
|
|
53
|
+
MultiTurnScoreSchemaV1: () => MultiTurnScoreSchemaV1,
|
|
54
|
+
MultiTurnTestCaseSchemaV1: () => MultiTurnTestCaseSchemaV1,
|
|
55
|
+
PeerbenchJSONStorage: () => PeerbenchJSONStorage,
|
|
56
|
+
QAKind: () => QAKind,
|
|
57
|
+
QAResponseSchemaV1: () => QAResponseSchemaV1,
|
|
58
|
+
QAScoreSchemaV1: () => QAScoreSchemaV1,
|
|
59
|
+
QATestCaseSchemaV1: () => QATestCaseSchemaV1,
|
|
60
|
+
peerbenchRunner: () => peerbenchRunner
|
|
49
61
|
});
|
|
50
62
|
|
|
51
|
-
// src/benchmarks/peerbench/
|
|
63
|
+
// src/benchmarks/peerbench/schema-sets/mcq.v1.ts
|
|
52
64
|
import { z } from "zod";
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
var PeerbenchBaseScoreSchemaV1 = defineScoreSchema({
|
|
56
|
-
baseSchema: BaseScoreSchemaV1,
|
|
57
|
-
fields: {
|
|
58
|
-
...ExtensionLLMAsAJudgeScorerFieldsV1
|
|
59
|
-
}
|
|
60
|
-
});
|
|
61
|
-
|
|
62
|
-
// src/benchmarks/peerbench/test-cases/mcq.v1.ts
|
|
63
|
-
var PeerbenchMultipleChoiceTestCaseSchemaV1 = defineTestCaseSchema({
|
|
65
|
+
var MCQKind = `llm/mcq`;
|
|
66
|
+
var MCQTestCaseSchemaV1 = defineTestCaseSchema({
|
|
64
67
|
baseSchema: BaseTestCaseSchemaV1,
|
|
65
|
-
|
|
68
|
+
namespace: PEERBENCH_NAMESPACE,
|
|
69
|
+
kind: MCQKind,
|
|
66
70
|
schemaVersion: 1,
|
|
67
71
|
fields: {
|
|
68
72
|
question: z.string(),
|
|
69
73
|
options: z.record(z.string(), z.string()),
|
|
70
|
-
|
|
71
|
-
answerKey: z.string()
|
|
74
|
+
correctAnswerKeys: z.string().array()
|
|
72
75
|
}
|
|
73
76
|
});
|
|
74
|
-
var
|
|
75
|
-
baseSchema:
|
|
76
|
-
|
|
77
|
+
var MCQResponseSchemaV1 = defineResponseSchema({
|
|
78
|
+
baseSchema: BaseResponseSchemaV1,
|
|
79
|
+
namespace: PEERBENCH_NAMESPACE,
|
|
80
|
+
kind: MCQKind,
|
|
77
81
|
schemaVersion: 1,
|
|
78
|
-
fields: {
|
|
82
|
+
fields: {
|
|
83
|
+
...ExtensionLLMResponseFieldsV1
|
|
84
|
+
}
|
|
79
85
|
});
|
|
80
|
-
var
|
|
81
|
-
baseSchema:
|
|
82
|
-
|
|
86
|
+
var MCQScoreSchemaV1 = defineScoreSchema({
|
|
87
|
+
baseSchema: BaseScoreSchemaV1,
|
|
88
|
+
namespace: PEERBENCH_NAMESPACE,
|
|
89
|
+
kind: MCQKind,
|
|
83
90
|
schemaVersion: 1,
|
|
84
91
|
fields: {
|
|
92
|
+
...ExtensionLLMAsAJudgeScoreFieldsV1,
|
|
85
93
|
extractedAnswers: z.array(z.string())
|
|
86
94
|
}
|
|
87
95
|
});
|
|
88
96
|
|
|
89
|
-
// src/benchmarks/peerbench/
|
|
97
|
+
// src/benchmarks/peerbench/schema-sets/multi-turn.v1.ts
|
|
90
98
|
import { z as z2 } from "zod";
|
|
91
|
-
var
|
|
92
|
-
|
|
93
|
-
schemaVersion: 1,
|
|
99
|
+
var MultiTurnKind = `llm/multi-turn`;
|
|
100
|
+
var MultiTurnTestCaseSchemaV1 = defineTestCaseSchema({
|
|
94
101
|
baseSchema: BaseTestCaseSchemaV1,
|
|
102
|
+
namespace: PEERBENCH_NAMESPACE,
|
|
103
|
+
kind: MultiTurnKind,
|
|
104
|
+
schemaVersion: 1,
|
|
95
105
|
fields: {
|
|
96
|
-
|
|
97
|
-
|
|
106
|
+
messages: z2.object({
|
|
107
|
+
role: z2.string(),
|
|
108
|
+
content: z2.string(),
|
|
109
|
+
goodAnswers: z2.string().array().optional(),
|
|
110
|
+
badAnswers: z2.string().array().optional()
|
|
111
|
+
}).array(),
|
|
112
|
+
maxTurns: z2.number().optional(),
|
|
113
|
+
expectedOutcome: z2.string().optional()
|
|
98
114
|
}
|
|
99
115
|
});
|
|
100
|
-
var
|
|
101
|
-
baseSchema:
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
});
|
|
105
|
-
var PeerbenchOpenEndedScoreSchemaV1 = defineScoreSchema({
|
|
106
|
-
baseSchema: PeerbenchBaseScoreSchemaV1,
|
|
107
|
-
kind: "pb.sc.open-ended",
|
|
108
|
-
schemaVersion: 1,
|
|
109
|
-
fields: {}
|
|
110
|
-
});
|
|
111
|
-
|
|
112
|
-
// src/benchmarks/peerbench/spec.ts
|
|
113
|
-
import z3 from "zod";
|
|
114
|
-
var PeerbenchBenchmarkSpecSchemaV1 = defineBenchmarkSpecSchema({
|
|
115
|
-
baseSchema: BaseBenchmarkSpecSchemaV1,
|
|
116
|
-
kind: "pb.benchmark.spec",
|
|
116
|
+
var MultiTurnResponseSchemaV1 = defineResponseSchema({
|
|
117
|
+
baseSchema: BaseResponseSchemaV1,
|
|
118
|
+
namespace: PEERBENCH_NAMESPACE,
|
|
119
|
+
kind: MultiTurnKind,
|
|
117
120
|
schemaVersion: 1,
|
|
118
121
|
fields: {
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
122
|
+
...ExtensionLLMResponseFieldsV1,
|
|
123
|
+
replies: z2.object({
|
|
124
|
+
messageIndex: z2.number(),
|
|
125
|
+
startedAt: z2.number(),
|
|
126
|
+
completedAt: z2.number(),
|
|
127
|
+
data: z2.string(),
|
|
128
|
+
inputTokensUsed: z2.number().optional(),
|
|
129
|
+
outputTokensUsed: z2.number().optional(),
|
|
130
|
+
inputCost: z2.string().optional(),
|
|
131
|
+
outputCost: z2.string().optional()
|
|
132
|
+
}).array()
|
|
123
133
|
}
|
|
124
134
|
});
|
|
125
|
-
|
|
126
|
-
// src/benchmarks/peerbench/loader.ts
|
|
127
|
-
import z4 from "zod";
|
|
128
|
-
var PeerbenchJSONDataLoader = class extends GenericJSONArrayDataLoader {
|
|
129
|
-
kind = "pb.load.json.data";
|
|
130
|
-
async loadBenchmarkSpec(params) {
|
|
131
|
-
const content = bufferToString(params.content);
|
|
132
|
-
const parsed = PeerbenchBenchmarkSpecSchemaV1.parse(content);
|
|
133
|
-
return parsed;
|
|
134
|
-
}
|
|
135
|
-
testCaseBuilder(data) {
|
|
136
|
-
const testCaseValidation = z4.union([
|
|
137
|
-
PeerbenchMultipleChoiceTestCaseSchemaV1,
|
|
138
|
-
PeerbenchOpenEndedTestCaseSchemaV1
|
|
139
|
-
]).safeParse(data);
|
|
140
|
-
return testCaseValidation.success ? testCaseValidation.data : void 0;
|
|
141
|
-
}
|
|
142
|
-
async responseBuilder(data) {
|
|
143
|
-
const responseValidation = z4.union([
|
|
144
|
-
PeerbenchMultipleChoiceResponseSchemaV1,
|
|
145
|
-
PeerbenchOpenEndedResponseSchemaV1
|
|
146
|
-
]).safeParse(data);
|
|
147
|
-
return responseValidation.success ? responseValidation.data : void 0;
|
|
148
|
-
}
|
|
149
|
-
async scoreBuilder(data) {
|
|
150
|
-
const scoreValidation = z4.union([
|
|
151
|
-
PeerbenchMultipleChoiceScoreSchemaV1,
|
|
152
|
-
PeerbenchOpenEndedScoreSchemaV1
|
|
153
|
-
]).safeParse(data);
|
|
154
|
-
return scoreValidation.success ? scoreValidation.data : void 0;
|
|
155
|
-
}
|
|
156
|
-
};
|
|
157
|
-
|
|
158
|
-
// src/benchmarks/peerbench/runner.ts
|
|
159
|
-
async function runTestCase(params) {
|
|
160
|
-
const { testCase } = params;
|
|
161
|
-
const responseIdGenerator = params.idGenerators?.response ?? idGeneratorUUIDv7;
|
|
162
|
-
const scoreIdGenerator = params.idGenerators?.score ?? idGeneratorUUIDv7;
|
|
163
|
-
const messages = [];
|
|
164
|
-
if (params.systemPrompt) {
|
|
165
|
-
messages.push({
|
|
166
|
-
role: "system",
|
|
167
|
-
content: params.systemPrompt.content
|
|
168
|
-
});
|
|
169
|
-
}
|
|
170
|
-
if (testCase.kind === "pb.ts.mcq") {
|
|
171
|
-
const formattedPrompt = formatMCQPrompt(testCase);
|
|
172
|
-
messages.push({
|
|
173
|
-
role: "user",
|
|
174
|
-
content: formattedPrompt
|
|
175
|
-
});
|
|
176
|
-
const providerResponse = await params.provider.forward({
|
|
177
|
-
model: params.runConfig.model,
|
|
178
|
-
messages
|
|
179
|
-
});
|
|
180
|
-
const response = await PeerbenchMultipleChoiceResponseSchemaV1.newWithId(
|
|
181
|
-
{
|
|
182
|
-
data: providerResponse.data,
|
|
183
|
-
startedAt: providerResponse.startedAt,
|
|
184
|
-
completedAt: providerResponse.completedAt,
|
|
185
|
-
testCaseId: testCase.id,
|
|
186
|
-
modelSlug: params.runConfig.model,
|
|
187
|
-
provider: params.provider.kind,
|
|
188
|
-
inputTokensUsed: providerResponse.inputTokensUsed,
|
|
189
|
-
outputTokensUsed: providerResponse.outputTokensUsed,
|
|
190
|
-
inputCost: providerResponse.inputCost,
|
|
191
|
-
outputCost: providerResponse.outputCost
|
|
192
|
-
},
|
|
193
|
-
responseIdGenerator
|
|
194
|
-
);
|
|
195
|
-
if (params.scorer?.kind === "mcq") {
|
|
196
|
-
const scorerResult = await params.scorer.score({
|
|
197
|
-
response: response.data,
|
|
198
|
-
choices: testCase.options ?? {},
|
|
199
|
-
correctAnswers: [testCase.answerKey]
|
|
200
|
-
});
|
|
201
|
-
if (scorerResult !== null) {
|
|
202
|
-
const score = await PeerbenchMultipleChoiceScoreSchemaV1.newWithId(
|
|
203
|
-
{
|
|
204
|
-
scoringMethod: ScoringMethod.algo,
|
|
205
|
-
value: scorerResult.value,
|
|
206
|
-
responseId: response.id,
|
|
207
|
-
extractedAnswers: scorerResult.extractedAnswers,
|
|
208
|
-
metadata: response.metadata
|
|
209
|
-
},
|
|
210
|
-
scoreIdGenerator
|
|
211
|
-
);
|
|
212
|
-
return { response, score };
|
|
213
|
-
}
|
|
214
|
-
}
|
|
215
|
-
return { response };
|
|
216
|
-
} else if (testCase.kind === "pb.ts.open-ended") {
|
|
217
|
-
const messages2 = [];
|
|
218
|
-
if (params.systemPrompt) {
|
|
219
|
-
messages2.push({
|
|
220
|
-
role: "system",
|
|
221
|
-
content: params.systemPrompt.content
|
|
222
|
-
});
|
|
223
|
-
}
|
|
224
|
-
messages2.push({
|
|
225
|
-
role: "user",
|
|
226
|
-
content: testCase.question
|
|
227
|
-
});
|
|
228
|
-
const providerResponse = await params.provider.forward({
|
|
229
|
-
model: params.runConfig.model,
|
|
230
|
-
messages: messages2
|
|
231
|
-
});
|
|
232
|
-
const response = await PeerbenchOpenEndedResponseSchemaV1.newWithId(
|
|
233
|
-
{
|
|
234
|
-
data: providerResponse.data,
|
|
235
|
-
startedAt: providerResponse.startedAt,
|
|
236
|
-
completedAt: providerResponse.completedAt,
|
|
237
|
-
testCaseId: testCase.id,
|
|
238
|
-
modelSlug: params.runConfig.model,
|
|
239
|
-
provider: params.provider.kind,
|
|
240
|
-
inputTokensUsed: providerResponse.inputTokensUsed,
|
|
241
|
-
outputTokensUsed: providerResponse.outputTokensUsed,
|
|
242
|
-
inputCost: providerResponse.inputCost,
|
|
243
|
-
outputCost: providerResponse.outputCost
|
|
244
|
-
},
|
|
245
|
-
responseIdGenerator
|
|
246
|
-
);
|
|
247
|
-
if (params.scorer?.kind === "llmJudge" && params.runConfig.llmJudgeModel) {
|
|
248
|
-
const scorerResult = await params.scorer.score({
|
|
249
|
-
task: testCase.question,
|
|
250
|
-
candidateAnswer: response.data,
|
|
251
|
-
referenceAnswer: testCase.answer,
|
|
252
|
-
model: params.runConfig.llmJudgeModel
|
|
253
|
-
});
|
|
254
|
-
if (scorerResult !== null) {
|
|
255
|
-
const score = await PeerbenchOpenEndedScoreSchemaV1.newWithId(
|
|
256
|
-
{
|
|
257
|
-
scoringMethod: ScoringMethod.ai,
|
|
258
|
-
value: scorerResult.value,
|
|
259
|
-
responseId: response.id,
|
|
260
|
-
explanation: scorerResult.explanation,
|
|
261
|
-
metadata: scorerResult.metadata,
|
|
262
|
-
scorerAIProvider: scorerResult.provider,
|
|
263
|
-
scorerAIModelSlug: params.runConfig.llmJudgeModel,
|
|
264
|
-
scorerAIInputTokensUsed: scorerResult.inputTokensUsed,
|
|
265
|
-
scorerAIOutputTokensUsed: scorerResult.outputTokensUsed,
|
|
266
|
-
scorerAIInputCost: scorerResult.inputCost,
|
|
267
|
-
scorerAIOutputCost: scorerResult.outputCost
|
|
268
|
-
},
|
|
269
|
-
scoreIdGenerator
|
|
270
|
-
);
|
|
271
|
-
return { response, score };
|
|
272
|
-
}
|
|
273
|
-
}
|
|
274
|
-
return { response };
|
|
275
|
-
}
|
|
276
|
-
throw new Error("Unsupported test case kind");
|
|
277
|
-
}
|
|
278
|
-
function formatMCQPrompt(testCase) {
|
|
279
|
-
return `Question: ${testCase.question}
|
|
280
|
-
Options:
|
|
281
|
-
${Object.entries(
|
|
282
|
-
testCase.options ?? {}
|
|
283
|
-
).map(([key, value]) => `${key}: ${value}`).join("\n")}`;
|
|
284
|
-
}
|
|
285
|
-
|
|
286
|
-
// src/benchmarks/mmlu-pro/index.ts
|
|
287
|
-
var mmlu_pro_exports = {};
|
|
288
|
-
__export(mmlu_pro_exports, {
|
|
289
|
-
BaseMMLUProScoreSchemaV1: () => BaseMMLUProScoreSchemaV1,
|
|
290
|
-
MMLUProBenchmarkSpecSchemaV1: () => MMLUProBenchmarkSpecSchemaV1,
|
|
291
|
-
MMLUProJSONDataLoader: () => MMLUProJSONDataLoader,
|
|
292
|
-
MMLUProMainResponseSchemaV1: () => MMLUProMainResponseSchemaV1,
|
|
293
|
-
MMLUProMainScoreSchemaV1: () => MMLUProMainScoreSchemaV1,
|
|
294
|
-
MMLUProMainTestCaseSchemaV1: () => MMLUProMainTestCaseSchemaV1,
|
|
295
|
-
MMLUProParquetDataLoader: () => MMLUProParquetDataLoader,
|
|
296
|
-
runTestCase: () => runTestCase2
|
|
297
|
-
});
|
|
298
|
-
|
|
299
|
-
// src/benchmarks/mmlu-pro/score.ts
|
|
300
|
-
var BaseMMLUProScoreSchemaV1 = defineScoreSchema({
|
|
135
|
+
var MultiTurnScoreSchemaV1 = defineScoreSchema({
|
|
301
136
|
baseSchema: BaseScoreSchemaV1,
|
|
137
|
+
namespace: PEERBENCH_NAMESPACE,
|
|
138
|
+
kind: MultiTurnKind,
|
|
139
|
+
schemaVersion: 1,
|
|
302
140
|
fields: {
|
|
303
|
-
...
|
|
141
|
+
...ExtensionLLMAsAJudgeScoreFieldsV1,
|
|
142
|
+
individualScores: z2.object({
|
|
143
|
+
replyIndex: z2.number(),
|
|
144
|
+
value: z2.number()
|
|
145
|
+
}).array()
|
|
304
146
|
}
|
|
305
147
|
});
|
|
306
148
|
|
|
307
|
-
// src/benchmarks/
|
|
308
|
-
import { z as
|
|
309
|
-
var
|
|
149
|
+
// src/benchmarks/peerbench/schema-sets/qa.v1.ts
|
|
150
|
+
import { z as z3 } from "zod";
|
|
151
|
+
var QAKind = `llm/qa`;
|
|
152
|
+
var QATestCaseSchemaV1 = defineTestCaseSchema({
|
|
310
153
|
baseSchema: BaseTestCaseSchemaV1,
|
|
311
|
-
|
|
154
|
+
namespace: PEERBENCH_NAMESPACE,
|
|
155
|
+
kind: QAKind,
|
|
312
156
|
schemaVersion: 1,
|
|
313
157
|
fields: {
|
|
314
|
-
question:
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
answerKey: z5.string()
|
|
158
|
+
question: z3.string(),
|
|
159
|
+
goodAnswers: z3.string().array(),
|
|
160
|
+
badAnswers: z3.string().array()
|
|
318
161
|
}
|
|
319
162
|
});
|
|
320
|
-
var
|
|
321
|
-
baseSchema:
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
});
|
|
325
|
-
var MMLUProMainScoreSchemaV1 = defineScoreSchema({
|
|
326
|
-
baseSchema: BaseMMLUProScoreSchemaV1,
|
|
327
|
-
kind: "mmlu-pro.sc.main",
|
|
163
|
+
var QAResponseSchemaV1 = defineResponseSchema({
|
|
164
|
+
baseSchema: BaseResponseSchemaV1,
|
|
165
|
+
namespace: PEERBENCH_NAMESPACE,
|
|
166
|
+
kind: QAKind,
|
|
328
167
|
schemaVersion: 1,
|
|
329
168
|
fields: {
|
|
330
|
-
|
|
169
|
+
...ExtensionLLMResponseFieldsV1
|
|
331
170
|
}
|
|
332
171
|
});
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
options: z6.array(z6.string()),
|
|
341
|
-
answer: z6.string(),
|
|
342
|
-
answer_index: z6.coerce.number(),
|
|
343
|
-
cot_content: z6.string(),
|
|
344
|
-
category: z6.string(),
|
|
345
|
-
src: z6.string()
|
|
346
|
-
}).array();
|
|
347
|
-
function mapData(data) {
|
|
348
|
-
return {
|
|
349
|
-
responses: [],
|
|
350
|
-
scores: [],
|
|
351
|
-
testCases: data.map(
|
|
352
|
-
(item) => MMLUProMainTestCaseSchemaV1.new({
|
|
353
|
-
id: `${item.src}-${item.category}-${item.question_id}`,
|
|
354
|
-
question: item.question,
|
|
355
|
-
answerKey: item.answer,
|
|
356
|
-
options: item.options.reduce(
|
|
357
|
-
(acc, option, index) => {
|
|
358
|
-
acc[String.fromCharCode(65 + index)] = option;
|
|
359
|
-
return acc;
|
|
360
|
-
},
|
|
361
|
-
{}
|
|
362
|
-
),
|
|
363
|
-
answer: item.options[item.answer_index],
|
|
364
|
-
metadata: {
|
|
365
|
-
category: item.category,
|
|
366
|
-
src: item.src,
|
|
367
|
-
answer_index: item.answer_index
|
|
368
|
-
}
|
|
369
|
-
})
|
|
370
|
-
)
|
|
371
|
-
};
|
|
372
|
-
}
|
|
373
|
-
var MMLUProJSONDataLoader = class extends AbstractDataLoader {
|
|
374
|
-
kind = "mmlu-pro.load.json.data";
|
|
375
|
-
loadData(params) {
|
|
376
|
-
const content = typeof params.content === "string" ? params.content : bufferToString(params.content);
|
|
377
|
-
const parsed = jsonSchema.parse(JSON.parse(content));
|
|
378
|
-
return mapData(parsed);
|
|
379
|
-
}
|
|
380
|
-
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
381
|
-
loadBenchmarkSpec(params) {
|
|
382
|
-
throw new Error("Not implemented");
|
|
383
|
-
}
|
|
384
|
-
};
|
|
385
|
-
var MMLUProParquetDataLoader = class extends AbstractDataLoader {
|
|
386
|
-
kind = "mmlu-pro.load.parquet.data";
|
|
387
|
-
async loadData(params) {
|
|
388
|
-
const data = await parquetReadObjects({
|
|
389
|
-
file: params.content.buffer
|
|
390
|
-
});
|
|
391
|
-
if (!data) {
|
|
392
|
-
throw new Error("Invalid Parquet file");
|
|
393
|
-
}
|
|
394
|
-
return mapData(jsonSchema.parse(data));
|
|
395
|
-
}
|
|
396
|
-
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
397
|
-
loadBenchmarkSpec(params) {
|
|
398
|
-
throw new Error("Not implemented");
|
|
172
|
+
var QAScoreSchemaV1 = defineScoreSchema({
|
|
173
|
+
baseSchema: BaseScoreSchemaV1,
|
|
174
|
+
namespace: PEERBENCH_NAMESPACE,
|
|
175
|
+
kind: QAKind,
|
|
176
|
+
schemaVersion: 1,
|
|
177
|
+
fields: {
|
|
178
|
+
...ExtensionLLMAsAJudgeScoreFieldsV1
|
|
399
179
|
}
|
|
400
|
-
};
|
|
180
|
+
});
|
|
401
181
|
|
|
402
|
-
// src/benchmarks/
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
if (params.systemPrompt) {
|
|
409
|
-
messages.push({
|
|
410
|
-
role: "system",
|
|
411
|
-
content: params.systemPrompt.content
|
|
412
|
-
});
|
|
413
|
-
}
|
|
414
|
-
if (testCase.kind === "mmlu-pro.ts.main") {
|
|
415
|
-
const formattedPrompt = formatMCQPrompt2(testCase);
|
|
416
|
-
messages.push({
|
|
417
|
-
role: "user",
|
|
418
|
-
content: formattedPrompt
|
|
419
|
-
});
|
|
420
|
-
const providerResponse = await params.provider.forward({
|
|
421
|
-
model: params.runConfig.model,
|
|
422
|
-
messages
|
|
423
|
-
});
|
|
424
|
-
const response = await MMLUProMainResponseSchemaV1.newWithId(
|
|
182
|
+
// src/benchmarks/peerbench/runner.ts
|
|
183
|
+
import Handlebars from "handlebars";
|
|
184
|
+
import z4 from "zod";
|
|
185
|
+
var peerbenchRunner = defineRunner(
|
|
186
|
+
{
|
|
187
|
+
schemaSets: [
|
|
425
188
|
{
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
testCaseId: testCase.id,
|
|
430
|
-
modelSlug: params.runConfig.model,
|
|
431
|
-
provider: params.provider.kind,
|
|
432
|
-
inputTokensUsed: providerResponse.inputTokensUsed,
|
|
433
|
-
outputTokensUsed: providerResponse.outputTokensUsed,
|
|
434
|
-
inputCost: providerResponse.inputCost,
|
|
435
|
-
outputCost: providerResponse.outputCost
|
|
189
|
+
testCase: MCQTestCaseSchemaV1,
|
|
190
|
+
response: MCQResponseSchemaV1,
|
|
191
|
+
score: MCQScoreSchemaV1
|
|
436
192
|
},
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
response: response.data,
|
|
442
|
-
choices: testCase.options ?? {},
|
|
443
|
-
correctAnswers: [testCase.answerKey]
|
|
444
|
-
});
|
|
445
|
-
if (scorerResult !== null) {
|
|
446
|
-
const score = await MMLUProMainScoreSchemaV1.newWithId(
|
|
447
|
-
{
|
|
448
|
-
scoringMethod: ScoringMethod.algo,
|
|
449
|
-
value: scorerResult.value,
|
|
450
|
-
responseId: response.id,
|
|
451
|
-
extractedAnswers: scorerResult.extractedAnswers,
|
|
452
|
-
metadata: response.metadata
|
|
453
|
-
},
|
|
454
|
-
scoreIdGenerator
|
|
455
|
-
);
|
|
456
|
-
return { response, score };
|
|
193
|
+
{
|
|
194
|
+
testCase: QATestCaseSchemaV1,
|
|
195
|
+
response: QAResponseSchemaV1,
|
|
196
|
+
score: QAScoreSchemaV1
|
|
457
197
|
}
|
|
198
|
+
],
|
|
199
|
+
providers: [AbstractLLMProvider],
|
|
200
|
+
scorers: [LLMAsAJudgeScorer, MCQScorer],
|
|
201
|
+
runConfigSchema: {
|
|
202
|
+
model: z4.string(),
|
|
203
|
+
llmJudgeModel: z4.string().optional(),
|
|
204
|
+
llmJudgeSystemPrompt: SimpleSystemPromptSchemaV1.optional(),
|
|
205
|
+
systemPrompt: SimpleSystemPromptSchemaV1.optional(),
|
|
206
|
+
templateVariables: z4.record(z4.string(), z4.string()).optional()
|
|
458
207
|
}
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
const
|
|
462
|
-
|
|
463
|
-
|
|
208
|
+
},
|
|
209
|
+
async (params) => {
|
|
210
|
+
const { testCase, provider, scorer, runConfig } = params;
|
|
211
|
+
const messages = [];
|
|
212
|
+
if (runConfig.systemPrompt) {
|
|
213
|
+
messages.push({
|
|
464
214
|
role: "system",
|
|
465
|
-
content:
|
|
215
|
+
content: runConfig.systemPrompt.content
|
|
466
216
|
});
|
|
467
217
|
}
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
inputTokensUsed: providerResponse.inputTokensUsed,
|
|
485
|
-
outputTokensUsed: providerResponse.outputTokensUsed,
|
|
486
|
-
inputCost: providerResponse.inputCost,
|
|
487
|
-
outputCost: providerResponse.outputCost
|
|
488
|
-
},
|
|
489
|
-
responseIdGenerator
|
|
490
|
-
);
|
|
491
|
-
if (params.scorer?.kind === "llmJudge" && params.runConfig.llmJudgeModel) {
|
|
492
|
-
const scorerResult = await params.scorer.score({
|
|
493
|
-
task: testCase.question,
|
|
494
|
-
candidateAnswer: response.data,
|
|
495
|
-
referenceAnswer: testCase.answer,
|
|
496
|
-
model: params.runConfig.llmJudgeModel
|
|
218
|
+
if (testCase.kind === "llm/mcq.tc") {
|
|
219
|
+
messages.push({
|
|
220
|
+
role: "user",
|
|
221
|
+
content: formatMCQ(testCase)
|
|
222
|
+
});
|
|
223
|
+
templateMessages(messages, runConfig.templateVariables ?? {});
|
|
224
|
+
return runMCQ({
|
|
225
|
+
testCase,
|
|
226
|
+
messages,
|
|
227
|
+
provider,
|
|
228
|
+
scorer,
|
|
229
|
+
runConfig,
|
|
230
|
+
idGenerators: {
|
|
231
|
+
response: params.idGenerators?.response ?? idGeneratorUUIDv7,
|
|
232
|
+
score: params.idGenerators?.score ?? idGeneratorUUIDv7
|
|
233
|
+
}
|
|
497
234
|
});
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
responseId: response.id,
|
|
504
|
-
explanation: scorerResult.explanation,
|
|
505
|
-
metadata: scorerResult.metadata,
|
|
506
|
-
extractedAnswers: [],
|
|
507
|
-
scorerAIProvider: scorerResult.provider,
|
|
508
|
-
scorerAIModelSlug: params.runConfig.llmJudgeModel,
|
|
509
|
-
scorerAIInputTokensUsed: scorerResult.inputTokensUsed,
|
|
510
|
-
scorerAIOutputTokensUsed: scorerResult.outputTokensUsed,
|
|
511
|
-
scorerAIInputCost: scorerResult.inputCost,
|
|
512
|
-
scorerAIOutputCost: scorerResult.outputCost
|
|
513
|
-
},
|
|
514
|
-
scoreIdGenerator
|
|
235
|
+
}
|
|
236
|
+
if (testCase.kind === "llm/qa.tc") {
|
|
237
|
+
if (scorer && scorer?.kind !== `${PEERBENCH_NAMESPACE}/llm-as-a-judge`) {
|
|
238
|
+
throw new Error(
|
|
239
|
+
`QA test cases can only be scored with an LLM as a judge scorer, but ${scorer?.kind} was provided`
|
|
515
240
|
);
|
|
516
|
-
return { response, score };
|
|
517
241
|
}
|
|
242
|
+
messages.push({
|
|
243
|
+
role: "user",
|
|
244
|
+
content: testCase.question
|
|
245
|
+
});
|
|
246
|
+
templateMessages(messages, runConfig.templateVariables ?? {});
|
|
247
|
+
return runQA({
|
|
248
|
+
testCase,
|
|
249
|
+
messages,
|
|
250
|
+
provider,
|
|
251
|
+
scorer,
|
|
252
|
+
runConfig,
|
|
253
|
+
idGenerators: {
|
|
254
|
+
response: params.idGenerators?.response ?? idGeneratorUUIDv7,
|
|
255
|
+
score: params.idGenerators?.score ?? idGeneratorUUIDv7
|
|
256
|
+
}
|
|
257
|
+
});
|
|
518
258
|
}
|
|
519
|
-
|
|
520
|
-
}
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
testCase.options ?? {}
|
|
528
|
-
).map(([key, value]) => `${key}: ${value}`).join("\n")}`;
|
|
529
|
-
}
|
|
530
|
-
|
|
531
|
-
// src/benchmarks/mmlu-pro/spec.ts
|
|
532
|
-
var MMLUProBenchmarkSpecSchemaV1 = defineBenchmarkSpecSchema({
|
|
533
|
-
baseSchema: BaseBenchmarkSpecSchemaV1,
|
|
534
|
-
kind: "mmlu-pro.benchmark.spec",
|
|
535
|
-
schemaVersion: 1,
|
|
536
|
-
fields: {}
|
|
537
|
-
});
|
|
538
|
-
|
|
539
|
-
// src/benchmarks/fnol/index.ts
|
|
540
|
-
var fnol_exports = {};
|
|
541
|
-
__export(fnol_exports, {
|
|
542
|
-
FNOLBaseScoreSchemaV1: () => FNOLBaseScoreSchemaV1,
|
|
543
|
-
FNOLConversationMessageSchemaV1: () => FNOLConversationMessageSchemaV1,
|
|
544
|
-
FNOLDoneReason: () => FNOLDoneReason,
|
|
545
|
-
FNOLFieldSchemaV1: () => FNOLFieldSchemaV1,
|
|
546
|
-
FNOLFieldValueType: () => FNOLFieldValueType,
|
|
547
|
-
FNOLFieldsScoreSchemaV1: () => FNOLFieldsScoreSchemaV1,
|
|
548
|
-
FNOLFieldsScorer: () => FNOLFieldsScorer,
|
|
549
|
-
FNOLLLMJudgeScoreSchemaV1: () => FNOLLLMJudgeScoreSchemaV1,
|
|
550
|
-
FNOLResponseSchemaV1: () => FNOLResponseSchemaV1,
|
|
551
|
-
FNOLTestCaseSchemaV1: () => FNOLTestCaseSchemaV1,
|
|
552
|
-
runTestCase: () => runTestCase3
|
|
553
|
-
});
|
|
554
|
-
|
|
555
|
-
// src/benchmarks/fnol/test-cases/fnol.v1.ts
|
|
556
|
-
import { z as z7 } from "zod";
|
|
557
|
-
|
|
558
|
-
// src/benchmarks/fnol/score.ts
|
|
559
|
-
var FNOLBaseScoreSchemaV1 = defineScoreSchema({
|
|
560
|
-
baseSchema: BaseScoreSchemaV1,
|
|
561
|
-
fields: {
|
|
562
|
-
...ExtensionLLMAsAJudgeScorerFieldsV1
|
|
563
|
-
}
|
|
564
|
-
});
|
|
565
|
-
|
|
566
|
-
// src/benchmarks/fnol/types.ts
|
|
567
|
-
var FNOLFieldValueType = {
|
|
568
|
-
string: "string",
|
|
569
|
-
number: "number",
|
|
570
|
-
boolean: "boolean",
|
|
571
|
-
object: "object"
|
|
572
|
-
};
|
|
573
|
-
var FNOLDoneReason = {
|
|
574
|
-
modelProvidedJson: "modelProvidedJson",
|
|
575
|
-
reachedMaxTurns: "reachedMaxTurns",
|
|
576
|
-
forcedFinalJson: "forcedFinalJson"
|
|
577
|
-
};
|
|
578
|
-
|
|
579
|
-
// src/benchmarks/fnol/test-cases/fnol.v1.ts
|
|
580
|
-
var FNOLFieldSchemaV1 = z7.object({
|
|
581
|
-
description: z7.string(),
|
|
582
|
-
required: z7.boolean().optional(),
|
|
583
|
-
/**
|
|
584
|
-
* Optional expected value used by the deterministic scorer.
|
|
585
|
-
* If omitted, the scorer will only check presence.
|
|
586
|
-
*/
|
|
587
|
-
expected: z7.unknown().optional(),
|
|
588
|
-
/**
|
|
589
|
-
* Optional type hint for the model/user simulation.
|
|
590
|
-
*/
|
|
591
|
-
valueType: z7.enum(FNOLFieldValueType).optional()
|
|
592
|
-
});
|
|
593
|
-
var FNOLTestCaseSchemaV1 = defineTestCaseSchema({
|
|
594
|
-
baseSchema: BaseTestCaseSchemaV1,
|
|
595
|
-
kind: "fnol.ts.v1",
|
|
596
|
-
schemaVersion: 1,
|
|
597
|
-
fields: {
|
|
598
|
-
/**
|
|
599
|
-
* Scenario starter message. This is what the "user" would say initially.
|
|
600
|
-
*/
|
|
601
|
-
initialUserMessage: z7.string(),
|
|
602
|
-
/**
|
|
603
|
-
* Private/structured information about the user and the incident.
|
|
604
|
-
* This is used by the user simulator LLM to answer the target model questions.
|
|
605
|
-
*/
|
|
606
|
-
userProfile: z7.record(z7.string(), z7.unknown()),
|
|
607
|
-
/**
|
|
608
|
-
* The fields the target model must collect.
|
|
609
|
-
* Keys are canonical identifiers (e.g. "policyNumber", "dateOfLoss").
|
|
610
|
-
*/
|
|
611
|
-
fieldsToCollect: z7.record(z7.string(), FNOLFieldSchemaV1),
|
|
612
|
-
/**
|
|
613
|
-
* Maximum number of back-and-forth turns (target question + user answer).
|
|
614
|
-
*/
|
|
615
|
-
maxTurns: z7.number().int().min(1).max(100).default(10)
|
|
616
|
-
}
|
|
617
|
-
});
|
|
618
|
-
var FNOLConversationMessageSchemaV1 = z7.object({
|
|
619
|
-
role: z7.enum(["system", "user", "assistant"]),
|
|
620
|
-
content: z7.string()
|
|
621
|
-
});
|
|
622
|
-
var FNOLResponseSchemaV1 = defineResponseSchema({
|
|
623
|
-
baseSchema: BaseLLMChatResponseSchemaV1,
|
|
624
|
-
kind: "fnol.rs.v1",
|
|
625
|
-
schemaVersion: 1,
|
|
626
|
-
fields: {
|
|
627
|
-
/**
|
|
628
|
-
* Full conversation between the target model and simulated user.
|
|
629
|
-
*/
|
|
630
|
-
conversation: z7.array(FNOLConversationMessageSchemaV1),
|
|
631
|
-
turnsUsed: z7.number().int(),
|
|
632
|
-
doneReason: z7.enum(FNOLDoneReason),
|
|
633
|
-
/**
|
|
634
|
-
* Parsed JSON object from the target model's final answer, if available.
|
|
635
|
-
*/
|
|
636
|
-
extracted: z7.record(z7.string(), z7.unknown()).optional()
|
|
637
|
-
}
|
|
638
|
-
});
|
|
639
|
-
var FNOLFieldsScoreSchemaV1 = defineScoreSchema({
|
|
640
|
-
baseSchema: FNOLBaseScoreSchemaV1,
|
|
641
|
-
kind: "fnol.sc.fields.v1",
|
|
642
|
-
schemaVersion: 1,
|
|
643
|
-
fields: {
|
|
644
|
-
requiredKeys: z7.array(z7.string()),
|
|
645
|
-
presentKeys: z7.array(z7.string()),
|
|
646
|
-
missingKeys: z7.array(z7.string()),
|
|
647
|
-
mismatchedKeys: z7.array(z7.string())
|
|
648
|
-
}
|
|
649
|
-
});
|
|
650
|
-
var FNOLLLMJudgeScoreSchemaV1 = defineScoreSchema({
|
|
651
|
-
baseSchema: FNOLBaseScoreSchemaV1,
|
|
652
|
-
kind: "fnol.sc.llm-judge.v1",
|
|
653
|
-
schemaVersion: 1,
|
|
654
|
-
fields: {
|
|
655
|
-
verdict: z7.enum(["pass", "borderline", "fail"]).optional()
|
|
656
|
-
}
|
|
657
|
-
});
|
|
658
|
-
|
|
659
|
-
// src/benchmarks/fnol/runner.ts
|
|
660
|
-
function formatFieldsToCollect(fieldsToCollect) {
|
|
661
|
-
return Object.entries(fieldsToCollect).map(([key, field]) => {
|
|
662
|
-
const required = field.required === false ? "optional" : "required";
|
|
663
|
-
return `- ${key} (${required}): ${field.description}`;
|
|
664
|
-
}).join("\n");
|
|
665
|
-
}
|
|
666
|
-
function hasAllRequiredFields(params) {
|
|
667
|
-
const extracted = params.extracted ?? {};
|
|
668
|
-
for (const [key, field] of Object.entries(params.fieldsToCollect)) {
|
|
669
|
-
if (field.required === false) continue;
|
|
670
|
-
const value = extracted[key];
|
|
671
|
-
if (value === void 0 || value === null || value === "") return false;
|
|
672
|
-
}
|
|
673
|
-
return true;
|
|
674
|
-
}
|
|
675
|
-
async function runTestCase3(params) {
|
|
676
|
-
const responseIdGenerator = params.idGenerators?.response ?? idGeneratorUUIDv7;
|
|
677
|
-
const scoreIdGenerator = params.idGenerators?.score ?? idGeneratorUUIDv7;
|
|
678
|
-
const userSimulatorProvider = params.userSimulatorProvider ?? params.provider;
|
|
679
|
-
const userSimulatorModel = params.runConfig.userSimulatorModel ?? params.runConfig.model;
|
|
680
|
-
const fieldsToCollectText = formatFieldsToCollect(
|
|
681
|
-
params.testCase.fieldsToCollect
|
|
682
|
-
);
|
|
683
|
-
const conversation = [];
|
|
684
|
-
if (params.systemPrompt) {
|
|
685
|
-
conversation.push({
|
|
686
|
-
role: "system",
|
|
687
|
-
content: params.systemPrompt.content
|
|
688
|
-
});
|
|
689
|
-
}
|
|
690
|
-
conversation.push({
|
|
691
|
-
role: "system",
|
|
692
|
-
content: [
|
|
693
|
-
"You are an insurance FNOL intake assistant.",
|
|
694
|
-
"Your job is to ask the user questions to collect the required fields listed below.",
|
|
695
|
-
"Ask concise questions, one or a few at a time.",
|
|
696
|
-
"When you have enough information OR when you are told to finish, output ONLY a single JSON object with the collected fields.",
|
|
697
|
-
"Do not include markdown fences. Do not include additional text outside the JSON.",
|
|
698
|
-
"",
|
|
699
|
-
"Fields to collect:",
|
|
700
|
-
fieldsToCollectText
|
|
701
|
-
].join("\n")
|
|
702
|
-
});
|
|
703
|
-
conversation.push({
|
|
704
|
-
role: "user",
|
|
705
|
-
content: params.testCase.initialUserMessage
|
|
259
|
+
throw new Error("Unsupported test case kind");
|
|
260
|
+
}
|
|
261
|
+
);
|
|
262
|
+
async function runQA(params) {
|
|
263
|
+
const { messages, testCase, provider, scorer, runConfig } = params;
|
|
264
|
+
const providerResponse = await provider.forward({
|
|
265
|
+
model: runConfig.model,
|
|
266
|
+
messages
|
|
706
267
|
});
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
268
|
+
const response = await QAResponseSchemaV1.newWithId(
|
|
269
|
+
{
|
|
270
|
+
data: providerResponse.data,
|
|
271
|
+
startedAt: providerResponse.startedAt,
|
|
272
|
+
completedAt: providerResponse.completedAt,
|
|
273
|
+
testCaseId: testCase.id,
|
|
274
|
+
modelSlug: runConfig.model,
|
|
275
|
+
provider: provider.kind,
|
|
276
|
+
systemPromptId: runConfig.systemPrompt?.id,
|
|
277
|
+
inputTokensUsed: providerResponse.inputTokensUsed,
|
|
278
|
+
outputTokensUsed: providerResponse.outputTokensUsed,
|
|
279
|
+
inputCost: providerResponse.inputCost,
|
|
280
|
+
outputCost: providerResponse.outputCost
|
|
281
|
+
},
|
|
282
|
+
params.idGenerators?.response ?? idGeneratorUUIDv7
|
|
283
|
+
);
|
|
284
|
+
if (scorer?.kind === `${PEERBENCH_NAMESPACE}/llm-as-a-judge`) {
|
|
285
|
+
if (!runConfig.llmJudgeModel) {
|
|
286
|
+
throw new Error(
|
|
287
|
+
"LLM judge model is required when using LLM as a judge scorer"
|
|
288
|
+
);
|
|
727
289
|
}
|
|
728
|
-
const
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
content: [
|
|
736
|
-
"You are simulating a real insurance customer (the user).",
|
|
737
|
-
"Answer the assistant's questions truthfully using ONLY the provided user profile and incident details.",
|
|
738
|
-
"If asked about something not present in the profile, say you don't know.",
|
|
739
|
-
"Be concise and natural. Do not invent new facts.",
|
|
740
|
-
"",
|
|
741
|
-
"User profile (JSON):",
|
|
742
|
-
JSON.stringify(params.testCase.userProfile)
|
|
743
|
-
].join("\n")
|
|
744
|
-
},
|
|
290
|
+
const scorerResult = await scorer.score({
|
|
291
|
+
model: runConfig.llmJudgeModel,
|
|
292
|
+
response: response.data,
|
|
293
|
+
rubric: `Expected/Valid answers: ${testCase.goodAnswers.join("\n")}
|
|
294
|
+
Invalid answers: ${testCase.badAnswers.join("\n")}`,
|
|
295
|
+
systemPrompt: runConfig.llmJudgeSystemPrompt?.content,
|
|
296
|
+
criteria: [
|
|
745
297
|
{
|
|
746
|
-
|
|
747
|
-
|
|
298
|
+
id: "correctness",
|
|
299
|
+
description: "Is the response matches with the expected/valid answers in terms of meaning?",
|
|
300
|
+
weight: 1
|
|
748
301
|
}
|
|
749
302
|
]
|
|
750
303
|
});
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
content: simulatedUser.data
|
|
754
|
-
});
|
|
755
|
-
}
|
|
756
|
-
if (!doneReason) {
|
|
757
|
-
doneReason = FNOLDoneReason.reachedMaxTurns;
|
|
758
|
-
const forced = await params.provider.forward({
|
|
759
|
-
model: params.runConfig.model,
|
|
760
|
-
temperature: params.runConfig.temperature,
|
|
761
|
-
messages: [
|
|
762
|
-
...conversation,
|
|
304
|
+
if (scorerResult !== null) {
|
|
305
|
+
const score = await QAScoreSchemaV1.newWithId(
|
|
763
306
|
{
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
307
|
+
scoringMethod: ScoringMethod.ai,
|
|
308
|
+
value: scorerResult.value,
|
|
309
|
+
responseId: response.id,
|
|
310
|
+
explanation: scorerResult.explanation,
|
|
311
|
+
metadata: scorerResult.metadata,
|
|
312
|
+
scorerAIInputCost: scorerResult.inputCost,
|
|
313
|
+
scorerAIOutputCost: scorerResult.outputCost,
|
|
314
|
+
scorerAIInputTokensUsed: scorerResult.inputTokensUsed,
|
|
315
|
+
scorerAIOutputTokensUsed: scorerResult.outputTokensUsed,
|
|
316
|
+
scorerAIProvider: scorerResult.provider,
|
|
317
|
+
scorerAIModelSlug: runConfig.llmJudgeModel,
|
|
318
|
+
scorerAISystemPromptId: runConfig.llmJudgeSystemPrompt?.id
|
|
319
|
+
},
|
|
320
|
+
params.idGenerators?.score ?? idGeneratorUUIDv7
|
|
321
|
+
);
|
|
322
|
+
return { response, score };
|
|
773
323
|
}
|
|
774
324
|
}
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
325
|
+
return { response };
|
|
326
|
+
}
|
|
327
|
+
async function runMCQ(params) {
|
|
328
|
+
const { messages, testCase, provider, scorer, runConfig } = params;
|
|
329
|
+
const providerResponse = await provider.forward({
|
|
330
|
+
model: runConfig.model,
|
|
331
|
+
messages
|
|
332
|
+
});
|
|
333
|
+
const response = await MCQResponseSchemaV1.newWithId(
|
|
778
334
|
{
|
|
779
|
-
data:
|
|
780
|
-
startedAt,
|
|
781
|
-
completedAt,
|
|
782
|
-
testCaseId:
|
|
783
|
-
modelSlug:
|
|
784
|
-
provider:
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
doneReason,
|
|
791
|
-
extracted
|
|
335
|
+
data: providerResponse.data,
|
|
336
|
+
startedAt: providerResponse.startedAt,
|
|
337
|
+
completedAt: providerResponse.completedAt,
|
|
338
|
+
testCaseId: testCase.id,
|
|
339
|
+
modelSlug: runConfig.model,
|
|
340
|
+
provider: provider.kind,
|
|
341
|
+
systemPromptId: runConfig.systemPrompt?.id,
|
|
342
|
+
inputTokensUsed: providerResponse.inputTokensUsed,
|
|
343
|
+
outputTokensUsed: providerResponse.outputTokensUsed,
|
|
344
|
+
inputCost: providerResponse.inputCost,
|
|
345
|
+
outputCost: providerResponse.outputCost
|
|
792
346
|
},
|
|
793
|
-
|
|
347
|
+
params.idGenerators?.response ?? idGeneratorUUIDv7
|
|
794
348
|
);
|
|
795
|
-
if (
|
|
796
|
-
const scorerResult = await
|
|
797
|
-
|
|
798
|
-
|
|
349
|
+
if (scorer?.kind === `${PEERBENCH_NAMESPACE}/mcq`) {
|
|
350
|
+
const scorerResult = await scorer.score({
|
|
351
|
+
response: response.data,
|
|
352
|
+
choices: testCase.options,
|
|
353
|
+
correctAnswers: testCase.correctAnswerKeys
|
|
799
354
|
});
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
return { response, score };
|
|
355
|
+
if (scorerResult !== null) {
|
|
356
|
+
const score = await MCQScoreSchemaV1.newWithId(
|
|
357
|
+
{
|
|
358
|
+
scoringMethod: ScoringMethod.algo,
|
|
359
|
+
value: scorerResult.value,
|
|
360
|
+
responseId: response.id,
|
|
361
|
+
extractedAnswers: scorerResult.extractedAnswers,
|
|
362
|
+
explanation: scorerResult.explanation,
|
|
363
|
+
metadata: scorerResult.metadata
|
|
364
|
+
},
|
|
365
|
+
params.idGenerators?.score ?? idGeneratorUUIDv7
|
|
366
|
+
);
|
|
367
|
+
return { response, score };
|
|
368
|
+
}
|
|
815
369
|
}
|
|
816
|
-
if (
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
370
|
+
if (scorer?.kind === `${PEERBENCH_NAMESPACE}/llm-as-a-judge`) {
|
|
371
|
+
if (!runConfig.llmJudgeModel) {
|
|
372
|
+
throw new Error(
|
|
373
|
+
"LLM judge model is required when using LLM as a judge scorer"
|
|
374
|
+
);
|
|
375
|
+
}
|
|
376
|
+
const scorerResult = await scorer.score({
|
|
377
|
+
model: runConfig.llmJudgeModel,
|
|
378
|
+
criteria: [
|
|
379
|
+
{
|
|
380
|
+
id: "correctness",
|
|
381
|
+
description: "Is the given answer key matches with one of the correct answer keys?",
|
|
382
|
+
weight: 1
|
|
383
|
+
}
|
|
384
|
+
],
|
|
385
|
+
rubric: `Answer text itself or the key (A, B, C) is accepted
|
|
386
|
+
Valid answer keys: ${testCase.correctAnswerKeys.map((key) => `- ${key}`).join("\n")}
|
|
387
|
+
Valid Answer texts: ${testCase.correctAnswerKeys.map((key) => `- ${testCase.options?.[key] ?? ""}`).join("\n")}`,
|
|
388
|
+
fieldsToExtract: {
|
|
389
|
+
extractedAnswers: z4.string().array().describe(
|
|
390
|
+
"The extracted answer keys, valid or invalid (even if the answer text is provided rather than the key)"
|
|
826
391
|
)
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
fieldsToCollect: params.testCase.fieldsToCollect,
|
|
831
|
-
doneReason
|
|
832
|
-
}
|
|
392
|
+
},
|
|
393
|
+
response: response.data,
|
|
394
|
+
systemPrompt: runConfig.llmJudgeSystemPrompt?.content
|
|
833
395
|
});
|
|
834
396
|
if (scorerResult !== null) {
|
|
835
|
-
const score = await
|
|
397
|
+
const score = await MCQScoreSchemaV1.newWithId(
|
|
836
398
|
{
|
|
837
|
-
|
|
399
|
+
scoringMethod: ScoringMethod.ai,
|
|
838
400
|
value: scorerResult.value,
|
|
401
|
+
extractedAnswers: scorerResult.extractedFields.extractedAnswers,
|
|
402
|
+
responseId: response.id,
|
|
839
403
|
explanation: scorerResult.explanation,
|
|
840
404
|
metadata: scorerResult.metadata,
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
scorerAIProvider: scorerResult.provider,
|
|
844
|
-
scorerAIModelSlug: params.runConfig.llmJudgeModel,
|
|
405
|
+
scorerAIInputCost: scorerResult.inputCost,
|
|
406
|
+
scorerAIOutputCost: scorerResult.outputCost,
|
|
845
407
|
scorerAIInputTokensUsed: scorerResult.inputTokensUsed,
|
|
846
408
|
scorerAIOutputTokensUsed: scorerResult.outputTokensUsed,
|
|
847
|
-
|
|
848
|
-
|
|
409
|
+
scorerAIProvider: scorerResult.provider,
|
|
410
|
+
scorerAIModelSlug: runConfig.llmJudgeModel,
|
|
411
|
+
scorerAISystemPromptId: runConfig.llmJudgeSystemPrompt?.id
|
|
849
412
|
},
|
|
850
|
-
|
|
413
|
+
params.idGenerators?.score ?? idGeneratorUUIDv7
|
|
851
414
|
);
|
|
852
415
|
return { response, score };
|
|
853
416
|
}
|
|
854
417
|
}
|
|
855
418
|
return { response };
|
|
856
419
|
}
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
}
|
|
862
|
-
|
|
863
|
-
return value.trim();
|
|
420
|
+
function formatMCQ(testCase) {
|
|
421
|
+
return `Question: ${testCase.question}
|
|
422
|
+
Options:
|
|
423
|
+
${Object.entries(
|
|
424
|
+
testCase.options ?? {}
|
|
425
|
+
).map(([key, value]) => `${key}: ${value}`).join("\n")}`;
|
|
864
426
|
}
|
|
865
|
-
function
|
|
866
|
-
|
|
867
|
-
|
|
427
|
+
function templateMessages(messages, templateVariables) {
|
|
428
|
+
for (let i = 0; i < messages.length; i++) {
|
|
429
|
+
const template = Handlebars.compile(messages[i].content);
|
|
430
|
+
messages[i].content = template(templateVariables);
|
|
868
431
|
}
|
|
869
|
-
return stableStringify(expected) === stableStringify(actual);
|
|
870
432
|
}
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
}
|
|
891
|
-
const requiredCount = requiredKeys.length;
|
|
892
|
-
const correctCount = requiredCount - missingKeys.length - mismatchedKeys.length;
|
|
893
|
-
const score = requiredCount === 0 ? 1 : correctCount / requiredCount;
|
|
894
|
-
return {
|
|
895
|
-
value: Math.max(0, Math.min(1, score)),
|
|
896
|
-
explanation: missingKeys.length === 0 && mismatchedKeys.length === 0 ? "All required fields collected" : "Missing or mismatched fields",
|
|
897
|
-
requiredKeys,
|
|
898
|
-
presentKeys,
|
|
899
|
-
missingKeys,
|
|
900
|
-
mismatchedKeys,
|
|
901
|
-
metadata: {
|
|
902
|
-
requiredCount,
|
|
903
|
-
presentCount: presentKeys.length,
|
|
904
|
-
missingCount: missingKeys.length,
|
|
905
|
-
mismatchedCount: mismatchedKeys.length
|
|
906
|
-
}
|
|
907
|
-
};
|
|
433
|
+
|
|
434
|
+
// src/benchmarks/peerbench/storages/json.ts
|
|
435
|
+
import z5 from "zod";
|
|
436
|
+
var PeerbenchJSONStorage = class extends JSONFileStorage {
|
|
437
|
+
constructor(config) {
|
|
438
|
+
super({
|
|
439
|
+
path: config.path,
|
|
440
|
+
chunkSize: config.chunkSize,
|
|
441
|
+
schema: z5.union([
|
|
442
|
+
MCQTestCaseSchemaV1,
|
|
443
|
+
MCQResponseSchemaV1,
|
|
444
|
+
MCQScoreSchemaV1,
|
|
445
|
+
QATestCaseSchemaV1,
|
|
446
|
+
QAResponseSchemaV1,
|
|
447
|
+
QAScoreSchemaV1,
|
|
448
|
+
MultiTurnTestCaseSchemaV1,
|
|
449
|
+
MultiTurnResponseSchemaV1,
|
|
450
|
+
MultiTurnScoreSchemaV1
|
|
451
|
+
])
|
|
452
|
+
});
|
|
908
453
|
}
|
|
909
454
|
};
|
|
910
455
|
export {
|
|
911
|
-
fnol_exports as fnol,
|
|
912
|
-
mmlu_pro_exports as mmluPro,
|
|
913
456
|
peerbench_exports as peerbench
|
|
914
457
|
};
|
|
915
458
|
//# sourceMappingURL=index.js.map
|