peerbench 0.0.10 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/README.md +71 -58
  2. package/dist/benchmarks/examples/echo-basic/runner.d.ts +11 -254
  3. package/dist/benchmarks/examples/echo-basic/schema-sets/echo.v1.d.ts +25 -25
  4. package/dist/benchmarks/examples/exact-match-scorer/runner.d.ts +38 -386
  5. package/dist/benchmarks/examples/exact-match-scorer/schema-sets/exact-match.v1.d.ts +25 -25
  6. package/dist/benchmarks/examples/text-transform/runner.d.ts +32 -480
  7. package/dist/benchmarks/examples/text-transform/schema-sets/echo.v1.d.ts +25 -25
  8. package/dist/benchmarks/examples/text-transform/schema-sets/reverse.v1.d.ts +25 -25
  9. package/dist/benchmarks/index.js +180 -248
  10. package/dist/benchmarks/index.js.map +1 -1
  11. package/dist/benchmarks/peerbench/index.d.ts +2 -1
  12. package/dist/benchmarks/peerbench/mcq-runner.d.ts +78 -0
  13. package/dist/benchmarks/peerbench/qa-runner.d.ts +77 -0
  14. package/dist/benchmarks/peerbench/schema-sets/mcq.v1.d.ts +25 -25
  15. package/dist/benchmarks/peerbench/schema-sets/multi-turn.v1.d.ts +25 -25
  16. package/dist/benchmarks/peerbench/schema-sets/qa.v1.d.ts +25 -25
  17. package/dist/chunk-6WDCU5BP.js +9 -0
  18. package/dist/chunk-6WDCU5BP.js.map +1 -0
  19. package/dist/{chunk-YY33MNMV.js → chunk-7KMGLEYP.js} +2 -2
  20. package/dist/{chunk-TRNCF2BG.js → chunk-HBGC6BDW.js} +1 -1
  21. package/dist/chunk-HBGC6BDW.js.map +1 -0
  22. package/dist/{chunk-HMQYGCKI.js → chunk-ZJWSK4VO.js} +1 -1
  23. package/dist/chunk-ZJWSK4VO.js.map +1 -0
  24. package/dist/dev.d.ts +22 -0
  25. package/dist/helpers/define-runner.d.ts +2 -45
  26. package/dist/index.js +2 -2
  27. package/dist/providers/ai-sdk.d.ts +24 -0
  28. package/dist/providers/callables/callable.d.ts +4 -0
  29. package/dist/providers/callables/llm.d.ts +41 -0
  30. package/dist/providers/example/echo.d.ts +12 -11
  31. package/dist/providers/example/restapi.d.ts +11 -18
  32. package/dist/providers/index.d.ts +4 -2
  33. package/dist/providers/index.js +380 -9
  34. package/dist/providers/index.js.map +1 -1
  35. package/dist/providers/mastra.d.ts +16 -21
  36. package/dist/providers/openai.d.ts +25 -10
  37. package/dist/providers/openrouter.d.ts +6 -8
  38. package/dist/schemas/extensions/index.js +1 -1
  39. package/dist/schemas/extensions/response/llm.d.ts +17 -0
  40. package/dist/schemas/index.js +2 -2
  41. package/dist/schemas/llm/index.js +36 -7
  42. package/dist/schemas/llm/index.js.map +1 -1
  43. package/dist/schemas/llm/simple-system-prompt.d.ts +3 -3
  44. package/dist/schemas/llm/system-prompt.d.ts +7 -7
  45. package/dist/schemas/response.d.ts +7 -7
  46. package/dist/schemas/schema-definer.d.ts +5 -5
  47. package/dist/schemas/score.d.ts +7 -7
  48. package/dist/schemas/test-case.d.ts +7 -7
  49. package/dist/scorers/abstract.d.ts +1 -1
  50. package/dist/scorers/index.js +377 -7
  51. package/dist/scorers/index.js.map +1 -1
  52. package/dist/scorers/llm-judge.d.ts +6 -6
  53. package/dist/types/index.d.ts +0 -5
  54. package/dist/types/runner.d.ts +13 -17
  55. package/package.json +8 -7
  56. package/dist/benchmarks/peerbench/runner.d.ts +0 -754
  57. package/dist/chunk-3JHDJEY3.js +0 -374
  58. package/dist/chunk-3JHDJEY3.js.map +0 -1
  59. package/dist/chunk-HMQYGCKI.js.map +0 -1
  60. package/dist/chunk-Q6GSOHOP.js +0 -44
  61. package/dist/chunk-Q6GSOHOP.js.map +0 -1
  62. package/dist/chunk-RTEAK4II.js +0 -37
  63. package/dist/chunk-RTEAK4II.js.map +0 -1
  64. package/dist/chunk-SMLNDQFX.js +0 -244
  65. package/dist/chunk-SMLNDQFX.js.map +0 -1
  66. package/dist/chunk-TRNCF2BG.js.map +0 -1
  67. package/dist/providers/abstract/llm.d.ts +0 -20
  68. /package/dist/{chunk-YY33MNMV.js.map → chunk-7KMGLEYP.js.map} +0 -0
  69. /package/dist/providers/{abstract/provider.d.ts → abstract.d.ts} +0 -0
@@ -1,12 +1,6 @@
1
- import {
2
- SimpleSystemPromptSchemaV1
3
- } from "../chunk-Q6GSOHOP.js";
4
1
  import {
5
2
  defineRunner
6
- } from "../chunk-RTEAK4II.js";
7
- import {
8
- AbstractLLMProvider
9
- } from "../chunk-SMLNDQFX.js";
3
+ } from "../chunk-6WDCU5BP.js";
10
4
  import {
11
5
  BaseResponseSchemaV1,
12
6
  BaseScoreSchemaV1,
@@ -14,28 +8,24 @@ import {
14
8
  defineResponseSchema,
15
9
  defineScoreSchema,
16
10
  defineTestCaseSchema
17
- } from "../chunk-YY33MNMV.js";
11
+ } from "../chunk-7KMGLEYP.js";
18
12
  import "../chunk-OQE6TQXZ.js";
19
13
  import {
20
14
  ScoringMethod
21
- } from "../chunk-HMQYGCKI.js";
22
- import {
23
- JSONFileStorage
24
- } from "../chunk-WBCMV445.js";
25
- import {
26
- LLMAsAJudgeScorer,
27
- MCQScorer
28
- } from "../chunk-3JHDJEY3.js";
15
+ } from "../chunk-ZJWSK4VO.js";
29
16
  import {
30
17
  PEERBENCH_NAMESPACE
31
18
  } from "../chunk-UHHHSYVE.js";
19
+ import {
20
+ JSONFileStorage
21
+ } from "../chunk-WBCMV445.js";
32
22
  import {
33
23
  idGeneratorUUIDv7
34
24
  } from "../chunk-4UBK6452.js";
35
25
  import {
36
26
  ExtensionLLMAsAJudgeScoreFieldsV1,
37
27
  ExtensionLLMResponseFieldsV1
38
- } from "../chunk-TRNCF2BG.js";
28
+ } from "../chunk-HBGC6BDW.js";
39
29
  import "../chunk-NUEOE3K5.js";
40
30
  import {
41
31
  __export
@@ -57,7 +47,8 @@ __export(peerbench_exports, {
57
47
  QAResponseSchemaV1: () => QAResponseSchemaV1,
58
48
  QAScoreSchemaV1: () => QAScoreSchemaV1,
59
49
  QATestCaseSchemaV1: () => QATestCaseSchemaV1,
60
- peerbenchRunner: () => peerbenchRunner
50
+ mcqRunner: () => mcqRunner,
51
+ qaRunner: () => qaRunner
61
52
  });
62
53
 
63
54
  // src/benchmarks/peerbench/schema-sets/mcq.v1.ts
@@ -179,254 +170,112 @@ var QAScoreSchemaV1 = defineScoreSchema({
179
170
  }
180
171
  });
181
172
 
182
- // src/benchmarks/peerbench/runner.ts
173
+ // src/benchmarks/peerbench/mcq-runner.ts
183
174
  import Handlebars from "handlebars";
184
175
  import z4 from "zod";
185
- var peerbenchRunner = defineRunner(
186
- {
187
- schemaSets: [
188
- {
189
- testCase: MCQTestCaseSchemaV1,
190
- response: MCQResponseSchemaV1,
191
- score: MCQScoreSchemaV1
192
- },
193
- {
194
- testCase: QATestCaseSchemaV1,
195
- response: QAResponseSchemaV1,
196
- score: QAScoreSchemaV1
197
- }
198
- ],
199
- providers: [AbstractLLMProvider],
200
- scorers: [LLMAsAJudgeScorer, MCQScorer],
201
- runConfigSchema: {
202
- model: z4.string(),
203
- llmJudgeModel: z4.string().optional(),
204
- llmJudgeSystemPrompt: SimpleSystemPromptSchemaV1.optional(),
205
- llmJudgeFieldsToExtract: z4.record(z4.string(), z4.custom()).optional(),
206
- systemPrompt: SimpleSystemPromptSchemaV1.optional(),
207
- templateVariables: z4.record(z4.string(), z4.string()).optional()
208
- }
209
- },
176
+ var mcqRunner = defineRunner(
210
177
  async (params) => {
211
- const { testCase, provider, scorer, runConfig } = params;
178
+ const { testCase, target, scorer } = params;
212
179
  const messages = [];
213
- if (runConfig.systemPrompt) {
180
+ if (params.systemPrompt) {
214
181
  messages.push({
215
182
  role: "system",
216
- content: runConfig.systemPrompt.content
183
+ content: params.systemPrompt.content
217
184
  });
218
185
  }
219
- if (testCase.kind === "llm/mcq.tc") {
220
- messages.push({
221
- role: "user",
222
- content: formatMCQ(testCase)
223
- });
224
- templateMessages(messages, runConfig.templateVariables ?? {});
225
- return runMCQ({
226
- testCase,
227
- messages,
228
- provider,
229
- scorer,
230
- runConfig,
231
- idGenerators: {
232
- response: params.idGenerators?.response ?? idGeneratorUUIDv7,
233
- score: params.idGenerators?.score ?? idGeneratorUUIDv7
234
- }
186
+ messages.push({
187
+ role: "user",
188
+ content: formatMCQ(testCase)
189
+ });
190
+ templateMessages(messages, params.templateVariables ?? {});
191
+ const providerResponse = await target.forward({ messages });
192
+ const response = await MCQResponseSchemaV1.newWithId(
193
+ {
194
+ data: providerResponse.data,
195
+ startedAt: providerResponse.startedAt,
196
+ completedAt: providerResponse.completedAt,
197
+ testCaseId: testCase.id,
198
+ modelSlug: target.slug,
199
+ provider: target.provider.kind,
200
+ systemPromptId: params.systemPrompt?.id,
201
+ inputTokensUsed: providerResponse.inputTokensUsed,
202
+ outputTokensUsed: providerResponse.outputTokensUsed,
203
+ inputCost: providerResponse.inputCost,
204
+ outputCost: providerResponse.outputCost
205
+ },
206
+ params.idGenerators?.response ?? idGeneratorUUIDv7
207
+ );
208
+ if (scorer?.kind === `${PEERBENCH_NAMESPACE}/mcq`) {
209
+ const scorerResult = await scorer.score({
210
+ response: response.data,
211
+ choices: testCase.options,
212
+ correctAnswers: testCase.correctAnswerKeys
235
213
  });
236
- }
237
- if (testCase.kind === "llm/qa.tc") {
238
- if (scorer && scorer?.kind !== `${PEERBENCH_NAMESPACE}/llm-as-a-judge`) {
239
- throw new Error(
240
- `QA test cases can only be scored with an LLM as a judge scorer, but ${scorer?.kind} was provided`
214
+ if (scorerResult !== null) {
215
+ const score = await MCQScoreSchemaV1.newWithId(
216
+ {
217
+ scoringMethod: ScoringMethod.algo,
218
+ value: scorerResult.value,
219
+ responseId: response.id,
220
+ extractedAnswers: scorerResult.extractedAnswers,
221
+ explanation: scorerResult.explanation,
222
+ metadata: scorerResult.metadata
223
+ },
224
+ params.idGenerators?.score ?? idGeneratorUUIDv7
241
225
  );
226
+ return { response, score };
242
227
  }
243
- messages.push({
244
- role: "user",
245
- content: testCase.question
246
- });
247
- templateMessages(messages, runConfig.templateVariables ?? {});
248
- return runQA({
249
- testCase,
250
- messages,
251
- provider,
252
- scorer,
253
- runConfig,
254
- idGenerators: {
255
- response: params.idGenerators?.response ?? idGeneratorUUIDv7,
256
- score: params.idGenerators?.score ?? idGeneratorUUIDv7
257
- }
258
- });
259
- }
260
- throw new Error("Unsupported test case kind");
261
- }
262
- );
263
- async function runQA(params) {
264
- const { messages, testCase, provider, scorer, runConfig } = params;
265
- const providerResponse = await provider.forward({
266
- model: runConfig.model,
267
- messages
268
- });
269
- const response = await QAResponseSchemaV1.newWithId(
270
- {
271
- data: providerResponse.data,
272
- startedAt: providerResponse.startedAt,
273
- completedAt: providerResponse.completedAt,
274
- testCaseId: testCase.id,
275
- modelSlug: runConfig.model,
276
- provider: provider.kind,
277
- systemPromptId: runConfig.systemPrompt?.id,
278
- inputTokensUsed: providerResponse.inputTokensUsed,
279
- outputTokensUsed: providerResponse.outputTokensUsed,
280
- inputCost: providerResponse.inputCost,
281
- outputCost: providerResponse.outputCost
282
- },
283
- params.idGenerators?.response ?? idGeneratorUUIDv7
284
- );
285
- if (scorer?.kind === `${PEERBENCH_NAMESPACE}/llm-as-a-judge`) {
286
- if (!runConfig.llmJudgeModel) {
287
- throw new Error(
288
- "LLM judge model is required when using LLM as a judge scorer"
289
- );
290
228
  }
291
- const scorerResult = await scorer.score({
292
- model: runConfig.llmJudgeModel,
293
- response: response.data,
294
- rubric: `Expected/Valid answers: ${testCase.goodAnswers.join("\n")}
295
- Invalid answers: ${testCase.badAnswers.join("\n")}`,
296
- systemPrompt: runConfig.llmJudgeSystemPrompt?.content,
297
- criteria: [
298
- {
299
- id: "correctness",
300
- description: "Is the response matches with the expected/valid answers in terms of meaning?",
301
- weight: 1
302
- }
303
- ],
304
- fieldsToExtract: runConfig.llmJudgeFieldsToExtract ?? {}
305
- });
306
- if (scorerResult !== null) {
307
- const score = await QAScoreSchemaV1.newWithId(
308
- {
309
- scoringMethod: ScoringMethod.ai,
310
- value: scorerResult.value,
311
- responseId: response.id,
312
- explanation: scorerResult.explanation,
313
- scorerAIInputCost: scorerResult.inputCost,
314
- scorerAIOutputCost: scorerResult.outputCost,
315
- scorerAIInputTokensUsed: scorerResult.inputTokensUsed,
316
- scorerAIOutputTokensUsed: scorerResult.outputTokensUsed,
317
- scorerAIProvider: scorerResult.provider,
318
- scorerAIModelSlug: runConfig.llmJudgeModel,
319
- scorerAISystemPromptId: runConfig.llmJudgeSystemPrompt?.id,
320
- metadata: {
321
- ...scorerResult.metadata,
322
- extractedFields: scorerResult.extractedFields
229
+ if (scorer?.kind === `${PEERBENCH_NAMESPACE}/llm-as-a-judge`) {
230
+ const scorerResult = await scorer.score({
231
+ criteria: [
232
+ {
233
+ id: "correctness",
234
+ description: "Is the given answer key matches with one of the correct answer keys?",
235
+ weight: 1
323
236
  }
324
- },
325
- params.idGenerators?.score ?? idGeneratorUUIDv7
326
- );
327
- return { response, score };
328
- }
329
- }
330
- return { response };
331
- }
332
- async function runMCQ(params) {
333
- const { messages, testCase, provider, scorer, runConfig } = params;
334
- const providerResponse = await provider.forward({
335
- model: runConfig.model,
336
- messages
337
- });
338
- const response = await MCQResponseSchemaV1.newWithId(
339
- {
340
- data: providerResponse.data,
341
- startedAt: providerResponse.startedAt,
342
- completedAt: providerResponse.completedAt,
343
- testCaseId: testCase.id,
344
- modelSlug: runConfig.model,
345
- provider: provider.kind,
346
- systemPromptId: runConfig.systemPrompt?.id,
347
- inputTokensUsed: providerResponse.inputTokensUsed,
348
- outputTokensUsed: providerResponse.outputTokensUsed,
349
- inputCost: providerResponse.inputCost,
350
- outputCost: providerResponse.outputCost
351
- },
352
- params.idGenerators?.response ?? idGeneratorUUIDv7
353
- );
354
- if (scorer?.kind === `${PEERBENCH_NAMESPACE}/mcq`) {
355
- const scorerResult = await scorer.score({
356
- response: response.data,
357
- choices: testCase.options,
358
- correctAnswers: testCase.correctAnswerKeys
359
- });
360
- if (scorerResult !== null) {
361
- const score = await MCQScoreSchemaV1.newWithId(
362
- {
363
- scoringMethod: ScoringMethod.algo,
364
- value: scorerResult.value,
365
- responseId: response.id,
366
- extractedAnswers: scorerResult.extractedAnswers,
367
- explanation: scorerResult.explanation,
368
- metadata: scorerResult.metadata
369
- },
370
- params.idGenerators?.score ?? idGeneratorUUIDv7
371
- );
372
- return { response, score };
373
- }
374
- }
375
- if (scorer?.kind === `${PEERBENCH_NAMESPACE}/llm-as-a-judge`) {
376
- if (!runConfig.llmJudgeModel) {
377
- throw new Error(
378
- "LLM judge model is required when using LLM as a judge scorer"
379
- );
380
- }
381
- const scorerResult = await scorer.score({
382
- model: runConfig.llmJudgeModel,
383
- criteria: [
384
- {
385
- id: "correctness",
386
- description: "Is the given answer key matches with one of the correct answer keys?",
387
- weight: 1
388
- }
389
- ],
390
- rubric: `Answer text itself or the key (A, B, C) is accepted
237
+ ],
238
+ rubric: `Answer text itself or the key (A, B, C) is accepted
391
239
  Valid answer keys: ${testCase.correctAnswerKeys.map((key) => `- ${key}`).join("\n")}
392
240
  Valid Answer texts: ${testCase.correctAnswerKeys.map((key) => `- ${testCase.options?.[key] ?? ""}`).join("\n")}`,
393
- fieldsToExtract: {
394
- extractedAnswers: z4.string().array().describe(
395
- "The extracted answer keys, valid or invalid (even if the answer text is provided rather than the key)"
396
- ),
397
- ...runConfig.llmJudgeFieldsToExtract ?? {}
398
- },
399
- response: response.data,
400
- systemPrompt: runConfig.llmJudgeSystemPrompt?.content
401
- });
402
- if (scorerResult !== null) {
403
- const { extractedAnswers, ...extractedFields } = scorerResult.extractedFields;
404
- const score = await MCQScoreSchemaV1.newWithId(
405
- {
406
- scoringMethod: ScoringMethod.ai,
407
- value: scorerResult.value,
408
- extractedAnswers,
409
- responseId: response.id,
410
- explanation: scorerResult.explanation,
411
- scorerAIInputCost: scorerResult.inputCost,
412
- scorerAIOutputCost: scorerResult.outputCost,
413
- scorerAIInputTokensUsed: scorerResult.inputTokensUsed,
414
- scorerAIOutputTokensUsed: scorerResult.outputTokensUsed,
415
- scorerAIProvider: scorerResult.provider,
416
- scorerAIModelSlug: runConfig.llmJudgeModel,
417
- scorerAISystemPromptId: runConfig.llmJudgeSystemPrompt?.id,
418
- metadata: {
419
- ...scorerResult.metadata,
420
- extractedFields
421
- }
241
+ fieldsToExtract: {
242
+ extractedAnswers: z4.string().array().describe(
243
+ "The extracted answer keys, valid or invalid (even if the answer text is provided rather than the key)"
244
+ ),
245
+ ...params.llmJudgeFieldsToExtract ?? {}
422
246
  },
423
- params.idGenerators?.score ?? idGeneratorUUIDv7
424
- );
425
- return { response, score };
247
+ response: response.data,
248
+ systemPrompt: params.llmJudgeSystemPrompt?.content
249
+ });
250
+ if (scorerResult !== null) {
251
+ const { extractedAnswers, ...extractedFields } = scorerResult.extractedFields;
252
+ const score = await MCQScoreSchemaV1.newWithId(
253
+ {
254
+ scoringMethod: ScoringMethod.ai,
255
+ value: scorerResult.value,
256
+ extractedAnswers,
257
+ responseId: response.id,
258
+ explanation: scorerResult.explanation,
259
+ scorerAIInputCost: scorerResult.inputCost,
260
+ scorerAIOutputCost: scorerResult.outputCost,
261
+ scorerAIInputTokensUsed: scorerResult.inputTokensUsed,
262
+ scorerAIOutputTokensUsed: scorerResult.outputTokensUsed,
263
+ scorerAIProvider: scorerResult.provider,
264
+ scorerAIModelSlug: scorerResult.modelSlug,
265
+ scorerAISystemPromptId: params.llmJudgeSystemPrompt?.id,
266
+ metadata: {
267
+ ...scorerResult.metadata,
268
+ extractedFields
269
+ }
270
+ },
271
+ params.idGenerators?.score ?? idGeneratorUUIDv7
272
+ );
273
+ return { response, score };
274
+ }
426
275
  }
276
+ return { response };
427
277
  }
428
- return { response };
429
- }
278
+ );
430
279
  function formatMCQ(testCase) {
431
280
  return `Question: ${testCase.question}
432
281
  Options:
@@ -441,6 +290,89 @@ function templateMessages(messages, templateVariables) {
441
290
  }
442
291
  }
443
292
 
293
+ // src/benchmarks/peerbench/qa-runner.ts
294
+ import Handlebars2 from "handlebars";
295
+ var qaRunner = defineRunner(
296
+ async (params) => {
297
+ const { testCase, target, scorer } = params;
298
+ const messages = [];
299
+ if (params.systemPrompt) {
300
+ messages.push({
301
+ role: "system",
302
+ content: params.systemPrompt.content
303
+ });
304
+ }
305
+ messages.push({
306
+ role: "user",
307
+ content: testCase.question
308
+ });
309
+ templateMessages2(messages, params.templateVariables ?? {});
310
+ const providerResponse = await target.forward({ messages });
311
+ const response = await QAResponseSchemaV1.newWithId(
312
+ {
313
+ data: providerResponse.data,
314
+ startedAt: providerResponse.startedAt,
315
+ completedAt: providerResponse.completedAt,
316
+ testCaseId: testCase.id,
317
+ modelSlug: target.slug,
318
+ provider: target.provider.kind,
319
+ systemPromptId: params.systemPrompt?.id,
320
+ inputTokensUsed: providerResponse.inputTokensUsed,
321
+ outputTokensUsed: providerResponse.outputTokensUsed,
322
+ inputCost: providerResponse.inputCost,
323
+ outputCost: providerResponse.outputCost
324
+ },
325
+ params.idGenerators?.response ?? idGeneratorUUIDv7
326
+ );
327
+ if (scorer?.kind === `${PEERBENCH_NAMESPACE}/llm-as-a-judge`) {
328
+ const scorerResult = await scorer.score({
329
+ response: response.data,
330
+ rubric: `Expected/Valid answers: ${testCase.goodAnswers.join("\n")}
331
+ Invalid answers: ${testCase.badAnswers.join("\n")}`,
332
+ systemPrompt: params.llmJudgeSystemPrompt?.content,
333
+ criteria: [
334
+ {
335
+ id: "correctness",
336
+ description: "Is the response matches with the expected/valid answers in terms of meaning?",
337
+ weight: 1
338
+ }
339
+ ],
340
+ fieldsToExtract: params.llmJudgeFieldsToExtract ?? {}
341
+ });
342
+ if (scorerResult !== null) {
343
+ const score = await QAScoreSchemaV1.newWithId(
344
+ {
345
+ scoringMethod: ScoringMethod.ai,
346
+ value: scorerResult.value,
347
+ responseId: response.id,
348
+ explanation: scorerResult.explanation,
349
+ scorerAIInputCost: scorerResult.inputCost,
350
+ scorerAIOutputCost: scorerResult.outputCost,
351
+ scorerAIInputTokensUsed: scorerResult.inputTokensUsed,
352
+ scorerAIOutputTokensUsed: scorerResult.outputTokensUsed,
353
+ scorerAIProvider: scorerResult.provider,
354
+ scorerAIModelSlug: scorerResult.modelSlug,
355
+ scorerAISystemPromptId: params.llmJudgeSystemPrompt?.id,
356
+ metadata: {
357
+ ...scorerResult.metadata,
358
+ extractedFields: scorerResult.extractedFields
359
+ }
360
+ },
361
+ params.idGenerators?.score ?? idGeneratorUUIDv7
362
+ );
363
+ return { response, score };
364
+ }
365
+ }
366
+ return { response };
367
+ }
368
+ );
369
+ function templateMessages2(messages, templateVariables) {
370
+ for (let i = 0; i < messages.length; i++) {
371
+ const template = Handlebars2.compile(messages[i].content);
372
+ messages[i].content = template(templateVariables);
373
+ }
374
+ }
375
+
444
376
  // src/benchmarks/peerbench/storages/json.ts
445
377
  import z5 from "zod";
446
378
  var PeerbenchJSONStorage = class extends JSONFileStorage {
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/benchmarks/peerbench/index.ts","../../src/benchmarks/peerbench/schema-sets/mcq.v1.ts","../../src/benchmarks/peerbench/schema-sets/multi-turn.v1.ts","../../src/benchmarks/peerbench/schema-sets/qa.v1.ts","../../src/benchmarks/peerbench/runner.ts","../../src/benchmarks/peerbench/storages/json.ts"],"sourcesContent":["export * from \"./schema-sets/mcq.v1\";\nexport * from \"./schema-sets/multi-turn.v1\";\nexport * from \"./schema-sets/qa.v1\";\n\nexport * from \"./runner\";\n\nexport * from \"./storages/json\";\n","import { PEERBENCH_NAMESPACE } from \"@/constants\";\nimport {\n BaseResponseSchemaV1,\n BaseScoreSchemaV1,\n BaseTestCaseSchemaV1,\n defineResponseSchema,\n defineScoreSchema,\n defineTestCaseSchema,\n} from \"@/schemas\";\nimport { ExtensionLLMResponseFieldsV1 } from \"@/schemas/extensions/response/llm\";\nimport { ExtensionLLMAsAJudgeScoreFieldsV1 } from \"@/schemas/extensions/score/llm-as-a-judge-scorer\";\nimport { z } from \"zod\";\n\nexport const MCQKind = `llm/mcq` as const;\n\nexport const MCQTestCaseSchemaV1 = defineTestCaseSchema({\n baseSchema: BaseTestCaseSchemaV1,\n namespace: PEERBENCH_NAMESPACE,\n kind: MCQKind,\n schemaVersion: 1,\n fields: {\n question: z.string(),\n options: z.record(z.string(), z.string()),\n correctAnswerKeys: z.string().array(),\n },\n});\nexport type MCQTestCaseV1 = z.infer<typeof MCQTestCaseSchemaV1>;\n\nexport const MCQResponseSchemaV1 = defineResponseSchema({\n baseSchema: BaseResponseSchemaV1,\n namespace: PEERBENCH_NAMESPACE,\n kind: MCQKind,\n schemaVersion: 1,\n fields: {\n ...ExtensionLLMResponseFieldsV1,\n },\n});\nexport type MCQResponseV1 = z.infer<typeof MCQResponseSchemaV1>;\n\nexport const MCQScoreSchemaV1 = defineScoreSchema({\n baseSchema: BaseScoreSchemaV1,\n namespace: PEERBENCH_NAMESPACE,\n kind: MCQKind,\n schemaVersion: 1,\n fields: {\n ...ExtensionLLMAsAJudgeScoreFieldsV1,\n extractedAnswers: z.array(z.string()),\n },\n});\nexport type MCQScoreV1 = z.infer<typeof MCQScoreSchemaV1>;\n","import { PEERBENCH_NAMESPACE } from \"@/constants\";\nimport {\n BaseResponseSchemaV1,\n BaseScoreSchemaV1,\n BaseTestCaseSchemaV1,\n defineResponseSchema,\n defineScoreSchema,\n defineTestCaseSchema,\n} from \"@/schemas\";\nimport { ExtensionLLMResponseFieldsV1 } from \"@/schemas/extensions/response/llm\";\nimport { ExtensionLLMAsAJudgeScoreFieldsV1 } from \"@/schemas/extensions/score/llm-as-a-judge-scorer\";\nimport { z } from \"zod\";\n\nexport const MultiTurnKind = `llm/multi-turn` as const;\n\nexport const MultiTurnTestCaseSchemaV1 = defineTestCaseSchema({\n baseSchema: BaseTestCaseSchemaV1,\n namespace: PEERBENCH_NAMESPACE,\n kind: MultiTurnKind,\n schemaVersion: 1,\n fields: {\n messages: z\n .object({\n role: z.string(),\n content: z.string(),\n goodAnswers: z.string().array().optional(),\n badAnswers: z.string().array().optional(),\n })\n .array(),\n\n maxTurns: z.number().optional(),\n expectedOutcome: z.string().optional(),\n },\n});\nexport type MultiTurnTestCaseV1 = z.infer<typeof MultiTurnTestCaseSchemaV1>;\n\nexport const MultiTurnResponseSchemaV1 = defineResponseSchema({\n baseSchema: BaseResponseSchemaV1,\n namespace: PEERBENCH_NAMESPACE,\n kind: MultiTurnKind,\n schemaVersion: 1,\n fields: {\n ...ExtensionLLMResponseFieldsV1,\n replies: z\n .object({\n messageIndex: z.number(),\n startedAt: z.number(),\n completedAt: z.number(),\n data: z.string(),\n\n inputTokensUsed: z.number().optional(),\n outputTokensUsed: z.number().optional(),\n inputCost: z.string().optional(),\n outputCost: z.string().optional(),\n })\n .array(),\n },\n});\nexport type MultiTurnResponseV1 = z.infer<typeof MultiTurnResponseSchemaV1>;\n\nexport const MultiTurnScoreSchemaV1 = defineScoreSchema({\n baseSchema: BaseScoreSchemaV1,\n namespace: PEERBENCH_NAMESPACE,\n kind: MultiTurnKind,\n schemaVersion: 1,\n fields: {\n ...ExtensionLLMAsAJudgeScoreFieldsV1,\n individualScores: z\n .object({\n replyIndex: z.number(),\n value: z.number(),\n })\n .array(),\n },\n});\nexport type MultiTurnScoreV1 = z.infer<typeof MultiTurnScoreSchemaV1>;\n","import { PEERBENCH_NAMESPACE } from \"@/constants\";\nimport {\n BaseResponseSchemaV1,\n BaseScoreSchemaV1,\n BaseTestCaseSchemaV1,\n defineResponseSchema,\n defineScoreSchema,\n defineTestCaseSchema,\n} from \"@/schemas\";\nimport { ExtensionLLMResponseFieldsV1 } from \"@/schemas/extensions/response/llm\";\nimport { ExtensionLLMAsAJudgeScoreFieldsV1 } from \"@/schemas/extensions/score/llm-as-a-judge-scorer\";\nimport { z } from \"zod\";\n\nexport const QAKind = `llm/qa` as const;\n\nexport const QATestCaseSchemaV1 = defineTestCaseSchema({\n baseSchema: BaseTestCaseSchemaV1,\n namespace: PEERBENCH_NAMESPACE,\n kind: QAKind,\n schemaVersion: 1,\n fields: {\n question: z.string(),\n goodAnswers: z.string().array(),\n badAnswers: z.string().array(),\n },\n});\nexport type QATestCaseV1 = z.infer<typeof QATestCaseSchemaV1>;\n\nexport const QAResponseSchemaV1 = defineResponseSchema({\n baseSchema: BaseResponseSchemaV1,\n namespace: PEERBENCH_NAMESPACE,\n kind: QAKind,\n schemaVersion: 1,\n fields: {\n ...ExtensionLLMResponseFieldsV1,\n },\n});\nexport type QAResponseV1 = z.infer<typeof QAResponseSchemaV1>;\n\nexport const QAScoreSchemaV1 = defineScoreSchema({\n baseSchema: BaseScoreSchemaV1,\n namespace: PEERBENCH_NAMESPACE,\n kind: QAKind,\n schemaVersion: 1,\n fields: {\n ...ExtensionLLMAsAJudgeScoreFieldsV1,\n },\n});\nexport type QAScoreV1 = z.infer<typeof QAScoreSchemaV1>;\n","import { defineRunner } from \"@/helpers/define-runner\";\nimport { AbstractLLMProvider } from \"@/providers\";\nimport {\n SimpleSystemPromptSchemaV1,\n SimpleSystemPromptV1,\n} from \"@/schemas/llm\";\nimport { LLMAsAJudgeScorer, MCQScorer } from \"@/scorers\";\nimport { IdGenerator, ScoringMethod } from \"@/types\";\nimport { idGeneratorUUIDv7 } from \"@/utils\";\nimport { ChatCompletionMessageParam } from \"openai/resources/index\";\nimport Handlebars from \"handlebars\";\nimport z from \"zod\";\nimport {\n MCQResponseSchemaV1,\n MCQScoreSchemaV1,\n MCQTestCaseSchemaV1,\n MCQTestCaseV1,\n} from \"./schema-sets/mcq.v1\";\nimport {\n QAResponseSchemaV1,\n QAScoreSchemaV1,\n QATestCaseSchemaV1,\n QATestCaseV1,\n} from \"./schema-sets/qa.v1\";\nimport { PEERBENCH_NAMESPACE } from \"@/constants\";\n\nexport const peerbenchRunner = defineRunner(\n {\n schemaSets: [\n {\n testCase: MCQTestCaseSchemaV1,\n response: MCQResponseSchemaV1,\n score: MCQScoreSchemaV1,\n },\n {\n testCase: QATestCaseSchemaV1,\n response: QAResponseSchemaV1,\n score: QAScoreSchemaV1,\n },\n ],\n providers: [AbstractLLMProvider],\n scorers: [LLMAsAJudgeScorer, MCQScorer],\n\n runConfigSchema: {\n model: z.string(),\n llmJudgeModel: z.string().optional(),\n llmJudgeSystemPrompt: SimpleSystemPromptSchemaV1.optional(),\n llmJudgeFieldsToExtract: z\n .record(z.string(), z.custom<z.ZodType>())\n .optional(),\n systemPrompt: SimpleSystemPromptSchemaV1.optional(),\n templateVariables: z.record(z.string(), z.string()).optional(),\n },\n },\n async (params) => {\n const { testCase, provider, scorer, runConfig } = params;\n const messages: ChatCompletionMessageParam[] = [];\n\n if (runConfig.systemPrompt) {\n messages.push({\n role: \"system\",\n content: runConfig.systemPrompt.content,\n });\n }\n\n if (testCase.kind === \"llm/mcq.tc\") {\n messages.push({\n role: \"user\",\n content: formatMCQ(testCase),\n });\n templateMessages(messages, runConfig.templateVariables ?? {});\n\n return runMCQ({\n testCase,\n messages,\n provider,\n scorer,\n runConfig,\n idGenerators: {\n response: params.idGenerators?.response ?? idGeneratorUUIDv7,\n score: params.idGenerators?.score ?? idGeneratorUUIDv7,\n },\n });\n }\n\n if (testCase.kind === \"llm/qa.tc\") {\n if (\n scorer &&\n scorer?.kind !== (`${PEERBENCH_NAMESPACE}/llm-as-a-judge` as const)\n ) {\n throw new Error(\n `QA test cases can only be scored with an LLM as a judge scorer, but ${scorer?.kind} was provided`\n );\n }\n\n messages.push({\n role: \"user\",\n content: testCase.question,\n });\n templateMessages(messages, runConfig.templateVariables ?? {});\n\n return runQA({\n testCase,\n messages,\n provider,\n scorer,\n runConfig,\n idGenerators: {\n response: params.idGenerators?.response ?? idGeneratorUUIDv7,\n score: params.idGenerators?.score ?? idGeneratorUUIDv7,\n },\n });\n }\n\n throw new Error(\"Unsupported test case kind\");\n }\n);\n\nasync function runQA(params: {\n messages: ChatCompletionMessageParam[];\n testCase: QATestCaseV1;\n provider: AbstractLLMProvider;\n scorer?: LLMAsAJudgeScorer;\n runConfig: {\n model: string;\n llmJudgeModel?: string;\n llmJudgeSystemPrompt?: SimpleSystemPromptV1;\n llmJudgeFieldsToExtract?: Record<string, z.ZodType>;\n systemPrompt?: SimpleSystemPromptV1;\n };\n idGenerators: {\n response: IdGenerator;\n score: IdGenerator;\n };\n}) {\n const { messages, testCase, provider, scorer, runConfig } = params;\n\n const providerResponse = await provider.forward({\n model: runConfig.model,\n messages,\n });\n\n const response = await QAResponseSchemaV1.newWithId(\n {\n data: providerResponse.data,\n startedAt: providerResponse.startedAt,\n completedAt: providerResponse.completedAt,\n testCaseId: testCase.id,\n modelSlug: runConfig.model,\n provider: provider.kind,\n systemPromptId: runConfig.systemPrompt?.id,\n\n inputTokensUsed: providerResponse.inputTokensUsed,\n outputTokensUsed: providerResponse.outputTokensUsed,\n inputCost: providerResponse.inputCost,\n outputCost: providerResponse.outputCost,\n },\n params.idGenerators?.response ?? idGeneratorUUIDv7\n );\n\n if (scorer?.kind === (`${PEERBENCH_NAMESPACE}/llm-as-a-judge` as const)) {\n if (!runConfig.llmJudgeModel) {\n throw new Error(\n \"LLM judge model is required when using LLM as a judge scorer\"\n );\n }\n\n const scorerResult = await scorer.score({\n model: runConfig.llmJudgeModel,\n response: response.data,\n rubric: `Expected/Valid answers: ${testCase.goodAnswers.join(\"\\n\")}\\nInvalid answers: ${testCase.badAnswers.join(\"\\n\")}`,\n systemPrompt: runConfig.llmJudgeSystemPrompt?.content,\n criteria: [\n {\n id: \"correctness\",\n description:\n \"Is the response matches with the expected/valid answers in terms of meaning?\",\n weight: 1,\n },\n ],\n fieldsToExtract: runConfig.llmJudgeFieldsToExtract ?? {},\n });\n\n if (scorerResult !== null) {\n const score = await QAScoreSchemaV1.newWithId(\n {\n scoringMethod: ScoringMethod.ai,\n value: scorerResult.value,\n responseId: response.id,\n explanation: scorerResult.explanation,\n scorerAIInputCost: scorerResult.inputCost,\n scorerAIOutputCost: scorerResult.outputCost,\n scorerAIInputTokensUsed: scorerResult.inputTokensUsed,\n scorerAIOutputTokensUsed: scorerResult.outputTokensUsed,\n scorerAIProvider: scorerResult.provider,\n scorerAIModelSlug: runConfig.llmJudgeModel,\n scorerAISystemPromptId: runConfig.llmJudgeSystemPrompt?.id,\n metadata: {\n ...scorerResult.metadata,\n extractedFields: scorerResult.extractedFields,\n },\n },\n params.idGenerators?.score ?? idGeneratorUUIDv7\n );\n\n return { response, score };\n }\n }\n\n return { response };\n}\n\nasync function runMCQ(params: {\n messages: ChatCompletionMessageParam[];\n testCase: MCQTestCaseV1;\n provider: AbstractLLMProvider;\n scorer?: MCQScorer | LLMAsAJudgeScorer;\n runConfig: {\n model: string;\n llmJudgeModel?: string;\n llmJudgeSystemPrompt?: SimpleSystemPromptV1;\n llmJudgeFieldsToExtract?: Record<string, z.ZodType>;\n systemPrompt?: SimpleSystemPromptV1;\n };\n idGenerators: {\n response: IdGenerator;\n score: IdGenerator;\n };\n}) {\n const { messages, testCase, provider, scorer, runConfig } = params;\n\n const providerResponse = await provider.forward({\n model: runConfig.model,\n messages,\n });\n\n const response = await MCQResponseSchemaV1.newWithId(\n {\n data: providerResponse.data,\n startedAt: providerResponse.startedAt,\n completedAt: providerResponse.completedAt,\n testCaseId: testCase.id,\n modelSlug: runConfig.model,\n provider: provider.kind,\n systemPromptId: runConfig.systemPrompt?.id,\n\n inputTokensUsed: providerResponse.inputTokensUsed,\n outputTokensUsed: providerResponse.outputTokensUsed,\n inputCost: providerResponse.inputCost,\n outputCost: providerResponse.outputCost,\n },\n params.idGenerators?.response ?? idGeneratorUUIDv7\n );\n\n if (scorer?.kind === (`${PEERBENCH_NAMESPACE}/mcq` as const)) {\n const scorerResult = await scorer.score({\n response: response.data,\n choices: testCase.options,\n correctAnswers: testCase.correctAnswerKeys,\n });\n\n if (scorerResult !== null) {\n const score = await MCQScoreSchemaV1.newWithId(\n {\n scoringMethod: ScoringMethod.algo,\n value: scorerResult.value,\n responseId: response.id,\n extractedAnswers: scorerResult.extractedAnswers,\n explanation: scorerResult.explanation,\n metadata: scorerResult.metadata,\n },\n params.idGenerators?.score ?? idGeneratorUUIDv7\n );\n\n return { response, score };\n }\n }\n\n if (scorer?.kind === (`${PEERBENCH_NAMESPACE}/llm-as-a-judge` as const)) {\n if (!runConfig.llmJudgeModel) {\n throw new Error(\n \"LLM judge model is required when using LLM as a judge scorer\"\n );\n }\n\n const scorerResult = await scorer.score({\n model: runConfig.llmJudgeModel,\n criteria: [\n {\n id: \"correctness\",\n description:\n \"Is the given answer key matches with one of the correct answer keys?\",\n weight: 1,\n },\n ],\n rubric: `Answer text itself or the key (A, B, C) is accepted\nValid answer keys: ${testCase.correctAnswerKeys.map((key) => `- ${key}`).join(\"\\n\")}\nValid Answer texts: ${testCase.correctAnswerKeys.map((key) => `- ${testCase.options?.[key] ?? \"\"}`).join(\"\\n\")}`,\n fieldsToExtract: {\n extractedAnswers: z\n .string()\n .array()\n .describe(\n \"The extracted answer keys, valid or invalid (even if the answer text is provided rather than the key)\"\n ),\n ...(runConfig.llmJudgeFieldsToExtract ?? {}),\n },\n response: response.data,\n systemPrompt: runConfig.llmJudgeSystemPrompt?.content,\n });\n\n if (scorerResult !== null) {\n const { extractedAnswers, ...extractedFields } =\n scorerResult.extractedFields;\n const score = await MCQScoreSchemaV1.newWithId(\n {\n scoringMethod: ScoringMethod.ai,\n value: scorerResult.value,\n extractedAnswers,\n responseId: response.id,\n explanation: scorerResult.explanation,\n scorerAIInputCost: scorerResult.inputCost,\n scorerAIOutputCost: scorerResult.outputCost,\n scorerAIInputTokensUsed: scorerResult.inputTokensUsed,\n scorerAIOutputTokensUsed: scorerResult.outputTokensUsed,\n scorerAIProvider: scorerResult.provider,\n scorerAIModelSlug: runConfig.llmJudgeModel,\n scorerAISystemPromptId: runConfig.llmJudgeSystemPrompt?.id,\n metadata: {\n ...scorerResult.metadata,\n extractedFields,\n },\n },\n params.idGenerators?.score ?? idGeneratorUUIDv7\n );\n\n return { response, score };\n }\n }\n\n return { response };\n}\n\nfunction formatMCQ(testCase: MCQTestCaseV1) {\n return `Question: ${testCase.question}\\nOptions:\\n${Object.entries(\n testCase.options ?? {}\n )\n .map(([key, value]) => `${key}: ${value}`)\n .join(\"\\n\")}`;\n}\n\nfunction templateMessages(\n messages: ChatCompletionMessageParam[],\n templateVariables: Record<string, string>\n) {\n for (let i = 0; i < messages.length; i++) {\n const template = Handlebars.compile(messages[i]!.content);\n messages[i]!.content = template(templateVariables);\n }\n}\n","import { JSONFileStorage } from \"@/storages/json-file\";\nimport {\n MCQResponseSchemaV1,\n MCQResponseV1,\n MCQScoreSchemaV1,\n MCQScoreV1,\n MCQTestCaseSchemaV1,\n MCQTestCaseV1,\n} from \"../schema-sets/mcq.v1\";\nimport {\n QAResponseSchemaV1,\n QAResponseV1,\n QAScoreSchemaV1,\n QAScoreV1,\n QATestCaseSchemaV1,\n QATestCaseV1,\n} from \"../schema-sets/qa.v1\";\nimport {\n MultiTurnResponseSchemaV1,\n MultiTurnResponseV1,\n MultiTurnScoreSchemaV1,\n MultiTurnScoreV1,\n MultiTurnTestCaseSchemaV1,\n MultiTurnTestCaseV1,\n} from \"../schema-sets/multi-turn.v1\";\nimport z from \"zod\";\n\nexport class PeerbenchJSONStorage extends JSONFileStorage<\n | MCQTestCaseV1\n | MCQResponseV1\n | MCQScoreV1\n | QATestCaseV1\n | QAResponseV1\n | QAScoreV1\n | MultiTurnTestCaseV1\n | MultiTurnResponseV1\n | MultiTurnScoreV1\n> {\n constructor(config: { path: string; chunkSize?: number }) {\n super({\n path: config.path,\n chunkSize: config.chunkSize,\n\n schema: z.union([\n MCQTestCaseSchemaV1,\n MCQResponseSchemaV1,\n MCQScoreSchemaV1,\n QATestCaseSchemaV1,\n QAResponseSchemaV1,\n QAScoreSchemaV1,\n MultiTurnTestCaseSchemaV1,\n MultiTurnResponseSchemaV1,\n MultiTurnScoreSchemaV1,\n ]),\n });\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACWA,SAAS,SAAS;AAEX,IAAM,UAAU;AAEhB,IAAM,sBAAsB,qBAAqB;AAAA,EACtD,YAAY;AAAA,EACZ,WAAW;AAAA,EACX,MAAM;AAAA,EACN,eAAe;AAAA,EACf,QAAQ;AAAA,IACN,UAAU,EAAE,OAAO;AAAA,IACnB,SAAS,EAAE,OAAO,EAAE,OAAO,GAAG,EAAE,OAAO,CAAC;AAAA,IACxC,mBAAmB,EAAE,OAAO,EAAE,MAAM;AAAA,EACtC;AACF,CAAC;AAGM,IAAM,sBAAsB,qBAAqB;AAAA,EACtD,YAAY;AAAA,EACZ,WAAW;AAAA,EACX,MAAM;AAAA,EACN,eAAe;AAAA,EACf,QAAQ;AAAA,IACN,GAAG;AAAA,EACL;AACF,CAAC;AAGM,IAAM,mBAAmB,kBAAkB;AAAA,EAChD,YAAY;AAAA,EACZ,WAAW;AAAA,EACX,MAAM;AAAA,EACN,eAAe;AAAA,EACf,QAAQ;AAAA,IACN,GAAG;AAAA,IACH,kBAAkB,EAAE,MAAM,EAAE,OAAO,CAAC;AAAA,EACtC;AACF,CAAC;;;ACrCD,SAAS,KAAAA,UAAS;AAEX,IAAM,gBAAgB;AAEtB,IAAM,4BAA4B,qBAAqB;AAAA,EAC5D,YAAY;AAAA,EACZ,WAAW;AAAA,EACX,MAAM;AAAA,EACN,eAAe;AAAA,EACf,QAAQ;AAAA,IACN,UAAUA,GACP,OAAO;AAAA,MACN,MAAMA,GAAE,OAAO;AAAA,MACf,SAASA,GAAE,OAAO;AAAA,MAClB,aAAaA,GAAE,OAAO,EAAE,MAAM,EAAE,SAAS;AAAA,MACzC,YAAYA,GAAE,OAAO,EAAE,MAAM,EAAE,SAAS;AAAA,IAC1C,CAAC,EACA,MAAM;AAAA,IAET,UAAUA,GAAE,OAAO,EAAE,SAAS;AAAA,IAC9B,iBAAiBA,GAAE,OAAO,EAAE,SAAS;AAAA,EACvC;AACF,CAAC;AAGM,IAAM,4BAA4B,qBAAqB;AAAA,EAC5D,YAAY;AAAA,EACZ,WAAW;AAAA,EACX,MAAM;AAAA,EACN,eAAe;AAAA,EACf,QAAQ;AAAA,IACN,GAAG;AAAA,IACH,SAASA,GACN,OAAO;AAAA,MACN,cAAcA,GAAE,OAAO;AAAA,MACvB,WAAWA,GAAE,OAAO;AAAA,MACpB,aAAaA,GAAE,OAAO;AAAA,MACtB,MAAMA,GAAE,OAAO;AAAA,MAEf,iBAAiBA,GAAE,OAAO,EAAE,SAAS;AAAA,MACrC,kBAAkBA,GAAE,OAAO,EAAE,SAAS;AAAA,MACtC,WAAWA,GAAE,OAAO,EAAE,SAAS;AAAA,MAC/B,YAAYA,GAAE,OAAO,EAAE,SAAS;AAAA,IAClC,CAAC,EACA,MAAM;AAAA,EACX;AACF,CAAC;AAGM,IAAM,yBAAyB,kBAAkB;AAAA,EACtD,YAAY;AAAA,EACZ,WAAW;AAAA,EACX,MAAM;AAAA,EACN,eAAe;AAAA,EACf,QAAQ;AAAA,IACN,GAAG;AAAA,IACH,kBAAkBA,GACf,OAAO;AAAA,MACN,YAAYA,GAAE,OAAO;AAAA,MACrB,OAAOA,GAAE,OAAO;AAAA,IAClB,CAAC,EACA,MAAM;AAAA,EACX;AACF,CAAC;;;AC/DD,SAAS,KAAAC,UAAS;AAEX,IAAM,SAAS;AAEf,IAAM,qBAAqB,qBAAqB;AAAA,EACrD,YAAY;AAAA,EACZ,WAAW;AAAA,EACX,MAAM;AAAA,EACN,eAAe;AAAA,EACf,QAAQ;AAAA,IACN,UAAUA,GAAE,OAAO;AAAA,IACnB,aAAaA,GAAE,OAAO,EAAE,MAAM;AAAA,IAC9B,YAAYA,GAAE,OAAO,EAAE,MAAM;AAAA,EAC/B;AACF,CAAC;AAGM,IAAM,qBAAqB,qBAAqB;AAAA,EACrD,YAAY;AAAA,EACZ,WAAW;AAAA,EACX,MAAM;AAAA,EACN,eAAe;AAAA,EACf,QAAQ;AAAA,IACN,GAAG;AAAA,EACL;AACF,CAAC;AAGM,IAAM,kBAAkB,kBAAkB;AAAA,EAC/C,YAAY;AAAA,EACZ,WAAW;AAAA,EACX,MAAM;AAAA,EACN,eAAe;AAAA,EACf,QAAQ;AAAA,IACN,GAAG;AAAA,EACL;AACF,CAAC;;;ACrCD,OAAO,gBAAgB;AACvB,OAAOC,QAAO;AAeP,IAAM,kBAAkB;AAAA,EAC7B;AAAA,IACE,YAAY;AAAA,MACV;AAAA,QACE,UAAU;AAAA,QACV,UAAU;AAAA,QACV,OAAO;AAAA,MACT;AAAA,MACA;AAAA,QACE,UAAU;AAAA,QACV,UAAU;AAAA,QACV,OAAO;AAAA,MACT;AAAA,IACF;AAAA,IACA,WAAW,CAAC,mBAAmB;AAAA,IAC/B,SAAS,CAAC,mBAAmB,SAAS;AAAA,IAEtC,iBAAiB;AAAA,MACf,OAAOC,GAAE,OAAO;AAAA,MAChB,eAAeA,GAAE,OAAO,EAAE,SAAS;AAAA,MACnC,sBAAsB,2BAA2B,SAAS;AAAA,MAC1D,yBAAyBA,GACtB,OAAOA,GAAE,OAAO,GAAGA,GAAE,OAAkB,CAAC,EACxC,SAAS;AAAA,MACZ,cAAc,2BAA2B,SAAS;AAAA,MAClD,mBAAmBA,GAAE,OAAOA,GAAE,OAAO,GAAGA,GAAE,OAAO,CAAC,EAAE,SAAS;AAAA,IAC/D;AAAA,EACF;AAAA,EACA,OAAO,WAAW;AAChB,UAAM,EAAE,UAAU,UAAU,QAAQ,UAAU,IAAI;AAClD,UAAM,WAAyC,CAAC;AAEhD,QAAI,UAAU,cAAc;AAC1B,eAAS,KAAK;AAAA,QACZ,MAAM;AAAA,QACN,SAAS,UAAU,aAAa;AAAA,MAClC,CAAC;AAAA,IACH;AAEA,QAAI,SAAS,SAAS,cAAc;AAClC,eAAS,KAAK;AAAA,QACZ,MAAM;AAAA,QACN,SAAS,UAAU,QAAQ;AAAA,MAC7B,CAAC;AACD,uBAAiB,UAAU,UAAU,qBAAqB,CAAC,CAAC;AAE5D,aAAO,OAAO;AAAA,QACZ;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA,cAAc;AAAA,UACZ,UAAU,OAAO,cAAc,YAAY;AAAA,UAC3C,OAAO,OAAO,cAAc,SAAS;AAAA,QACvC;AAAA,MACF,CAAC;AAAA,IACH;AAEA,QAAI,SAAS,SAAS,aAAa;AACjC,UACE,UACA,QAAQ,SAAU,GAAG,mBAAmB,mBACxC;AACA,cAAM,IAAI;AAAA,UACR,uEAAuE,QAAQ,IAAI;AAAA,QACrF;AAAA,MACF;AAEA,eAAS,KAAK;AAAA,QACZ,MAAM;AAAA,QACN,SAAS,SAAS;AAAA,MACpB,CAAC;AACD,uBAAiB,UAAU,UAAU,qBAAqB,CAAC,CAAC;AAE5D,aAAO,MAAM;AAAA,QACX;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA,cAAc;AAAA,UACZ,UAAU,OAAO,cAAc,YAAY;AAAA,UAC3C,OAAO,OAAO,cAAc,SAAS;AAAA,QACvC;AAAA,MACF,CAAC;AAAA,IACH;AAEA,UAAM,IAAI,MAAM,4BAA4B;AAAA,EAC9C;AACF;AAEA,eAAe,MAAM,QAgBlB;AACD,QAAM,EAAE,UAAU,UAAU,UAAU,QAAQ,UAAU,IAAI;AAE5D,QAAM,mBAAmB,MAAM,SAAS,QAAQ;AAAA,IAC9C,OAAO,UAAU;AAAA,IACjB;AAAA,EACF,CAAC;AAED,QAAM,WAAW,MAAM,mBAAmB;AAAA,IACxC;AAAA,MACE,MAAM,iBAAiB;AAAA,MACvB,WAAW,iBAAiB;AAAA,MAC5B,aAAa,iBAAiB;AAAA,MAC9B,YAAY,SAAS;AAAA,MACrB,WAAW,UAAU;AAAA,MACrB,UAAU,SAAS;AAAA,MACnB,gBAAgB,UAAU,cAAc;AAAA,MAExC,iBAAiB,iBAAiB;AAAA,MAClC,kBAAkB,iBAAiB;AAAA,MACnC,WAAW,iBAAiB;AAAA,MAC5B,YAAY,iBAAiB;AAAA,IAC/B;AAAA,IACA,OAAO,cAAc,YAAY;AAAA,EACnC;AAEA,MAAI,QAAQ,SAAU,GAAG,mBAAmB,mBAA6B;AACvE,QAAI,CAAC,UAAU,eAAe;AAC5B,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAEA,UAAM,eAAe,MAAM,OAAO,MAAM;AAAA,MACtC,OAAO,UAAU;AAAA,MACjB,UAAU,SAAS;AAAA,MACnB,QAAQ,2BAA2B,SAAS,YAAY,KAAK,IAAI,CAAC;AAAA,mBAAsB,SAAS,WAAW,KAAK,IAAI,CAAC;AAAA,MACtH,cAAc,UAAU,sBAAsB;AAAA,MAC9C,UAAU;AAAA,QACR;AAAA,UACE,IAAI;AAAA,UACJ,aACE;AAAA,UACF,QAAQ;AAAA,QACV;AAAA,MACF;AAAA,MACA,iBAAiB,UAAU,2BAA2B,CAAC;AAAA,IACzD,CAAC;AAED,QAAI,iBAAiB,MAAM;AACzB,YAAM,QAAQ,MAAM,gBAAgB;AAAA,QAClC;AAAA,UACE,eAAe,cAAc;AAAA,UAC7B,OAAO,aAAa;AAAA,UACpB,YAAY,SAAS;AAAA,UACrB,aAAa,aAAa;AAAA,UAC1B,mBAAmB,aAAa;AAAA,UAChC,oBAAoB,aAAa;AAAA,UACjC,yBAAyB,aAAa;AAAA,UACtC,0BAA0B,aAAa;AAAA,UACvC,kBAAkB,aAAa;AAAA,UAC/B,mBAAmB,UAAU;AAAA,UAC7B,wBAAwB,UAAU,sBAAsB;AAAA,UACxD,UAAU;AAAA,YACR,GAAG,aAAa;AAAA,YAChB,iBAAiB,aAAa;AAAA,UAChC;AAAA,QACF;AAAA,QACA,OAAO,cAAc,SAAS;AAAA,MAChC;AAEA,aAAO,EAAE,UAAU,MAAM;AAAA,IAC3B;AAAA,EACF;AAEA,SAAO,EAAE,SAAS;AACpB;AAEA,eAAe,OAAO,QAgBnB;AACD,QAAM,EAAE,UAAU,UAAU,UAAU,QAAQ,UAAU,IAAI;AAE5D,QAAM,mBAAmB,MAAM,SAAS,QAAQ;AAAA,IAC9C,OAAO,UAAU;AAAA,IACjB;AAAA,EACF,CAAC;AAED,QAAM,WAAW,MAAM,oBAAoB;AAAA,IACzC;AAAA,MACE,MAAM,iBAAiB;AAAA,MACvB,WAAW,iBAAiB;AAAA,MAC5B,aAAa,iBAAiB;AAAA,MAC9B,YAAY,SAAS;AAAA,MACrB,WAAW,UAAU;AAAA,MACrB,UAAU,SAAS;AAAA,MACnB,gBAAgB,UAAU,cAAc;AAAA,MAExC,iBAAiB,iBAAiB;AAAA,MAClC,kBAAkB,iBAAiB;AAAA,MACnC,WAAW,iBAAiB;AAAA,MAC5B,YAAY,iBAAiB;AAAA,IAC/B;AAAA,IACA,OAAO,cAAc,YAAY;AAAA,EACnC;AAEA,MAAI,QAAQ,SAAU,GAAG,mBAAmB,QAAkB;AAC5D,UAAM,eAAe,MAAM,OAAO,MAAM;AAAA,MACtC,UAAU,SAAS;AAAA,MACnB,SAAS,SAAS;AAAA,MAClB,gBAAgB,SAAS;AAAA,IAC3B,CAAC;AAED,QAAI,iBAAiB,MAAM;AACzB,YAAM,QAAQ,MAAM,iBAAiB;AAAA,QACnC;AAAA,UACE,eAAe,cAAc;AAAA,UAC7B,OAAO,aAAa;AAAA,UACpB,YAAY,SAAS;AAAA,UACrB,kBAAkB,aAAa;AAAA,UAC/B,aAAa,aAAa;AAAA,UAC1B,UAAU,aAAa;AAAA,QACzB;AAAA,QACA,OAAO,cAAc,SAAS;AAAA,MAChC;AAEA,aAAO,EAAE,UAAU,MAAM;AAAA,IAC3B;AAAA,EACF;AAEA,MAAI,QAAQ,SAAU,GAAG,mBAAmB,mBAA6B;AACvE,QAAI,CAAC,UAAU,eAAe;AAC5B,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAEA,UAAM,eAAe,MAAM,OAAO,MAAM;AAAA,MACtC,OAAO,UAAU;AAAA,MACjB,UAAU;AAAA,QACR;AAAA,UACE,IAAI;AAAA,UACJ,aACE;AAAA,UACF,QAAQ;AAAA,QACV;AAAA,MACF;AAAA,MACA,QAAQ;AAAA,qBACO,SAAS,kBAAkB,IAAI,CAAC,QAAQ,KAAK,GAAG,EAAE,EAAE,KAAK,IAAI,CAAC;AAAA,sBAC7D,SAAS,kBAAkB,IAAI,CAAC,QAAQ,KAAK,SAAS,UAAU,GAAG,KAAK,EAAE,EAAE,EAAE,KAAK,IAAI,CAAC;AAAA,MACxG,iBAAiB;AAAA,QACf,kBAAkBA,GACf,OAAO,EACP,MAAM,EACN;AAAA,UACC;AAAA,QACF;AAAA,QACF,GAAI,UAAU,2BAA2B,CAAC;AAAA,MAC5C;AAAA,MACA,UAAU,SAAS;AAAA,MACnB,cAAc,UAAU,sBAAsB;AAAA,IAChD,CAAC;AAED,QAAI,iBAAiB,MAAM;AACzB,YAAM,EAAE,kBAAkB,GAAG,gBAAgB,IAC3C,aAAa;AACf,YAAM,QAAQ,MAAM,iBAAiB;AAAA,QACnC;AAAA,UACE,eAAe,cAAc;AAAA,UAC7B,OAAO,aAAa;AAAA,UACpB;AAAA,UACA,YAAY,SAAS;AAAA,UACrB,aAAa,aAAa;AAAA,UAC1B,mBAAmB,aAAa;AAAA,UAChC,oBAAoB,aAAa;AAAA,UACjC,yBAAyB,aAAa;AAAA,UACtC,0BAA0B,aAAa;AAAA,UACvC,kBAAkB,aAAa;AAAA,UAC/B,mBAAmB,UAAU;AAAA,UAC7B,wBAAwB,UAAU,sBAAsB;AAAA,UACxD,UAAU;AAAA,YACR,GAAG,aAAa;AAAA,YAChB;AAAA,UACF;AAAA,QACF;AAAA,QACA,OAAO,cAAc,SAAS;AAAA,MAChC;AAEA,aAAO,EAAE,UAAU,MAAM;AAAA,IAC3B;AAAA,EACF;AAEA,SAAO,EAAE,SAAS;AACpB;AAEA,SAAS,UAAU,UAAyB;AAC1C,SAAO,aAAa,SAAS,QAAQ;AAAA;AAAA,EAAe,OAAO;AAAA,IACzD,SAAS,WAAW,CAAC;AAAA,EACvB,EACG,IAAI,CAAC,CAAC,KAAK,KAAK,MAAM,GAAG,GAAG,KAAK,KAAK,EAAE,EACxC,KAAK,IAAI,CAAC;AACf;AAEA,SAAS,iBACP,UACA,mBACA;AACA,WAAS,IAAI,GAAG,IAAI,SAAS,QAAQ,KAAK;AACxC,UAAM,WAAW,WAAW,QAAQ,SAAS,CAAC,EAAG,OAAO;AACxD,aAAS,CAAC,EAAG,UAAU,SAAS,iBAAiB;AAAA,EACnD;AACF;;;AC9UA,OAAOC,QAAO;AAEP,IAAM,uBAAN,cAAmC,gBAUxC;AAAA,EACA,YAAY,QAA8C;AACxD,UAAM;AAAA,MACJ,MAAM,OAAO;AAAA,MACb,WAAW,OAAO;AAAA,MAElB,QAAQA,GAAE,MAAM;AAAA,QACd;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,MACF,CAAC;AAAA,IACH,CAAC;AAAA,EACH;AACF;","names":["z","z","z","z","z"]}
1
+ {"version":3,"sources":["../../src/benchmarks/peerbench/index.ts","../../src/benchmarks/peerbench/schema-sets/mcq.v1.ts","../../src/benchmarks/peerbench/schema-sets/multi-turn.v1.ts","../../src/benchmarks/peerbench/schema-sets/qa.v1.ts","../../src/benchmarks/peerbench/mcq-runner.ts","../../src/benchmarks/peerbench/qa-runner.ts","../../src/benchmarks/peerbench/storages/json.ts"],"sourcesContent":["export * from \"./schema-sets/mcq.v1\";\nexport * from \"./schema-sets/multi-turn.v1\";\nexport * from \"./schema-sets/qa.v1\";\n\nexport * from \"./mcq-runner\";\nexport * from \"./qa-runner\";\n\nexport * from \"./storages/json\";\n","import { PEERBENCH_NAMESPACE } from \"@/constants\";\nimport {\n BaseResponseSchemaV1,\n BaseScoreSchemaV1,\n BaseTestCaseSchemaV1,\n defineResponseSchema,\n defineScoreSchema,\n defineTestCaseSchema,\n} from \"@/schemas\";\nimport { ExtensionLLMResponseFieldsV1 } from \"@/schemas/extensions/response/llm\";\nimport { ExtensionLLMAsAJudgeScoreFieldsV1 } from \"@/schemas/extensions/score/llm-as-a-judge-scorer\";\nimport { z } from \"zod\";\n\nexport const MCQKind = `llm/mcq` as const;\n\nexport const MCQTestCaseSchemaV1 = defineTestCaseSchema({\n baseSchema: BaseTestCaseSchemaV1,\n namespace: PEERBENCH_NAMESPACE,\n kind: MCQKind,\n schemaVersion: 1,\n fields: {\n question: z.string(),\n options: z.record(z.string(), z.string()),\n correctAnswerKeys: z.string().array(),\n },\n});\nexport type MCQTestCaseV1 = z.infer<typeof MCQTestCaseSchemaV1>;\n\nexport const MCQResponseSchemaV1 = defineResponseSchema({\n baseSchema: BaseResponseSchemaV1,\n namespace: PEERBENCH_NAMESPACE,\n kind: MCQKind,\n schemaVersion: 1,\n fields: {\n ...ExtensionLLMResponseFieldsV1,\n },\n});\nexport type MCQResponseV1 = z.infer<typeof MCQResponseSchemaV1>;\n\nexport const MCQScoreSchemaV1 = defineScoreSchema({\n baseSchema: BaseScoreSchemaV1,\n namespace: PEERBENCH_NAMESPACE,\n kind: MCQKind,\n schemaVersion: 1,\n fields: {\n ...ExtensionLLMAsAJudgeScoreFieldsV1,\n extractedAnswers: z.array(z.string()),\n },\n});\nexport type MCQScoreV1 = z.infer<typeof MCQScoreSchemaV1>;\n","import { PEERBENCH_NAMESPACE } from \"@/constants\";\nimport {\n BaseResponseSchemaV1,\n BaseScoreSchemaV1,\n BaseTestCaseSchemaV1,\n defineResponseSchema,\n defineScoreSchema,\n defineTestCaseSchema,\n} from \"@/schemas\";\nimport { ExtensionLLMResponseFieldsV1 } from \"@/schemas/extensions/response/llm\";\nimport { ExtensionLLMAsAJudgeScoreFieldsV1 } from \"@/schemas/extensions/score/llm-as-a-judge-scorer\";\nimport { z } from \"zod\";\n\nexport const MultiTurnKind = `llm/multi-turn` as const;\n\nexport const MultiTurnTestCaseSchemaV1 = defineTestCaseSchema({\n baseSchema: BaseTestCaseSchemaV1,\n namespace: PEERBENCH_NAMESPACE,\n kind: MultiTurnKind,\n schemaVersion: 1,\n fields: {\n messages: z\n .object({\n role: z.string(),\n content: z.string(),\n goodAnswers: z.string().array().optional(),\n badAnswers: z.string().array().optional(),\n })\n .array(),\n\n maxTurns: z.number().optional(),\n expectedOutcome: z.string().optional(),\n },\n});\nexport type MultiTurnTestCaseV1 = z.infer<typeof MultiTurnTestCaseSchemaV1>;\n\nexport const MultiTurnResponseSchemaV1 = defineResponseSchema({\n baseSchema: BaseResponseSchemaV1,\n namespace: PEERBENCH_NAMESPACE,\n kind: MultiTurnKind,\n schemaVersion: 1,\n fields: {\n ...ExtensionLLMResponseFieldsV1,\n replies: z\n .object({\n messageIndex: z.number(),\n startedAt: z.number(),\n completedAt: z.number(),\n data: z.string(),\n\n inputTokensUsed: z.number().optional(),\n outputTokensUsed: z.number().optional(),\n inputCost: z.string().optional(),\n outputCost: z.string().optional(),\n })\n .array(),\n },\n});\nexport type MultiTurnResponseV1 = z.infer<typeof MultiTurnResponseSchemaV1>;\n\nexport const MultiTurnScoreSchemaV1 = defineScoreSchema({\n baseSchema: BaseScoreSchemaV1,\n namespace: PEERBENCH_NAMESPACE,\n kind: MultiTurnKind,\n schemaVersion: 1,\n fields: {\n ...ExtensionLLMAsAJudgeScoreFieldsV1,\n individualScores: z\n .object({\n replyIndex: z.number(),\n value: z.number(),\n })\n .array(),\n },\n});\nexport type MultiTurnScoreV1 = z.infer<typeof MultiTurnScoreSchemaV1>;\n","import { PEERBENCH_NAMESPACE } from \"@/constants\";\nimport {\n BaseResponseSchemaV1,\n BaseScoreSchemaV1,\n BaseTestCaseSchemaV1,\n defineResponseSchema,\n defineScoreSchema,\n defineTestCaseSchema,\n} from \"@/schemas\";\nimport { ExtensionLLMResponseFieldsV1 } from \"@/schemas/extensions/response/llm\";\nimport { ExtensionLLMAsAJudgeScoreFieldsV1 } from \"@/schemas/extensions/score/llm-as-a-judge-scorer\";\nimport { z } from \"zod\";\n\nexport const QAKind = `llm/qa` as const;\n\nexport const QATestCaseSchemaV1 = defineTestCaseSchema({\n baseSchema: BaseTestCaseSchemaV1,\n namespace: PEERBENCH_NAMESPACE,\n kind: QAKind,\n schemaVersion: 1,\n fields: {\n question: z.string(),\n goodAnswers: z.string().array(),\n badAnswers: z.string().array(),\n },\n});\nexport type QATestCaseV1 = z.infer<typeof QATestCaseSchemaV1>;\n\nexport const QAResponseSchemaV1 = defineResponseSchema({\n baseSchema: BaseResponseSchemaV1,\n namespace: PEERBENCH_NAMESPACE,\n kind: QAKind,\n schemaVersion: 1,\n fields: {\n ...ExtensionLLMResponseFieldsV1,\n },\n});\nexport type QAResponseV1 = z.infer<typeof QAResponseSchemaV1>;\n\nexport const QAScoreSchemaV1 = defineScoreSchema({\n baseSchema: BaseScoreSchemaV1,\n namespace: PEERBENCH_NAMESPACE,\n kind: QAKind,\n schemaVersion: 1,\n fields: {\n ...ExtensionLLMAsAJudgeScoreFieldsV1,\n },\n});\nexport type QAScoreV1 = z.infer<typeof QAScoreSchemaV1>;\n","import { defineRunner } from \"@/helpers/define-runner\";\nimport { CallableLLM } from \"@/providers\";\nimport { SimpleSystemPromptV1 } from \"@/schemas/llm\";\nimport { LLMAsAJudgeScorer, MCQScorer } from \"@/scorers\";\nimport { IdGenerator, ScoringMethod } from \"@/types\";\nimport { idGeneratorUUIDv7 } from \"@/utils\";\nimport { ChatCompletionMessageParam } from \"openai/resources/index\";\nimport Handlebars from \"handlebars\";\nimport z from \"zod\";\nimport {\n MCQResponseSchemaV1,\n MCQScoreSchemaV1,\n MCQTestCaseV1,\n} from \"./schema-sets/mcq.v1\";\nimport { PEERBENCH_NAMESPACE } from \"@/constants\";\n\nexport const mcqRunner = defineRunner(\n async (params: {\n testCase: MCQTestCaseV1;\n target: CallableLLM;\n scorer?: MCQScorer | LLMAsAJudgeScorer;\n systemPrompt?: SimpleSystemPromptV1;\n llmJudgeSystemPrompt?: SimpleSystemPromptV1;\n llmJudgeFieldsToExtract?: Record<string, z.ZodType>;\n templateVariables?: Record<string, string>;\n idGenerators?: {\n response?: IdGenerator;\n score?: IdGenerator;\n };\n }) => {\n const { testCase, target, scorer } = params;\n const messages: ChatCompletionMessageParam[] = [];\n\n if (params.systemPrompt) {\n messages.push({\n role: \"system\",\n content: params.systemPrompt.content,\n });\n }\n\n messages.push({\n role: \"user\",\n content: formatMCQ(testCase),\n });\n templateMessages(messages, params.templateVariables ?? {});\n\n const providerResponse = await target.forward({ messages });\n\n const response = await MCQResponseSchemaV1.newWithId(\n {\n data: providerResponse.data,\n startedAt: providerResponse.startedAt,\n completedAt: providerResponse.completedAt,\n testCaseId: testCase.id,\n modelSlug: target.slug,\n provider: target.provider.kind,\n systemPromptId: params.systemPrompt?.id,\n inputTokensUsed: providerResponse.inputTokensUsed,\n outputTokensUsed: providerResponse.outputTokensUsed,\n inputCost: providerResponse.inputCost,\n outputCost: providerResponse.outputCost,\n },\n params.idGenerators?.response ?? idGeneratorUUIDv7\n );\n\n if (scorer?.kind === (`${PEERBENCH_NAMESPACE}/mcq` as const)) {\n const scorerResult = await scorer.score({\n response: response.data,\n choices: testCase.options,\n correctAnswers: testCase.correctAnswerKeys,\n });\n\n if (scorerResult !== null) {\n const score = await MCQScoreSchemaV1.newWithId(\n {\n scoringMethod: ScoringMethod.algo,\n value: scorerResult.value,\n responseId: response.id,\n extractedAnswers: scorerResult.extractedAnswers,\n explanation: scorerResult.explanation,\n metadata: scorerResult.metadata,\n },\n params.idGenerators?.score ?? idGeneratorUUIDv7\n );\n\n return { response, score };\n }\n }\n\n if (scorer?.kind === (`${PEERBENCH_NAMESPACE}/llm-as-a-judge` as const)) {\n const scorerResult = await scorer.score({\n criteria: [\n {\n id: \"correctness\",\n description:\n \"Is the given answer key matches with one of the correct answer keys?\",\n weight: 1,\n },\n ],\n rubric: `Answer text itself or the key (A, B, C) is accepted\nValid answer keys: ${testCase.correctAnswerKeys.map((key) => `- ${key}`).join(\"\\n\")}\nValid Answer texts: ${testCase.correctAnswerKeys.map((key) => `- ${testCase.options?.[key] ?? \"\"}`).join(\"\\n\")}`,\n fieldsToExtract: {\n extractedAnswers: z\n .string()\n .array()\n .describe(\n \"The extracted answer keys, valid or invalid (even if the answer text is provided rather than the key)\"\n ),\n ...(params.llmJudgeFieldsToExtract ?? {}),\n },\n response: response.data,\n systemPrompt: params.llmJudgeSystemPrompt?.content,\n });\n\n if (scorerResult !== null) {\n const { extractedAnswers, ...extractedFields } =\n scorerResult.extractedFields;\n const score = await MCQScoreSchemaV1.newWithId(\n {\n scoringMethod: ScoringMethod.ai,\n value: scorerResult.value,\n extractedAnswers,\n responseId: response.id,\n explanation: scorerResult.explanation,\n scorerAIInputCost: scorerResult.inputCost,\n scorerAIOutputCost: scorerResult.outputCost,\n scorerAIInputTokensUsed: scorerResult.inputTokensUsed,\n scorerAIOutputTokensUsed: scorerResult.outputTokensUsed,\n scorerAIProvider: scorerResult.provider,\n scorerAIModelSlug: scorerResult.modelSlug,\n scorerAISystemPromptId: params.llmJudgeSystemPrompt?.id,\n metadata: {\n ...scorerResult.metadata,\n extractedFields,\n },\n },\n params.idGenerators?.score ?? idGeneratorUUIDv7\n );\n\n return { response, score };\n }\n }\n\n return { response };\n }\n);\n\nfunction formatMCQ(testCase: MCQTestCaseV1) {\n return `Question: ${testCase.question}\\nOptions:\\n${Object.entries(\n testCase.options ?? {}\n )\n .map(([key, value]) => `${key}: ${value}`)\n .join(\"\\n\")}`;\n}\n\nfunction templateMessages(\n messages: ChatCompletionMessageParam[],\n templateVariables: Record<string, string>\n) {\n for (let i = 0; i < messages.length; i++) {\n const template = Handlebars.compile(messages[i]!.content);\n messages[i]!.content = template(templateVariables);\n }\n}\n","import { defineRunner } from \"@/helpers/define-runner\";\nimport { CallableLLM } from \"@/providers\";\nimport { SimpleSystemPromptV1 } from \"@/schemas/llm\";\nimport { LLMAsAJudgeScorer } from \"@/scorers\";\nimport { IdGenerator, ScoringMethod } from \"@/types\";\nimport { idGeneratorUUIDv7 } from \"@/utils\";\nimport { ChatCompletionMessageParam } from \"openai/resources/index\";\nimport Handlebars from \"handlebars\";\nimport z from \"zod\";\nimport {\n QAResponseSchemaV1,\n QAScoreSchemaV1,\n QATestCaseV1,\n} from \"./schema-sets/qa.v1\";\nimport { PEERBENCH_NAMESPACE } from \"@/constants\";\n\nexport const qaRunner = defineRunner(\n async (params: {\n testCase: QATestCaseV1;\n target: CallableLLM;\n scorer?: LLMAsAJudgeScorer;\n systemPrompt?: SimpleSystemPromptV1;\n llmJudgeSystemPrompt?: SimpleSystemPromptV1;\n llmJudgeFieldsToExtract?: Record<string, z.ZodType>;\n templateVariables?: Record<string, string>;\n idGenerators?: {\n response?: IdGenerator;\n score?: IdGenerator;\n };\n }) => {\n const { testCase, target, scorer } = params;\n const messages: ChatCompletionMessageParam[] = [];\n\n if (params.systemPrompt) {\n messages.push({\n role: \"system\",\n content: params.systemPrompt.content,\n });\n }\n\n messages.push({\n role: \"user\",\n content: testCase.question,\n });\n templateMessages(messages, params.templateVariables ?? {});\n\n const providerResponse = await target.forward({ messages });\n\n const response = await QAResponseSchemaV1.newWithId(\n {\n data: providerResponse.data,\n startedAt: providerResponse.startedAt,\n completedAt: providerResponse.completedAt,\n testCaseId: testCase.id,\n modelSlug: target.slug,\n provider: target.provider.kind,\n systemPromptId: params.systemPrompt?.id,\n inputTokensUsed: providerResponse.inputTokensUsed,\n outputTokensUsed: providerResponse.outputTokensUsed,\n inputCost: providerResponse.inputCost,\n outputCost: providerResponse.outputCost,\n },\n params.idGenerators?.response ?? idGeneratorUUIDv7\n );\n\n if (scorer?.kind === (`${PEERBENCH_NAMESPACE}/llm-as-a-judge` as const)) {\n const scorerResult = await scorer.score({\n response: response.data,\n rubric: `Expected/Valid answers: ${testCase.goodAnswers.join(\"\\n\")}\\nInvalid answers: ${testCase.badAnswers.join(\"\\n\")}`,\n systemPrompt: params.llmJudgeSystemPrompt?.content,\n criteria: [\n {\n id: \"correctness\",\n description:\n \"Is the response matches with the expected/valid answers in terms of meaning?\",\n weight: 1,\n },\n ],\n fieldsToExtract: params.llmJudgeFieldsToExtract ?? {},\n });\n\n if (scorerResult !== null) {\n const score = await QAScoreSchemaV1.newWithId(\n {\n scoringMethod: ScoringMethod.ai,\n value: scorerResult.value,\n responseId: response.id,\n explanation: scorerResult.explanation,\n scorerAIInputCost: scorerResult.inputCost,\n scorerAIOutputCost: scorerResult.outputCost,\n scorerAIInputTokensUsed: scorerResult.inputTokensUsed,\n scorerAIOutputTokensUsed: scorerResult.outputTokensUsed,\n scorerAIProvider: scorerResult.provider,\n scorerAIModelSlug: scorerResult.modelSlug,\n scorerAISystemPromptId: params.llmJudgeSystemPrompt?.id,\n metadata: {\n ...scorerResult.metadata,\n extractedFields: scorerResult.extractedFields,\n },\n },\n params.idGenerators?.score ?? idGeneratorUUIDv7\n );\n\n return { response, score };\n }\n }\n\n return { response };\n }\n);\n\nfunction templateMessages(\n messages: ChatCompletionMessageParam[],\n templateVariables: Record<string, string>\n) {\n for (let i = 0; i < messages.length; i++) {\n const template = Handlebars.compile(messages[i]!.content);\n messages[i]!.content = template(templateVariables);\n }\n}\n","import { JSONFileStorage } from \"@/storages/json-file\";\nimport {\n MCQResponseSchemaV1,\n MCQResponseV1,\n MCQScoreSchemaV1,\n MCQScoreV1,\n MCQTestCaseSchemaV1,\n MCQTestCaseV1,\n} from \"../schema-sets/mcq.v1\";\nimport {\n QAResponseSchemaV1,\n QAResponseV1,\n QAScoreSchemaV1,\n QAScoreV1,\n QATestCaseSchemaV1,\n QATestCaseV1,\n} from \"../schema-sets/qa.v1\";\nimport {\n MultiTurnResponseSchemaV1,\n MultiTurnResponseV1,\n MultiTurnScoreSchemaV1,\n MultiTurnScoreV1,\n MultiTurnTestCaseSchemaV1,\n MultiTurnTestCaseV1,\n} from \"../schema-sets/multi-turn.v1\";\nimport z from \"zod\";\n\nexport class PeerbenchJSONStorage extends JSONFileStorage<\n | MCQTestCaseV1\n | MCQResponseV1\n | MCQScoreV1\n | QATestCaseV1\n | QAResponseV1\n | QAScoreV1\n | MultiTurnTestCaseV1\n | MultiTurnResponseV1\n | MultiTurnScoreV1\n> {\n constructor(config: { path: string; chunkSize?: number }) {\n super({\n path: config.path,\n chunkSize: config.chunkSize,\n\n schema: z.union([\n MCQTestCaseSchemaV1,\n MCQResponseSchemaV1,\n MCQScoreSchemaV1,\n QATestCaseSchemaV1,\n QAResponseSchemaV1,\n QAScoreSchemaV1,\n MultiTurnTestCaseSchemaV1,\n MultiTurnResponseSchemaV1,\n MultiTurnScoreSchemaV1,\n ]),\n });\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACWA,SAAS,SAAS;AAEX,IAAM,UAAU;AAEhB,IAAM,sBAAsB,qBAAqB;AAAA,EACtD,YAAY;AAAA,EACZ,WAAW;AAAA,EACX,MAAM;AAAA,EACN,eAAe;AAAA,EACf,QAAQ;AAAA,IACN,UAAU,EAAE,OAAO;AAAA,IACnB,SAAS,EAAE,OAAO,EAAE,OAAO,GAAG,EAAE,OAAO,CAAC;AAAA,IACxC,mBAAmB,EAAE,OAAO,EAAE,MAAM;AAAA,EACtC;AACF,CAAC;AAGM,IAAM,sBAAsB,qBAAqB;AAAA,EACtD,YAAY;AAAA,EACZ,WAAW;AAAA,EACX,MAAM;AAAA,EACN,eAAe;AAAA,EACf,QAAQ;AAAA,IACN,GAAG;AAAA,EACL;AACF,CAAC;AAGM,IAAM,mBAAmB,kBAAkB;AAAA,EAChD,YAAY;AAAA,EACZ,WAAW;AAAA,EACX,MAAM;AAAA,EACN,eAAe;AAAA,EACf,QAAQ;AAAA,IACN,GAAG;AAAA,IACH,kBAAkB,EAAE,MAAM,EAAE,OAAO,CAAC;AAAA,EACtC;AACF,CAAC;;;ACrCD,SAAS,KAAAA,UAAS;AAEX,IAAM,gBAAgB;AAEtB,IAAM,4BAA4B,qBAAqB;AAAA,EAC5D,YAAY;AAAA,EACZ,WAAW;AAAA,EACX,MAAM;AAAA,EACN,eAAe;AAAA,EACf,QAAQ;AAAA,IACN,UAAUA,GACP,OAAO;AAAA,MACN,MAAMA,GAAE,OAAO;AAAA,MACf,SAASA,GAAE,OAAO;AAAA,MAClB,aAAaA,GAAE,OAAO,EAAE,MAAM,EAAE,SAAS;AAAA,MACzC,YAAYA,GAAE,OAAO,EAAE,MAAM,EAAE,SAAS;AAAA,IAC1C,CAAC,EACA,MAAM;AAAA,IAET,UAAUA,GAAE,OAAO,EAAE,SAAS;AAAA,IAC9B,iBAAiBA,GAAE,OAAO,EAAE,SAAS;AAAA,EACvC;AACF,CAAC;AAGM,IAAM,4BAA4B,qBAAqB;AAAA,EAC5D,YAAY;AAAA,EACZ,WAAW;AAAA,EACX,MAAM;AAAA,EACN,eAAe;AAAA,EACf,QAAQ;AAAA,IACN,GAAG;AAAA,IACH,SAASA,GACN,OAAO;AAAA,MACN,cAAcA,GAAE,OAAO;AAAA,MACvB,WAAWA,GAAE,OAAO;AAAA,MACpB,aAAaA,GAAE,OAAO;AAAA,MACtB,MAAMA,GAAE,OAAO;AAAA,MAEf,iBAAiBA,GAAE,OAAO,EAAE,SAAS;AAAA,MACrC,kBAAkBA,GAAE,OAAO,EAAE,SAAS;AAAA,MACtC,WAAWA,GAAE,OAAO,EAAE,SAAS;AAAA,MAC/B,YAAYA,GAAE,OAAO,EAAE,SAAS;AAAA,IAClC,CAAC,EACA,MAAM;AAAA,EACX;AACF,CAAC;AAGM,IAAM,yBAAyB,kBAAkB;AAAA,EACtD,YAAY;AAAA,EACZ,WAAW;AAAA,EACX,MAAM;AAAA,EACN,eAAe;AAAA,EACf,QAAQ;AAAA,IACN,GAAG;AAAA,IACH,kBAAkBA,GACf,OAAO;AAAA,MACN,YAAYA,GAAE,OAAO;AAAA,MACrB,OAAOA,GAAE,OAAO;AAAA,IAClB,CAAC,EACA,MAAM;AAAA,EACX;AACF,CAAC;;;AC/DD,SAAS,KAAAC,UAAS;AAEX,IAAM,SAAS;AAEf,IAAM,qBAAqB,qBAAqB;AAAA,EACrD,YAAY;AAAA,EACZ,WAAW;AAAA,EACX,MAAM;AAAA,EACN,eAAe;AAAA,EACf,QAAQ;AAAA,IACN,UAAUA,GAAE,OAAO;AAAA,IACnB,aAAaA,GAAE,OAAO,EAAE,MAAM;AAAA,IAC9B,YAAYA,GAAE,OAAO,EAAE,MAAM;AAAA,EAC/B;AACF,CAAC;AAGM,IAAM,qBAAqB,qBAAqB;AAAA,EACrD,YAAY;AAAA,EACZ,WAAW;AAAA,EACX,MAAM;AAAA,EACN,eAAe;AAAA,EACf,QAAQ;AAAA,IACN,GAAG;AAAA,EACL;AACF,CAAC;AAGM,IAAM,kBAAkB,kBAAkB;AAAA,EAC/C,YAAY;AAAA,EACZ,WAAW;AAAA,EACX,MAAM;AAAA,EACN,eAAe;AAAA,EACf,QAAQ;AAAA,IACN,GAAG;AAAA,EACL;AACF,CAAC;;;ACxCD,OAAO,gBAAgB;AACvB,OAAOC,QAAO;AAQP,IAAM,YAAY;AAAA,EACvB,OAAO,WAYD;AACJ,UAAM,EAAE,UAAU,QAAQ,OAAO,IAAI;AACrC,UAAM,WAAyC,CAAC;AAEhD,QAAI,OAAO,cAAc;AACvB,eAAS,KAAK;AAAA,QACZ,MAAM;AAAA,QACN,SAAS,OAAO,aAAa;AAAA,MAC/B,CAAC;AAAA,IACH;AAEA,aAAS,KAAK;AAAA,MACZ,MAAM;AAAA,MACN,SAAS,UAAU,QAAQ;AAAA,IAC7B,CAAC;AACD,qBAAiB,UAAU,OAAO,qBAAqB,CAAC,CAAC;AAEzD,UAAM,mBAAmB,MAAM,OAAO,QAAQ,EAAE,SAAS,CAAC;AAE1D,UAAM,WAAW,MAAM,oBAAoB;AAAA,MACzC;AAAA,QACE,MAAM,iBAAiB;AAAA,QACvB,WAAW,iBAAiB;AAAA,QAC5B,aAAa,iBAAiB;AAAA,QAC9B,YAAY,SAAS;AAAA,QACrB,WAAW,OAAO;AAAA,QAClB,UAAU,OAAO,SAAS;AAAA,QAC1B,gBAAgB,OAAO,cAAc;AAAA,QACrC,iBAAiB,iBAAiB;AAAA,QAClC,kBAAkB,iBAAiB;AAAA,QACnC,WAAW,iBAAiB;AAAA,QAC5B,YAAY,iBAAiB;AAAA,MAC/B;AAAA,MACA,OAAO,cAAc,YAAY;AAAA,IACnC;AAEA,QAAI,QAAQ,SAAU,GAAG,mBAAmB,QAAkB;AAC5D,YAAM,eAAe,MAAM,OAAO,MAAM;AAAA,QACtC,UAAU,SAAS;AAAA,QACnB,SAAS,SAAS;AAAA,QAClB,gBAAgB,SAAS;AAAA,MAC3B,CAAC;AAED,UAAI,iBAAiB,MAAM;AACzB,cAAM,QAAQ,MAAM,iBAAiB;AAAA,UACnC;AAAA,YACE,eAAe,cAAc;AAAA,YAC7B,OAAO,aAAa;AAAA,YACpB,YAAY,SAAS;AAAA,YACrB,kBAAkB,aAAa;AAAA,YAC/B,aAAa,aAAa;AAAA,YAC1B,UAAU,aAAa;AAAA,UACzB;AAAA,UACA,OAAO,cAAc,SAAS;AAAA,QAChC;AAEA,eAAO,EAAE,UAAU,MAAM;AAAA,MAC3B;AAAA,IACF;AAEA,QAAI,QAAQ,SAAU,GAAG,mBAAmB,mBAA6B;AACvE,YAAM,eAAe,MAAM,OAAO,MAAM;AAAA,QACtC,UAAU;AAAA,UACR;AAAA,YACE,IAAI;AAAA,YACJ,aACE;AAAA,YACF,QAAQ;AAAA,UACV;AAAA,QACF;AAAA,QACA,QAAQ;AAAA,qBACK,SAAS,kBAAkB,IAAI,CAAC,QAAQ,KAAK,GAAG,EAAE,EAAE,KAAK,IAAI,CAAC;AAAA,sBAC7D,SAAS,kBAAkB,IAAI,CAAC,QAAQ,KAAK,SAAS,UAAU,GAAG,KAAK,EAAE,EAAE,EAAE,KAAK,IAAI,CAAC;AAAA,QACtG,iBAAiB;AAAA,UACf,kBAAkBC,GACf,OAAO,EACP,MAAM,EACN;AAAA,YACC;AAAA,UACF;AAAA,UACF,GAAI,OAAO,2BAA2B,CAAC;AAAA,QACzC;AAAA,QACA,UAAU,SAAS;AAAA,QACnB,cAAc,OAAO,sBAAsB;AAAA,MAC7C,CAAC;AAED,UAAI,iBAAiB,MAAM;AACzB,cAAM,EAAE,kBAAkB,GAAG,gBAAgB,IAC3C,aAAa;AACf,cAAM,QAAQ,MAAM,iBAAiB;AAAA,UACnC;AAAA,YACE,eAAe,cAAc;AAAA,YAC7B,OAAO,aAAa;AAAA,YACpB;AAAA,YACA,YAAY,SAAS;AAAA,YACrB,aAAa,aAAa;AAAA,YAC1B,mBAAmB,aAAa;AAAA,YAChC,oBAAoB,aAAa;AAAA,YACjC,yBAAyB,aAAa;AAAA,YACtC,0BAA0B,aAAa;AAAA,YACvC,kBAAkB,aAAa;AAAA,YAC/B,mBAAmB,aAAa;AAAA,YAChC,wBAAwB,OAAO,sBAAsB;AAAA,YACrD,UAAU;AAAA,cACR,GAAG,aAAa;AAAA,cAChB;AAAA,YACF;AAAA,UACF;AAAA,UACA,OAAO,cAAc,SAAS;AAAA,QAChC;AAEA,eAAO,EAAE,UAAU,MAAM;AAAA,MAC3B;AAAA,IACF;AAEA,WAAO,EAAE,SAAS;AAAA,EACpB;AACF;AAEA,SAAS,UAAU,UAAyB;AAC1C,SAAO,aAAa,SAAS,QAAQ;AAAA;AAAA,EAAe,OAAO;AAAA,IACzD,SAAS,WAAW,CAAC;AAAA,EACvB,EACG,IAAI,CAAC,CAAC,KAAK,KAAK,MAAM,GAAG,GAAG,KAAK,KAAK,EAAE,EACxC,KAAK,IAAI,CAAC;AACf;AAEA,SAAS,iBACP,UACA,mBACA;AACA,WAAS,IAAI,GAAG,IAAI,SAAS,QAAQ,KAAK;AACxC,UAAM,WAAW,WAAW,QAAQ,SAAS,CAAC,EAAG,OAAO;AACxD,aAAS,CAAC,EAAG,UAAU,SAAS,iBAAiB;AAAA,EACnD;AACF;;;AC7JA,OAAOC,iBAAgB;AAShB,IAAM,WAAW;AAAA,EACtB,OAAO,WAYD;AACJ,UAAM,EAAE,UAAU,QAAQ,OAAO,IAAI;AACrC,UAAM,WAAyC,CAAC;AAEhD,QAAI,OAAO,cAAc;AACvB,eAAS,KAAK;AAAA,QACZ,MAAM;AAAA,QACN,SAAS,OAAO,aAAa;AAAA,MAC/B,CAAC;AAAA,IACH;AAEA,aAAS,KAAK;AAAA,MACZ,MAAM;AAAA,MACN,SAAS,SAAS;AAAA,IACpB,CAAC;AACD,IAAAC,kBAAiB,UAAU,OAAO,qBAAqB,CAAC,CAAC;AAEzD,UAAM,mBAAmB,MAAM,OAAO,QAAQ,EAAE,SAAS,CAAC;AAE1D,UAAM,WAAW,MAAM,mBAAmB;AAAA,MACxC;AAAA,QACE,MAAM,iBAAiB;AAAA,QACvB,WAAW,iBAAiB;AAAA,QAC5B,aAAa,iBAAiB;AAAA,QAC9B,YAAY,SAAS;AAAA,QACrB,WAAW,OAAO;AAAA,QAClB,UAAU,OAAO,SAAS;AAAA,QAC1B,gBAAgB,OAAO,cAAc;AAAA,QACrC,iBAAiB,iBAAiB;AAAA,QAClC,kBAAkB,iBAAiB;AAAA,QACnC,WAAW,iBAAiB;AAAA,QAC5B,YAAY,iBAAiB;AAAA,MAC/B;AAAA,MACA,OAAO,cAAc,YAAY;AAAA,IACnC;AAEA,QAAI,QAAQ,SAAU,GAAG,mBAAmB,mBAA6B;AACvE,YAAM,eAAe,MAAM,OAAO,MAAM;AAAA,QACtC,UAAU,SAAS;AAAA,QACnB,QAAQ,2BAA2B,SAAS,YAAY,KAAK,IAAI,CAAC;AAAA,mBAAsB,SAAS,WAAW,KAAK,IAAI,CAAC;AAAA,QACtH,cAAc,OAAO,sBAAsB;AAAA,QAC3C,UAAU;AAAA,UACR;AAAA,YACE,IAAI;AAAA,YACJ,aACE;AAAA,YACF,QAAQ;AAAA,UACV;AAAA,QACF;AAAA,QACA,iBAAiB,OAAO,2BAA2B,CAAC;AAAA,MACtD,CAAC;AAED,UAAI,iBAAiB,MAAM;AACzB,cAAM,QAAQ,MAAM,gBAAgB;AAAA,UAClC;AAAA,YACE,eAAe,cAAc;AAAA,YAC7B,OAAO,aAAa;AAAA,YACpB,YAAY,SAAS;AAAA,YACrB,aAAa,aAAa;AAAA,YAC1B,mBAAmB,aAAa;AAAA,YAChC,oBAAoB,aAAa;AAAA,YACjC,yBAAyB,aAAa;AAAA,YACtC,0BAA0B,aAAa;AAAA,YACvC,kBAAkB,aAAa;AAAA,YAC/B,mBAAmB,aAAa;AAAA,YAChC,wBAAwB,OAAO,sBAAsB;AAAA,YACrD,UAAU;AAAA,cACR,GAAG,aAAa;AAAA,cAChB,iBAAiB,aAAa;AAAA,YAChC;AAAA,UACF;AAAA,UACA,OAAO,cAAc,SAAS;AAAA,QAChC;AAEA,eAAO,EAAE,UAAU,MAAM;AAAA,MAC3B;AAAA,IACF;AAEA,WAAO,EAAE,SAAS;AAAA,EACpB;AACF;AAEA,SAASA,kBACP,UACA,mBACA;AACA,WAAS,IAAI,GAAG,IAAI,SAAS,QAAQ,KAAK;AACxC,UAAM,WAAWC,YAAW,QAAQ,SAAS,CAAC,EAAG,OAAO;AACxD,aAAS,CAAC,EAAG,UAAU,SAAS,iBAAiB;AAAA,EACnD;AACF;;;AC9FA,OAAOC,QAAO;AAEP,IAAM,uBAAN,cAAmC,gBAUxC;AAAA,EACA,YAAY,QAA8C;AACxD,UAAM;AAAA,MACJ,MAAM,OAAO;AAAA,MACb,WAAW,OAAO;AAAA,MAElB,QAAQA,GAAE,MAAM;AAAA,QACd;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,MACF,CAAC;AAAA,IACH,CAAC;AAAA,EACH;AACF;","names":["z","z","z","z","Handlebars","templateMessages","Handlebars","z"]}
@@ -1,5 +1,6 @@
1
1
  export * from "./schema-sets/mcq.v1";
2
2
  export * from "./schema-sets/multi-turn.v1";
3
3
  export * from "./schema-sets/qa.v1";
4
- export * from "./runner";
4
+ export * from "./mcq-runner";
5
+ export * from "./qa-runner";
5
6
  export * from "./storages/json";