peerbench 0.0.1 → 0.0.2-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/README.md +308 -2
  2. package/dist/abstract-Dec9Sc5O.d.ts +12 -0
  3. package/dist/benchmarks/index.d.ts +1698 -0
  4. package/dist/benchmarks/index.js +915 -0
  5. package/dist/benchmarks/index.js.map +1 -0
  6. package/dist/catalogs/index.d.ts +75 -0
  7. package/dist/catalogs/index.js +88 -0
  8. package/dist/catalogs/index.js.map +1 -0
  9. package/dist/chunk-22HU24QF.js +8 -0
  10. package/dist/chunk-22HU24QF.js.map +1 -0
  11. package/dist/chunk-232PY7K3.js +50 -0
  12. package/dist/chunk-232PY7K3.js.map +1 -0
  13. package/dist/chunk-7TREBPSJ.js +26 -0
  14. package/dist/chunk-7TREBPSJ.js.map +1 -0
  15. package/dist/chunk-DUBKY73H.js +128 -0
  16. package/dist/chunk-DUBKY73H.js.map +1 -0
  17. package/dist/chunk-GVF4YZF3.js +15 -0
  18. package/dist/chunk-GVF4YZF3.js.map +1 -0
  19. package/dist/chunk-HJH3SW3L.js +103 -0
  20. package/dist/chunk-HJH3SW3L.js.map +1 -0
  21. package/dist/chunk-IUN2IUCS.js +58 -0
  22. package/dist/chunk-IUN2IUCS.js.map +1 -0
  23. package/dist/chunk-PZ5AY32C.js +10 -0
  24. package/dist/chunk-PZ5AY32C.js.map +1 -0
  25. package/dist/chunk-VBOM2YEG.js +47 -0
  26. package/dist/chunk-VBOM2YEG.js.map +1 -0
  27. package/dist/chunk-ZJWSK4VO.js +11 -0
  28. package/dist/chunk-ZJWSK4VO.js.map +1 -0
  29. package/dist/data-BmN5WjZ4.d.ts +57 -0
  30. package/dist/generic-array-DLHWSvf1.d.ts +22 -0
  31. package/dist/index-WiPjF2AL.d.ts +15 -0
  32. package/dist/index.d.ts +38 -3845
  33. package/dist/index.js +40 -3557
  34. package/dist/index.js.map +1 -1
  35. package/dist/llm-DNj_tp2T.d.ts +22 -0
  36. package/dist/llm-judge-DIG1f1Az.d.ts +67 -0
  37. package/dist/provider-BDjGp2y-.d.ts +10 -0
  38. package/dist/providers/index.d.ts +72 -0
  39. package/dist/providers/index.js +263 -0
  40. package/dist/providers/index.js.map +1 -0
  41. package/dist/rate-limiter-CSmVIRsM.d.ts +60 -0
  42. package/dist/schemas/extensions/index.d.ts +14 -0
  43. package/dist/schemas/extensions/index.js +13 -0
  44. package/dist/schemas/extensions/index.js.map +1 -0
  45. package/dist/schemas/index.d.ts +233 -0
  46. package/dist/schemas/index.js +27 -0
  47. package/dist/schemas/index.js.map +1 -0
  48. package/dist/schemas/llm/index.d.ts +98 -0
  49. package/dist/schemas/llm/index.js +37 -0
  50. package/dist/schemas/llm/index.js.map +1 -0
  51. package/dist/scorers/index.d.ts +63 -0
  52. package/dist/scorers/index.js +494 -0
  53. package/dist/scorers/index.js.map +1 -0
  54. package/dist/simple-system-prompt-CzPYuvo0.d.ts +49 -0
  55. package/dist/system-prompt--0FdPWqK.d.ts +58 -0
  56. package/dist/utilities-BrRH32rD.d.ts +30 -0
  57. package/package.json +39 -21
  58. package/LICENSE +0 -21
@@ -0,0 +1,1698 @@
1
+ import { a as GenericJSONArrayDataLoader } from '../generic-array-DLHWSvf1.js';
2
+ import { I as IdGenerator } from '../index-WiPjF2AL.js';
3
+ import * as z from 'zod';
4
+ import z__default, { z as z$1 } from 'zod';
5
+ import { A as AbstractLLMProvider } from '../llm-DNj_tp2T.js';
6
+ import { a as MCQScorer, L as LLMJudgeScorer } from '../llm-judge-DIG1f1Az.js';
7
+ import { a as SimpleSystemPromptV1 } from '../simple-system-prompt-CzPYuvo0.js';
8
+ import { c as RunnerResult, A as AbstractDataLoader, L as LoaderResult } from '../data-BmN5WjZ4.js';
9
+ import * as zod_v4_core from 'zod/v4/core';
10
+ import { A as AbstractScorer, B as BaseScorerResult } from '../abstract-Dec9Sc5O.js';
11
+ import '../schemas/index.js';
12
+ import '../provider-BDjGp2y-.js';
13
+ import 'openai/resources/shared';
14
+ import 'openai/resources/chat/completions';
15
+ import '../rate-limiter-CSmVIRsM.js';
16
+ import '../system-prompt--0FdPWqK.js';
17
+
18
+ declare const PeerbenchMultipleChoiceTestCaseSchemaV1: z$1.ZodObject<Omit<{
19
+ id: z$1.ZodString;
20
+ kind: z$1.ZodString;
21
+ schemaVersion: z$1.ZodNumber;
22
+ metadata: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
23
+ }, "kind" | "schemaVersion"> & {
24
+ question: z$1.ZodString;
25
+ options: z$1.ZodRecord<z$1.ZodString, z$1.ZodString>;
26
+ answer: z$1.ZodString;
27
+ answerKey: z$1.ZodString;
28
+ } & {
29
+ kind: z$1.ZodLiteral<"pb.ts.mcq">;
30
+ schemaVersion: z$1.ZodLiteral<1>;
31
+ }, z$1.core.$strip> & {
32
+ new: (input: Omit<{
33
+ id: string;
34
+ question: string;
35
+ options: Record<string, string>;
36
+ answer: string;
37
+ answerKey: string;
38
+ kind: "pb.ts.mcq";
39
+ schemaVersion: 1;
40
+ metadata?: Record<string, unknown> | undefined;
41
+ }, "kind" | "schemaVersion">) => {
42
+ id: string;
43
+ question: string;
44
+ options: Record<string, string>;
45
+ answer: string;
46
+ answerKey: string;
47
+ kind: "pb.ts.mcq";
48
+ schemaVersion: 1;
49
+ metadata?: Record<string, unknown> | undefined;
50
+ };
51
+ newWithId(input: Omit<{
52
+ id: string;
53
+ question: string;
54
+ options: Record<string, string>;
55
+ answer: string;
56
+ answerKey: string;
57
+ kind: "pb.ts.mcq";
58
+ schemaVersion: 1;
59
+ metadata?: Record<string, unknown> | undefined;
60
+ }, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
61
+ id: string;
62
+ question: string;
63
+ options: Record<string, string>;
64
+ answer: string;
65
+ answerKey: string;
66
+ kind: "pb.ts.mcq";
67
+ schemaVersion: 1;
68
+ metadata?: Record<string, unknown> | undefined;
69
+ }>;
70
+ };
71
+ type PeerbenchMultipleChoiceTestCaseV1 = z$1.infer<typeof PeerbenchMultipleChoiceTestCaseSchemaV1>;
72
+ declare const PeerbenchMultipleChoiceResponseSchemaV1: z$1.ZodObject<Omit<Omit<{
73
+ id: z$1.ZodString;
74
+ kind: z$1.ZodString;
75
+ schemaVersion: z$1.ZodNumber;
76
+ startedAt: z$1.ZodNumber;
77
+ completedAt: z$1.ZodNumber;
78
+ testCaseId: z$1.ZodString;
79
+ metadata: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
80
+ }, "kind" | "schemaVersion"> & {
81
+ data: z$1.ZodString;
82
+ modelSlug: z$1.ZodString;
83
+ provider: z$1.ZodString;
84
+ systemPromptId: z$1.ZodOptional<z$1.ZodString>;
85
+ inputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
86
+ outputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
87
+ inputCost: z$1.ZodOptional<z$1.ZodString>;
88
+ outputCost: z$1.ZodOptional<z$1.ZodString>;
89
+ } & {
90
+ kind: z$1.ZodString;
91
+ schemaVersion: z$1.ZodNumber;
92
+ }, "kind" | "schemaVersion"> & {
93
+ kind: z$1.ZodLiteral<"pb.rs.mcq">;
94
+ schemaVersion: z$1.ZodLiteral<1>;
95
+ }, z$1.core.$strip> & {
96
+ new: (input: Omit<{
97
+ id: string;
98
+ testCaseId: string;
99
+ startedAt: number;
100
+ completedAt: number;
101
+ data: string;
102
+ provider: string;
103
+ modelSlug: string;
104
+ kind: "pb.rs.mcq";
105
+ schemaVersion: 1;
106
+ metadata?: Record<string, unknown> | undefined;
107
+ inputTokensUsed?: number | undefined;
108
+ outputTokensUsed?: number | undefined;
109
+ inputCost?: string | undefined;
110
+ outputCost?: string | undefined;
111
+ systemPromptId?: string | undefined;
112
+ }, "kind" | "schemaVersion">) => {
113
+ id: string;
114
+ testCaseId: string;
115
+ startedAt: number;
116
+ completedAt: number;
117
+ data: string;
118
+ provider: string;
119
+ modelSlug: string;
120
+ kind: "pb.rs.mcq";
121
+ schemaVersion: 1;
122
+ metadata?: Record<string, unknown> | undefined;
123
+ inputTokensUsed?: number | undefined;
124
+ outputTokensUsed?: number | undefined;
125
+ inputCost?: string | undefined;
126
+ outputCost?: string | undefined;
127
+ systemPromptId?: string | undefined;
128
+ };
129
+ newWithId(input: Omit<{
130
+ id: string;
131
+ testCaseId: string;
132
+ startedAt: number;
133
+ completedAt: number;
134
+ data: string;
135
+ provider: string;
136
+ modelSlug: string;
137
+ kind: "pb.rs.mcq";
138
+ schemaVersion: 1;
139
+ metadata?: Record<string, unknown> | undefined;
140
+ inputTokensUsed?: number | undefined;
141
+ outputTokensUsed?: number | undefined;
142
+ inputCost?: string | undefined;
143
+ outputCost?: string | undefined;
144
+ systemPromptId?: string | undefined;
145
+ }, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
146
+ id: string;
147
+ testCaseId: string;
148
+ startedAt: number;
149
+ completedAt: number;
150
+ data: string;
151
+ provider: string;
152
+ modelSlug: string;
153
+ kind: "pb.rs.mcq";
154
+ schemaVersion: 1;
155
+ metadata?: Record<string, unknown> | undefined;
156
+ inputTokensUsed?: number | undefined;
157
+ outputTokensUsed?: number | undefined;
158
+ inputCost?: string | undefined;
159
+ outputCost?: string | undefined;
160
+ systemPromptId?: string | undefined;
161
+ }>;
162
+ };
163
+ type PeerbenchMultipleChoiceResponseV1 = z$1.infer<typeof PeerbenchMultipleChoiceResponseSchemaV1>;
164
+ declare const PeerbenchMultipleChoiceScoreSchemaV1: z$1.ZodObject<Omit<Omit<{
165
+ id: z$1.ZodString;
166
+ kind: z$1.ZodString;
167
+ schemaVersion: z$1.ZodNumber;
168
+ value: z$1.ZodNumber;
169
+ responseId: z$1.ZodString;
170
+ explanation: z$1.ZodOptional<z$1.ZodString>;
171
+ metadata: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
172
+ scoringMethod: z$1.ZodEnum<{
173
+ readonly ai: "ai";
174
+ readonly human: "human";
175
+ readonly algo: "algo";
176
+ }>;
177
+ }, "kind" | "schemaVersion"> & {
178
+ scorerAIProvider: z$1.ZodOptional<z$1.ZodString>;
179
+ scorerAIModelSlug: z$1.ZodOptional<z$1.ZodString>;
180
+ scorerAIInputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
181
+ scorerAIOutputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
182
+ scorerAIInputCost: z$1.ZodOptional<z$1.ZodString>;
183
+ scorerAIOutputCost: z$1.ZodOptional<z$1.ZodString>;
184
+ } & {
185
+ kind: z$1.ZodString;
186
+ schemaVersion: z$1.ZodNumber;
187
+ }, "kind" | "schemaVersion"> & {
188
+ extractedAnswers: z$1.ZodArray<z$1.ZodString>;
189
+ } & {
190
+ kind: z$1.ZodLiteral<"pb.sc.mcq">;
191
+ schemaVersion: z$1.ZodLiteral<1>;
192
+ }, z$1.core.$strip> & {
193
+ new: (input: Omit<{
194
+ id: string;
195
+ value: number;
196
+ responseId: string;
197
+ scoringMethod: "ai" | "human" | "algo";
198
+ extractedAnswers: string[];
199
+ kind: "pb.sc.mcq";
200
+ schemaVersion: 1;
201
+ metadata?: Record<string, unknown> | undefined;
202
+ explanation?: string | undefined;
203
+ scorerAIProvider?: string | undefined;
204
+ scorerAIModelSlug?: string | undefined;
205
+ scorerAIInputTokensUsed?: number | undefined;
206
+ scorerAIOutputTokensUsed?: number | undefined;
207
+ scorerAIInputCost?: string | undefined;
208
+ scorerAIOutputCost?: string | undefined;
209
+ }, "kind" | "schemaVersion">) => {
210
+ id: string;
211
+ value: number;
212
+ responseId: string;
213
+ scoringMethod: "ai" | "human" | "algo";
214
+ extractedAnswers: string[];
215
+ kind: "pb.sc.mcq";
216
+ schemaVersion: 1;
217
+ metadata?: Record<string, unknown> | undefined;
218
+ explanation?: string | undefined;
219
+ scorerAIProvider?: string | undefined;
220
+ scorerAIModelSlug?: string | undefined;
221
+ scorerAIInputTokensUsed?: number | undefined;
222
+ scorerAIOutputTokensUsed?: number | undefined;
223
+ scorerAIInputCost?: string | undefined;
224
+ scorerAIOutputCost?: string | undefined;
225
+ };
226
+ newWithId(input: Omit<{
227
+ id: string;
228
+ value: number;
229
+ responseId: string;
230
+ scoringMethod: "ai" | "human" | "algo";
231
+ extractedAnswers: string[];
232
+ kind: "pb.sc.mcq";
233
+ schemaVersion: 1;
234
+ metadata?: Record<string, unknown> | undefined;
235
+ explanation?: string | undefined;
236
+ scorerAIProvider?: string | undefined;
237
+ scorerAIModelSlug?: string | undefined;
238
+ scorerAIInputTokensUsed?: number | undefined;
239
+ scorerAIOutputTokensUsed?: number | undefined;
240
+ scorerAIInputCost?: string | undefined;
241
+ scorerAIOutputCost?: string | undefined;
242
+ }, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
243
+ id: string;
244
+ value: number;
245
+ responseId: string;
246
+ scoringMethod: "ai" | "human" | "algo";
247
+ extractedAnswers: string[];
248
+ kind: "pb.sc.mcq";
249
+ schemaVersion: 1;
250
+ metadata?: Record<string, unknown> | undefined;
251
+ explanation?: string | undefined;
252
+ scorerAIProvider?: string | undefined;
253
+ scorerAIModelSlug?: string | undefined;
254
+ scorerAIInputTokensUsed?: number | undefined;
255
+ scorerAIOutputTokensUsed?: number | undefined;
256
+ scorerAIInputCost?: string | undefined;
257
+ scorerAIOutputCost?: string | undefined;
258
+ }>;
259
+ };
260
+ type PeerbenchMultipleChoiceScoreV1 = z$1.infer<typeof PeerbenchMultipleChoiceScoreSchemaV1>;
261
+
262
+ declare const PeerbenchOpenEndedTestCaseSchemaV1: z$1.ZodObject<Omit<{
263
+ id: z$1.ZodString;
264
+ kind: z$1.ZodString;
265
+ schemaVersion: z$1.ZodNumber;
266
+ metadata: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
267
+ }, "kind" | "schemaVersion"> & {
268
+ question: z$1.ZodString;
269
+ answer: z$1.ZodOptional<z$1.ZodString>;
270
+ } & {
271
+ kind: z$1.ZodLiteral<"pb.ts.open-ended">;
272
+ schemaVersion: z$1.ZodLiteral<1>;
273
+ }, z$1.core.$strip> & {
274
+ new: (input: Omit<{
275
+ id: string;
276
+ question: string;
277
+ kind: "pb.ts.open-ended";
278
+ schemaVersion: 1;
279
+ metadata?: Record<string, unknown> | undefined;
280
+ answer?: string | undefined;
281
+ }, "kind" | "schemaVersion">) => {
282
+ id: string;
283
+ question: string;
284
+ kind: "pb.ts.open-ended";
285
+ schemaVersion: 1;
286
+ metadata?: Record<string, unknown> | undefined;
287
+ answer?: string | undefined;
288
+ };
289
+ newWithId(input: Omit<{
290
+ id: string;
291
+ question: string;
292
+ kind: "pb.ts.open-ended";
293
+ schemaVersion: 1;
294
+ metadata?: Record<string, unknown> | undefined;
295
+ answer?: string | undefined;
296
+ }, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
297
+ id: string;
298
+ question: string;
299
+ kind: "pb.ts.open-ended";
300
+ schemaVersion: 1;
301
+ metadata?: Record<string, unknown> | undefined;
302
+ answer?: string | undefined;
303
+ }>;
304
+ };
305
+ type PeerbenchOpenEndedTestCaseV1 = z$1.infer<typeof PeerbenchOpenEndedTestCaseSchemaV1>;
306
+ declare const PeerbenchOpenEndedResponseSchemaV1: z$1.ZodObject<Omit<Omit<{
307
+ id: z$1.ZodString;
308
+ kind: z$1.ZodString;
309
+ schemaVersion: z$1.ZodNumber;
310
+ startedAt: z$1.ZodNumber;
311
+ completedAt: z$1.ZodNumber;
312
+ testCaseId: z$1.ZodString;
313
+ metadata: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
314
+ }, "kind" | "schemaVersion"> & {
315
+ data: z$1.ZodString;
316
+ modelSlug: z$1.ZodString;
317
+ provider: z$1.ZodString;
318
+ systemPromptId: z$1.ZodOptional<z$1.ZodString>;
319
+ inputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
320
+ outputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
321
+ inputCost: z$1.ZodOptional<z$1.ZodString>;
322
+ outputCost: z$1.ZodOptional<z$1.ZodString>;
323
+ } & {
324
+ kind: z$1.ZodString;
325
+ schemaVersion: z$1.ZodNumber;
326
+ }, "kind" | "schemaVersion"> & {
327
+ kind: z$1.ZodLiteral<"pb.rs.open-ended">;
328
+ schemaVersion: z$1.ZodLiteral<1>;
329
+ }, z$1.core.$strip> & {
330
+ new: (input: Omit<{
331
+ id: string;
332
+ testCaseId: string;
333
+ startedAt: number;
334
+ completedAt: number;
335
+ data: string;
336
+ provider: string;
337
+ modelSlug: string;
338
+ kind: "pb.rs.open-ended";
339
+ schemaVersion: 1;
340
+ metadata?: Record<string, unknown> | undefined;
341
+ inputTokensUsed?: number | undefined;
342
+ outputTokensUsed?: number | undefined;
343
+ inputCost?: string | undefined;
344
+ outputCost?: string | undefined;
345
+ systemPromptId?: string | undefined;
346
+ }, "kind" | "schemaVersion">) => {
347
+ id: string;
348
+ testCaseId: string;
349
+ startedAt: number;
350
+ completedAt: number;
351
+ data: string;
352
+ provider: string;
353
+ modelSlug: string;
354
+ kind: "pb.rs.open-ended";
355
+ schemaVersion: 1;
356
+ metadata?: Record<string, unknown> | undefined;
357
+ inputTokensUsed?: number | undefined;
358
+ outputTokensUsed?: number | undefined;
359
+ inputCost?: string | undefined;
360
+ outputCost?: string | undefined;
361
+ systemPromptId?: string | undefined;
362
+ };
363
+ newWithId(input: Omit<{
364
+ id: string;
365
+ testCaseId: string;
366
+ startedAt: number;
367
+ completedAt: number;
368
+ data: string;
369
+ provider: string;
370
+ modelSlug: string;
371
+ kind: "pb.rs.open-ended";
372
+ schemaVersion: 1;
373
+ metadata?: Record<string, unknown> | undefined;
374
+ inputTokensUsed?: number | undefined;
375
+ outputTokensUsed?: number | undefined;
376
+ inputCost?: string | undefined;
377
+ outputCost?: string | undefined;
378
+ systemPromptId?: string | undefined;
379
+ }, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
380
+ id: string;
381
+ testCaseId: string;
382
+ startedAt: number;
383
+ completedAt: number;
384
+ data: string;
385
+ provider: string;
386
+ modelSlug: string;
387
+ kind: "pb.rs.open-ended";
388
+ schemaVersion: 1;
389
+ metadata?: Record<string, unknown> | undefined;
390
+ inputTokensUsed?: number | undefined;
391
+ outputTokensUsed?: number | undefined;
392
+ inputCost?: string | undefined;
393
+ outputCost?: string | undefined;
394
+ systemPromptId?: string | undefined;
395
+ }>;
396
+ };
397
+ type PeerbenchOpenEndedResponseV1 = z$1.infer<typeof PeerbenchOpenEndedResponseSchemaV1>;
398
+ declare const PeerbenchOpenEndedScoreSchemaV1: z$1.ZodObject<Omit<Omit<{
399
+ id: z$1.ZodString;
400
+ kind: z$1.ZodString;
401
+ schemaVersion: z$1.ZodNumber;
402
+ value: z$1.ZodNumber;
403
+ responseId: z$1.ZodString;
404
+ explanation: z$1.ZodOptional<z$1.ZodString>;
405
+ metadata: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
406
+ scoringMethod: z$1.ZodEnum<{
407
+ readonly ai: "ai";
408
+ readonly human: "human";
409
+ readonly algo: "algo";
410
+ }>;
411
+ }, "kind" | "schemaVersion"> & {
412
+ scorerAIProvider: z$1.ZodOptional<z$1.ZodString>;
413
+ scorerAIModelSlug: z$1.ZodOptional<z$1.ZodString>;
414
+ scorerAIInputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
415
+ scorerAIOutputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
416
+ scorerAIInputCost: z$1.ZodOptional<z$1.ZodString>;
417
+ scorerAIOutputCost: z$1.ZodOptional<z$1.ZodString>;
418
+ } & {
419
+ kind: z$1.ZodString;
420
+ schemaVersion: z$1.ZodNumber;
421
+ }, "kind" | "schemaVersion"> & {
422
+ kind: z$1.ZodLiteral<"pb.sc.open-ended">;
423
+ schemaVersion: z$1.ZodLiteral<1>;
424
+ }, z$1.core.$strip> & {
425
+ new: (input: Omit<{
426
+ id: string;
427
+ value: number;
428
+ responseId: string;
429
+ scoringMethod: "ai" | "human" | "algo";
430
+ kind: "pb.sc.open-ended";
431
+ schemaVersion: 1;
432
+ metadata?: Record<string, unknown> | undefined;
433
+ explanation?: string | undefined;
434
+ scorerAIProvider?: string | undefined;
435
+ scorerAIModelSlug?: string | undefined;
436
+ scorerAIInputTokensUsed?: number | undefined;
437
+ scorerAIOutputTokensUsed?: number | undefined;
438
+ scorerAIInputCost?: string | undefined;
439
+ scorerAIOutputCost?: string | undefined;
440
+ }, "kind" | "schemaVersion">) => {
441
+ id: string;
442
+ value: number;
443
+ responseId: string;
444
+ scoringMethod: "ai" | "human" | "algo";
445
+ kind: "pb.sc.open-ended";
446
+ schemaVersion: 1;
447
+ metadata?: Record<string, unknown> | undefined;
448
+ explanation?: string | undefined;
449
+ scorerAIProvider?: string | undefined;
450
+ scorerAIModelSlug?: string | undefined;
451
+ scorerAIInputTokensUsed?: number | undefined;
452
+ scorerAIOutputTokensUsed?: number | undefined;
453
+ scorerAIInputCost?: string | undefined;
454
+ scorerAIOutputCost?: string | undefined;
455
+ };
456
+ newWithId(input: Omit<{
457
+ id: string;
458
+ value: number;
459
+ responseId: string;
460
+ scoringMethod: "ai" | "human" | "algo";
461
+ kind: "pb.sc.open-ended";
462
+ schemaVersion: 1;
463
+ metadata?: Record<string, unknown> | undefined;
464
+ explanation?: string | undefined;
465
+ scorerAIProvider?: string | undefined;
466
+ scorerAIModelSlug?: string | undefined;
467
+ scorerAIInputTokensUsed?: number | undefined;
468
+ scorerAIOutputTokensUsed?: number | undefined;
469
+ scorerAIInputCost?: string | undefined;
470
+ scorerAIOutputCost?: string | undefined;
471
+ }, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
472
+ id: string;
473
+ value: number;
474
+ responseId: string;
475
+ scoringMethod: "ai" | "human" | "algo";
476
+ kind: "pb.sc.open-ended";
477
+ schemaVersion: 1;
478
+ metadata?: Record<string, unknown> | undefined;
479
+ explanation?: string | undefined;
480
+ scorerAIProvider?: string | undefined;
481
+ scorerAIModelSlug?: string | undefined;
482
+ scorerAIInputTokensUsed?: number | undefined;
483
+ scorerAIOutputTokensUsed?: number | undefined;
484
+ scorerAIInputCost?: string | undefined;
485
+ scorerAIOutputCost?: string | undefined;
486
+ }>;
487
+ };
488
+ type PeerbenchOpenEndedScoreV1 = z$1.infer<typeof PeerbenchOpenEndedScoreSchemaV1>;
489
+
490
+ declare const PeerbenchBenchmarkSpecSchemaV1: z__default.ZodObject<Omit<{
491
+ kind: z__default.ZodString;
492
+ schemaVersion: z__default.ZodNumber;
493
+ metadata: z__default.ZodOptional<z__default.ZodRecord<z__default.ZodString, z__default.ZodUnknown>>;
494
+ }, "kind" | "schemaVersion"> & {
495
+ /**
496
+ * Big text contents that can be referred as <text>{key}</text> in a prompt or system prompt.
497
+ */
498
+ blobTexts: z__default.ZodOptional<z__default.ZodRecord<z__default.ZodString, z__default.ZodString>>;
499
+ } & {
500
+ kind: z__default.ZodLiteral<"pb.benchmark.spec">;
501
+ schemaVersion: z__default.ZodLiteral<1>;
502
+ }, z__default.core.$strip> & {
503
+ new: (input: Omit<{
504
+ kind: "pb.benchmark.spec";
505
+ schemaVersion: 1;
506
+ metadata?: Record<string, unknown> | undefined;
507
+ blobTexts?: Record<string, string> | undefined;
508
+ }, "kind" | "schemaVersion">) => {
509
+ kind: "pb.benchmark.spec";
510
+ schemaVersion: 1;
511
+ metadata?: Record<string, unknown> | undefined;
512
+ blobTexts?: Record<string, string> | undefined;
513
+ };
514
+ newWithId(input: Omit<{
515
+ kind: "pb.benchmark.spec";
516
+ schemaVersion: 1;
517
+ metadata?: Record<string, unknown> | undefined;
518
+ blobTexts?: Record<string, string> | undefined;
519
+ }, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
520
+ kind: "pb.benchmark.spec";
521
+ schemaVersion: 1;
522
+ metadata?: Record<string, unknown> | undefined;
523
+ blobTexts?: Record<string, string> | undefined;
524
+ }>;
525
+ };
526
+ type PeerbenchBenchmarkSpecV1 = z__default.infer<typeof PeerbenchBenchmarkSpecSchemaV1>;
527
+
528
+ declare class PeerbenchJSONDataLoader extends GenericJSONArrayDataLoader<PeerbenchMultipleChoiceTestCaseV1 | PeerbenchOpenEndedTestCaseV1, PeerbenchMultipleChoiceResponseV1 | PeerbenchOpenEndedResponseV1, PeerbenchMultipleChoiceScoreV1 | PeerbenchOpenEndedScoreV1> {
529
+ readonly kind = "pb.load.json.data";
530
+ loadBenchmarkSpec(params: {
531
+ content: Uint8Array;
532
+ }): Promise<PeerbenchBenchmarkSpecV1>;
533
+ protected testCaseBuilder(data: any): {
534
+ id: string;
535
+ question: string;
536
+ options: Record<string, string>;
537
+ answer: string;
538
+ answerKey: string;
539
+ kind: "pb.ts.mcq";
540
+ schemaVersion: 1;
541
+ metadata?: Record<string, unknown> | undefined;
542
+ } | {
543
+ id: string;
544
+ question: string;
545
+ kind: "pb.ts.open-ended";
546
+ schemaVersion: 1;
547
+ metadata?: Record<string, unknown> | undefined;
548
+ answer?: string | undefined;
549
+ } | undefined;
550
+ protected responseBuilder(data: any): Promise<{
551
+ id: string;
552
+ testCaseId: string;
553
+ startedAt: number;
554
+ completedAt: number;
555
+ data: string;
556
+ provider: string;
557
+ modelSlug: string;
558
+ kind: "pb.rs.mcq";
559
+ schemaVersion: 1;
560
+ metadata?: Record<string, unknown> | undefined;
561
+ inputTokensUsed?: number | undefined;
562
+ outputTokensUsed?: number | undefined;
563
+ inputCost?: string | undefined;
564
+ outputCost?: string | undefined;
565
+ systemPromptId?: string | undefined;
566
+ } | {
567
+ id: string;
568
+ testCaseId: string;
569
+ startedAt: number;
570
+ completedAt: number;
571
+ data: string;
572
+ provider: string;
573
+ modelSlug: string;
574
+ kind: "pb.rs.open-ended";
575
+ schemaVersion: 1;
576
+ metadata?: Record<string, unknown> | undefined;
577
+ inputTokensUsed?: number | undefined;
578
+ outputTokensUsed?: number | undefined;
579
+ inputCost?: string | undefined;
580
+ outputCost?: string | undefined;
581
+ systemPromptId?: string | undefined;
582
+ } | undefined>;
583
+ protected scoreBuilder(data: any): Promise<{
584
+ id: string;
585
+ value: number;
586
+ responseId: string;
587
+ scoringMethod: "ai" | "human" | "algo";
588
+ extractedAnswers: string[];
589
+ kind: "pb.sc.mcq";
590
+ schemaVersion: 1;
591
+ metadata?: Record<string, unknown> | undefined;
592
+ explanation?: string | undefined;
593
+ scorerAIProvider?: string | undefined;
594
+ scorerAIModelSlug?: string | undefined;
595
+ scorerAIInputTokensUsed?: number | undefined;
596
+ scorerAIOutputTokensUsed?: number | undefined;
597
+ scorerAIInputCost?: string | undefined;
598
+ scorerAIOutputCost?: string | undefined;
599
+ } | {
600
+ id: string;
601
+ value: number;
602
+ responseId: string;
603
+ scoringMethod: "ai" | "human" | "algo";
604
+ kind: "pb.sc.open-ended";
605
+ schemaVersion: 1;
606
+ metadata?: Record<string, unknown> | undefined;
607
+ explanation?: string | undefined;
608
+ scorerAIProvider?: string | undefined;
609
+ scorerAIModelSlug?: string | undefined;
610
+ scorerAIInputTokensUsed?: number | undefined;
611
+ scorerAIOutputTokensUsed?: number | undefined;
612
+ scorerAIInputCost?: string | undefined;
613
+ scorerAIOutputCost?: string | undefined;
614
+ } | undefined>;
615
+ }
616
+
617
+ type ResponseTypes = PeerbenchMultipleChoiceResponseV1 | PeerbenchOpenEndedResponseV1;
618
+ type ScoreTypes = PeerbenchMultipleChoiceScoreV1 | PeerbenchOpenEndedScoreV1;
619
+ type TestCaseTypes = PeerbenchMultipleChoiceTestCaseV1 | PeerbenchOpenEndedTestCaseV1;
620
+ declare function runTestCase$2(params: {
621
+ testCase: TestCaseTypes;
622
+ provider: AbstractLLMProvider;
623
+ scorer?: MCQScorer | LLMJudgeScorer;
624
+ spec?: PeerbenchBenchmarkSpecV1;
625
+ runConfig: {
626
+ model: string;
627
+ llmJudgeModel?: string;
628
+ };
629
+ systemPrompt?: SimpleSystemPromptV1;
630
+ idGenerators?: {
631
+ response?: IdGenerator;
632
+ score?: IdGenerator;
633
+ };
634
+ }): Promise<RunnerResult<ResponseTypes, ScoreTypes>>;
635
+
636
+ type index$2_PeerbenchJSONDataLoader = PeerbenchJSONDataLoader;
637
+ declare const index$2_PeerbenchJSONDataLoader: typeof PeerbenchJSONDataLoader;
638
+ declare const index$2_PeerbenchMultipleChoiceResponseSchemaV1: typeof PeerbenchMultipleChoiceResponseSchemaV1;
639
+ type index$2_PeerbenchMultipleChoiceResponseV1 = PeerbenchMultipleChoiceResponseV1;
640
+ declare const index$2_PeerbenchMultipleChoiceScoreSchemaV1: typeof PeerbenchMultipleChoiceScoreSchemaV1;
641
+ type index$2_PeerbenchMultipleChoiceScoreV1 = PeerbenchMultipleChoiceScoreV1;
642
+ declare const index$2_PeerbenchMultipleChoiceTestCaseSchemaV1: typeof PeerbenchMultipleChoiceTestCaseSchemaV1;
643
+ type index$2_PeerbenchMultipleChoiceTestCaseV1 = PeerbenchMultipleChoiceTestCaseV1;
644
+ declare const index$2_PeerbenchOpenEndedResponseSchemaV1: typeof PeerbenchOpenEndedResponseSchemaV1;
645
+ type index$2_PeerbenchOpenEndedResponseV1 = PeerbenchOpenEndedResponseV1;
646
+ declare const index$2_PeerbenchOpenEndedScoreSchemaV1: typeof PeerbenchOpenEndedScoreSchemaV1;
647
+ type index$2_PeerbenchOpenEndedScoreV1 = PeerbenchOpenEndedScoreV1;
648
+ declare const index$2_PeerbenchOpenEndedTestCaseSchemaV1: typeof PeerbenchOpenEndedTestCaseSchemaV1;
649
+ type index$2_PeerbenchOpenEndedTestCaseV1 = PeerbenchOpenEndedTestCaseV1;
650
+ declare namespace index$2 {
651
+ export { index$2_PeerbenchJSONDataLoader as PeerbenchJSONDataLoader, index$2_PeerbenchMultipleChoiceResponseSchemaV1 as PeerbenchMultipleChoiceResponseSchemaV1, type index$2_PeerbenchMultipleChoiceResponseV1 as PeerbenchMultipleChoiceResponseV1, index$2_PeerbenchMultipleChoiceScoreSchemaV1 as PeerbenchMultipleChoiceScoreSchemaV1, type index$2_PeerbenchMultipleChoiceScoreV1 as PeerbenchMultipleChoiceScoreV1, index$2_PeerbenchMultipleChoiceTestCaseSchemaV1 as PeerbenchMultipleChoiceTestCaseSchemaV1, type index$2_PeerbenchMultipleChoiceTestCaseV1 as PeerbenchMultipleChoiceTestCaseV1, index$2_PeerbenchOpenEndedResponseSchemaV1 as PeerbenchOpenEndedResponseSchemaV1, type index$2_PeerbenchOpenEndedResponseV1 as PeerbenchOpenEndedResponseV1, index$2_PeerbenchOpenEndedScoreSchemaV1 as PeerbenchOpenEndedScoreSchemaV1, type index$2_PeerbenchOpenEndedScoreV1 as PeerbenchOpenEndedScoreV1, index$2_PeerbenchOpenEndedTestCaseSchemaV1 as PeerbenchOpenEndedTestCaseSchemaV1, type index$2_PeerbenchOpenEndedTestCaseV1 as PeerbenchOpenEndedTestCaseV1, runTestCase$2 as runTestCase };
652
+ }
653
+
654
+ declare const MMLUProMainTestCaseSchemaV1: z$1.ZodObject<Omit<{
655
+ id: z$1.ZodString;
656
+ kind: z$1.ZodString;
657
+ schemaVersion: z$1.ZodNumber;
658
+ metadata: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
659
+ }, "kind" | "schemaVersion"> & {
660
+ question: z$1.ZodString;
661
+ options: z$1.ZodRecord<z$1.ZodString, z$1.ZodString>;
662
+ answer: z$1.ZodString;
663
+ answerKey: z$1.ZodString;
664
+ } & {
665
+ kind: z$1.ZodLiteral<"mmlu-pro.ts.main">;
666
+ schemaVersion: z$1.ZodLiteral<1>;
667
+ }, z$1.core.$strip> & {
668
+ new: (input: Omit<{
669
+ id: string;
670
+ question: string;
671
+ options: Record<string, string>;
672
+ answer: string;
673
+ answerKey: string;
674
+ kind: "mmlu-pro.ts.main";
675
+ schemaVersion: 1;
676
+ metadata?: Record<string, unknown> | undefined;
677
+ }, "kind" | "schemaVersion">) => {
678
+ id: string;
679
+ question: string;
680
+ options: Record<string, string>;
681
+ answer: string;
682
+ answerKey: string;
683
+ kind: "mmlu-pro.ts.main";
684
+ schemaVersion: 1;
685
+ metadata?: Record<string, unknown> | undefined;
686
+ };
687
+ newWithId(input: Omit<{
688
+ id: string;
689
+ question: string;
690
+ options: Record<string, string>;
691
+ answer: string;
692
+ answerKey: string;
693
+ kind: "mmlu-pro.ts.main";
694
+ schemaVersion: 1;
695
+ metadata?: Record<string, unknown> | undefined;
696
+ }, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
697
+ id: string;
698
+ question: string;
699
+ options: Record<string, string>;
700
+ answer: string;
701
+ answerKey: string;
702
+ kind: "mmlu-pro.ts.main";
703
+ schemaVersion: 1;
704
+ metadata?: Record<string, unknown> | undefined;
705
+ }>;
706
+ };
707
+ type MMLUProMainTestCaseV1 = z$1.infer<typeof MMLUProMainTestCaseSchemaV1>;
708
+ declare const MMLUProMainResponseSchemaV1: z$1.ZodObject<Omit<Omit<{
709
+ id: z$1.ZodString;
710
+ kind: z$1.ZodString;
711
+ schemaVersion: z$1.ZodNumber;
712
+ startedAt: z$1.ZodNumber;
713
+ completedAt: z$1.ZodNumber;
714
+ testCaseId: z$1.ZodString;
715
+ metadata: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
716
+ }, "kind" | "schemaVersion"> & {
717
+ data: z$1.ZodString;
718
+ modelSlug: z$1.ZodString;
719
+ provider: z$1.ZodString;
720
+ systemPromptId: z$1.ZodOptional<z$1.ZodString>;
721
+ inputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
722
+ outputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
723
+ inputCost: z$1.ZodOptional<z$1.ZodString>;
724
+ outputCost: z$1.ZodOptional<z$1.ZodString>;
725
+ } & {
726
+ kind: z$1.ZodString;
727
+ schemaVersion: z$1.ZodNumber;
728
+ }, "kind" | "schemaVersion"> & {
729
+ kind: z$1.ZodLiteral<"mmlu-pro.rs.main">;
730
+ schemaVersion: z$1.ZodLiteral<1>;
731
+ }, z$1.core.$strip> & {
732
+ new: (input: Omit<{
733
+ id: string;
734
+ testCaseId: string;
735
+ startedAt: number;
736
+ completedAt: number;
737
+ data: string;
738
+ provider: string;
739
+ modelSlug: string;
740
+ kind: "mmlu-pro.rs.main";
741
+ schemaVersion: 1;
742
+ metadata?: Record<string, unknown> | undefined;
743
+ inputTokensUsed?: number | undefined;
744
+ outputTokensUsed?: number | undefined;
745
+ inputCost?: string | undefined;
746
+ outputCost?: string | undefined;
747
+ systemPromptId?: string | undefined;
748
+ }, "kind" | "schemaVersion">) => {
749
+ id: string;
750
+ testCaseId: string;
751
+ startedAt: number;
752
+ completedAt: number;
753
+ data: string;
754
+ provider: string;
755
+ modelSlug: string;
756
+ kind: "mmlu-pro.rs.main";
757
+ schemaVersion: 1;
758
+ metadata?: Record<string, unknown> | undefined;
759
+ inputTokensUsed?: number | undefined;
760
+ outputTokensUsed?: number | undefined;
761
+ inputCost?: string | undefined;
762
+ outputCost?: string | undefined;
763
+ systemPromptId?: string | undefined;
764
+ };
765
+ newWithId(input: Omit<{
766
+ id: string;
767
+ testCaseId: string;
768
+ startedAt: number;
769
+ completedAt: number;
770
+ data: string;
771
+ provider: string;
772
+ modelSlug: string;
773
+ kind: "mmlu-pro.rs.main";
774
+ schemaVersion: 1;
775
+ metadata?: Record<string, unknown> | undefined;
776
+ inputTokensUsed?: number | undefined;
777
+ outputTokensUsed?: number | undefined;
778
+ inputCost?: string | undefined;
779
+ outputCost?: string | undefined;
780
+ systemPromptId?: string | undefined;
781
+ }, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
782
+ id: string;
783
+ testCaseId: string;
784
+ startedAt: number;
785
+ completedAt: number;
786
+ data: string;
787
+ provider: string;
788
+ modelSlug: string;
789
+ kind: "mmlu-pro.rs.main";
790
+ schemaVersion: 1;
791
+ metadata?: Record<string, unknown> | undefined;
792
+ inputTokensUsed?: number | undefined;
793
+ outputTokensUsed?: number | undefined;
794
+ inputCost?: string | undefined;
795
+ outputCost?: string | undefined;
796
+ systemPromptId?: string | undefined;
797
+ }>;
798
+ };
799
+ type MMLUProMainResponseV1 = z$1.infer<typeof MMLUProMainResponseSchemaV1>;
800
+ declare const MMLUProMainScoreSchemaV1: z$1.ZodObject<Omit<Omit<{
801
+ id: z$1.ZodString;
802
+ kind: z$1.ZodString;
803
+ schemaVersion: z$1.ZodNumber;
804
+ value: z$1.ZodNumber;
805
+ responseId: z$1.ZodString;
806
+ explanation: z$1.ZodOptional<z$1.ZodString>;
807
+ metadata: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
808
+ scoringMethod: z$1.ZodEnum<{
809
+ readonly ai: "ai";
810
+ readonly human: "human";
811
+ readonly algo: "algo";
812
+ }>;
813
+ }, "kind" | "schemaVersion"> & {
814
+ scorerAIProvider: z$1.ZodOptional<z$1.ZodString>;
815
+ scorerAIModelSlug: z$1.ZodOptional<z$1.ZodString>;
816
+ scorerAIInputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
817
+ scorerAIOutputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
818
+ scorerAIInputCost: z$1.ZodOptional<z$1.ZodString>;
819
+ scorerAIOutputCost: z$1.ZodOptional<z$1.ZodString>;
820
+ } & {
821
+ kind: z$1.ZodString;
822
+ schemaVersion: z$1.ZodNumber;
823
+ }, "kind" | "schemaVersion"> & {
824
+ extractedAnswers: z$1.ZodArray<z$1.ZodString>;
825
+ } & {
826
+ kind: z$1.ZodLiteral<"mmlu-pro.sc.main">;
827
+ schemaVersion: z$1.ZodLiteral<1>;
828
+ }, z$1.core.$strip> & {
829
+ new: (input: Omit<{
830
+ id: string;
831
+ value: number;
832
+ responseId: string;
833
+ scoringMethod: "ai" | "human" | "algo";
834
+ extractedAnswers: string[];
835
+ kind: "mmlu-pro.sc.main";
836
+ schemaVersion: 1;
837
+ metadata?: Record<string, unknown> | undefined;
838
+ explanation?: string | undefined;
839
+ scorerAIProvider?: string | undefined;
840
+ scorerAIModelSlug?: string | undefined;
841
+ scorerAIInputTokensUsed?: number | undefined;
842
+ scorerAIOutputTokensUsed?: number | undefined;
843
+ scorerAIInputCost?: string | undefined;
844
+ scorerAIOutputCost?: string | undefined;
845
+ }, "kind" | "schemaVersion">) => {
846
+ id: string;
847
+ value: number;
848
+ responseId: string;
849
+ scoringMethod: "ai" | "human" | "algo";
850
+ extractedAnswers: string[];
851
+ kind: "mmlu-pro.sc.main";
852
+ schemaVersion: 1;
853
+ metadata?: Record<string, unknown> | undefined;
854
+ explanation?: string | undefined;
855
+ scorerAIProvider?: string | undefined;
856
+ scorerAIModelSlug?: string | undefined;
857
+ scorerAIInputTokensUsed?: number | undefined;
858
+ scorerAIOutputTokensUsed?: number | undefined;
859
+ scorerAIInputCost?: string | undefined;
860
+ scorerAIOutputCost?: string | undefined;
861
+ };
862
+ newWithId(input: Omit<{
863
+ id: string;
864
+ value: number;
865
+ responseId: string;
866
+ scoringMethod: "ai" | "human" | "algo";
867
+ extractedAnswers: string[];
868
+ kind: "mmlu-pro.sc.main";
869
+ schemaVersion: 1;
870
+ metadata?: Record<string, unknown> | undefined;
871
+ explanation?: string | undefined;
872
+ scorerAIProvider?: string | undefined;
873
+ scorerAIModelSlug?: string | undefined;
874
+ scorerAIInputTokensUsed?: number | undefined;
875
+ scorerAIOutputTokensUsed?: number | undefined;
876
+ scorerAIInputCost?: string | undefined;
877
+ scorerAIOutputCost?: string | undefined;
878
+ }, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
879
+ id: string;
880
+ value: number;
881
+ responseId: string;
882
+ scoringMethod: "ai" | "human" | "algo";
883
+ extractedAnswers: string[];
884
+ kind: "mmlu-pro.sc.main";
885
+ schemaVersion: 1;
886
+ metadata?: Record<string, unknown> | undefined;
887
+ explanation?: string | undefined;
888
+ scorerAIProvider?: string | undefined;
889
+ scorerAIModelSlug?: string | undefined;
890
+ scorerAIInputTokensUsed?: number | undefined;
891
+ scorerAIOutputTokensUsed?: number | undefined;
892
+ scorerAIInputCost?: string | undefined;
893
+ scorerAIOutputCost?: string | undefined;
894
+ }>;
895
+ };
896
+ type MMLUProMainScoreV1 = z$1.infer<typeof MMLUProMainScoreSchemaV1>;
897
+
898
+ declare const MMLUProBenchmarkSpecSchemaV1: z__default.ZodObject<Omit<{
899
+ kind: z__default.ZodString;
900
+ schemaVersion: z__default.ZodNumber;
901
+ metadata: z__default.ZodOptional<z__default.ZodRecord<z__default.ZodString, z__default.ZodUnknown>>;
902
+ }, "kind" | "schemaVersion"> & {
903
+ kind: z__default.ZodLiteral<"mmlu-pro.benchmark.spec">;
904
+ schemaVersion: z__default.ZodLiteral<1>;
905
+ }, z__default.core.$strip> & {
906
+ new: (input: Omit<{
907
+ kind: "mmlu-pro.benchmark.spec";
908
+ schemaVersion: 1;
909
+ metadata?: Record<string, unknown> | undefined;
910
+ }, "kind" | "schemaVersion">) => {
911
+ kind: "mmlu-pro.benchmark.spec";
912
+ schemaVersion: 1;
913
+ metadata?: Record<string, unknown> | undefined;
914
+ };
915
+ newWithId(input: Omit<{
916
+ kind: "mmlu-pro.benchmark.spec";
917
+ schemaVersion: 1;
918
+ metadata?: Record<string, unknown> | undefined;
919
+ }, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
920
+ kind: "mmlu-pro.benchmark.spec";
921
+ schemaVersion: 1;
922
+ metadata?: Record<string, unknown> | undefined;
923
+ }>;
924
+ };
925
+ type MMLUProBenchmarkSpecV1 = z__default.infer<typeof MMLUProBenchmarkSpecSchemaV1>;
926
+
927
+ declare class MMLUProJSONDataLoader extends AbstractDataLoader {
928
+ readonly kind = "mmlu-pro.load.json.data";
929
+ loadData(params: {
930
+ content: Uint8Array;
931
+ }): LoaderResult<MMLUProMainTestCaseV1, MMLUProMainResponseV1, MMLUProMainScoreV1>;
932
+ loadBenchmarkSpec(params: {
933
+ content: Uint8Array;
934
+ }): Promise<MMLUProBenchmarkSpecV1>;
935
+ }
936
+ declare class MMLUProParquetDataLoader extends AbstractDataLoader {
937
+ readonly kind = "mmlu-pro.load.parquet.data";
938
+ loadData(params: {
939
+ content: Uint8Array;
940
+ }): Promise<LoaderResult<MMLUProMainTestCaseV1, MMLUProMainResponseV1, MMLUProMainScoreV1>>;
941
+ loadBenchmarkSpec(params: {
942
+ content: Uint8Array;
943
+ }): Promise<MMLUProBenchmarkSpecV1>;
944
+ }
945
+
946
+ declare function runTestCase$1(params: {
947
+ testCase: MMLUProMainTestCaseV1;
948
+ provider: AbstractLLMProvider;
949
+ scorer?: MCQScorer | LLMJudgeScorer;
950
+ spec?: MMLUProBenchmarkSpecV1;
951
+ runConfig: {
952
+ model: string;
953
+ llmJudgeModel?: string;
954
+ };
955
+ systemPrompt?: SimpleSystemPromptV1;
956
+ idGenerators?: {
957
+ response?: IdGenerator;
958
+ score?: IdGenerator;
959
+ };
960
+ }): Promise<RunnerResult<MMLUProMainResponseV1, MMLUProMainScoreV1>>;
961
+
962
+ declare const BaseMMLUProScoreSchemaV1: z.ZodObject<Omit<{
963
+ id: z.ZodString;
964
+ kind: z.ZodString;
965
+ schemaVersion: z.ZodNumber;
966
+ value: z.ZodNumber;
967
+ responseId: z.ZodString;
968
+ explanation: z.ZodOptional<z.ZodString>;
969
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
970
+ scoringMethod: z.ZodEnum<{
971
+ readonly ai: "ai";
972
+ readonly human: "human";
973
+ readonly algo: "algo";
974
+ }>;
975
+ }, "kind" | "schemaVersion"> & {
976
+ scorerAIProvider: z.ZodOptional<z.ZodString>;
977
+ scorerAIModelSlug: z.ZodOptional<z.ZodString>;
978
+ scorerAIInputTokensUsed: z.ZodOptional<z.ZodNumber>;
979
+ scorerAIOutputTokensUsed: z.ZodOptional<z.ZodNumber>;
980
+ scorerAIInputCost: z.ZodOptional<z.ZodString>;
981
+ scorerAIOutputCost: z.ZodOptional<z.ZodString>;
982
+ } & {
983
+ kind: z.ZodString;
984
+ schemaVersion: z.ZodNumber;
985
+ }, zod_v4_core.$strip> & {
986
+ new: (input: Omit<{
987
+ id: string;
988
+ value: number;
989
+ responseId: string;
990
+ scoringMethod: "ai" | "human" | "algo";
991
+ kind: string;
992
+ schemaVersion: number;
993
+ metadata?: Record<string, unknown> | undefined;
994
+ explanation?: string | undefined;
995
+ scorerAIProvider?: string | undefined;
996
+ scorerAIModelSlug?: string | undefined;
997
+ scorerAIInputTokensUsed?: number | undefined;
998
+ scorerAIOutputTokensUsed?: number | undefined;
999
+ scorerAIInputCost?: string | undefined;
1000
+ scorerAIOutputCost?: string | undefined;
1001
+ }, "kind" | "schemaVersion">) => {
1002
+ id: string;
1003
+ value: number;
1004
+ responseId: string;
1005
+ scoringMethod: "ai" | "human" | "algo";
1006
+ kind: string;
1007
+ schemaVersion: number;
1008
+ metadata?: Record<string, unknown> | undefined;
1009
+ explanation?: string | undefined;
1010
+ scorerAIProvider?: string | undefined;
1011
+ scorerAIModelSlug?: string | undefined;
1012
+ scorerAIInputTokensUsed?: number | undefined;
1013
+ scorerAIOutputTokensUsed?: number | undefined;
1014
+ scorerAIInputCost?: string | undefined;
1015
+ scorerAIOutputCost?: string | undefined;
1016
+ };
1017
+ newWithId(input: Omit<{
1018
+ id: string;
1019
+ value: number;
1020
+ responseId: string;
1021
+ scoringMethod: "ai" | "human" | "algo";
1022
+ kind: string;
1023
+ schemaVersion: number;
1024
+ metadata?: Record<string, unknown> | undefined;
1025
+ explanation?: string | undefined;
1026
+ scorerAIProvider?: string | undefined;
1027
+ scorerAIModelSlug?: string | undefined;
1028
+ scorerAIInputTokensUsed?: number | undefined;
1029
+ scorerAIOutputTokensUsed?: number | undefined;
1030
+ scorerAIInputCost?: string | undefined;
1031
+ scorerAIOutputCost?: string | undefined;
1032
+ }, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
1033
+ id: string;
1034
+ value: number;
1035
+ responseId: string;
1036
+ scoringMethod: "ai" | "human" | "algo";
1037
+ kind: string;
1038
+ schemaVersion: number;
1039
+ metadata?: Record<string, unknown> | undefined;
1040
+ explanation?: string | undefined;
1041
+ scorerAIProvider?: string | undefined;
1042
+ scorerAIModelSlug?: string | undefined;
1043
+ scorerAIInputTokensUsed?: number | undefined;
1044
+ scorerAIOutputTokensUsed?: number | undefined;
1045
+ scorerAIInputCost?: string | undefined;
1046
+ scorerAIOutputCost?: string | undefined;
1047
+ }>;
1048
+ };
1049
+
1050
+ declare const index$1_BaseMMLUProScoreSchemaV1: typeof BaseMMLUProScoreSchemaV1;
1051
+ declare const index$1_MMLUProBenchmarkSpecSchemaV1: typeof MMLUProBenchmarkSpecSchemaV1;
1052
+ type index$1_MMLUProBenchmarkSpecV1 = MMLUProBenchmarkSpecV1;
1053
+ type index$1_MMLUProJSONDataLoader = MMLUProJSONDataLoader;
1054
+ declare const index$1_MMLUProJSONDataLoader: typeof MMLUProJSONDataLoader;
1055
+ declare const index$1_MMLUProMainResponseSchemaV1: typeof MMLUProMainResponseSchemaV1;
1056
+ type index$1_MMLUProMainResponseV1 = MMLUProMainResponseV1;
1057
+ declare const index$1_MMLUProMainScoreSchemaV1: typeof MMLUProMainScoreSchemaV1;
1058
+ type index$1_MMLUProMainScoreV1 = MMLUProMainScoreV1;
1059
+ declare const index$1_MMLUProMainTestCaseSchemaV1: typeof MMLUProMainTestCaseSchemaV1;
1060
+ type index$1_MMLUProMainTestCaseV1 = MMLUProMainTestCaseV1;
1061
+ type index$1_MMLUProParquetDataLoader = MMLUProParquetDataLoader;
1062
+ declare const index$1_MMLUProParquetDataLoader: typeof MMLUProParquetDataLoader;
1063
+ declare namespace index$1 {
1064
+ export { index$1_BaseMMLUProScoreSchemaV1 as BaseMMLUProScoreSchemaV1, index$1_MMLUProBenchmarkSpecSchemaV1 as MMLUProBenchmarkSpecSchemaV1, type index$1_MMLUProBenchmarkSpecV1 as MMLUProBenchmarkSpecV1, index$1_MMLUProJSONDataLoader as MMLUProJSONDataLoader, index$1_MMLUProMainResponseSchemaV1 as MMLUProMainResponseSchemaV1, type index$1_MMLUProMainResponseV1 as MMLUProMainResponseV1, index$1_MMLUProMainScoreSchemaV1 as MMLUProMainScoreSchemaV1, type index$1_MMLUProMainScoreV1 as MMLUProMainScoreV1, index$1_MMLUProMainTestCaseSchemaV1 as MMLUProMainTestCaseSchemaV1, type index$1_MMLUProMainTestCaseV1 as MMLUProMainTestCaseV1, index$1_MMLUProParquetDataLoader as MMLUProParquetDataLoader, runTestCase$1 as runTestCase };
1065
+ }
1066
+
1067
+ declare class FNOLFieldsScorer extends AbstractScorer {
1068
+ readonly kind = "fnol.fields";
1069
+ score(params: {
1070
+ fieldsToCollect: Record<string, {
1071
+ required?: boolean;
1072
+ expected?: unknown;
1073
+ description?: string;
1074
+ }>;
1075
+ extracted?: Record<string, unknown>;
1076
+ }): Promise<BaseScorerResult & {
1077
+ requiredKeys: string[];
1078
+ presentKeys: string[];
1079
+ missingKeys: string[];
1080
+ mismatchedKeys: string[];
1081
+ }>;
1082
+ }
1083
+
1084
+ declare const FNOLFieldSchemaV1: z$1.ZodObject<{
1085
+ description: z$1.ZodString;
1086
+ required: z$1.ZodOptional<z$1.ZodBoolean>;
1087
+ expected: z$1.ZodOptional<z$1.ZodUnknown>;
1088
+ valueType: z$1.ZodOptional<z$1.ZodEnum<{
1089
+ readonly string: "string";
1090
+ readonly number: "number";
1091
+ readonly boolean: "boolean";
1092
+ readonly object: "object";
1093
+ }>>;
1094
+ }, z$1.core.$strip>;
1095
+ declare const FNOLTestCaseSchemaV1: z$1.ZodObject<Omit<{
1096
+ id: z$1.ZodString;
1097
+ kind: z$1.ZodString;
1098
+ schemaVersion: z$1.ZodNumber;
1099
+ metadata: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
1100
+ }, "kind" | "schemaVersion"> & {
1101
+ /**
1102
+ * Scenario starter message. This is what the "user" would say initially.
1103
+ */
1104
+ initialUserMessage: z$1.ZodString;
1105
+ /**
1106
+ * Private/structured information about the user and the incident.
1107
+ * This is used by the user simulator LLM to answer the target model questions.
1108
+ */
1109
+ userProfile: z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>;
1110
+ /**
1111
+ * The fields the target model must collect.
1112
+ * Keys are canonical identifiers (e.g. "policyNumber", "dateOfLoss").
1113
+ */
1114
+ fieldsToCollect: z$1.ZodRecord<z$1.ZodString, z$1.ZodObject<{
1115
+ description: z$1.ZodString;
1116
+ required: z$1.ZodOptional<z$1.ZodBoolean>;
1117
+ expected: z$1.ZodOptional<z$1.ZodUnknown>;
1118
+ valueType: z$1.ZodOptional<z$1.ZodEnum<{
1119
+ readonly string: "string";
1120
+ readonly number: "number";
1121
+ readonly boolean: "boolean";
1122
+ readonly object: "object";
1123
+ }>>;
1124
+ }, z$1.core.$strip>>;
1125
+ /**
1126
+ * Maximum number of back-and-forth turns (target question + user answer).
1127
+ */
1128
+ maxTurns: z$1.ZodDefault<z$1.ZodNumber>;
1129
+ } & {
1130
+ kind: z$1.ZodLiteral<"fnol.ts.v1">;
1131
+ schemaVersion: z$1.ZodLiteral<1>;
1132
+ }, z$1.core.$strip> & {
1133
+ new: (input: Omit<{
1134
+ id: string;
1135
+ initialUserMessage: string;
1136
+ userProfile: Record<string, unknown>;
1137
+ fieldsToCollect: Record<string, {
1138
+ description: string;
1139
+ required?: boolean | undefined;
1140
+ expected?: unknown;
1141
+ valueType?: "string" | "number" | "boolean" | "object" | undefined;
1142
+ }>;
1143
+ maxTurns: number;
1144
+ kind: "fnol.ts.v1";
1145
+ schemaVersion: 1;
1146
+ metadata?: Record<string, unknown> | undefined;
1147
+ }, "kind" | "schemaVersion">) => {
1148
+ id: string;
1149
+ initialUserMessage: string;
1150
+ userProfile: Record<string, unknown>;
1151
+ fieldsToCollect: Record<string, {
1152
+ description: string;
1153
+ required?: boolean | undefined;
1154
+ expected?: unknown;
1155
+ valueType?: "string" | "number" | "boolean" | "object" | undefined;
1156
+ }>;
1157
+ maxTurns: number;
1158
+ kind: "fnol.ts.v1";
1159
+ schemaVersion: 1;
1160
+ metadata?: Record<string, unknown> | undefined;
1161
+ };
1162
+ newWithId(input: Omit<{
1163
+ id: string;
1164
+ initialUserMessage: string;
1165
+ userProfile: Record<string, unknown>;
1166
+ fieldsToCollect: Record<string, {
1167
+ description: string;
1168
+ required?: boolean | undefined;
1169
+ expected?: unknown;
1170
+ valueType?: "string" | "number" | "boolean" | "object" | undefined;
1171
+ }>;
1172
+ maxTurns: number;
1173
+ kind: "fnol.ts.v1";
1174
+ schemaVersion: 1;
1175
+ metadata?: Record<string, unknown> | undefined;
1176
+ }, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
1177
+ id: string;
1178
+ initialUserMessage: string;
1179
+ userProfile: Record<string, unknown>;
1180
+ fieldsToCollect: Record<string, {
1181
+ description: string;
1182
+ required?: boolean | undefined;
1183
+ expected?: unknown;
1184
+ valueType?: "string" | "number" | "boolean" | "object" | undefined;
1185
+ }>;
1186
+ maxTurns: number;
1187
+ kind: "fnol.ts.v1";
1188
+ schemaVersion: 1;
1189
+ metadata?: Record<string, unknown> | undefined;
1190
+ }>;
1191
+ };
1192
+ type FNOLTestCaseV1 = z$1.infer<typeof FNOLTestCaseSchemaV1>;
1193
+ declare const FNOLConversationMessageSchemaV1: z$1.ZodObject<{
1194
+ role: z$1.ZodEnum<{
1195
+ system: "system";
1196
+ user: "user";
1197
+ assistant: "assistant";
1198
+ }>;
1199
+ content: z$1.ZodString;
1200
+ }, z$1.core.$strip>;
1201
+ declare const FNOLResponseSchemaV1: z$1.ZodObject<Omit<Omit<{
1202
+ id: z$1.ZodString;
1203
+ kind: z$1.ZodString;
1204
+ schemaVersion: z$1.ZodNumber;
1205
+ startedAt: z$1.ZodNumber;
1206
+ completedAt: z$1.ZodNumber;
1207
+ testCaseId: z$1.ZodString;
1208
+ metadata: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
1209
+ }, "kind" | "schemaVersion"> & {
1210
+ data: z$1.ZodString;
1211
+ modelSlug: z$1.ZodString;
1212
+ provider: z$1.ZodString;
1213
+ systemPromptId: z$1.ZodOptional<z$1.ZodString>;
1214
+ inputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
1215
+ outputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
1216
+ inputCost: z$1.ZodOptional<z$1.ZodString>;
1217
+ outputCost: z$1.ZodOptional<z$1.ZodString>;
1218
+ } & {
1219
+ kind: z$1.ZodString;
1220
+ schemaVersion: z$1.ZodNumber;
1221
+ }, "kind" | "schemaVersion"> & {
1222
+ /**
1223
+ * Full conversation between the target model and simulated user.
1224
+ */
1225
+ conversation: z$1.ZodArray<z$1.ZodObject<{
1226
+ role: z$1.ZodEnum<{
1227
+ system: "system";
1228
+ user: "user";
1229
+ assistant: "assistant";
1230
+ }>;
1231
+ content: z$1.ZodString;
1232
+ }, z$1.core.$strip>>;
1233
+ turnsUsed: z$1.ZodNumber;
1234
+ doneReason: z$1.ZodEnum<{
1235
+ readonly modelProvidedJson: "modelProvidedJson";
1236
+ readonly reachedMaxTurns: "reachedMaxTurns";
1237
+ readonly forcedFinalJson: "forcedFinalJson";
1238
+ }>;
1239
+ /**
1240
+ * Parsed JSON object from the target model's final answer, if available.
1241
+ */
1242
+ extracted: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
1243
+ } & {
1244
+ kind: z$1.ZodLiteral<"fnol.rs.v1">;
1245
+ schemaVersion: z$1.ZodLiteral<1>;
1246
+ }, z$1.core.$strip> & {
1247
+ new: (input: Omit<{
1248
+ id: string;
1249
+ testCaseId: string;
1250
+ startedAt: number;
1251
+ completedAt: number;
1252
+ data: string;
1253
+ provider: string;
1254
+ modelSlug: string;
1255
+ conversation: {
1256
+ role: "system" | "user" | "assistant";
1257
+ content: string;
1258
+ }[];
1259
+ turnsUsed: number;
1260
+ doneReason: "modelProvidedJson" | "reachedMaxTurns" | "forcedFinalJson";
1261
+ kind: "fnol.rs.v1";
1262
+ schemaVersion: 1;
1263
+ metadata?: Record<string, unknown> | undefined;
1264
+ inputTokensUsed?: number | undefined;
1265
+ outputTokensUsed?: number | undefined;
1266
+ inputCost?: string | undefined;
1267
+ outputCost?: string | undefined;
1268
+ systemPromptId?: string | undefined;
1269
+ extracted?: Record<string, unknown> | undefined;
1270
+ }, "kind" | "schemaVersion">) => {
1271
+ id: string;
1272
+ testCaseId: string;
1273
+ startedAt: number;
1274
+ completedAt: number;
1275
+ data: string;
1276
+ provider: string;
1277
+ modelSlug: string;
1278
+ conversation: {
1279
+ role: "system" | "user" | "assistant";
1280
+ content: string;
1281
+ }[];
1282
+ turnsUsed: number;
1283
+ doneReason: "modelProvidedJson" | "reachedMaxTurns" | "forcedFinalJson";
1284
+ kind: "fnol.rs.v1";
1285
+ schemaVersion: 1;
1286
+ metadata?: Record<string, unknown> | undefined;
1287
+ inputTokensUsed?: number | undefined;
1288
+ outputTokensUsed?: number | undefined;
1289
+ inputCost?: string | undefined;
1290
+ outputCost?: string | undefined;
1291
+ systemPromptId?: string | undefined;
1292
+ extracted?: Record<string, unknown> | undefined;
1293
+ };
1294
+ newWithId(input: Omit<{
1295
+ id: string;
1296
+ testCaseId: string;
1297
+ startedAt: number;
1298
+ completedAt: number;
1299
+ data: string;
1300
+ provider: string;
1301
+ modelSlug: string;
1302
+ conversation: {
1303
+ role: "system" | "user" | "assistant";
1304
+ content: string;
1305
+ }[];
1306
+ turnsUsed: number;
1307
+ doneReason: "modelProvidedJson" | "reachedMaxTurns" | "forcedFinalJson";
1308
+ kind: "fnol.rs.v1";
1309
+ schemaVersion: 1;
1310
+ metadata?: Record<string, unknown> | undefined;
1311
+ inputTokensUsed?: number | undefined;
1312
+ outputTokensUsed?: number | undefined;
1313
+ inputCost?: string | undefined;
1314
+ outputCost?: string | undefined;
1315
+ systemPromptId?: string | undefined;
1316
+ extracted?: Record<string, unknown> | undefined;
1317
+ }, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
1318
+ id: string;
1319
+ testCaseId: string;
1320
+ startedAt: number;
1321
+ completedAt: number;
1322
+ data: string;
1323
+ provider: string;
1324
+ modelSlug: string;
1325
+ conversation: {
1326
+ role: "system" | "user" | "assistant";
1327
+ content: string;
1328
+ }[];
1329
+ turnsUsed: number;
1330
+ doneReason: "modelProvidedJson" | "reachedMaxTurns" | "forcedFinalJson";
1331
+ kind: "fnol.rs.v1";
1332
+ schemaVersion: 1;
1333
+ metadata?: Record<string, unknown> | undefined;
1334
+ inputTokensUsed?: number | undefined;
1335
+ outputTokensUsed?: number | undefined;
1336
+ inputCost?: string | undefined;
1337
+ outputCost?: string | undefined;
1338
+ systemPromptId?: string | undefined;
1339
+ extracted?: Record<string, unknown> | undefined;
1340
+ }>;
1341
+ };
1342
+ type FNOLResponseV1 = z$1.infer<typeof FNOLResponseSchemaV1>;
1343
+ declare const FNOLFieldsScoreSchemaV1: z$1.ZodObject<Omit<Omit<{
1344
+ id: z$1.ZodString;
1345
+ kind: z$1.ZodString;
1346
+ schemaVersion: z$1.ZodNumber;
1347
+ value: z$1.ZodNumber;
1348
+ responseId: z$1.ZodString;
1349
+ explanation: z$1.ZodOptional<z$1.ZodString>;
1350
+ metadata: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
1351
+ scoringMethod: z$1.ZodEnum<{
1352
+ readonly ai: "ai";
1353
+ readonly human: "human";
1354
+ readonly algo: "algo";
1355
+ }>;
1356
+ }, "kind" | "schemaVersion"> & {
1357
+ scorerAIProvider: z$1.ZodOptional<z$1.ZodString>;
1358
+ scorerAIModelSlug: z$1.ZodOptional<z$1.ZodString>;
1359
+ scorerAIInputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
1360
+ scorerAIOutputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
1361
+ scorerAIInputCost: z$1.ZodOptional<z$1.ZodString>;
1362
+ scorerAIOutputCost: z$1.ZodOptional<z$1.ZodString>;
1363
+ } & {
1364
+ kind: z$1.ZodString;
1365
+ schemaVersion: z$1.ZodNumber;
1366
+ }, "kind" | "schemaVersion"> & {
1367
+ requiredKeys: z$1.ZodArray<z$1.ZodString>;
1368
+ presentKeys: z$1.ZodArray<z$1.ZodString>;
1369
+ missingKeys: z$1.ZodArray<z$1.ZodString>;
1370
+ mismatchedKeys: z$1.ZodArray<z$1.ZodString>;
1371
+ } & {
1372
+ kind: z$1.ZodLiteral<"fnol.sc.fields.v1">;
1373
+ schemaVersion: z$1.ZodLiteral<1>;
1374
+ }, z$1.core.$strip> & {
1375
+ new: (input: Omit<{
1376
+ id: string;
1377
+ value: number;
1378
+ responseId: string;
1379
+ scoringMethod: "ai" | "human" | "algo";
1380
+ requiredKeys: string[];
1381
+ presentKeys: string[];
1382
+ missingKeys: string[];
1383
+ mismatchedKeys: string[];
1384
+ kind: "fnol.sc.fields.v1";
1385
+ schemaVersion: 1;
1386
+ metadata?: Record<string, unknown> | undefined;
1387
+ explanation?: string | undefined;
1388
+ scorerAIProvider?: string | undefined;
1389
+ scorerAIModelSlug?: string | undefined;
1390
+ scorerAIInputTokensUsed?: number | undefined;
1391
+ scorerAIOutputTokensUsed?: number | undefined;
1392
+ scorerAIInputCost?: string | undefined;
1393
+ scorerAIOutputCost?: string | undefined;
1394
+ }, "kind" | "schemaVersion">) => {
1395
+ id: string;
1396
+ value: number;
1397
+ responseId: string;
1398
+ scoringMethod: "ai" | "human" | "algo";
1399
+ requiredKeys: string[];
1400
+ presentKeys: string[];
1401
+ missingKeys: string[];
1402
+ mismatchedKeys: string[];
1403
+ kind: "fnol.sc.fields.v1";
1404
+ schemaVersion: 1;
1405
+ metadata?: Record<string, unknown> | undefined;
1406
+ explanation?: string | undefined;
1407
+ scorerAIProvider?: string | undefined;
1408
+ scorerAIModelSlug?: string | undefined;
1409
+ scorerAIInputTokensUsed?: number | undefined;
1410
+ scorerAIOutputTokensUsed?: number | undefined;
1411
+ scorerAIInputCost?: string | undefined;
1412
+ scorerAIOutputCost?: string | undefined;
1413
+ };
1414
+ newWithId(input: Omit<{
1415
+ id: string;
1416
+ value: number;
1417
+ responseId: string;
1418
+ scoringMethod: "ai" | "human" | "algo";
1419
+ requiredKeys: string[];
1420
+ presentKeys: string[];
1421
+ missingKeys: string[];
1422
+ mismatchedKeys: string[];
1423
+ kind: "fnol.sc.fields.v1";
1424
+ schemaVersion: 1;
1425
+ metadata?: Record<string, unknown> | undefined;
1426
+ explanation?: string | undefined;
1427
+ scorerAIProvider?: string | undefined;
1428
+ scorerAIModelSlug?: string | undefined;
1429
+ scorerAIInputTokensUsed?: number | undefined;
1430
+ scorerAIOutputTokensUsed?: number | undefined;
1431
+ scorerAIInputCost?: string | undefined;
1432
+ scorerAIOutputCost?: string | undefined;
1433
+ }, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
1434
+ id: string;
1435
+ value: number;
1436
+ responseId: string;
1437
+ scoringMethod: "ai" | "human" | "algo";
1438
+ requiredKeys: string[];
1439
+ presentKeys: string[];
1440
+ missingKeys: string[];
1441
+ mismatchedKeys: string[];
1442
+ kind: "fnol.sc.fields.v1";
1443
+ schemaVersion: 1;
1444
+ metadata?: Record<string, unknown> | undefined;
1445
+ explanation?: string | undefined;
1446
+ scorerAIProvider?: string | undefined;
1447
+ scorerAIModelSlug?: string | undefined;
1448
+ scorerAIInputTokensUsed?: number | undefined;
1449
+ scorerAIOutputTokensUsed?: number | undefined;
1450
+ scorerAIInputCost?: string | undefined;
1451
+ scorerAIOutputCost?: string | undefined;
1452
+ }>;
1453
+ };
1454
+ type FNOLFieldsScoreV1 = z$1.infer<typeof FNOLFieldsScoreSchemaV1>;
1455
+ declare const FNOLLLMJudgeScoreSchemaV1: z$1.ZodObject<Omit<Omit<{
1456
+ id: z$1.ZodString;
1457
+ kind: z$1.ZodString;
1458
+ schemaVersion: z$1.ZodNumber;
1459
+ value: z$1.ZodNumber;
1460
+ responseId: z$1.ZodString;
1461
+ explanation: z$1.ZodOptional<z$1.ZodString>;
1462
+ metadata: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
1463
+ scoringMethod: z$1.ZodEnum<{
1464
+ readonly ai: "ai";
1465
+ readonly human: "human";
1466
+ readonly algo: "algo";
1467
+ }>;
1468
+ }, "kind" | "schemaVersion"> & {
1469
+ scorerAIProvider: z$1.ZodOptional<z$1.ZodString>;
1470
+ scorerAIModelSlug: z$1.ZodOptional<z$1.ZodString>;
1471
+ scorerAIInputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
1472
+ scorerAIOutputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
1473
+ scorerAIInputCost: z$1.ZodOptional<z$1.ZodString>;
1474
+ scorerAIOutputCost: z$1.ZodOptional<z$1.ZodString>;
1475
+ } & {
1476
+ kind: z$1.ZodString;
1477
+ schemaVersion: z$1.ZodNumber;
1478
+ }, "kind" | "schemaVersion"> & {
1479
+ verdict: z$1.ZodOptional<z$1.ZodEnum<{
1480
+ pass: "pass";
1481
+ borderline: "borderline";
1482
+ fail: "fail";
1483
+ }>>;
1484
+ } & {
1485
+ kind: z$1.ZodLiteral<"fnol.sc.llm-judge.v1">;
1486
+ schemaVersion: z$1.ZodLiteral<1>;
1487
+ }, z$1.core.$strip> & {
1488
+ new: (input: Omit<{
1489
+ id: string;
1490
+ value: number;
1491
+ responseId: string;
1492
+ scoringMethod: "ai" | "human" | "algo";
1493
+ kind: "fnol.sc.llm-judge.v1";
1494
+ schemaVersion: 1;
1495
+ metadata?: Record<string, unknown> | undefined;
1496
+ explanation?: string | undefined;
1497
+ scorerAIProvider?: string | undefined;
1498
+ scorerAIModelSlug?: string | undefined;
1499
+ scorerAIInputTokensUsed?: number | undefined;
1500
+ scorerAIOutputTokensUsed?: number | undefined;
1501
+ scorerAIInputCost?: string | undefined;
1502
+ scorerAIOutputCost?: string | undefined;
1503
+ verdict?: "pass" | "borderline" | "fail" | undefined;
1504
+ }, "kind" | "schemaVersion">) => {
1505
+ id: string;
1506
+ value: number;
1507
+ responseId: string;
1508
+ scoringMethod: "ai" | "human" | "algo";
1509
+ kind: "fnol.sc.llm-judge.v1";
1510
+ schemaVersion: 1;
1511
+ metadata?: Record<string, unknown> | undefined;
1512
+ explanation?: string | undefined;
1513
+ scorerAIProvider?: string | undefined;
1514
+ scorerAIModelSlug?: string | undefined;
1515
+ scorerAIInputTokensUsed?: number | undefined;
1516
+ scorerAIOutputTokensUsed?: number | undefined;
1517
+ scorerAIInputCost?: string | undefined;
1518
+ scorerAIOutputCost?: string | undefined;
1519
+ verdict?: "pass" | "borderline" | "fail" | undefined;
1520
+ };
1521
+ newWithId(input: Omit<{
1522
+ id: string;
1523
+ value: number;
1524
+ responseId: string;
1525
+ scoringMethod: "ai" | "human" | "algo";
1526
+ kind: "fnol.sc.llm-judge.v1";
1527
+ schemaVersion: 1;
1528
+ metadata?: Record<string, unknown> | undefined;
1529
+ explanation?: string | undefined;
1530
+ scorerAIProvider?: string | undefined;
1531
+ scorerAIModelSlug?: string | undefined;
1532
+ scorerAIInputTokensUsed?: number | undefined;
1533
+ scorerAIOutputTokensUsed?: number | undefined;
1534
+ scorerAIInputCost?: string | undefined;
1535
+ scorerAIOutputCost?: string | undefined;
1536
+ verdict?: "pass" | "borderline" | "fail" | undefined;
1537
+ }, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
1538
+ id: string;
1539
+ value: number;
1540
+ responseId: string;
1541
+ scoringMethod: "ai" | "human" | "algo";
1542
+ kind: "fnol.sc.llm-judge.v1";
1543
+ schemaVersion: 1;
1544
+ metadata?: Record<string, unknown> | undefined;
1545
+ explanation?: string | undefined;
1546
+ scorerAIProvider?: string | undefined;
1547
+ scorerAIModelSlug?: string | undefined;
1548
+ scorerAIInputTokensUsed?: number | undefined;
1549
+ scorerAIOutputTokensUsed?: number | undefined;
1550
+ scorerAIInputCost?: string | undefined;
1551
+ scorerAIOutputCost?: string | undefined;
1552
+ verdict?: "pass" | "borderline" | "fail" | undefined;
1553
+ }>;
1554
+ };
1555
+ type FNOLLLMJudgeScoreV1 = z$1.infer<typeof FNOLLLMJudgeScoreSchemaV1>;
1556
+
1557
+ declare function runTestCase(params: {
1558
+ testCase: FNOLTestCaseV1;
1559
+ provider: AbstractLLMProvider;
1560
+ userSimulatorProvider?: AbstractLLMProvider;
1561
+ scorer?: FNOLFieldsScorer | LLMJudgeScorer;
1562
+ runConfig: {
1563
+ model: string;
1564
+ userSimulatorModel?: string;
1565
+ llmJudgeModel?: string;
1566
+ temperature?: number;
1567
+ userSimulatorTemperature?: number;
1568
+ };
1569
+ systemPrompt?: SimpleSystemPromptV1;
1570
+ idGenerators?: {
1571
+ response?: IdGenerator;
1572
+ score?: IdGenerator;
1573
+ };
1574
+ }): Promise<RunnerResult<FNOLResponseV1, FNOLFieldsScoreV1 | FNOLLLMJudgeScoreV1>>;
1575
+
1576
+ declare const FNOLBaseScoreSchemaV1: z.ZodObject<Omit<{
1577
+ id: z.ZodString;
1578
+ kind: z.ZodString;
1579
+ schemaVersion: z.ZodNumber;
1580
+ value: z.ZodNumber;
1581
+ responseId: z.ZodString;
1582
+ explanation: z.ZodOptional<z.ZodString>;
1583
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
1584
+ scoringMethod: z.ZodEnum<{
1585
+ readonly ai: "ai";
1586
+ readonly human: "human";
1587
+ readonly algo: "algo";
1588
+ }>;
1589
+ }, "kind" | "schemaVersion"> & {
1590
+ scorerAIProvider: z.ZodOptional<z.ZodString>;
1591
+ scorerAIModelSlug: z.ZodOptional<z.ZodString>;
1592
+ scorerAIInputTokensUsed: z.ZodOptional<z.ZodNumber>;
1593
+ scorerAIOutputTokensUsed: z.ZodOptional<z.ZodNumber>;
1594
+ scorerAIInputCost: z.ZodOptional<z.ZodString>;
1595
+ scorerAIOutputCost: z.ZodOptional<z.ZodString>;
1596
+ } & {
1597
+ kind: z.ZodString;
1598
+ schemaVersion: z.ZodNumber;
1599
+ }, zod_v4_core.$strip> & {
1600
+ new: (input: Omit<{
1601
+ id: string;
1602
+ value: number;
1603
+ responseId: string;
1604
+ scoringMethod: "ai" | "human" | "algo";
1605
+ kind: string;
1606
+ schemaVersion: number;
1607
+ metadata?: Record<string, unknown> | undefined;
1608
+ explanation?: string | undefined;
1609
+ scorerAIProvider?: string | undefined;
1610
+ scorerAIModelSlug?: string | undefined;
1611
+ scorerAIInputTokensUsed?: number | undefined;
1612
+ scorerAIOutputTokensUsed?: number | undefined;
1613
+ scorerAIInputCost?: string | undefined;
1614
+ scorerAIOutputCost?: string | undefined;
1615
+ }, "kind" | "schemaVersion">) => {
1616
+ id: string;
1617
+ value: number;
1618
+ responseId: string;
1619
+ scoringMethod: "ai" | "human" | "algo";
1620
+ kind: string;
1621
+ schemaVersion: number;
1622
+ metadata?: Record<string, unknown> | undefined;
1623
+ explanation?: string | undefined;
1624
+ scorerAIProvider?: string | undefined;
1625
+ scorerAIModelSlug?: string | undefined;
1626
+ scorerAIInputTokensUsed?: number | undefined;
1627
+ scorerAIOutputTokensUsed?: number | undefined;
1628
+ scorerAIInputCost?: string | undefined;
1629
+ scorerAIOutputCost?: string | undefined;
1630
+ };
1631
+ newWithId(input: Omit<{
1632
+ id: string;
1633
+ value: number;
1634
+ responseId: string;
1635
+ scoringMethod: "ai" | "human" | "algo";
1636
+ kind: string;
1637
+ schemaVersion: number;
1638
+ metadata?: Record<string, unknown> | undefined;
1639
+ explanation?: string | undefined;
1640
+ scorerAIProvider?: string | undefined;
1641
+ scorerAIModelSlug?: string | undefined;
1642
+ scorerAIInputTokensUsed?: number | undefined;
1643
+ scorerAIOutputTokensUsed?: number | undefined;
1644
+ scorerAIInputCost?: string | undefined;
1645
+ scorerAIOutputCost?: string | undefined;
1646
+ }, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
1647
+ id: string;
1648
+ value: number;
1649
+ responseId: string;
1650
+ scoringMethod: "ai" | "human" | "algo";
1651
+ kind: string;
1652
+ schemaVersion: number;
1653
+ metadata?: Record<string, unknown> | undefined;
1654
+ explanation?: string | undefined;
1655
+ scorerAIProvider?: string | undefined;
1656
+ scorerAIModelSlug?: string | undefined;
1657
+ scorerAIInputTokensUsed?: number | undefined;
1658
+ scorerAIOutputTokensUsed?: number | undefined;
1659
+ scorerAIInputCost?: string | undefined;
1660
+ scorerAIOutputCost?: string | undefined;
1661
+ }>;
1662
+ };
1663
+
1664
+ declare const FNOLFieldValueType: {
1665
+ readonly string: "string";
1666
+ readonly number: "number";
1667
+ readonly boolean: "boolean";
1668
+ readonly object: "object";
1669
+ };
1670
+ type FNOLFieldValueType = (typeof FNOLFieldValueType)[keyof typeof FNOLFieldValueType];
1671
+ declare const FNOLDoneReason: {
1672
+ readonly modelProvidedJson: "modelProvidedJson";
1673
+ readonly reachedMaxTurns: "reachedMaxTurns";
1674
+ readonly forcedFinalJson: "forcedFinalJson";
1675
+ };
1676
+ type FNOLDoneReason = (typeof FNOLDoneReason)[keyof typeof FNOLDoneReason];
1677
+
1678
+ declare const index_FNOLBaseScoreSchemaV1: typeof FNOLBaseScoreSchemaV1;
1679
+ declare const index_FNOLConversationMessageSchemaV1: typeof FNOLConversationMessageSchemaV1;
1680
+ type index_FNOLDoneReason = FNOLDoneReason;
1681
+ declare const index_FNOLFieldSchemaV1: typeof FNOLFieldSchemaV1;
1682
+ type index_FNOLFieldValueType = FNOLFieldValueType;
1683
+ declare const index_FNOLFieldsScoreSchemaV1: typeof FNOLFieldsScoreSchemaV1;
1684
+ type index_FNOLFieldsScoreV1 = FNOLFieldsScoreV1;
1685
+ type index_FNOLFieldsScorer = FNOLFieldsScorer;
1686
+ declare const index_FNOLFieldsScorer: typeof FNOLFieldsScorer;
1687
+ declare const index_FNOLLLMJudgeScoreSchemaV1: typeof FNOLLLMJudgeScoreSchemaV1;
1688
+ type index_FNOLLLMJudgeScoreV1 = FNOLLLMJudgeScoreV1;
1689
+ declare const index_FNOLResponseSchemaV1: typeof FNOLResponseSchemaV1;
1690
+ type index_FNOLResponseV1 = FNOLResponseV1;
1691
+ declare const index_FNOLTestCaseSchemaV1: typeof FNOLTestCaseSchemaV1;
1692
+ type index_FNOLTestCaseV1 = FNOLTestCaseV1;
1693
+ declare const index_runTestCase: typeof runTestCase;
1694
+ declare namespace index {
1695
+ export { index_FNOLBaseScoreSchemaV1 as FNOLBaseScoreSchemaV1, index_FNOLConversationMessageSchemaV1 as FNOLConversationMessageSchemaV1, type index_FNOLDoneReason as FNOLDoneReason, index_FNOLFieldSchemaV1 as FNOLFieldSchemaV1, type index_FNOLFieldValueType as FNOLFieldValueType, index_FNOLFieldsScoreSchemaV1 as FNOLFieldsScoreSchemaV1, type index_FNOLFieldsScoreV1 as FNOLFieldsScoreV1, index_FNOLFieldsScorer as FNOLFieldsScorer, index_FNOLLLMJudgeScoreSchemaV1 as FNOLLLMJudgeScoreSchemaV1, type index_FNOLLLMJudgeScoreV1 as FNOLLLMJudgeScoreV1, index_FNOLResponseSchemaV1 as FNOLResponseSchemaV1, type index_FNOLResponseV1 as FNOLResponseV1, index_FNOLTestCaseSchemaV1 as FNOLTestCaseSchemaV1, type index_FNOLTestCaseV1 as FNOLTestCaseV1, index_runTestCase as runTestCase };
1696
+ }
1697
+
1698
+ export { index as fnol, index$1 as mmluPro, index$2 as peerbench };