peerbench 0.0.2-alpha.0 → 0.0.2-alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +123 -99
- package/dist/aggregators/index.d.ts +67 -0
- package/dist/aggregators/index.js +46 -0
- package/dist/aggregators/index.js.map +1 -0
- package/dist/benchmarks/index.d.ts +615 -1271
- package/dist/benchmarks/index.js +358 -805
- package/dist/benchmarks/index.js.map +1 -1
- package/dist/{chunk-DUBKY73H.js → chunk-4UBK6452.js} +13 -13
- package/dist/chunk-4UBK6452.js.map +1 -0
- package/dist/chunk-ERALDEZY.js +112 -0
- package/dist/chunk-ERALDEZY.js.map +1 -0
- package/dist/{chunk-ZJWSK4VO.js → chunk-HMQYGCKI.js} +1 -1
- package/dist/chunk-HMQYGCKI.js.map +1 -0
- package/dist/chunk-NUEOE3K5.js +8 -0
- package/dist/chunk-NUEOE3K5.js.map +1 -0
- package/dist/chunk-OQE6TQXZ.js +42 -0
- package/dist/chunk-OQE6TQXZ.js.map +1 -0
- package/dist/chunk-QY5MPNNB.js +28 -0
- package/dist/chunk-QY5MPNNB.js.map +1 -0
- package/dist/chunk-R76XA2K6.js +229 -0
- package/dist/chunk-R76XA2K6.js.map +1 -0
- package/dist/chunk-TRNCF2BG.js +35 -0
- package/dist/chunk-TRNCF2BG.js.map +1 -0
- package/dist/chunk-UHHHSYVE.js +11 -0
- package/dist/chunk-UHHHSYVE.js.map +1 -0
- package/dist/{chunk-232PY7K3.js → chunk-YY33MNMV.js} +29 -14
- package/dist/chunk-YY33MNMV.js.map +1 -0
- package/dist/chunk-ZEWI24CV.js +365 -0
- package/dist/chunk-ZEWI24CV.js.map +1 -0
- package/dist/chunk-ZXTQJFGL.js +44 -0
- package/dist/chunk-ZXTQJFGL.js.map +1 -0
- package/dist/index-BAioQhp2.d.ts +27 -0
- package/dist/index.d.ts +51 -26
- package/dist/index.js +28 -25
- package/dist/index.js.map +1 -1
- package/dist/json-file-ZwzLUbje.d.ts +73 -0
- package/dist/llm-judge-QThCZ9TQ.d.ts +67 -0
- package/dist/providers/index.d.ts +16 -19
- package/dist/providers/index.js +8 -253
- package/dist/providers/index.js.map +1 -1
- package/dist/schemas/extensions/index.d.ts +16 -2
- package/dist/schemas/extensions/index.js +9 -3
- package/dist/schemas/extensions/index.js.map +1 -1
- package/dist/schemas/index.d.ts +108 -141
- package/dist/schemas/index.js +7 -10
- package/dist/schemas/llm/index.d.ts +100 -82
- package/dist/schemas/llm/index.js +7 -29
- package/dist/schemas/llm/index.js.map +1 -1
- package/dist/scorers/index.d.ts +3 -2
- package/dist/scorers/index.js +8 -486
- package/dist/scorers/index.js.map +1 -1
- package/dist/storages/index.d.ts +69 -0
- package/dist/storages/index.js +98 -0
- package/dist/storages/index.js.map +1 -0
- package/package.json +12 -6
- package/dist/catalogs/index.d.ts +0 -75
- package/dist/catalogs/index.js +0 -88
- package/dist/catalogs/index.js.map +0 -1
- package/dist/chunk-22HU24QF.js +0 -8
- package/dist/chunk-22HU24QF.js.map +0 -1
- package/dist/chunk-232PY7K3.js.map +0 -1
- package/dist/chunk-7TREBPSJ.js +0 -26
- package/dist/chunk-7TREBPSJ.js.map +0 -1
- package/dist/chunk-DUBKY73H.js.map +0 -1
- package/dist/chunk-GVF4YZF3.js +0 -15
- package/dist/chunk-GVF4YZF3.js.map +0 -1
- package/dist/chunk-HJH3SW3L.js +0 -103
- package/dist/chunk-HJH3SW3L.js.map +0 -1
- package/dist/chunk-IUN2IUCS.js +0 -58
- package/dist/chunk-IUN2IUCS.js.map +0 -1
- package/dist/chunk-VBOM2YEG.js +0 -47
- package/dist/chunk-VBOM2YEG.js.map +0 -1
- package/dist/chunk-ZJWSK4VO.js.map +0 -1
- package/dist/data-BmN5WjZ4.d.ts +0 -57
- package/dist/generic-array-DLHWSvf1.d.ts +0 -22
- package/dist/index-WiPjF2AL.d.ts +0 -15
- package/dist/llm-judge-DIG1f1Az.d.ts +0 -67
- package/dist/simple-system-prompt-CzPYuvo0.d.ts +0 -49
- package/dist/system-prompt--0FdPWqK.d.ts +0 -58
- package/dist/utilities-BrRH32rD.d.ts +0 -30
|
@@ -1,50 +1,47 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
3
|
-
import * as z from 'zod';
|
|
4
|
-
import z__default, { z as z$1 } from 'zod';
|
|
1
|
+
import { I as IdGenerator } from '../index-BAioQhp2.js';
|
|
2
|
+
import z__default, { z } from 'zod';
|
|
5
3
|
import { A as AbstractLLMProvider } from '../llm-DNj_tp2T.js';
|
|
6
|
-
import { a as MCQScorer, L as
|
|
7
|
-
import {
|
|
8
|
-
import { c as RunnerResult, A as AbstractDataLoader, L as LoaderResult } from '../data-BmN5WjZ4.js';
|
|
9
|
-
import * as zod_v4_core from 'zod/v4/core';
|
|
10
|
-
import { A as AbstractScorer, B as BaseScorerResult } from '../abstract-Dec9Sc5O.js';
|
|
11
|
-
import '../schemas/index.js';
|
|
4
|
+
import { a as MCQScorer, L as LLMAsAJudgeScorer } from '../llm-judge-QThCZ9TQ.js';
|
|
5
|
+
import { J as JSONFileStorage } from '../json-file-ZwzLUbje.js';
|
|
12
6
|
import '../provider-BDjGp2y-.js';
|
|
7
|
+
import '../abstract-Dec9Sc5O.js';
|
|
13
8
|
import 'openai/resources/shared';
|
|
14
9
|
import 'openai/resources/chat/completions';
|
|
15
10
|
import '../rate-limiter-CSmVIRsM.js';
|
|
16
|
-
import '
|
|
11
|
+
import 'node:fs/promises';
|
|
17
12
|
|
|
18
|
-
declare const
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
13
|
+
declare const MCQKind: "llm/mcq";
|
|
14
|
+
declare const MCQTestCaseSchemaV1: z.ZodObject<Omit<{
|
|
15
|
+
id: z.ZodString;
|
|
16
|
+
namespace: z.ZodString;
|
|
17
|
+
schemaVersion: z.ZodNumber;
|
|
18
|
+
kind: z.ZodString;
|
|
19
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
20
|
+
}, "kind" | "namespace" | "schemaVersion"> & {
|
|
21
|
+
question: z.ZodString;
|
|
22
|
+
options: z.ZodRecord<z.ZodString, z.ZodString>;
|
|
23
|
+
correctAnswerKeys: z.ZodArray<z.ZodString>;
|
|
28
24
|
} & {
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
25
|
+
namespace: z.ZodLiteral<"peerbench.ai">;
|
|
26
|
+
kind: z.ZodLiteral<"llm/mcq.tc">;
|
|
27
|
+
schemaVersion: z.ZodLiteral<1>;
|
|
28
|
+
}, z.core.$strip> & {
|
|
32
29
|
new: (input: Omit<{
|
|
33
30
|
id: string;
|
|
34
31
|
question: string;
|
|
35
32
|
options: Record<string, string>;
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
kind: "
|
|
33
|
+
correctAnswerKeys: string[];
|
|
34
|
+
namespace: "peerbench.ai";
|
|
35
|
+
kind: "llm/mcq.tc";
|
|
39
36
|
schemaVersion: 1;
|
|
40
37
|
metadata?: Record<string, unknown> | undefined;
|
|
41
|
-
}, "kind" | "schemaVersion">) => {
|
|
38
|
+
}, "kind" | "namespace" | "schemaVersion">) => {
|
|
42
39
|
id: string;
|
|
43
40
|
question: string;
|
|
44
41
|
options: Record<string, string>;
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
kind: "
|
|
42
|
+
correctAnswerKeys: string[];
|
|
43
|
+
namespace: "peerbench.ai";
|
|
44
|
+
kind: "llm/mcq.tc";
|
|
48
45
|
schemaVersion: 1;
|
|
49
46
|
metadata?: Record<string, unknown> | undefined;
|
|
50
47
|
};
|
|
@@ -52,170 +49,179 @@ declare const PeerbenchMultipleChoiceTestCaseSchemaV1: z$1.ZodObject<Omit<{
|
|
|
52
49
|
id: string;
|
|
53
50
|
question: string;
|
|
54
51
|
options: Record<string, string>;
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
kind: "
|
|
52
|
+
correctAnswerKeys: string[];
|
|
53
|
+
namespace: "peerbench.ai";
|
|
54
|
+
kind: "llm/mcq.tc";
|
|
58
55
|
schemaVersion: 1;
|
|
59
56
|
metadata?: Record<string, unknown> | undefined;
|
|
60
|
-
}, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
57
|
+
}, "id" | "kind" | "namespace" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
61
58
|
id: string;
|
|
62
59
|
question: string;
|
|
63
60
|
options: Record<string, string>;
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
kind: "
|
|
61
|
+
correctAnswerKeys: string[];
|
|
62
|
+
namespace: "peerbench.ai";
|
|
63
|
+
kind: "llm/mcq.tc";
|
|
67
64
|
schemaVersion: 1;
|
|
68
65
|
metadata?: Record<string, unknown> | undefined;
|
|
69
66
|
}>;
|
|
70
67
|
};
|
|
71
|
-
type
|
|
72
|
-
declare const
|
|
73
|
-
id: z
|
|
74
|
-
|
|
75
|
-
schemaVersion: z
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
68
|
+
type MCQTestCaseV1 = z.infer<typeof MCQTestCaseSchemaV1>;
|
|
69
|
+
declare const MCQResponseSchemaV1: z.ZodObject<Omit<{
|
|
70
|
+
id: z.ZodString;
|
|
71
|
+
namespace: z.ZodString;
|
|
72
|
+
schemaVersion: z.ZodNumber;
|
|
73
|
+
kind: z.ZodString;
|
|
74
|
+
startedAt: z.ZodNumber;
|
|
75
|
+
completedAt: z.ZodNumber;
|
|
76
|
+
testCaseId: z.ZodString;
|
|
77
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
78
|
+
}, "kind" | "namespace" | "schemaVersion"> & {
|
|
79
|
+
data: z.ZodString;
|
|
80
|
+
modelSlug: z.ZodString;
|
|
81
|
+
provider: z.ZodString;
|
|
82
|
+
systemPromptId: z.ZodOptional<z.ZodString>;
|
|
83
|
+
inputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
84
|
+
outputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
85
|
+
inputCost: z.ZodOptional<z.ZodString>;
|
|
86
|
+
outputCost: z.ZodOptional<z.ZodString>;
|
|
89
87
|
} & {
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
schemaVersion: z$1.ZodLiteral<1>;
|
|
95
|
-
}, z$1.core.$strip> & {
|
|
88
|
+
namespace: z.ZodLiteral<"peerbench.ai">;
|
|
89
|
+
kind: z.ZodLiteral<"llm/mcq.rs">;
|
|
90
|
+
schemaVersion: z.ZodLiteral<1>;
|
|
91
|
+
}, z.core.$strip> & {
|
|
96
92
|
new: (input: Omit<{
|
|
97
|
-
id: string;
|
|
98
|
-
testCaseId: string;
|
|
99
93
|
startedAt: number;
|
|
100
94
|
completedAt: number;
|
|
95
|
+
id: string;
|
|
96
|
+
testCaseId: string;
|
|
101
97
|
data: string;
|
|
102
|
-
provider: string;
|
|
103
98
|
modelSlug: string;
|
|
104
|
-
|
|
99
|
+
provider: string;
|
|
100
|
+
namespace: "peerbench.ai";
|
|
101
|
+
kind: "llm/mcq.rs";
|
|
105
102
|
schemaVersion: 1;
|
|
106
103
|
metadata?: Record<string, unknown> | undefined;
|
|
104
|
+
systemPromptId?: string | undefined;
|
|
107
105
|
inputTokensUsed?: number | undefined;
|
|
108
106
|
outputTokensUsed?: number | undefined;
|
|
109
107
|
inputCost?: string | undefined;
|
|
110
108
|
outputCost?: string | undefined;
|
|
111
|
-
|
|
112
|
-
}, "kind" | "schemaVersion">) => {
|
|
113
|
-
id: string;
|
|
114
|
-
testCaseId: string;
|
|
109
|
+
}, "kind" | "namespace" | "schemaVersion">) => {
|
|
115
110
|
startedAt: number;
|
|
116
111
|
completedAt: number;
|
|
112
|
+
id: string;
|
|
113
|
+
testCaseId: string;
|
|
117
114
|
data: string;
|
|
118
|
-
provider: string;
|
|
119
115
|
modelSlug: string;
|
|
120
|
-
|
|
116
|
+
provider: string;
|
|
117
|
+
namespace: "peerbench.ai";
|
|
118
|
+
kind: "llm/mcq.rs";
|
|
121
119
|
schemaVersion: 1;
|
|
122
120
|
metadata?: Record<string, unknown> | undefined;
|
|
121
|
+
systemPromptId?: string | undefined;
|
|
123
122
|
inputTokensUsed?: number | undefined;
|
|
124
123
|
outputTokensUsed?: number | undefined;
|
|
125
124
|
inputCost?: string | undefined;
|
|
126
125
|
outputCost?: string | undefined;
|
|
127
|
-
systemPromptId?: string | undefined;
|
|
128
126
|
};
|
|
129
127
|
newWithId(input: Omit<{
|
|
130
|
-
id: string;
|
|
131
|
-
testCaseId: string;
|
|
132
128
|
startedAt: number;
|
|
133
129
|
completedAt: number;
|
|
130
|
+
id: string;
|
|
131
|
+
testCaseId: string;
|
|
134
132
|
data: string;
|
|
135
|
-
provider: string;
|
|
136
133
|
modelSlug: string;
|
|
137
|
-
|
|
134
|
+
provider: string;
|
|
135
|
+
namespace: "peerbench.ai";
|
|
136
|
+
kind: "llm/mcq.rs";
|
|
138
137
|
schemaVersion: 1;
|
|
139
138
|
metadata?: Record<string, unknown> | undefined;
|
|
139
|
+
systemPromptId?: string | undefined;
|
|
140
140
|
inputTokensUsed?: number | undefined;
|
|
141
141
|
outputTokensUsed?: number | undefined;
|
|
142
142
|
inputCost?: string | undefined;
|
|
143
143
|
outputCost?: string | undefined;
|
|
144
|
-
|
|
145
|
-
}, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
146
|
-
id: string;
|
|
147
|
-
testCaseId: string;
|
|
144
|
+
}, "id" | "kind" | "namespace" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
148
145
|
startedAt: number;
|
|
149
146
|
completedAt: number;
|
|
147
|
+
id: string;
|
|
148
|
+
testCaseId: string;
|
|
150
149
|
data: string;
|
|
151
|
-
provider: string;
|
|
152
150
|
modelSlug: string;
|
|
153
|
-
|
|
151
|
+
provider: string;
|
|
152
|
+
namespace: "peerbench.ai";
|
|
153
|
+
kind: "llm/mcq.rs";
|
|
154
154
|
schemaVersion: 1;
|
|
155
155
|
metadata?: Record<string, unknown> | undefined;
|
|
156
|
+
systemPromptId?: string | undefined;
|
|
156
157
|
inputTokensUsed?: number | undefined;
|
|
157
158
|
outputTokensUsed?: number | undefined;
|
|
158
159
|
inputCost?: string | undefined;
|
|
159
160
|
outputCost?: string | undefined;
|
|
160
|
-
systemPromptId?: string | undefined;
|
|
161
161
|
}>;
|
|
162
162
|
};
|
|
163
|
-
type
|
|
164
|
-
declare const
|
|
165
|
-
id: z
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
163
|
+
type MCQResponseV1 = z.infer<typeof MCQResponseSchemaV1>;
|
|
164
|
+
declare const MCQScoreSchemaV1: z.ZodObject<Omit<{
|
|
165
|
+
id: z.ZodString;
|
|
166
|
+
namespace: z.ZodString;
|
|
167
|
+
kind: z.ZodString;
|
|
168
|
+
schemaVersion: z.ZodNumber;
|
|
169
|
+
value: z.ZodNumber;
|
|
170
|
+
responseId: z.ZodString;
|
|
171
|
+
explanation: z.ZodOptional<z.ZodString>;
|
|
172
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
173
|
+
scoringMethod: z.ZodEnum<{
|
|
173
174
|
readonly ai: "ai";
|
|
174
175
|
readonly human: "human";
|
|
175
176
|
readonly algo: "algo";
|
|
176
177
|
}>;
|
|
177
|
-
}, "kind" | "schemaVersion"> & {
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
}, "kind" | "schemaVersion"> & {
|
|
188
|
-
extractedAnswers: z$1.ZodArray<z$1.ZodString>;
|
|
178
|
+
}, "kind" | "namespace" | "schemaVersion"> & {
|
|
179
|
+
extractedAnswers: z.ZodArray<z.ZodString>;
|
|
180
|
+
scorerAISystemPrompt: z.ZodOptional<z.ZodString>;
|
|
181
|
+
scorerAISystemPromptId: z.ZodOptional<z.ZodString>;
|
|
182
|
+
scorerAIProvider: z.ZodOptional<z.ZodString>;
|
|
183
|
+
scorerAIModelSlug: z.ZodOptional<z.ZodString>;
|
|
184
|
+
scorerAIInputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
185
|
+
scorerAIOutputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
186
|
+
scorerAIInputCost: z.ZodOptional<z.ZodString>;
|
|
187
|
+
scorerAIOutputCost: z.ZodOptional<z.ZodString>;
|
|
189
188
|
} & {
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
189
|
+
namespace: z.ZodLiteral<"peerbench.ai">;
|
|
190
|
+
kind: z.ZodLiteral<"llm/mcq.sc">;
|
|
191
|
+
schemaVersion: z.ZodLiteral<1>;
|
|
192
|
+
}, z.core.$strip> & {
|
|
193
193
|
new: (input: Omit<{
|
|
194
194
|
id: string;
|
|
195
195
|
value: number;
|
|
196
196
|
responseId: string;
|
|
197
197
|
scoringMethod: "ai" | "human" | "algo";
|
|
198
198
|
extractedAnswers: string[];
|
|
199
|
-
|
|
199
|
+
namespace: "peerbench.ai";
|
|
200
|
+
kind: "llm/mcq.sc";
|
|
200
201
|
schemaVersion: 1;
|
|
201
|
-
metadata?: Record<string, unknown> | undefined;
|
|
202
202
|
explanation?: string | undefined;
|
|
203
|
+
metadata?: Record<string, unknown> | undefined;
|
|
204
|
+
scorerAISystemPrompt?: string | undefined;
|
|
205
|
+
scorerAISystemPromptId?: string | undefined;
|
|
203
206
|
scorerAIProvider?: string | undefined;
|
|
204
207
|
scorerAIModelSlug?: string | undefined;
|
|
205
208
|
scorerAIInputTokensUsed?: number | undefined;
|
|
206
209
|
scorerAIOutputTokensUsed?: number | undefined;
|
|
207
210
|
scorerAIInputCost?: string | undefined;
|
|
208
211
|
scorerAIOutputCost?: string | undefined;
|
|
209
|
-
}, "kind" | "schemaVersion">) => {
|
|
212
|
+
}, "kind" | "namespace" | "schemaVersion">) => {
|
|
210
213
|
id: string;
|
|
211
214
|
value: number;
|
|
212
215
|
responseId: string;
|
|
213
216
|
scoringMethod: "ai" | "human" | "algo";
|
|
214
217
|
extractedAnswers: string[];
|
|
215
|
-
|
|
218
|
+
namespace: "peerbench.ai";
|
|
219
|
+
kind: "llm/mcq.sc";
|
|
216
220
|
schemaVersion: 1;
|
|
217
|
-
metadata?: Record<string, unknown> | undefined;
|
|
218
221
|
explanation?: string | undefined;
|
|
222
|
+
metadata?: Record<string, unknown> | undefined;
|
|
223
|
+
scorerAISystemPrompt?: string | undefined;
|
|
224
|
+
scorerAISystemPromptId?: string | undefined;
|
|
219
225
|
scorerAIProvider?: string | undefined;
|
|
220
226
|
scorerAIModelSlug?: string | undefined;
|
|
221
227
|
scorerAIInputTokensUsed?: number | undefined;
|
|
@@ -229,26 +235,32 @@ declare const PeerbenchMultipleChoiceScoreSchemaV1: z$1.ZodObject<Omit<Omit<{
|
|
|
229
235
|
responseId: string;
|
|
230
236
|
scoringMethod: "ai" | "human" | "algo";
|
|
231
237
|
extractedAnswers: string[];
|
|
232
|
-
|
|
238
|
+
namespace: "peerbench.ai";
|
|
239
|
+
kind: "llm/mcq.sc";
|
|
233
240
|
schemaVersion: 1;
|
|
234
|
-
metadata?: Record<string, unknown> | undefined;
|
|
235
241
|
explanation?: string | undefined;
|
|
242
|
+
metadata?: Record<string, unknown> | undefined;
|
|
243
|
+
scorerAISystemPrompt?: string | undefined;
|
|
244
|
+
scorerAISystemPromptId?: string | undefined;
|
|
236
245
|
scorerAIProvider?: string | undefined;
|
|
237
246
|
scorerAIModelSlug?: string | undefined;
|
|
238
247
|
scorerAIInputTokensUsed?: number | undefined;
|
|
239
248
|
scorerAIOutputTokensUsed?: number | undefined;
|
|
240
249
|
scorerAIInputCost?: string | undefined;
|
|
241
250
|
scorerAIOutputCost?: string | undefined;
|
|
242
|
-
}, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
251
|
+
}, "id" | "kind" | "namespace" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
243
252
|
id: string;
|
|
244
253
|
value: number;
|
|
245
254
|
responseId: string;
|
|
246
255
|
scoringMethod: "ai" | "human" | "algo";
|
|
247
256
|
extractedAnswers: string[];
|
|
248
|
-
|
|
257
|
+
namespace: "peerbench.ai";
|
|
258
|
+
kind: "llm/mcq.sc";
|
|
249
259
|
schemaVersion: 1;
|
|
250
|
-
metadata?: Record<string, unknown> | undefined;
|
|
251
260
|
explanation?: string | undefined;
|
|
261
|
+
metadata?: Record<string, unknown> | undefined;
|
|
262
|
+
scorerAISystemPrompt?: string | undefined;
|
|
263
|
+
scorerAISystemPromptId?: string | undefined;
|
|
252
264
|
scorerAIProvider?: string | undefined;
|
|
253
265
|
scorerAIModelSlug?: string | undefined;
|
|
254
266
|
scorerAIInputTokensUsed?: number | undefined;
|
|
@@ -257,195 +269,304 @@ declare const PeerbenchMultipleChoiceScoreSchemaV1: z$1.ZodObject<Omit<Omit<{
|
|
|
257
269
|
scorerAIOutputCost?: string | undefined;
|
|
258
270
|
}>;
|
|
259
271
|
};
|
|
260
|
-
type
|
|
272
|
+
type MCQScoreV1 = z.infer<typeof MCQScoreSchemaV1>;
|
|
261
273
|
|
|
262
|
-
declare const
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
274
|
+
declare const MultiTurnKind: "llm/multi-turn";
|
|
275
|
+
declare const MultiTurnTestCaseSchemaV1: z.ZodObject<Omit<{
|
|
276
|
+
id: z.ZodString;
|
|
277
|
+
namespace: z.ZodString;
|
|
278
|
+
schemaVersion: z.ZodNumber;
|
|
279
|
+
kind: z.ZodString;
|
|
280
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
281
|
+
}, "kind" | "namespace" | "schemaVersion"> & {
|
|
282
|
+
messages: z.ZodArray<z.ZodObject<{
|
|
283
|
+
role: z.ZodString;
|
|
284
|
+
content: z.ZodString;
|
|
285
|
+
goodAnswers: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
286
|
+
badAnswers: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
287
|
+
}, z.core.$strip>>;
|
|
288
|
+
maxTurns: z.ZodOptional<z.ZodNumber>;
|
|
289
|
+
expectedOutcome: z.ZodOptional<z.ZodString>;
|
|
270
290
|
} & {
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
291
|
+
namespace: z.ZodLiteral<"peerbench.ai">;
|
|
292
|
+
kind: z.ZodLiteral<"llm/multi-turn.tc">;
|
|
293
|
+
schemaVersion: z.ZodLiteral<1>;
|
|
294
|
+
}, z.core.$strip> & {
|
|
274
295
|
new: (input: Omit<{
|
|
275
296
|
id: string;
|
|
276
|
-
|
|
277
|
-
|
|
297
|
+
messages: {
|
|
298
|
+
role: string;
|
|
299
|
+
content: string;
|
|
300
|
+
goodAnswers?: string[] | undefined;
|
|
301
|
+
badAnswers?: string[] | undefined;
|
|
302
|
+
}[];
|
|
303
|
+
namespace: "peerbench.ai";
|
|
304
|
+
kind: "llm/multi-turn.tc";
|
|
278
305
|
schemaVersion: 1;
|
|
279
306
|
metadata?: Record<string, unknown> | undefined;
|
|
280
|
-
|
|
281
|
-
|
|
307
|
+
maxTurns?: number | undefined;
|
|
308
|
+
expectedOutcome?: string | undefined;
|
|
309
|
+
}, "kind" | "namespace" | "schemaVersion">) => {
|
|
282
310
|
id: string;
|
|
283
|
-
|
|
284
|
-
|
|
311
|
+
messages: {
|
|
312
|
+
role: string;
|
|
313
|
+
content: string;
|
|
314
|
+
goodAnswers?: string[] | undefined;
|
|
315
|
+
badAnswers?: string[] | undefined;
|
|
316
|
+
}[];
|
|
317
|
+
namespace: "peerbench.ai";
|
|
318
|
+
kind: "llm/multi-turn.tc";
|
|
285
319
|
schemaVersion: 1;
|
|
286
320
|
metadata?: Record<string, unknown> | undefined;
|
|
287
|
-
|
|
321
|
+
maxTurns?: number | undefined;
|
|
322
|
+
expectedOutcome?: string | undefined;
|
|
288
323
|
};
|
|
289
324
|
newWithId(input: Omit<{
|
|
290
325
|
id: string;
|
|
291
|
-
|
|
292
|
-
|
|
326
|
+
messages: {
|
|
327
|
+
role: string;
|
|
328
|
+
content: string;
|
|
329
|
+
goodAnswers?: string[] | undefined;
|
|
330
|
+
badAnswers?: string[] | undefined;
|
|
331
|
+
}[];
|
|
332
|
+
namespace: "peerbench.ai";
|
|
333
|
+
kind: "llm/multi-turn.tc";
|
|
293
334
|
schemaVersion: 1;
|
|
294
335
|
metadata?: Record<string, unknown> | undefined;
|
|
295
|
-
|
|
296
|
-
|
|
336
|
+
maxTurns?: number | undefined;
|
|
337
|
+
expectedOutcome?: string | undefined;
|
|
338
|
+
}, "id" | "kind" | "namespace" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
297
339
|
id: string;
|
|
298
|
-
|
|
299
|
-
|
|
340
|
+
messages: {
|
|
341
|
+
role: string;
|
|
342
|
+
content: string;
|
|
343
|
+
goodAnswers?: string[] | undefined;
|
|
344
|
+
badAnswers?: string[] | undefined;
|
|
345
|
+
}[];
|
|
346
|
+
namespace: "peerbench.ai";
|
|
347
|
+
kind: "llm/multi-turn.tc";
|
|
300
348
|
schemaVersion: 1;
|
|
301
349
|
metadata?: Record<string, unknown> | undefined;
|
|
302
|
-
|
|
350
|
+
maxTurns?: number | undefined;
|
|
351
|
+
expectedOutcome?: string | undefined;
|
|
303
352
|
}>;
|
|
304
353
|
};
|
|
305
|
-
type
|
|
306
|
-
declare const
|
|
307
|
-
id: z
|
|
308
|
-
|
|
309
|
-
schemaVersion: z
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
354
|
+
type MultiTurnTestCaseV1 = z.infer<typeof MultiTurnTestCaseSchemaV1>;
|
|
355
|
+
declare const MultiTurnResponseSchemaV1: z.ZodObject<Omit<{
|
|
356
|
+
id: z.ZodString;
|
|
357
|
+
namespace: z.ZodString;
|
|
358
|
+
schemaVersion: z.ZodNumber;
|
|
359
|
+
kind: z.ZodString;
|
|
360
|
+
startedAt: z.ZodNumber;
|
|
361
|
+
completedAt: z.ZodNumber;
|
|
362
|
+
testCaseId: z.ZodString;
|
|
363
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
364
|
+
}, "kind" | "namespace" | "schemaVersion"> & {
|
|
365
|
+
replies: z.ZodArray<z.ZodObject<{
|
|
366
|
+
messageIndex: z.ZodNumber;
|
|
367
|
+
startedAt: z.ZodNumber;
|
|
368
|
+
completedAt: z.ZodNumber;
|
|
369
|
+
data: z.ZodString;
|
|
370
|
+
inputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
371
|
+
outputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
372
|
+
inputCost: z.ZodOptional<z.ZodString>;
|
|
373
|
+
outputCost: z.ZodOptional<z.ZodString>;
|
|
374
|
+
}, z.core.$strip>>;
|
|
375
|
+
data: z.ZodString;
|
|
376
|
+
modelSlug: z.ZodString;
|
|
377
|
+
provider: z.ZodString;
|
|
378
|
+
systemPromptId: z.ZodOptional<z.ZodString>;
|
|
379
|
+
inputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
380
|
+
outputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
381
|
+
inputCost: z.ZodOptional<z.ZodString>;
|
|
382
|
+
outputCost: z.ZodOptional<z.ZodString>;
|
|
323
383
|
} & {
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
schemaVersion: z$1.ZodLiteral<1>;
|
|
329
|
-
}, z$1.core.$strip> & {
|
|
384
|
+
namespace: z.ZodLiteral<"peerbench.ai">;
|
|
385
|
+
kind: z.ZodLiteral<"llm/multi-turn.rs">;
|
|
386
|
+
schemaVersion: z.ZodLiteral<1>;
|
|
387
|
+
}, z.core.$strip> & {
|
|
330
388
|
new: (input: Omit<{
|
|
331
|
-
id: string;
|
|
332
|
-
testCaseId: string;
|
|
333
389
|
startedAt: number;
|
|
334
390
|
completedAt: number;
|
|
391
|
+
id: string;
|
|
392
|
+
testCaseId: string;
|
|
393
|
+
replies: {
|
|
394
|
+
messageIndex: number;
|
|
395
|
+
startedAt: number;
|
|
396
|
+
completedAt: number;
|
|
397
|
+
data: string;
|
|
398
|
+
inputTokensUsed?: number | undefined;
|
|
399
|
+
outputTokensUsed?: number | undefined;
|
|
400
|
+
inputCost?: string | undefined;
|
|
401
|
+
outputCost?: string | undefined;
|
|
402
|
+
}[];
|
|
335
403
|
data: string;
|
|
336
|
-
provider: string;
|
|
337
404
|
modelSlug: string;
|
|
338
|
-
|
|
405
|
+
provider: string;
|
|
406
|
+
namespace: "peerbench.ai";
|
|
407
|
+
kind: "llm/multi-turn.rs";
|
|
339
408
|
schemaVersion: 1;
|
|
340
409
|
metadata?: Record<string, unknown> | undefined;
|
|
410
|
+
systemPromptId?: string | undefined;
|
|
341
411
|
inputTokensUsed?: number | undefined;
|
|
342
412
|
outputTokensUsed?: number | undefined;
|
|
343
413
|
inputCost?: string | undefined;
|
|
344
414
|
outputCost?: string | undefined;
|
|
345
|
-
|
|
346
|
-
}, "kind" | "schemaVersion">) => {
|
|
347
|
-
id: string;
|
|
348
|
-
testCaseId: string;
|
|
415
|
+
}, "kind" | "namespace" | "schemaVersion">) => {
|
|
349
416
|
startedAt: number;
|
|
350
417
|
completedAt: number;
|
|
418
|
+
id: string;
|
|
419
|
+
testCaseId: string;
|
|
420
|
+
replies: {
|
|
421
|
+
messageIndex: number;
|
|
422
|
+
startedAt: number;
|
|
423
|
+
completedAt: number;
|
|
424
|
+
data: string;
|
|
425
|
+
inputTokensUsed?: number | undefined;
|
|
426
|
+
outputTokensUsed?: number | undefined;
|
|
427
|
+
inputCost?: string | undefined;
|
|
428
|
+
outputCost?: string | undefined;
|
|
429
|
+
}[];
|
|
351
430
|
data: string;
|
|
352
|
-
provider: string;
|
|
353
431
|
modelSlug: string;
|
|
354
|
-
|
|
432
|
+
provider: string;
|
|
433
|
+
namespace: "peerbench.ai";
|
|
434
|
+
kind: "llm/multi-turn.rs";
|
|
355
435
|
schemaVersion: 1;
|
|
356
436
|
metadata?: Record<string, unknown> | undefined;
|
|
437
|
+
systemPromptId?: string | undefined;
|
|
357
438
|
inputTokensUsed?: number | undefined;
|
|
358
439
|
outputTokensUsed?: number | undefined;
|
|
359
440
|
inputCost?: string | undefined;
|
|
360
441
|
outputCost?: string | undefined;
|
|
361
|
-
systemPromptId?: string | undefined;
|
|
362
442
|
};
|
|
363
443
|
newWithId(input: Omit<{
|
|
364
|
-
id: string;
|
|
365
|
-
testCaseId: string;
|
|
366
444
|
startedAt: number;
|
|
367
445
|
completedAt: number;
|
|
446
|
+
id: string;
|
|
447
|
+
testCaseId: string;
|
|
448
|
+
replies: {
|
|
449
|
+
messageIndex: number;
|
|
450
|
+
startedAt: number;
|
|
451
|
+
completedAt: number;
|
|
452
|
+
data: string;
|
|
453
|
+
inputTokensUsed?: number | undefined;
|
|
454
|
+
outputTokensUsed?: number | undefined;
|
|
455
|
+
inputCost?: string | undefined;
|
|
456
|
+
outputCost?: string | undefined;
|
|
457
|
+
}[];
|
|
368
458
|
data: string;
|
|
369
|
-
provider: string;
|
|
370
459
|
modelSlug: string;
|
|
371
|
-
|
|
460
|
+
provider: string;
|
|
461
|
+
namespace: "peerbench.ai";
|
|
462
|
+
kind: "llm/multi-turn.rs";
|
|
372
463
|
schemaVersion: 1;
|
|
373
464
|
metadata?: Record<string, unknown> | undefined;
|
|
465
|
+
systemPromptId?: string | undefined;
|
|
374
466
|
inputTokensUsed?: number | undefined;
|
|
375
467
|
outputTokensUsed?: number | undefined;
|
|
376
468
|
inputCost?: string | undefined;
|
|
377
469
|
outputCost?: string | undefined;
|
|
378
|
-
|
|
379
|
-
}, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
380
|
-
id: string;
|
|
381
|
-
testCaseId: string;
|
|
470
|
+
}, "id" | "kind" | "namespace" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
382
471
|
startedAt: number;
|
|
383
472
|
completedAt: number;
|
|
473
|
+
id: string;
|
|
474
|
+
testCaseId: string;
|
|
475
|
+
replies: {
|
|
476
|
+
messageIndex: number;
|
|
477
|
+
startedAt: number;
|
|
478
|
+
completedAt: number;
|
|
479
|
+
data: string;
|
|
480
|
+
inputTokensUsed?: number | undefined;
|
|
481
|
+
outputTokensUsed?: number | undefined;
|
|
482
|
+
inputCost?: string | undefined;
|
|
483
|
+
outputCost?: string | undefined;
|
|
484
|
+
}[];
|
|
384
485
|
data: string;
|
|
385
|
-
provider: string;
|
|
386
486
|
modelSlug: string;
|
|
387
|
-
|
|
487
|
+
provider: string;
|
|
488
|
+
namespace: "peerbench.ai";
|
|
489
|
+
kind: "llm/multi-turn.rs";
|
|
388
490
|
schemaVersion: 1;
|
|
389
491
|
metadata?: Record<string, unknown> | undefined;
|
|
492
|
+
systemPromptId?: string | undefined;
|
|
390
493
|
inputTokensUsed?: number | undefined;
|
|
391
494
|
outputTokensUsed?: number | undefined;
|
|
392
495
|
inputCost?: string | undefined;
|
|
393
496
|
outputCost?: string | undefined;
|
|
394
|
-
systemPromptId?: string | undefined;
|
|
395
497
|
}>;
|
|
396
498
|
};
|
|
397
|
-
type
|
|
398
|
-
declare const
|
|
399
|
-
id: z
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
499
|
+
type MultiTurnResponseV1 = z.infer<typeof MultiTurnResponseSchemaV1>;
|
|
500
|
+
declare const MultiTurnScoreSchemaV1: z.ZodObject<Omit<{
|
|
501
|
+
id: z.ZodString;
|
|
502
|
+
namespace: z.ZodString;
|
|
503
|
+
kind: z.ZodString;
|
|
504
|
+
schemaVersion: z.ZodNumber;
|
|
505
|
+
value: z.ZodNumber;
|
|
506
|
+
responseId: z.ZodString;
|
|
507
|
+
explanation: z.ZodOptional<z.ZodString>;
|
|
508
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
509
|
+
scoringMethod: z.ZodEnum<{
|
|
407
510
|
readonly ai: "ai";
|
|
408
511
|
readonly human: "human";
|
|
409
512
|
readonly algo: "algo";
|
|
410
513
|
}>;
|
|
411
|
-
}, "kind" | "schemaVersion"> & {
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
514
|
+
}, "kind" | "namespace" | "schemaVersion"> & {
|
|
515
|
+
individualScores: z.ZodArray<z.ZodObject<{
|
|
516
|
+
replyIndex: z.ZodNumber;
|
|
517
|
+
value: z.ZodNumber;
|
|
518
|
+
}, z.core.$strip>>;
|
|
519
|
+
scorerAISystemPrompt: z.ZodOptional<z.ZodString>;
|
|
520
|
+
scorerAISystemPromptId: z.ZodOptional<z.ZodString>;
|
|
521
|
+
scorerAIProvider: z.ZodOptional<z.ZodString>;
|
|
522
|
+
scorerAIModelSlug: z.ZodOptional<z.ZodString>;
|
|
523
|
+
scorerAIInputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
524
|
+
scorerAIOutputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
525
|
+
scorerAIInputCost: z.ZodOptional<z.ZodString>;
|
|
526
|
+
scorerAIOutputCost: z.ZodOptional<z.ZodString>;
|
|
418
527
|
} & {
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
schemaVersion: z$1.ZodLiteral<1>;
|
|
424
|
-
}, z$1.core.$strip> & {
|
|
528
|
+
namespace: z.ZodLiteral<"peerbench.ai">;
|
|
529
|
+
kind: z.ZodLiteral<"llm/multi-turn.sc">;
|
|
530
|
+
schemaVersion: z.ZodLiteral<1>;
|
|
531
|
+
}, z.core.$strip> & {
|
|
425
532
|
new: (input: Omit<{
|
|
426
533
|
id: string;
|
|
427
534
|
value: number;
|
|
428
535
|
responseId: string;
|
|
429
536
|
scoringMethod: "ai" | "human" | "algo";
|
|
430
|
-
|
|
537
|
+
individualScores: {
|
|
538
|
+
replyIndex: number;
|
|
539
|
+
value: number;
|
|
540
|
+
}[];
|
|
541
|
+
namespace: "peerbench.ai";
|
|
542
|
+
kind: "llm/multi-turn.sc";
|
|
431
543
|
schemaVersion: 1;
|
|
432
|
-
metadata?: Record<string, unknown> | undefined;
|
|
433
544
|
explanation?: string | undefined;
|
|
545
|
+
metadata?: Record<string, unknown> | undefined;
|
|
546
|
+
scorerAISystemPrompt?: string | undefined;
|
|
547
|
+
scorerAISystemPromptId?: string | undefined;
|
|
434
548
|
scorerAIProvider?: string | undefined;
|
|
435
549
|
scorerAIModelSlug?: string | undefined;
|
|
436
550
|
scorerAIInputTokensUsed?: number | undefined;
|
|
437
551
|
scorerAIOutputTokensUsed?: number | undefined;
|
|
438
552
|
scorerAIInputCost?: string | undefined;
|
|
439
553
|
scorerAIOutputCost?: string | undefined;
|
|
440
|
-
}, "kind" | "schemaVersion">) => {
|
|
554
|
+
}, "kind" | "namespace" | "schemaVersion">) => {
|
|
441
555
|
id: string;
|
|
442
556
|
value: number;
|
|
443
557
|
responseId: string;
|
|
444
558
|
scoringMethod: "ai" | "human" | "algo";
|
|
445
|
-
|
|
559
|
+
individualScores: {
|
|
560
|
+
replyIndex: number;
|
|
561
|
+
value: number;
|
|
562
|
+
}[];
|
|
563
|
+
namespace: "peerbench.ai";
|
|
564
|
+
kind: "llm/multi-turn.sc";
|
|
446
565
|
schemaVersion: 1;
|
|
447
|
-
metadata?: Record<string, unknown> | undefined;
|
|
448
566
|
explanation?: string | undefined;
|
|
567
|
+
metadata?: Record<string, unknown> | undefined;
|
|
568
|
+
scorerAISystemPrompt?: string | undefined;
|
|
569
|
+
scorerAISystemPromptId?: string | undefined;
|
|
449
570
|
scorerAIProvider?: string | undefined;
|
|
450
571
|
scorerAIModelSlug?: string | undefined;
|
|
451
572
|
scorerAIInputTokensUsed?: number | undefined;
|
|
@@ -458,25 +579,39 @@ declare const PeerbenchOpenEndedScoreSchemaV1: z$1.ZodObject<Omit<Omit<{
|
|
|
458
579
|
value: number;
|
|
459
580
|
responseId: string;
|
|
460
581
|
scoringMethod: "ai" | "human" | "algo";
|
|
461
|
-
|
|
582
|
+
individualScores: {
|
|
583
|
+
replyIndex: number;
|
|
584
|
+
value: number;
|
|
585
|
+
}[];
|
|
586
|
+
namespace: "peerbench.ai";
|
|
587
|
+
kind: "llm/multi-turn.sc";
|
|
462
588
|
schemaVersion: 1;
|
|
463
|
-
metadata?: Record<string, unknown> | undefined;
|
|
464
589
|
explanation?: string | undefined;
|
|
590
|
+
metadata?: Record<string, unknown> | undefined;
|
|
591
|
+
scorerAISystemPrompt?: string | undefined;
|
|
592
|
+
scorerAISystemPromptId?: string | undefined;
|
|
465
593
|
scorerAIProvider?: string | undefined;
|
|
466
594
|
scorerAIModelSlug?: string | undefined;
|
|
467
595
|
scorerAIInputTokensUsed?: number | undefined;
|
|
468
596
|
scorerAIOutputTokensUsed?: number | undefined;
|
|
469
597
|
scorerAIInputCost?: string | undefined;
|
|
470
598
|
scorerAIOutputCost?: string | undefined;
|
|
471
|
-
}, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
599
|
+
}, "id" | "kind" | "namespace" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
472
600
|
id: string;
|
|
473
601
|
value: number;
|
|
474
602
|
responseId: string;
|
|
475
603
|
scoringMethod: "ai" | "human" | "algo";
|
|
476
|
-
|
|
604
|
+
individualScores: {
|
|
605
|
+
replyIndex: number;
|
|
606
|
+
value: number;
|
|
607
|
+
}[];
|
|
608
|
+
namespace: "peerbench.ai";
|
|
609
|
+
kind: "llm/multi-turn.sc";
|
|
477
610
|
schemaVersion: 1;
|
|
478
|
-
metadata?: Record<string, unknown> | undefined;
|
|
479
611
|
explanation?: string | undefined;
|
|
612
|
+
metadata?: Record<string, unknown> | undefined;
|
|
613
|
+
scorerAISystemPrompt?: string | undefined;
|
|
614
|
+
scorerAISystemPromptId?: string | undefined;
|
|
480
615
|
scorerAIProvider?: string | undefined;
|
|
481
616
|
scorerAIModelSlug?: string | undefined;
|
|
482
617
|
scorerAIInputTokensUsed?: number | undefined;
|
|
@@ -485,406 +620,254 @@ declare const PeerbenchOpenEndedScoreSchemaV1: z$1.ZodObject<Omit<Omit<{
|
|
|
485
620
|
scorerAIOutputCost?: string | undefined;
|
|
486
621
|
}>;
|
|
487
622
|
};
|
|
488
|
-
type
|
|
623
|
+
type MultiTurnScoreV1 = z.infer<typeof MultiTurnScoreSchemaV1>;
|
|
489
624
|
|
|
490
|
-
declare const
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
625
|
+
declare const QAKind: "llm/qa";
|
|
626
|
+
declare const QATestCaseSchemaV1: z.ZodObject<Omit<{
|
|
627
|
+
id: z.ZodString;
|
|
628
|
+
namespace: z.ZodString;
|
|
629
|
+
schemaVersion: z.ZodNumber;
|
|
630
|
+
kind: z.ZodString;
|
|
631
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
632
|
+
}, "kind" | "namespace" | "schemaVersion"> & {
|
|
633
|
+
question: z.ZodString;
|
|
634
|
+
goodAnswers: z.ZodArray<z.ZodString>;
|
|
635
|
+
badAnswers: z.ZodArray<z.ZodString>;
|
|
499
636
|
} & {
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
637
|
+
namespace: z.ZodLiteral<"peerbench.ai">;
|
|
638
|
+
kind: z.ZodLiteral<"llm/qa.tc">;
|
|
639
|
+
schemaVersion: z.ZodLiteral<1>;
|
|
640
|
+
}, z.core.$strip> & {
|
|
503
641
|
new: (input: Omit<{
|
|
504
|
-
|
|
642
|
+
id: string;
|
|
643
|
+
question: string;
|
|
644
|
+
goodAnswers: string[];
|
|
645
|
+
badAnswers: string[];
|
|
646
|
+
namespace: "peerbench.ai";
|
|
647
|
+
kind: "llm/qa.tc";
|
|
505
648
|
schemaVersion: 1;
|
|
506
649
|
metadata?: Record<string, unknown> | undefined;
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
650
|
+
}, "kind" | "namespace" | "schemaVersion">) => {
|
|
651
|
+
id: string;
|
|
652
|
+
question: string;
|
|
653
|
+
goodAnswers: string[];
|
|
654
|
+
badAnswers: string[];
|
|
655
|
+
namespace: "peerbench.ai";
|
|
656
|
+
kind: "llm/qa.tc";
|
|
510
657
|
schemaVersion: 1;
|
|
511
658
|
metadata?: Record<string, unknown> | undefined;
|
|
512
|
-
blobTexts?: Record<string, string> | undefined;
|
|
513
659
|
};
|
|
514
660
|
newWithId(input: Omit<{
|
|
515
|
-
kind: "pb.benchmark.spec";
|
|
516
|
-
schemaVersion: 1;
|
|
517
|
-
metadata?: Record<string, unknown> | undefined;
|
|
518
|
-
blobTexts?: Record<string, string> | undefined;
|
|
519
|
-
}, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
520
|
-
kind: "pb.benchmark.spec";
|
|
521
|
-
schemaVersion: 1;
|
|
522
|
-
metadata?: Record<string, unknown> | undefined;
|
|
523
|
-
blobTexts?: Record<string, string> | undefined;
|
|
524
|
-
}>;
|
|
525
|
-
};
|
|
526
|
-
type PeerbenchBenchmarkSpecV1 = z__default.infer<typeof PeerbenchBenchmarkSpecSchemaV1>;
|
|
527
|
-
|
|
528
|
-
declare class PeerbenchJSONDataLoader extends GenericJSONArrayDataLoader<PeerbenchMultipleChoiceTestCaseV1 | PeerbenchOpenEndedTestCaseV1, PeerbenchMultipleChoiceResponseV1 | PeerbenchOpenEndedResponseV1, PeerbenchMultipleChoiceScoreV1 | PeerbenchOpenEndedScoreV1> {
|
|
529
|
-
readonly kind = "pb.load.json.data";
|
|
530
|
-
loadBenchmarkSpec(params: {
|
|
531
|
-
content: Uint8Array;
|
|
532
|
-
}): Promise<PeerbenchBenchmarkSpecV1>;
|
|
533
|
-
protected testCaseBuilder(data: any): {
|
|
534
661
|
id: string;
|
|
535
662
|
question: string;
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
kind: "
|
|
663
|
+
goodAnswers: string[];
|
|
664
|
+
badAnswers: string[];
|
|
665
|
+
namespace: "peerbench.ai";
|
|
666
|
+
kind: "llm/qa.tc";
|
|
540
667
|
schemaVersion: 1;
|
|
541
668
|
metadata?: Record<string, unknown> | undefined;
|
|
542
|
-
} | {
|
|
669
|
+
}, "id" | "kind" | "namespace" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
543
670
|
id: string;
|
|
544
671
|
question: string;
|
|
545
|
-
|
|
672
|
+
goodAnswers: string[];
|
|
673
|
+
badAnswers: string[];
|
|
674
|
+
namespace: "peerbench.ai";
|
|
675
|
+
kind: "llm/qa.tc";
|
|
546
676
|
schemaVersion: 1;
|
|
547
677
|
metadata?: Record<string, unknown> | undefined;
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
678
|
+
}>;
|
|
679
|
+
};
|
|
680
|
+
type QATestCaseV1 = z.infer<typeof QATestCaseSchemaV1>;
|
|
681
|
+
declare const QAResponseSchemaV1: z.ZodObject<Omit<{
|
|
682
|
+
id: z.ZodString;
|
|
683
|
+
namespace: z.ZodString;
|
|
684
|
+
schemaVersion: z.ZodNumber;
|
|
685
|
+
kind: z.ZodString;
|
|
686
|
+
startedAt: z.ZodNumber;
|
|
687
|
+
completedAt: z.ZodNumber;
|
|
688
|
+
testCaseId: z.ZodString;
|
|
689
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
690
|
+
}, "kind" | "namespace" | "schemaVersion"> & {
|
|
691
|
+
data: z.ZodString;
|
|
692
|
+
modelSlug: z.ZodString;
|
|
693
|
+
provider: z.ZodString;
|
|
694
|
+
systemPromptId: z.ZodOptional<z.ZodString>;
|
|
695
|
+
inputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
696
|
+
outputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
697
|
+
inputCost: z.ZodOptional<z.ZodString>;
|
|
698
|
+
outputCost: z.ZodOptional<z.ZodString>;
|
|
699
|
+
} & {
|
|
700
|
+
namespace: z.ZodLiteral<"peerbench.ai">;
|
|
701
|
+
kind: z.ZodLiteral<"llm/qa.rs">;
|
|
702
|
+
schemaVersion: z.ZodLiteral<1>;
|
|
703
|
+
}, z.core.$strip> & {
|
|
704
|
+
new: (input: Omit<{
|
|
705
|
+
startedAt: number;
|
|
706
|
+
completedAt: number;
|
|
551
707
|
id: string;
|
|
552
708
|
testCaseId: string;
|
|
709
|
+
data: string;
|
|
710
|
+
modelSlug: string;
|
|
711
|
+
provider: string;
|
|
712
|
+
namespace: "peerbench.ai";
|
|
713
|
+
kind: "llm/qa.rs";
|
|
714
|
+
schemaVersion: 1;
|
|
715
|
+
metadata?: Record<string, unknown> | undefined;
|
|
716
|
+
systemPromptId?: string | undefined;
|
|
717
|
+
inputTokensUsed?: number | undefined;
|
|
718
|
+
outputTokensUsed?: number | undefined;
|
|
719
|
+
inputCost?: string | undefined;
|
|
720
|
+
outputCost?: string | undefined;
|
|
721
|
+
}, "kind" | "namespace" | "schemaVersion">) => {
|
|
553
722
|
startedAt: number;
|
|
554
723
|
completedAt: number;
|
|
724
|
+
id: string;
|
|
725
|
+
testCaseId: string;
|
|
555
726
|
data: string;
|
|
556
|
-
provider: string;
|
|
557
727
|
modelSlug: string;
|
|
558
|
-
|
|
728
|
+
provider: string;
|
|
729
|
+
namespace: "peerbench.ai";
|
|
730
|
+
kind: "llm/qa.rs";
|
|
559
731
|
schemaVersion: 1;
|
|
560
732
|
metadata?: Record<string, unknown> | undefined;
|
|
733
|
+
systemPromptId?: string | undefined;
|
|
561
734
|
inputTokensUsed?: number | undefined;
|
|
562
735
|
outputTokensUsed?: number | undefined;
|
|
563
736
|
inputCost?: string | undefined;
|
|
564
737
|
outputCost?: string | undefined;
|
|
565
|
-
|
|
566
|
-
|
|
738
|
+
};
|
|
739
|
+
newWithId(input: Omit<{
|
|
740
|
+
startedAt: number;
|
|
741
|
+
completedAt: number;
|
|
567
742
|
id: string;
|
|
568
743
|
testCaseId: string;
|
|
744
|
+
data: string;
|
|
745
|
+
modelSlug: string;
|
|
746
|
+
provider: string;
|
|
747
|
+
namespace: "peerbench.ai";
|
|
748
|
+
kind: "llm/qa.rs";
|
|
749
|
+
schemaVersion: 1;
|
|
750
|
+
metadata?: Record<string, unknown> | undefined;
|
|
751
|
+
systemPromptId?: string | undefined;
|
|
752
|
+
inputTokensUsed?: number | undefined;
|
|
753
|
+
outputTokensUsed?: number | undefined;
|
|
754
|
+
inputCost?: string | undefined;
|
|
755
|
+
outputCost?: string | undefined;
|
|
756
|
+
}, "id" | "kind" | "namespace" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
569
757
|
startedAt: number;
|
|
570
758
|
completedAt: number;
|
|
759
|
+
id: string;
|
|
760
|
+
testCaseId: string;
|
|
571
761
|
data: string;
|
|
572
|
-
provider: string;
|
|
573
762
|
modelSlug: string;
|
|
574
|
-
|
|
763
|
+
provider: string;
|
|
764
|
+
namespace: "peerbench.ai";
|
|
765
|
+
kind: "llm/qa.rs";
|
|
575
766
|
schemaVersion: 1;
|
|
576
767
|
metadata?: Record<string, unknown> | undefined;
|
|
768
|
+
systemPromptId?: string | undefined;
|
|
577
769
|
inputTokensUsed?: number | undefined;
|
|
578
770
|
outputTokensUsed?: number | undefined;
|
|
579
771
|
inputCost?: string | undefined;
|
|
580
772
|
outputCost?: string | undefined;
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
773
|
+
}>;
|
|
774
|
+
};
|
|
775
|
+
type QAResponseV1 = z.infer<typeof QAResponseSchemaV1>;
|
|
776
|
+
declare const QAScoreSchemaV1: z.ZodObject<Omit<{
|
|
777
|
+
id: z.ZodString;
|
|
778
|
+
namespace: z.ZodString;
|
|
779
|
+
kind: z.ZodString;
|
|
780
|
+
schemaVersion: z.ZodNumber;
|
|
781
|
+
value: z.ZodNumber;
|
|
782
|
+
responseId: z.ZodString;
|
|
783
|
+
explanation: z.ZodOptional<z.ZodString>;
|
|
784
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
785
|
+
scoringMethod: z.ZodEnum<{
|
|
786
|
+
readonly ai: "ai";
|
|
787
|
+
readonly human: "human";
|
|
788
|
+
readonly algo: "algo";
|
|
789
|
+
}>;
|
|
790
|
+
}, "kind" | "namespace" | "schemaVersion"> & {
|
|
791
|
+
scorerAISystemPrompt: z.ZodOptional<z.ZodString>;
|
|
792
|
+
scorerAISystemPromptId: z.ZodOptional<z.ZodString>;
|
|
793
|
+
scorerAIProvider: z.ZodOptional<z.ZodString>;
|
|
794
|
+
scorerAIModelSlug: z.ZodOptional<z.ZodString>;
|
|
795
|
+
scorerAIInputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
796
|
+
scorerAIOutputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
797
|
+
scorerAIInputCost: z.ZodOptional<z.ZodString>;
|
|
798
|
+
scorerAIOutputCost: z.ZodOptional<z.ZodString>;
|
|
799
|
+
} & {
|
|
800
|
+
namespace: z.ZodLiteral<"peerbench.ai">;
|
|
801
|
+
kind: z.ZodLiteral<"llm/qa.sc">;
|
|
802
|
+
schemaVersion: z.ZodLiteral<1>;
|
|
803
|
+
}, z.core.$strip> & {
|
|
804
|
+
new: (input: Omit<{
|
|
584
805
|
id: string;
|
|
585
806
|
value: number;
|
|
586
807
|
responseId: string;
|
|
587
808
|
scoringMethod: "ai" | "human" | "algo";
|
|
588
|
-
|
|
589
|
-
kind: "
|
|
809
|
+
namespace: "peerbench.ai";
|
|
810
|
+
kind: "llm/qa.sc";
|
|
590
811
|
schemaVersion: 1;
|
|
591
|
-
metadata?: Record<string, unknown> | undefined;
|
|
592
812
|
explanation?: string | undefined;
|
|
813
|
+
metadata?: Record<string, unknown> | undefined;
|
|
814
|
+
scorerAISystemPrompt?: string | undefined;
|
|
815
|
+
scorerAISystemPromptId?: string | undefined;
|
|
593
816
|
scorerAIProvider?: string | undefined;
|
|
594
817
|
scorerAIModelSlug?: string | undefined;
|
|
595
818
|
scorerAIInputTokensUsed?: number | undefined;
|
|
596
819
|
scorerAIOutputTokensUsed?: number | undefined;
|
|
597
820
|
scorerAIInputCost?: string | undefined;
|
|
598
821
|
scorerAIOutputCost?: string | undefined;
|
|
599
|
-
} | {
|
|
822
|
+
}, "kind" | "namespace" | "schemaVersion">) => {
|
|
600
823
|
id: string;
|
|
601
824
|
value: number;
|
|
602
825
|
responseId: string;
|
|
603
826
|
scoringMethod: "ai" | "human" | "algo";
|
|
604
|
-
|
|
827
|
+
namespace: "peerbench.ai";
|
|
828
|
+
kind: "llm/qa.sc";
|
|
605
829
|
schemaVersion: 1;
|
|
606
|
-
metadata?: Record<string, unknown> | undefined;
|
|
607
830
|
explanation?: string | undefined;
|
|
831
|
+
metadata?: Record<string, unknown> | undefined;
|
|
832
|
+
scorerAISystemPrompt?: string | undefined;
|
|
833
|
+
scorerAISystemPromptId?: string | undefined;
|
|
608
834
|
scorerAIProvider?: string | undefined;
|
|
609
835
|
scorerAIModelSlug?: string | undefined;
|
|
610
836
|
scorerAIInputTokensUsed?: number | undefined;
|
|
611
837
|
scorerAIOutputTokensUsed?: number | undefined;
|
|
612
838
|
scorerAIInputCost?: string | undefined;
|
|
613
839
|
scorerAIOutputCost?: string | undefined;
|
|
614
|
-
} | undefined>;
|
|
615
|
-
}
|
|
616
|
-
|
|
617
|
-
type ResponseTypes = PeerbenchMultipleChoiceResponseV1 | PeerbenchOpenEndedResponseV1;
|
|
618
|
-
type ScoreTypes = PeerbenchMultipleChoiceScoreV1 | PeerbenchOpenEndedScoreV1;
|
|
619
|
-
type TestCaseTypes = PeerbenchMultipleChoiceTestCaseV1 | PeerbenchOpenEndedTestCaseV1;
|
|
620
|
-
declare function runTestCase$2(params: {
|
|
621
|
-
testCase: TestCaseTypes;
|
|
622
|
-
provider: AbstractLLMProvider;
|
|
623
|
-
scorer?: MCQScorer | LLMJudgeScorer;
|
|
624
|
-
spec?: PeerbenchBenchmarkSpecV1;
|
|
625
|
-
runConfig: {
|
|
626
|
-
model: string;
|
|
627
|
-
llmJudgeModel?: string;
|
|
628
|
-
};
|
|
629
|
-
systemPrompt?: SimpleSystemPromptV1;
|
|
630
|
-
idGenerators?: {
|
|
631
|
-
response?: IdGenerator;
|
|
632
|
-
score?: IdGenerator;
|
|
633
840
|
};
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
type index$2_PeerbenchJSONDataLoader = PeerbenchJSONDataLoader;
|
|
637
|
-
declare const index$2_PeerbenchJSONDataLoader: typeof PeerbenchJSONDataLoader;
|
|
638
|
-
declare const index$2_PeerbenchMultipleChoiceResponseSchemaV1: typeof PeerbenchMultipleChoiceResponseSchemaV1;
|
|
639
|
-
type index$2_PeerbenchMultipleChoiceResponseV1 = PeerbenchMultipleChoiceResponseV1;
|
|
640
|
-
declare const index$2_PeerbenchMultipleChoiceScoreSchemaV1: typeof PeerbenchMultipleChoiceScoreSchemaV1;
|
|
641
|
-
type index$2_PeerbenchMultipleChoiceScoreV1 = PeerbenchMultipleChoiceScoreV1;
|
|
642
|
-
declare const index$2_PeerbenchMultipleChoiceTestCaseSchemaV1: typeof PeerbenchMultipleChoiceTestCaseSchemaV1;
|
|
643
|
-
type index$2_PeerbenchMultipleChoiceTestCaseV1 = PeerbenchMultipleChoiceTestCaseV1;
|
|
644
|
-
declare const index$2_PeerbenchOpenEndedResponseSchemaV1: typeof PeerbenchOpenEndedResponseSchemaV1;
|
|
645
|
-
type index$2_PeerbenchOpenEndedResponseV1 = PeerbenchOpenEndedResponseV1;
|
|
646
|
-
declare const index$2_PeerbenchOpenEndedScoreSchemaV1: typeof PeerbenchOpenEndedScoreSchemaV1;
|
|
647
|
-
type index$2_PeerbenchOpenEndedScoreV1 = PeerbenchOpenEndedScoreV1;
|
|
648
|
-
declare const index$2_PeerbenchOpenEndedTestCaseSchemaV1: typeof PeerbenchOpenEndedTestCaseSchemaV1;
|
|
649
|
-
type index$2_PeerbenchOpenEndedTestCaseV1 = PeerbenchOpenEndedTestCaseV1;
|
|
650
|
-
declare namespace index$2 {
|
|
651
|
-
export { index$2_PeerbenchJSONDataLoader as PeerbenchJSONDataLoader, index$2_PeerbenchMultipleChoiceResponseSchemaV1 as PeerbenchMultipleChoiceResponseSchemaV1, type index$2_PeerbenchMultipleChoiceResponseV1 as PeerbenchMultipleChoiceResponseV1, index$2_PeerbenchMultipleChoiceScoreSchemaV1 as PeerbenchMultipleChoiceScoreSchemaV1, type index$2_PeerbenchMultipleChoiceScoreV1 as PeerbenchMultipleChoiceScoreV1, index$2_PeerbenchMultipleChoiceTestCaseSchemaV1 as PeerbenchMultipleChoiceTestCaseSchemaV1, type index$2_PeerbenchMultipleChoiceTestCaseV1 as PeerbenchMultipleChoiceTestCaseV1, index$2_PeerbenchOpenEndedResponseSchemaV1 as PeerbenchOpenEndedResponseSchemaV1, type index$2_PeerbenchOpenEndedResponseV1 as PeerbenchOpenEndedResponseV1, index$2_PeerbenchOpenEndedScoreSchemaV1 as PeerbenchOpenEndedScoreSchemaV1, type index$2_PeerbenchOpenEndedScoreV1 as PeerbenchOpenEndedScoreV1, index$2_PeerbenchOpenEndedTestCaseSchemaV1 as PeerbenchOpenEndedTestCaseSchemaV1, type index$2_PeerbenchOpenEndedTestCaseV1 as PeerbenchOpenEndedTestCaseV1, runTestCase$2 as runTestCase };
|
|
652
|
-
}
|
|
653
|
-
|
|
654
|
-
declare const MMLUProMainTestCaseSchemaV1: z$1.ZodObject<Omit<{
|
|
655
|
-
id: z$1.ZodString;
|
|
656
|
-
kind: z$1.ZodString;
|
|
657
|
-
schemaVersion: z$1.ZodNumber;
|
|
658
|
-
metadata: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
|
|
659
|
-
}, "kind" | "schemaVersion"> & {
|
|
660
|
-
question: z$1.ZodString;
|
|
661
|
-
options: z$1.ZodRecord<z$1.ZodString, z$1.ZodString>;
|
|
662
|
-
answer: z$1.ZodString;
|
|
663
|
-
answerKey: z$1.ZodString;
|
|
664
|
-
} & {
|
|
665
|
-
kind: z$1.ZodLiteral<"mmlu-pro.ts.main">;
|
|
666
|
-
schemaVersion: z$1.ZodLiteral<1>;
|
|
667
|
-
}, z$1.core.$strip> & {
|
|
668
|
-
new: (input: Omit<{
|
|
669
|
-
id: string;
|
|
670
|
-
question: string;
|
|
671
|
-
options: Record<string, string>;
|
|
672
|
-
answer: string;
|
|
673
|
-
answerKey: string;
|
|
674
|
-
kind: "mmlu-pro.ts.main";
|
|
675
|
-
schemaVersion: 1;
|
|
676
|
-
metadata?: Record<string, unknown> | undefined;
|
|
677
|
-
}, "kind" | "schemaVersion">) => {
|
|
678
|
-
id: string;
|
|
679
|
-
question: string;
|
|
680
|
-
options: Record<string, string>;
|
|
681
|
-
answer: string;
|
|
682
|
-
answerKey: string;
|
|
683
|
-
kind: "mmlu-pro.ts.main";
|
|
684
|
-
schemaVersion: 1;
|
|
685
|
-
metadata?: Record<string, unknown> | undefined;
|
|
686
|
-
};
|
|
687
|
-
newWithId(input: Omit<{
|
|
688
|
-
id: string;
|
|
689
|
-
question: string;
|
|
690
|
-
options: Record<string, string>;
|
|
691
|
-
answer: string;
|
|
692
|
-
answerKey: string;
|
|
693
|
-
kind: "mmlu-pro.ts.main";
|
|
694
|
-
schemaVersion: 1;
|
|
695
|
-
metadata?: Record<string, unknown> | undefined;
|
|
696
|
-
}, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
697
|
-
id: string;
|
|
698
|
-
question: string;
|
|
699
|
-
options: Record<string, string>;
|
|
700
|
-
answer: string;
|
|
701
|
-
answerKey: string;
|
|
702
|
-
kind: "mmlu-pro.ts.main";
|
|
703
|
-
schemaVersion: 1;
|
|
704
|
-
metadata?: Record<string, unknown> | undefined;
|
|
705
|
-
}>;
|
|
706
|
-
};
|
|
707
|
-
type MMLUProMainTestCaseV1 = z$1.infer<typeof MMLUProMainTestCaseSchemaV1>;
|
|
708
|
-
declare const MMLUProMainResponseSchemaV1: z$1.ZodObject<Omit<Omit<{
|
|
709
|
-
id: z$1.ZodString;
|
|
710
|
-
kind: z$1.ZodString;
|
|
711
|
-
schemaVersion: z$1.ZodNumber;
|
|
712
|
-
startedAt: z$1.ZodNumber;
|
|
713
|
-
completedAt: z$1.ZodNumber;
|
|
714
|
-
testCaseId: z$1.ZodString;
|
|
715
|
-
metadata: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
|
|
716
|
-
}, "kind" | "schemaVersion"> & {
|
|
717
|
-
data: z$1.ZodString;
|
|
718
|
-
modelSlug: z$1.ZodString;
|
|
719
|
-
provider: z$1.ZodString;
|
|
720
|
-
systemPromptId: z$1.ZodOptional<z$1.ZodString>;
|
|
721
|
-
inputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
|
|
722
|
-
outputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
|
|
723
|
-
inputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
724
|
-
outputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
725
|
-
} & {
|
|
726
|
-
kind: z$1.ZodString;
|
|
727
|
-
schemaVersion: z$1.ZodNumber;
|
|
728
|
-
}, "kind" | "schemaVersion"> & {
|
|
729
|
-
kind: z$1.ZodLiteral<"mmlu-pro.rs.main">;
|
|
730
|
-
schemaVersion: z$1.ZodLiteral<1>;
|
|
731
|
-
}, z$1.core.$strip> & {
|
|
732
|
-
new: (input: Omit<{
|
|
733
|
-
id: string;
|
|
734
|
-
testCaseId: string;
|
|
735
|
-
startedAt: number;
|
|
736
|
-
completedAt: number;
|
|
737
|
-
data: string;
|
|
738
|
-
provider: string;
|
|
739
|
-
modelSlug: string;
|
|
740
|
-
kind: "mmlu-pro.rs.main";
|
|
741
|
-
schemaVersion: 1;
|
|
742
|
-
metadata?: Record<string, unknown> | undefined;
|
|
743
|
-
inputTokensUsed?: number | undefined;
|
|
744
|
-
outputTokensUsed?: number | undefined;
|
|
745
|
-
inputCost?: string | undefined;
|
|
746
|
-
outputCost?: string | undefined;
|
|
747
|
-
systemPromptId?: string | undefined;
|
|
748
|
-
}, "kind" | "schemaVersion">) => {
|
|
749
|
-
id: string;
|
|
750
|
-
testCaseId: string;
|
|
751
|
-
startedAt: number;
|
|
752
|
-
completedAt: number;
|
|
753
|
-
data: string;
|
|
754
|
-
provider: string;
|
|
755
|
-
modelSlug: string;
|
|
756
|
-
kind: "mmlu-pro.rs.main";
|
|
757
|
-
schemaVersion: 1;
|
|
758
|
-
metadata?: Record<string, unknown> | undefined;
|
|
759
|
-
inputTokensUsed?: number | undefined;
|
|
760
|
-
outputTokensUsed?: number | undefined;
|
|
761
|
-
inputCost?: string | undefined;
|
|
762
|
-
outputCost?: string | undefined;
|
|
763
|
-
systemPromptId?: string | undefined;
|
|
764
|
-
};
|
|
765
|
-
newWithId(input: Omit<{
|
|
766
|
-
id: string;
|
|
767
|
-
testCaseId: string;
|
|
768
|
-
startedAt: number;
|
|
769
|
-
completedAt: number;
|
|
770
|
-
data: string;
|
|
771
|
-
provider: string;
|
|
772
|
-
modelSlug: string;
|
|
773
|
-
kind: "mmlu-pro.rs.main";
|
|
774
|
-
schemaVersion: 1;
|
|
775
|
-
metadata?: Record<string, unknown> | undefined;
|
|
776
|
-
inputTokensUsed?: number | undefined;
|
|
777
|
-
outputTokensUsed?: number | undefined;
|
|
778
|
-
inputCost?: string | undefined;
|
|
779
|
-
outputCost?: string | undefined;
|
|
780
|
-
systemPromptId?: string | undefined;
|
|
781
|
-
}, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
782
|
-
id: string;
|
|
783
|
-
testCaseId: string;
|
|
784
|
-
startedAt: number;
|
|
785
|
-
completedAt: number;
|
|
786
|
-
data: string;
|
|
787
|
-
provider: string;
|
|
788
|
-
modelSlug: string;
|
|
789
|
-
kind: "mmlu-pro.rs.main";
|
|
790
|
-
schemaVersion: 1;
|
|
791
|
-
metadata?: Record<string, unknown> | undefined;
|
|
792
|
-
inputTokensUsed?: number | undefined;
|
|
793
|
-
outputTokensUsed?: number | undefined;
|
|
794
|
-
inputCost?: string | undefined;
|
|
795
|
-
outputCost?: string | undefined;
|
|
796
|
-
systemPromptId?: string | undefined;
|
|
797
|
-
}>;
|
|
798
|
-
};
|
|
799
|
-
type MMLUProMainResponseV1 = z$1.infer<typeof MMLUProMainResponseSchemaV1>;
|
|
800
|
-
declare const MMLUProMainScoreSchemaV1: z$1.ZodObject<Omit<Omit<{
|
|
801
|
-
id: z$1.ZodString;
|
|
802
|
-
kind: z$1.ZodString;
|
|
803
|
-
schemaVersion: z$1.ZodNumber;
|
|
804
|
-
value: z$1.ZodNumber;
|
|
805
|
-
responseId: z$1.ZodString;
|
|
806
|
-
explanation: z$1.ZodOptional<z$1.ZodString>;
|
|
807
|
-
metadata: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
|
|
808
|
-
scoringMethod: z$1.ZodEnum<{
|
|
809
|
-
readonly ai: "ai";
|
|
810
|
-
readonly human: "human";
|
|
811
|
-
readonly algo: "algo";
|
|
812
|
-
}>;
|
|
813
|
-
}, "kind" | "schemaVersion"> & {
|
|
814
|
-
scorerAIProvider: z$1.ZodOptional<z$1.ZodString>;
|
|
815
|
-
scorerAIModelSlug: z$1.ZodOptional<z$1.ZodString>;
|
|
816
|
-
scorerAIInputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
|
|
817
|
-
scorerAIOutputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
|
|
818
|
-
scorerAIInputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
819
|
-
scorerAIOutputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
820
|
-
} & {
|
|
821
|
-
kind: z$1.ZodString;
|
|
822
|
-
schemaVersion: z$1.ZodNumber;
|
|
823
|
-
}, "kind" | "schemaVersion"> & {
|
|
824
|
-
extractedAnswers: z$1.ZodArray<z$1.ZodString>;
|
|
825
|
-
} & {
|
|
826
|
-
kind: z$1.ZodLiteral<"mmlu-pro.sc.main">;
|
|
827
|
-
schemaVersion: z$1.ZodLiteral<1>;
|
|
828
|
-
}, z$1.core.$strip> & {
|
|
829
|
-
new: (input: Omit<{
|
|
841
|
+
newWithId(input: Omit<{
|
|
830
842
|
id: string;
|
|
831
843
|
value: number;
|
|
832
844
|
responseId: string;
|
|
833
845
|
scoringMethod: "ai" | "human" | "algo";
|
|
834
|
-
|
|
835
|
-
kind: "
|
|
846
|
+
namespace: "peerbench.ai";
|
|
847
|
+
kind: "llm/qa.sc";
|
|
836
848
|
schemaVersion: 1;
|
|
837
|
-
metadata?: Record<string, unknown> | undefined;
|
|
838
849
|
explanation?: string | undefined;
|
|
839
|
-
scorerAIProvider?: string | undefined;
|
|
840
|
-
scorerAIModelSlug?: string | undefined;
|
|
841
|
-
scorerAIInputTokensUsed?: number | undefined;
|
|
842
|
-
scorerAIOutputTokensUsed?: number | undefined;
|
|
843
|
-
scorerAIInputCost?: string | undefined;
|
|
844
|
-
scorerAIOutputCost?: string | undefined;
|
|
845
|
-
}, "kind" | "schemaVersion">) => {
|
|
846
|
-
id: string;
|
|
847
|
-
value: number;
|
|
848
|
-
responseId: string;
|
|
849
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
850
|
-
extractedAnswers: string[];
|
|
851
|
-
kind: "mmlu-pro.sc.main";
|
|
852
|
-
schemaVersion: 1;
|
|
853
850
|
metadata?: Record<string, unknown> | undefined;
|
|
854
|
-
|
|
851
|
+
scorerAISystemPrompt?: string | undefined;
|
|
852
|
+
scorerAISystemPromptId?: string | undefined;
|
|
855
853
|
scorerAIProvider?: string | undefined;
|
|
856
854
|
scorerAIModelSlug?: string | undefined;
|
|
857
855
|
scorerAIInputTokensUsed?: number | undefined;
|
|
858
856
|
scorerAIOutputTokensUsed?: number | undefined;
|
|
859
857
|
scorerAIInputCost?: string | undefined;
|
|
860
858
|
scorerAIOutputCost?: string | undefined;
|
|
861
|
-
}
|
|
862
|
-
newWithId(input: Omit<{
|
|
859
|
+
}, "id" | "kind" | "namespace" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
863
860
|
id: string;
|
|
864
861
|
value: number;
|
|
865
862
|
responseId: string;
|
|
866
863
|
scoringMethod: "ai" | "human" | "algo";
|
|
867
|
-
|
|
868
|
-
kind: "
|
|
864
|
+
namespace: "peerbench.ai";
|
|
865
|
+
kind: "llm/qa.sc";
|
|
869
866
|
schemaVersion: 1;
|
|
870
|
-
metadata?: Record<string, unknown> | undefined;
|
|
871
867
|
explanation?: string | undefined;
|
|
872
|
-
scorerAIProvider?: string | undefined;
|
|
873
|
-
scorerAIModelSlug?: string | undefined;
|
|
874
|
-
scorerAIInputTokensUsed?: number | undefined;
|
|
875
|
-
scorerAIOutputTokensUsed?: number | undefined;
|
|
876
|
-
scorerAIInputCost?: string | undefined;
|
|
877
|
-
scorerAIOutputCost?: string | undefined;
|
|
878
|
-
}, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
879
|
-
id: string;
|
|
880
|
-
value: number;
|
|
881
|
-
responseId: string;
|
|
882
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
883
|
-
extractedAnswers: string[];
|
|
884
|
-
kind: "mmlu-pro.sc.main";
|
|
885
|
-
schemaVersion: 1;
|
|
886
868
|
metadata?: Record<string, unknown> | undefined;
|
|
887
|
-
|
|
869
|
+
scorerAISystemPrompt?: string | undefined;
|
|
870
|
+
scorerAISystemPromptId?: string | undefined;
|
|
888
871
|
scorerAIProvider?: string | undefined;
|
|
889
872
|
scorerAIModelSlug?: string | undefined;
|
|
890
873
|
scorerAIInputTokensUsed?: number | undefined;
|
|
@@ -893,806 +876,167 @@ declare const MMLUProMainScoreSchemaV1: z$1.ZodObject<Omit<Omit<{
|
|
|
893
876
|
scorerAIOutputCost?: string | undefined;
|
|
894
877
|
}>;
|
|
895
878
|
};
|
|
896
|
-
type
|
|
879
|
+
type QAScoreV1 = z.infer<typeof QAScoreSchemaV1>;
|
|
897
880
|
|
|
898
|
-
declare const
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
new: (input: Omit<{
|
|
907
|
-
kind: "mmlu-pro.benchmark.spec";
|
|
881
|
+
declare const peerbenchRunner: (params: {
|
|
882
|
+
testCase: {
|
|
883
|
+
id: string;
|
|
884
|
+
question: string;
|
|
885
|
+
options: Record<string, string>;
|
|
886
|
+
correctAnswerKeys: string[];
|
|
887
|
+
namespace: "peerbench.ai";
|
|
888
|
+
kind: "llm/mcq.tc";
|
|
908
889
|
schemaVersion: 1;
|
|
909
890
|
metadata?: Record<string, unknown> | undefined;
|
|
910
|
-
}
|
|
911
|
-
|
|
891
|
+
} | {
|
|
892
|
+
id: string;
|
|
893
|
+
question: string;
|
|
894
|
+
goodAnswers: string[];
|
|
895
|
+
badAnswers: string[];
|
|
896
|
+
namespace: "peerbench.ai";
|
|
897
|
+
kind: "llm/qa.tc";
|
|
912
898
|
schemaVersion: 1;
|
|
913
899
|
metadata?: Record<string, unknown> | undefined;
|
|
914
900
|
};
|
|
915
|
-
newWithId(input: Omit<{
|
|
916
|
-
kind: "mmlu-pro.benchmark.spec";
|
|
917
|
-
schemaVersion: 1;
|
|
918
|
-
metadata?: Record<string, unknown> | undefined;
|
|
919
|
-
}, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
920
|
-
kind: "mmlu-pro.benchmark.spec";
|
|
921
|
-
schemaVersion: 1;
|
|
922
|
-
metadata?: Record<string, unknown> | undefined;
|
|
923
|
-
}>;
|
|
924
|
-
};
|
|
925
|
-
type MMLUProBenchmarkSpecV1 = z__default.infer<typeof MMLUProBenchmarkSpecSchemaV1>;
|
|
926
|
-
|
|
927
|
-
declare class MMLUProJSONDataLoader extends AbstractDataLoader {
|
|
928
|
-
readonly kind = "mmlu-pro.load.json.data";
|
|
929
|
-
loadData(params: {
|
|
930
|
-
content: Uint8Array;
|
|
931
|
-
}): LoaderResult<MMLUProMainTestCaseV1, MMLUProMainResponseV1, MMLUProMainScoreV1>;
|
|
932
|
-
loadBenchmarkSpec(params: {
|
|
933
|
-
content: Uint8Array;
|
|
934
|
-
}): Promise<MMLUProBenchmarkSpecV1>;
|
|
935
|
-
}
|
|
936
|
-
declare class MMLUProParquetDataLoader extends AbstractDataLoader {
|
|
937
|
-
readonly kind = "mmlu-pro.load.parquet.data";
|
|
938
|
-
loadData(params: {
|
|
939
|
-
content: Uint8Array;
|
|
940
|
-
}): Promise<LoaderResult<MMLUProMainTestCaseV1, MMLUProMainResponseV1, MMLUProMainScoreV1>>;
|
|
941
|
-
loadBenchmarkSpec(params: {
|
|
942
|
-
content: Uint8Array;
|
|
943
|
-
}): Promise<MMLUProBenchmarkSpecV1>;
|
|
944
|
-
}
|
|
945
|
-
|
|
946
|
-
declare function runTestCase$1(params: {
|
|
947
|
-
testCase: MMLUProMainTestCaseV1;
|
|
948
901
|
provider: AbstractLLMProvider;
|
|
949
|
-
scorer?: MCQScorer |
|
|
950
|
-
spec?: MMLUProBenchmarkSpecV1;
|
|
902
|
+
scorer?: MCQScorer | LLMAsAJudgeScorer | undefined;
|
|
951
903
|
runConfig: {
|
|
952
904
|
model: string;
|
|
953
|
-
llmJudgeModel?: string;
|
|
905
|
+
llmJudgeModel?: string | undefined;
|
|
906
|
+
llmJudgeSystemPrompt?: {
|
|
907
|
+
id: string;
|
|
908
|
+
version: number;
|
|
909
|
+
content: string;
|
|
910
|
+
namespace: "peerbench.ai";
|
|
911
|
+
kind: `${string}/simple.sys-prompt`;
|
|
912
|
+
schemaVersion: 1;
|
|
913
|
+
metadata?: Record<string, unknown> | undefined;
|
|
914
|
+
} | undefined;
|
|
915
|
+
llmJudgeFieldsToExtract?: Record<string, z__default.ZodType<unknown, unknown, z__default.core.$ZodTypeInternals<unknown, unknown>>> | undefined;
|
|
916
|
+
systemPrompt?: {
|
|
917
|
+
id: string;
|
|
918
|
+
version: number;
|
|
919
|
+
content: string;
|
|
920
|
+
namespace: "peerbench.ai";
|
|
921
|
+
kind: `${string}/simple.sys-prompt`;
|
|
922
|
+
schemaVersion: 1;
|
|
923
|
+
metadata?: Record<string, unknown> | undefined;
|
|
924
|
+
} | undefined;
|
|
925
|
+
templateVariables?: Record<string, string> | undefined;
|
|
954
926
|
};
|
|
955
|
-
systemPrompt?: SimpleSystemPromptV1;
|
|
956
927
|
idGenerators?: {
|
|
957
928
|
response?: IdGenerator;
|
|
958
929
|
score?: IdGenerator;
|
|
959
930
|
};
|
|
960
|
-
})
|
|
961
|
-
|
|
962
|
-
declare const BaseMMLUProScoreSchemaV1: z.ZodObject<Omit<{
|
|
963
|
-
id: z.ZodString;
|
|
964
|
-
kind: z.ZodString;
|
|
965
|
-
schemaVersion: z.ZodNumber;
|
|
966
|
-
value: z.ZodNumber;
|
|
967
|
-
responseId: z.ZodString;
|
|
968
|
-
explanation: z.ZodOptional<z.ZodString>;
|
|
969
|
-
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
970
|
-
scoringMethod: z.ZodEnum<{
|
|
971
|
-
readonly ai: "ai";
|
|
972
|
-
readonly human: "human";
|
|
973
|
-
readonly algo: "algo";
|
|
974
|
-
}>;
|
|
975
|
-
}, "kind" | "schemaVersion"> & {
|
|
976
|
-
scorerAIProvider: z.ZodOptional<z.ZodString>;
|
|
977
|
-
scorerAIModelSlug: z.ZodOptional<z.ZodString>;
|
|
978
|
-
scorerAIInputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
979
|
-
scorerAIOutputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
980
|
-
scorerAIInputCost: z.ZodOptional<z.ZodString>;
|
|
981
|
-
scorerAIOutputCost: z.ZodOptional<z.ZodString>;
|
|
982
|
-
} & {
|
|
983
|
-
kind: z.ZodString;
|
|
984
|
-
schemaVersion: z.ZodNumber;
|
|
985
|
-
}, zod_v4_core.$strip> & {
|
|
986
|
-
new: (input: Omit<{
|
|
987
|
-
id: string;
|
|
988
|
-
value: number;
|
|
989
|
-
responseId: string;
|
|
990
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
991
|
-
kind: string;
|
|
992
|
-
schemaVersion: number;
|
|
993
|
-
metadata?: Record<string, unknown> | undefined;
|
|
994
|
-
explanation?: string | undefined;
|
|
995
|
-
scorerAIProvider?: string | undefined;
|
|
996
|
-
scorerAIModelSlug?: string | undefined;
|
|
997
|
-
scorerAIInputTokensUsed?: number | undefined;
|
|
998
|
-
scorerAIOutputTokensUsed?: number | undefined;
|
|
999
|
-
scorerAIInputCost?: string | undefined;
|
|
1000
|
-
scorerAIOutputCost?: string | undefined;
|
|
1001
|
-
}, "kind" | "schemaVersion">) => {
|
|
1002
|
-
id: string;
|
|
1003
|
-
value: number;
|
|
1004
|
-
responseId: string;
|
|
1005
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
1006
|
-
kind: string;
|
|
1007
|
-
schemaVersion: number;
|
|
1008
|
-
metadata?: Record<string, unknown> | undefined;
|
|
1009
|
-
explanation?: string | undefined;
|
|
1010
|
-
scorerAIProvider?: string | undefined;
|
|
1011
|
-
scorerAIModelSlug?: string | undefined;
|
|
1012
|
-
scorerAIInputTokensUsed?: number | undefined;
|
|
1013
|
-
scorerAIOutputTokensUsed?: number | undefined;
|
|
1014
|
-
scorerAIInputCost?: string | undefined;
|
|
1015
|
-
scorerAIOutputCost?: string | undefined;
|
|
1016
|
-
};
|
|
1017
|
-
newWithId(input: Omit<{
|
|
1018
|
-
id: string;
|
|
1019
|
-
value: number;
|
|
1020
|
-
responseId: string;
|
|
1021
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
1022
|
-
kind: string;
|
|
1023
|
-
schemaVersion: number;
|
|
1024
|
-
metadata?: Record<string, unknown> | undefined;
|
|
1025
|
-
explanation?: string | undefined;
|
|
1026
|
-
scorerAIProvider?: string | undefined;
|
|
1027
|
-
scorerAIModelSlug?: string | undefined;
|
|
1028
|
-
scorerAIInputTokensUsed?: number | undefined;
|
|
1029
|
-
scorerAIOutputTokensUsed?: number | undefined;
|
|
1030
|
-
scorerAIInputCost?: string | undefined;
|
|
1031
|
-
scorerAIOutputCost?: string | undefined;
|
|
1032
|
-
}, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
1033
|
-
id: string;
|
|
1034
|
-
value: number;
|
|
1035
|
-
responseId: string;
|
|
1036
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
1037
|
-
kind: string;
|
|
1038
|
-
schemaVersion: number;
|
|
1039
|
-
metadata?: Record<string, unknown> | undefined;
|
|
1040
|
-
explanation?: string | undefined;
|
|
1041
|
-
scorerAIProvider?: string | undefined;
|
|
1042
|
-
scorerAIModelSlug?: string | undefined;
|
|
1043
|
-
scorerAIInputTokensUsed?: number | undefined;
|
|
1044
|
-
scorerAIOutputTokensUsed?: number | undefined;
|
|
1045
|
-
scorerAIInputCost?: string | undefined;
|
|
1046
|
-
scorerAIOutputCost?: string | undefined;
|
|
1047
|
-
}>;
|
|
1048
|
-
};
|
|
1049
|
-
|
|
1050
|
-
declare const index$1_BaseMMLUProScoreSchemaV1: typeof BaseMMLUProScoreSchemaV1;
|
|
1051
|
-
declare const index$1_MMLUProBenchmarkSpecSchemaV1: typeof MMLUProBenchmarkSpecSchemaV1;
|
|
1052
|
-
type index$1_MMLUProBenchmarkSpecV1 = MMLUProBenchmarkSpecV1;
|
|
1053
|
-
type index$1_MMLUProJSONDataLoader = MMLUProJSONDataLoader;
|
|
1054
|
-
declare const index$1_MMLUProJSONDataLoader: typeof MMLUProJSONDataLoader;
|
|
1055
|
-
declare const index$1_MMLUProMainResponseSchemaV1: typeof MMLUProMainResponseSchemaV1;
|
|
1056
|
-
type index$1_MMLUProMainResponseV1 = MMLUProMainResponseV1;
|
|
1057
|
-
declare const index$1_MMLUProMainScoreSchemaV1: typeof MMLUProMainScoreSchemaV1;
|
|
1058
|
-
type index$1_MMLUProMainScoreV1 = MMLUProMainScoreV1;
|
|
1059
|
-
declare const index$1_MMLUProMainTestCaseSchemaV1: typeof MMLUProMainTestCaseSchemaV1;
|
|
1060
|
-
type index$1_MMLUProMainTestCaseV1 = MMLUProMainTestCaseV1;
|
|
1061
|
-
type index$1_MMLUProParquetDataLoader = MMLUProParquetDataLoader;
|
|
1062
|
-
declare const index$1_MMLUProParquetDataLoader: typeof MMLUProParquetDataLoader;
|
|
1063
|
-
declare namespace index$1 {
|
|
1064
|
-
export { index$1_BaseMMLUProScoreSchemaV1 as BaseMMLUProScoreSchemaV1, index$1_MMLUProBenchmarkSpecSchemaV1 as MMLUProBenchmarkSpecSchemaV1, type index$1_MMLUProBenchmarkSpecV1 as MMLUProBenchmarkSpecV1, index$1_MMLUProJSONDataLoader as MMLUProJSONDataLoader, index$1_MMLUProMainResponseSchemaV1 as MMLUProMainResponseSchemaV1, type index$1_MMLUProMainResponseV1 as MMLUProMainResponseV1, index$1_MMLUProMainScoreSchemaV1 as MMLUProMainScoreSchemaV1, type index$1_MMLUProMainScoreV1 as MMLUProMainScoreV1, index$1_MMLUProMainTestCaseSchemaV1 as MMLUProMainTestCaseSchemaV1, type index$1_MMLUProMainTestCaseV1 as MMLUProMainTestCaseV1, index$1_MMLUProParquetDataLoader as MMLUProParquetDataLoader, runTestCase$1 as runTestCase };
|
|
1065
|
-
}
|
|
1066
|
-
|
|
1067
|
-
declare class FNOLFieldsScorer extends AbstractScorer {
|
|
1068
|
-
readonly kind = "fnol.fields";
|
|
1069
|
-
score(params: {
|
|
1070
|
-
fieldsToCollect: Record<string, {
|
|
1071
|
-
required?: boolean;
|
|
1072
|
-
expected?: unknown;
|
|
1073
|
-
description?: string;
|
|
1074
|
-
}>;
|
|
1075
|
-
extracted?: Record<string, unknown>;
|
|
1076
|
-
}): Promise<BaseScorerResult & {
|
|
1077
|
-
requiredKeys: string[];
|
|
1078
|
-
presentKeys: string[];
|
|
1079
|
-
missingKeys: string[];
|
|
1080
|
-
mismatchedKeys: string[];
|
|
1081
|
-
}>;
|
|
1082
|
-
}
|
|
1083
|
-
|
|
1084
|
-
declare const FNOLFieldSchemaV1: z$1.ZodObject<{
|
|
1085
|
-
description: z$1.ZodString;
|
|
1086
|
-
required: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
1087
|
-
expected: z$1.ZodOptional<z$1.ZodUnknown>;
|
|
1088
|
-
valueType: z$1.ZodOptional<z$1.ZodEnum<{
|
|
1089
|
-
readonly string: "string";
|
|
1090
|
-
readonly number: "number";
|
|
1091
|
-
readonly boolean: "boolean";
|
|
1092
|
-
readonly object: "object";
|
|
1093
|
-
}>>;
|
|
1094
|
-
}, z$1.core.$strip>;
|
|
1095
|
-
declare const FNOLTestCaseSchemaV1: z$1.ZodObject<Omit<{
|
|
1096
|
-
id: z$1.ZodString;
|
|
1097
|
-
kind: z$1.ZodString;
|
|
1098
|
-
schemaVersion: z$1.ZodNumber;
|
|
1099
|
-
metadata: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
|
|
1100
|
-
}, "kind" | "schemaVersion"> & {
|
|
1101
|
-
/**
|
|
1102
|
-
* Scenario starter message. This is what the "user" would say initially.
|
|
1103
|
-
*/
|
|
1104
|
-
initialUserMessage: z$1.ZodString;
|
|
1105
|
-
/**
|
|
1106
|
-
* Private/structured information about the user and the incident.
|
|
1107
|
-
* This is used by the user simulator LLM to answer the target model questions.
|
|
1108
|
-
*/
|
|
1109
|
-
userProfile: z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>;
|
|
1110
|
-
/**
|
|
1111
|
-
* The fields the target model must collect.
|
|
1112
|
-
* Keys are canonical identifiers (e.g. "policyNumber", "dateOfLoss").
|
|
1113
|
-
*/
|
|
1114
|
-
fieldsToCollect: z$1.ZodRecord<z$1.ZodString, z$1.ZodObject<{
|
|
1115
|
-
description: z$1.ZodString;
|
|
1116
|
-
required: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
1117
|
-
expected: z$1.ZodOptional<z$1.ZodUnknown>;
|
|
1118
|
-
valueType: z$1.ZodOptional<z$1.ZodEnum<{
|
|
1119
|
-
readonly string: "string";
|
|
1120
|
-
readonly number: "number";
|
|
1121
|
-
readonly boolean: "boolean";
|
|
1122
|
-
readonly object: "object";
|
|
1123
|
-
}>>;
|
|
1124
|
-
}, z$1.core.$strip>>;
|
|
1125
|
-
/**
|
|
1126
|
-
* Maximum number of back-and-forth turns (target question + user answer).
|
|
1127
|
-
*/
|
|
1128
|
-
maxTurns: z$1.ZodDefault<z$1.ZodNumber>;
|
|
1129
|
-
} & {
|
|
1130
|
-
kind: z$1.ZodLiteral<"fnol.ts.v1">;
|
|
1131
|
-
schemaVersion: z$1.ZodLiteral<1>;
|
|
1132
|
-
}, z$1.core.$strip> & {
|
|
1133
|
-
new: (input: Omit<{
|
|
1134
|
-
id: string;
|
|
1135
|
-
initialUserMessage: string;
|
|
1136
|
-
userProfile: Record<string, unknown>;
|
|
1137
|
-
fieldsToCollect: Record<string, {
|
|
1138
|
-
description: string;
|
|
1139
|
-
required?: boolean | undefined;
|
|
1140
|
-
expected?: unknown;
|
|
1141
|
-
valueType?: "string" | "number" | "boolean" | "object" | undefined;
|
|
1142
|
-
}>;
|
|
1143
|
-
maxTurns: number;
|
|
1144
|
-
kind: "fnol.ts.v1";
|
|
1145
|
-
schemaVersion: 1;
|
|
1146
|
-
metadata?: Record<string, unknown> | undefined;
|
|
1147
|
-
}, "kind" | "schemaVersion">) => {
|
|
1148
|
-
id: string;
|
|
1149
|
-
initialUserMessage: string;
|
|
1150
|
-
userProfile: Record<string, unknown>;
|
|
1151
|
-
fieldsToCollect: Record<string, {
|
|
1152
|
-
description: string;
|
|
1153
|
-
required?: boolean | undefined;
|
|
1154
|
-
expected?: unknown;
|
|
1155
|
-
valueType?: "string" | "number" | "boolean" | "object" | undefined;
|
|
1156
|
-
}>;
|
|
1157
|
-
maxTurns: number;
|
|
1158
|
-
kind: "fnol.ts.v1";
|
|
1159
|
-
schemaVersion: 1;
|
|
1160
|
-
metadata?: Record<string, unknown> | undefined;
|
|
1161
|
-
};
|
|
1162
|
-
newWithId(input: Omit<{
|
|
1163
|
-
id: string;
|
|
1164
|
-
initialUserMessage: string;
|
|
1165
|
-
userProfile: Record<string, unknown>;
|
|
1166
|
-
fieldsToCollect: Record<string, {
|
|
1167
|
-
description: string;
|
|
1168
|
-
required?: boolean | undefined;
|
|
1169
|
-
expected?: unknown;
|
|
1170
|
-
valueType?: "string" | "number" | "boolean" | "object" | undefined;
|
|
1171
|
-
}>;
|
|
1172
|
-
maxTurns: number;
|
|
1173
|
-
kind: "fnol.ts.v1";
|
|
1174
|
-
schemaVersion: 1;
|
|
1175
|
-
metadata?: Record<string, unknown> | undefined;
|
|
1176
|
-
}, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
1177
|
-
id: string;
|
|
1178
|
-
initialUserMessage: string;
|
|
1179
|
-
userProfile: Record<string, unknown>;
|
|
1180
|
-
fieldsToCollect: Record<string, {
|
|
1181
|
-
description: string;
|
|
1182
|
-
required?: boolean | undefined;
|
|
1183
|
-
expected?: unknown;
|
|
1184
|
-
valueType?: "string" | "number" | "boolean" | "object" | undefined;
|
|
1185
|
-
}>;
|
|
1186
|
-
maxTurns: number;
|
|
1187
|
-
kind: "fnol.ts.v1";
|
|
1188
|
-
schemaVersion: 1;
|
|
1189
|
-
metadata?: Record<string, unknown> | undefined;
|
|
1190
|
-
}>;
|
|
1191
|
-
};
|
|
1192
|
-
type FNOLTestCaseV1 = z$1.infer<typeof FNOLTestCaseSchemaV1>;
|
|
1193
|
-
declare const FNOLConversationMessageSchemaV1: z$1.ZodObject<{
|
|
1194
|
-
role: z$1.ZodEnum<{
|
|
1195
|
-
system: "system";
|
|
1196
|
-
user: "user";
|
|
1197
|
-
assistant: "assistant";
|
|
1198
|
-
}>;
|
|
1199
|
-
content: z$1.ZodString;
|
|
1200
|
-
}, z$1.core.$strip>;
|
|
1201
|
-
declare const FNOLResponseSchemaV1: z$1.ZodObject<Omit<Omit<{
|
|
1202
|
-
id: z$1.ZodString;
|
|
1203
|
-
kind: z$1.ZodString;
|
|
1204
|
-
schemaVersion: z$1.ZodNumber;
|
|
1205
|
-
startedAt: z$1.ZodNumber;
|
|
1206
|
-
completedAt: z$1.ZodNumber;
|
|
1207
|
-
testCaseId: z$1.ZodString;
|
|
1208
|
-
metadata: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
|
|
1209
|
-
}, "kind" | "schemaVersion"> & {
|
|
1210
|
-
data: z$1.ZodString;
|
|
1211
|
-
modelSlug: z$1.ZodString;
|
|
1212
|
-
provider: z$1.ZodString;
|
|
1213
|
-
systemPromptId: z$1.ZodOptional<z$1.ZodString>;
|
|
1214
|
-
inputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1215
|
-
outputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1216
|
-
inputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1217
|
-
outputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1218
|
-
} & {
|
|
1219
|
-
kind: z$1.ZodString;
|
|
1220
|
-
schemaVersion: z$1.ZodNumber;
|
|
1221
|
-
}, "kind" | "schemaVersion"> & {
|
|
1222
|
-
/**
|
|
1223
|
-
* Full conversation between the target model and simulated user.
|
|
1224
|
-
*/
|
|
1225
|
-
conversation: z$1.ZodArray<z$1.ZodObject<{
|
|
1226
|
-
role: z$1.ZodEnum<{
|
|
1227
|
-
system: "system";
|
|
1228
|
-
user: "user";
|
|
1229
|
-
assistant: "assistant";
|
|
1230
|
-
}>;
|
|
1231
|
-
content: z$1.ZodString;
|
|
1232
|
-
}, z$1.core.$strip>>;
|
|
1233
|
-
turnsUsed: z$1.ZodNumber;
|
|
1234
|
-
doneReason: z$1.ZodEnum<{
|
|
1235
|
-
readonly modelProvidedJson: "modelProvidedJson";
|
|
1236
|
-
readonly reachedMaxTurns: "reachedMaxTurns";
|
|
1237
|
-
readonly forcedFinalJson: "forcedFinalJson";
|
|
1238
|
-
}>;
|
|
1239
|
-
/**
|
|
1240
|
-
* Parsed JSON object from the target model's final answer, if available.
|
|
1241
|
-
*/
|
|
1242
|
-
extracted: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
|
|
1243
|
-
} & {
|
|
1244
|
-
kind: z$1.ZodLiteral<"fnol.rs.v1">;
|
|
1245
|
-
schemaVersion: z$1.ZodLiteral<1>;
|
|
1246
|
-
}, z$1.core.$strip> & {
|
|
1247
|
-
new: (input: Omit<{
|
|
1248
|
-
id: string;
|
|
1249
|
-
testCaseId: string;
|
|
931
|
+
}) => Promise<{
|
|
932
|
+
response: {
|
|
1250
933
|
startedAt: number;
|
|
1251
934
|
completedAt: number;
|
|
1252
|
-
data: string;
|
|
1253
|
-
provider: string;
|
|
1254
|
-
modelSlug: string;
|
|
1255
|
-
conversation: {
|
|
1256
|
-
role: "system" | "user" | "assistant";
|
|
1257
|
-
content: string;
|
|
1258
|
-
}[];
|
|
1259
|
-
turnsUsed: number;
|
|
1260
|
-
doneReason: "modelProvidedJson" | "reachedMaxTurns" | "forcedFinalJson";
|
|
1261
|
-
kind: "fnol.rs.v1";
|
|
1262
|
-
schemaVersion: 1;
|
|
1263
|
-
metadata?: Record<string, unknown> | undefined;
|
|
1264
|
-
inputTokensUsed?: number | undefined;
|
|
1265
|
-
outputTokensUsed?: number | undefined;
|
|
1266
|
-
inputCost?: string | undefined;
|
|
1267
|
-
outputCost?: string | undefined;
|
|
1268
|
-
systemPromptId?: string | undefined;
|
|
1269
|
-
extracted?: Record<string, unknown> | undefined;
|
|
1270
|
-
}, "kind" | "schemaVersion">) => {
|
|
1271
935
|
id: string;
|
|
1272
936
|
testCaseId: string;
|
|
1273
|
-
startedAt: number;
|
|
1274
|
-
completedAt: number;
|
|
1275
937
|
data: string;
|
|
1276
|
-
provider: string;
|
|
1277
938
|
modelSlug: string;
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
}[];
|
|
1282
|
-
turnsUsed: number;
|
|
1283
|
-
doneReason: "modelProvidedJson" | "reachedMaxTurns" | "forcedFinalJson";
|
|
1284
|
-
kind: "fnol.rs.v1";
|
|
939
|
+
provider: string;
|
|
940
|
+
namespace: "peerbench.ai";
|
|
941
|
+
kind: "llm/mcq.rs";
|
|
1285
942
|
schemaVersion: 1;
|
|
1286
943
|
metadata?: Record<string, unknown> | undefined;
|
|
944
|
+
systemPromptId?: string | undefined;
|
|
1287
945
|
inputTokensUsed?: number | undefined;
|
|
1288
946
|
outputTokensUsed?: number | undefined;
|
|
1289
947
|
inputCost?: string | undefined;
|
|
1290
948
|
outputCost?: string | undefined;
|
|
1291
|
-
|
|
1292
|
-
extracted?: Record<string, unknown> | undefined;
|
|
1293
|
-
};
|
|
1294
|
-
newWithId(input: Omit<{
|
|
1295
|
-
id: string;
|
|
1296
|
-
testCaseId: string;
|
|
949
|
+
} | {
|
|
1297
950
|
startedAt: number;
|
|
1298
951
|
completedAt: number;
|
|
1299
|
-
data: string;
|
|
1300
|
-
provider: string;
|
|
1301
|
-
modelSlug: string;
|
|
1302
|
-
conversation: {
|
|
1303
|
-
role: "system" | "user" | "assistant";
|
|
1304
|
-
content: string;
|
|
1305
|
-
}[];
|
|
1306
|
-
turnsUsed: number;
|
|
1307
|
-
doneReason: "modelProvidedJson" | "reachedMaxTurns" | "forcedFinalJson";
|
|
1308
|
-
kind: "fnol.rs.v1";
|
|
1309
|
-
schemaVersion: 1;
|
|
1310
|
-
metadata?: Record<string, unknown> | undefined;
|
|
1311
|
-
inputTokensUsed?: number | undefined;
|
|
1312
|
-
outputTokensUsed?: number | undefined;
|
|
1313
|
-
inputCost?: string | undefined;
|
|
1314
|
-
outputCost?: string | undefined;
|
|
1315
|
-
systemPromptId?: string | undefined;
|
|
1316
|
-
extracted?: Record<string, unknown> | undefined;
|
|
1317
|
-
}, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
1318
952
|
id: string;
|
|
1319
953
|
testCaseId: string;
|
|
1320
|
-
startedAt: number;
|
|
1321
|
-
completedAt: number;
|
|
1322
954
|
data: string;
|
|
1323
|
-
provider: string;
|
|
1324
955
|
modelSlug: string;
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
}[];
|
|
1329
|
-
turnsUsed: number;
|
|
1330
|
-
doneReason: "modelProvidedJson" | "reachedMaxTurns" | "forcedFinalJson";
|
|
1331
|
-
kind: "fnol.rs.v1";
|
|
956
|
+
provider: string;
|
|
957
|
+
namespace: "peerbench.ai";
|
|
958
|
+
kind: "llm/qa.rs";
|
|
1332
959
|
schemaVersion: 1;
|
|
1333
960
|
metadata?: Record<string, unknown> | undefined;
|
|
961
|
+
systemPromptId?: string | undefined;
|
|
1334
962
|
inputTokensUsed?: number | undefined;
|
|
1335
963
|
outputTokensUsed?: number | undefined;
|
|
1336
964
|
inputCost?: string | undefined;
|
|
1337
965
|
outputCost?: string | undefined;
|
|
1338
|
-
systemPromptId?: string | undefined;
|
|
1339
|
-
extracted?: Record<string, unknown> | undefined;
|
|
1340
|
-
}>;
|
|
1341
|
-
};
|
|
1342
|
-
type FNOLResponseV1 = z$1.infer<typeof FNOLResponseSchemaV1>;
|
|
1343
|
-
declare const FNOLFieldsScoreSchemaV1: z$1.ZodObject<Omit<Omit<{
|
|
1344
|
-
id: z$1.ZodString;
|
|
1345
|
-
kind: z$1.ZodString;
|
|
1346
|
-
schemaVersion: z$1.ZodNumber;
|
|
1347
|
-
value: z$1.ZodNumber;
|
|
1348
|
-
responseId: z$1.ZodString;
|
|
1349
|
-
explanation: z$1.ZodOptional<z$1.ZodString>;
|
|
1350
|
-
metadata: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
|
|
1351
|
-
scoringMethod: z$1.ZodEnum<{
|
|
1352
|
-
readonly ai: "ai";
|
|
1353
|
-
readonly human: "human";
|
|
1354
|
-
readonly algo: "algo";
|
|
1355
|
-
}>;
|
|
1356
|
-
}, "kind" | "schemaVersion"> & {
|
|
1357
|
-
scorerAIProvider: z$1.ZodOptional<z$1.ZodString>;
|
|
1358
|
-
scorerAIModelSlug: z$1.ZodOptional<z$1.ZodString>;
|
|
1359
|
-
scorerAIInputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1360
|
-
scorerAIOutputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1361
|
-
scorerAIInputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1362
|
-
scorerAIOutputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1363
|
-
} & {
|
|
1364
|
-
kind: z$1.ZodString;
|
|
1365
|
-
schemaVersion: z$1.ZodNumber;
|
|
1366
|
-
}, "kind" | "schemaVersion"> & {
|
|
1367
|
-
requiredKeys: z$1.ZodArray<z$1.ZodString>;
|
|
1368
|
-
presentKeys: z$1.ZodArray<z$1.ZodString>;
|
|
1369
|
-
missingKeys: z$1.ZodArray<z$1.ZodString>;
|
|
1370
|
-
mismatchedKeys: z$1.ZodArray<z$1.ZodString>;
|
|
1371
|
-
} & {
|
|
1372
|
-
kind: z$1.ZodLiteral<"fnol.sc.fields.v1">;
|
|
1373
|
-
schemaVersion: z$1.ZodLiteral<1>;
|
|
1374
|
-
}, z$1.core.$strip> & {
|
|
1375
|
-
new: (input: Omit<{
|
|
1376
|
-
id: string;
|
|
1377
|
-
value: number;
|
|
1378
|
-
responseId: string;
|
|
1379
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
1380
|
-
requiredKeys: string[];
|
|
1381
|
-
presentKeys: string[];
|
|
1382
|
-
missingKeys: string[];
|
|
1383
|
-
mismatchedKeys: string[];
|
|
1384
|
-
kind: "fnol.sc.fields.v1";
|
|
1385
|
-
schemaVersion: 1;
|
|
1386
|
-
metadata?: Record<string, unknown> | undefined;
|
|
1387
|
-
explanation?: string | undefined;
|
|
1388
|
-
scorerAIProvider?: string | undefined;
|
|
1389
|
-
scorerAIModelSlug?: string | undefined;
|
|
1390
|
-
scorerAIInputTokensUsed?: number | undefined;
|
|
1391
|
-
scorerAIOutputTokensUsed?: number | undefined;
|
|
1392
|
-
scorerAIInputCost?: string | undefined;
|
|
1393
|
-
scorerAIOutputCost?: string | undefined;
|
|
1394
|
-
}, "kind" | "schemaVersion">) => {
|
|
1395
|
-
id: string;
|
|
1396
|
-
value: number;
|
|
1397
|
-
responseId: string;
|
|
1398
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
1399
|
-
requiredKeys: string[];
|
|
1400
|
-
presentKeys: string[];
|
|
1401
|
-
missingKeys: string[];
|
|
1402
|
-
mismatchedKeys: string[];
|
|
1403
|
-
kind: "fnol.sc.fields.v1";
|
|
1404
|
-
schemaVersion: 1;
|
|
1405
|
-
metadata?: Record<string, unknown> | undefined;
|
|
1406
|
-
explanation?: string | undefined;
|
|
1407
|
-
scorerAIProvider?: string | undefined;
|
|
1408
|
-
scorerAIModelSlug?: string | undefined;
|
|
1409
|
-
scorerAIInputTokensUsed?: number | undefined;
|
|
1410
|
-
scorerAIOutputTokensUsed?: number | undefined;
|
|
1411
|
-
scorerAIInputCost?: string | undefined;
|
|
1412
|
-
scorerAIOutputCost?: string | undefined;
|
|
1413
966
|
};
|
|
1414
|
-
|
|
1415
|
-
id: string;
|
|
1416
|
-
value: number;
|
|
1417
|
-
responseId: string;
|
|
1418
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
1419
|
-
requiredKeys: string[];
|
|
1420
|
-
presentKeys: string[];
|
|
1421
|
-
missingKeys: string[];
|
|
1422
|
-
mismatchedKeys: string[];
|
|
1423
|
-
kind: "fnol.sc.fields.v1";
|
|
1424
|
-
schemaVersion: 1;
|
|
1425
|
-
metadata?: Record<string, unknown> | undefined;
|
|
1426
|
-
explanation?: string | undefined;
|
|
1427
|
-
scorerAIProvider?: string | undefined;
|
|
1428
|
-
scorerAIModelSlug?: string | undefined;
|
|
1429
|
-
scorerAIInputTokensUsed?: number | undefined;
|
|
1430
|
-
scorerAIOutputTokensUsed?: number | undefined;
|
|
1431
|
-
scorerAIInputCost?: string | undefined;
|
|
1432
|
-
scorerAIOutputCost?: string | undefined;
|
|
1433
|
-
}, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
1434
|
-
id: string;
|
|
1435
|
-
value: number;
|
|
1436
|
-
responseId: string;
|
|
1437
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
1438
|
-
requiredKeys: string[];
|
|
1439
|
-
presentKeys: string[];
|
|
1440
|
-
missingKeys: string[];
|
|
1441
|
-
mismatchedKeys: string[];
|
|
1442
|
-
kind: "fnol.sc.fields.v1";
|
|
1443
|
-
schemaVersion: 1;
|
|
1444
|
-
metadata?: Record<string, unknown> | undefined;
|
|
1445
|
-
explanation?: string | undefined;
|
|
1446
|
-
scorerAIProvider?: string | undefined;
|
|
1447
|
-
scorerAIModelSlug?: string | undefined;
|
|
1448
|
-
scorerAIInputTokensUsed?: number | undefined;
|
|
1449
|
-
scorerAIOutputTokensUsed?: number | undefined;
|
|
1450
|
-
scorerAIInputCost?: string | undefined;
|
|
1451
|
-
scorerAIOutputCost?: string | undefined;
|
|
1452
|
-
}>;
|
|
1453
|
-
};
|
|
1454
|
-
type FNOLFieldsScoreV1 = z$1.infer<typeof FNOLFieldsScoreSchemaV1>;
|
|
1455
|
-
declare const FNOLLLMJudgeScoreSchemaV1: z$1.ZodObject<Omit<Omit<{
|
|
1456
|
-
id: z$1.ZodString;
|
|
1457
|
-
kind: z$1.ZodString;
|
|
1458
|
-
schemaVersion: z$1.ZodNumber;
|
|
1459
|
-
value: z$1.ZodNumber;
|
|
1460
|
-
responseId: z$1.ZodString;
|
|
1461
|
-
explanation: z$1.ZodOptional<z$1.ZodString>;
|
|
1462
|
-
metadata: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
|
|
1463
|
-
scoringMethod: z$1.ZodEnum<{
|
|
1464
|
-
readonly ai: "ai";
|
|
1465
|
-
readonly human: "human";
|
|
1466
|
-
readonly algo: "algo";
|
|
1467
|
-
}>;
|
|
1468
|
-
}, "kind" | "schemaVersion"> & {
|
|
1469
|
-
scorerAIProvider: z$1.ZodOptional<z$1.ZodString>;
|
|
1470
|
-
scorerAIModelSlug: z$1.ZodOptional<z$1.ZodString>;
|
|
1471
|
-
scorerAIInputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1472
|
-
scorerAIOutputTokensUsed: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1473
|
-
scorerAIInputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1474
|
-
scorerAIOutputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1475
|
-
} & {
|
|
1476
|
-
kind: z$1.ZodString;
|
|
1477
|
-
schemaVersion: z$1.ZodNumber;
|
|
1478
|
-
}, "kind" | "schemaVersion"> & {
|
|
1479
|
-
verdict: z$1.ZodOptional<z$1.ZodEnum<{
|
|
1480
|
-
pass: "pass";
|
|
1481
|
-
borderline: "borderline";
|
|
1482
|
-
fail: "fail";
|
|
1483
|
-
}>>;
|
|
1484
|
-
} & {
|
|
1485
|
-
kind: z$1.ZodLiteral<"fnol.sc.llm-judge.v1">;
|
|
1486
|
-
schemaVersion: z$1.ZodLiteral<1>;
|
|
1487
|
-
}, z$1.core.$strip> & {
|
|
1488
|
-
new: (input: Omit<{
|
|
967
|
+
score?: {
|
|
1489
968
|
id: string;
|
|
1490
969
|
value: number;
|
|
1491
970
|
responseId: string;
|
|
1492
971
|
scoringMethod: "ai" | "human" | "algo";
|
|
1493
|
-
|
|
972
|
+
extractedAnswers: string[];
|
|
973
|
+
namespace: "peerbench.ai";
|
|
974
|
+
kind: "llm/mcq.sc";
|
|
1494
975
|
schemaVersion: 1;
|
|
1495
|
-
metadata?: Record<string, unknown> | undefined;
|
|
1496
976
|
explanation?: string | undefined;
|
|
1497
|
-
scorerAIProvider?: string | undefined;
|
|
1498
|
-
scorerAIModelSlug?: string | undefined;
|
|
1499
|
-
scorerAIInputTokensUsed?: number | undefined;
|
|
1500
|
-
scorerAIOutputTokensUsed?: number | undefined;
|
|
1501
|
-
scorerAIInputCost?: string | undefined;
|
|
1502
|
-
scorerAIOutputCost?: string | undefined;
|
|
1503
|
-
verdict?: "pass" | "borderline" | "fail" | undefined;
|
|
1504
|
-
}, "kind" | "schemaVersion">) => {
|
|
1505
|
-
id: string;
|
|
1506
|
-
value: number;
|
|
1507
|
-
responseId: string;
|
|
1508
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
1509
|
-
kind: "fnol.sc.llm-judge.v1";
|
|
1510
|
-
schemaVersion: 1;
|
|
1511
977
|
metadata?: Record<string, unknown> | undefined;
|
|
1512
|
-
|
|
978
|
+
scorerAISystemPrompt?: string | undefined;
|
|
979
|
+
scorerAISystemPromptId?: string | undefined;
|
|
1513
980
|
scorerAIProvider?: string | undefined;
|
|
1514
981
|
scorerAIModelSlug?: string | undefined;
|
|
1515
982
|
scorerAIInputTokensUsed?: number | undefined;
|
|
1516
983
|
scorerAIOutputTokensUsed?: number | undefined;
|
|
1517
984
|
scorerAIInputCost?: string | undefined;
|
|
1518
985
|
scorerAIOutputCost?: string | undefined;
|
|
1519
|
-
|
|
1520
|
-
};
|
|
1521
|
-
newWithId(input: Omit<{
|
|
986
|
+
} | {
|
|
1522
987
|
id: string;
|
|
1523
988
|
value: number;
|
|
1524
989
|
responseId: string;
|
|
1525
990
|
scoringMethod: "ai" | "human" | "algo";
|
|
1526
|
-
|
|
991
|
+
namespace: "peerbench.ai";
|
|
992
|
+
kind: "llm/qa.sc";
|
|
1527
993
|
schemaVersion: 1;
|
|
1528
|
-
metadata?: Record<string, unknown> | undefined;
|
|
1529
994
|
explanation?: string | undefined;
|
|
1530
|
-
scorerAIProvider?: string | undefined;
|
|
1531
|
-
scorerAIModelSlug?: string | undefined;
|
|
1532
|
-
scorerAIInputTokensUsed?: number | undefined;
|
|
1533
|
-
scorerAIOutputTokensUsed?: number | undefined;
|
|
1534
|
-
scorerAIInputCost?: string | undefined;
|
|
1535
|
-
scorerAIOutputCost?: string | undefined;
|
|
1536
|
-
verdict?: "pass" | "borderline" | "fail" | undefined;
|
|
1537
|
-
}, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
1538
|
-
id: string;
|
|
1539
|
-
value: number;
|
|
1540
|
-
responseId: string;
|
|
1541
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
1542
|
-
kind: "fnol.sc.llm-judge.v1";
|
|
1543
|
-
schemaVersion: 1;
|
|
1544
995
|
metadata?: Record<string, unknown> | undefined;
|
|
1545
|
-
|
|
996
|
+
scorerAISystemPrompt?: string | undefined;
|
|
997
|
+
scorerAISystemPromptId?: string | undefined;
|
|
1546
998
|
scorerAIProvider?: string | undefined;
|
|
1547
999
|
scorerAIModelSlug?: string | undefined;
|
|
1548
1000
|
scorerAIInputTokensUsed?: number | undefined;
|
|
1549
1001
|
scorerAIOutputTokensUsed?: number | undefined;
|
|
1550
1002
|
scorerAIInputCost?: string | undefined;
|
|
1551
1003
|
scorerAIOutputCost?: string | undefined;
|
|
1552
|
-
|
|
1553
|
-
|
|
1554
|
-
};
|
|
1555
|
-
type FNOLLLMJudgeScoreV1 = z$1.infer<typeof FNOLLLMJudgeScoreSchemaV1>;
|
|
1556
|
-
|
|
1557
|
-
declare function runTestCase(params: {
|
|
1558
|
-
testCase: FNOLTestCaseV1;
|
|
1559
|
-
provider: AbstractLLMProvider;
|
|
1560
|
-
userSimulatorProvider?: AbstractLLMProvider;
|
|
1561
|
-
scorer?: FNOLFieldsScorer | LLMJudgeScorer;
|
|
1562
|
-
runConfig: {
|
|
1563
|
-
model: string;
|
|
1564
|
-
userSimulatorModel?: string;
|
|
1565
|
-
llmJudgeModel?: string;
|
|
1566
|
-
temperature?: number;
|
|
1567
|
-
userSimulatorTemperature?: number;
|
|
1568
|
-
};
|
|
1569
|
-
systemPrompt?: SimpleSystemPromptV1;
|
|
1570
|
-
idGenerators?: {
|
|
1571
|
-
response?: IdGenerator;
|
|
1572
|
-
score?: IdGenerator;
|
|
1573
|
-
};
|
|
1574
|
-
}): Promise<RunnerResult<FNOLResponseV1, FNOLFieldsScoreV1 | FNOLLLMJudgeScoreV1>>;
|
|
1575
|
-
|
|
1576
|
-
declare const FNOLBaseScoreSchemaV1: z.ZodObject<Omit<{
|
|
1577
|
-
id: z.ZodString;
|
|
1578
|
-
kind: z.ZodString;
|
|
1579
|
-
schemaVersion: z.ZodNumber;
|
|
1580
|
-
value: z.ZodNumber;
|
|
1581
|
-
responseId: z.ZodString;
|
|
1582
|
-
explanation: z.ZodOptional<z.ZodString>;
|
|
1583
|
-
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
1584
|
-
scoringMethod: z.ZodEnum<{
|
|
1585
|
-
readonly ai: "ai";
|
|
1586
|
-
readonly human: "human";
|
|
1587
|
-
readonly algo: "algo";
|
|
1588
|
-
}>;
|
|
1589
|
-
}, "kind" | "schemaVersion"> & {
|
|
1590
|
-
scorerAIProvider: z.ZodOptional<z.ZodString>;
|
|
1591
|
-
scorerAIModelSlug: z.ZodOptional<z.ZodString>;
|
|
1592
|
-
scorerAIInputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
1593
|
-
scorerAIOutputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
1594
|
-
scorerAIInputCost: z.ZodOptional<z.ZodString>;
|
|
1595
|
-
scorerAIOutputCost: z.ZodOptional<z.ZodString>;
|
|
1596
|
-
} & {
|
|
1597
|
-
kind: z.ZodString;
|
|
1598
|
-
schemaVersion: z.ZodNumber;
|
|
1599
|
-
}, zod_v4_core.$strip> & {
|
|
1600
|
-
new: (input: Omit<{
|
|
1601
|
-
id: string;
|
|
1602
|
-
value: number;
|
|
1603
|
-
responseId: string;
|
|
1604
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
1605
|
-
kind: string;
|
|
1606
|
-
schemaVersion: number;
|
|
1607
|
-
metadata?: Record<string, unknown> | undefined;
|
|
1608
|
-
explanation?: string | undefined;
|
|
1609
|
-
scorerAIProvider?: string | undefined;
|
|
1610
|
-
scorerAIModelSlug?: string | undefined;
|
|
1611
|
-
scorerAIInputTokensUsed?: number | undefined;
|
|
1612
|
-
scorerAIOutputTokensUsed?: number | undefined;
|
|
1613
|
-
scorerAIInputCost?: string | undefined;
|
|
1614
|
-
scorerAIOutputCost?: string | undefined;
|
|
1615
|
-
}, "kind" | "schemaVersion">) => {
|
|
1616
|
-
id: string;
|
|
1617
|
-
value: number;
|
|
1618
|
-
responseId: string;
|
|
1619
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
1620
|
-
kind: string;
|
|
1621
|
-
schemaVersion: number;
|
|
1622
|
-
metadata?: Record<string, unknown> | undefined;
|
|
1623
|
-
explanation?: string | undefined;
|
|
1624
|
-
scorerAIProvider?: string | undefined;
|
|
1625
|
-
scorerAIModelSlug?: string | undefined;
|
|
1626
|
-
scorerAIInputTokensUsed?: number | undefined;
|
|
1627
|
-
scorerAIOutputTokensUsed?: number | undefined;
|
|
1628
|
-
scorerAIInputCost?: string | undefined;
|
|
1629
|
-
scorerAIOutputCost?: string | undefined;
|
|
1630
|
-
};
|
|
1631
|
-
newWithId(input: Omit<{
|
|
1632
|
-
id: string;
|
|
1633
|
-
value: number;
|
|
1634
|
-
responseId: string;
|
|
1635
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
1636
|
-
kind: string;
|
|
1637
|
-
schemaVersion: number;
|
|
1638
|
-
metadata?: Record<string, unknown> | undefined;
|
|
1639
|
-
explanation?: string | undefined;
|
|
1640
|
-
scorerAIProvider?: string | undefined;
|
|
1641
|
-
scorerAIModelSlug?: string | undefined;
|
|
1642
|
-
scorerAIInputTokensUsed?: number | undefined;
|
|
1643
|
-
scorerAIOutputTokensUsed?: number | undefined;
|
|
1644
|
-
scorerAIInputCost?: string | undefined;
|
|
1645
|
-
scorerAIOutputCost?: string | undefined;
|
|
1646
|
-
}, "id" | "kind" | "schemaVersion">, generator: IdGenerator): Promise<{
|
|
1647
|
-
id: string;
|
|
1648
|
-
value: number;
|
|
1649
|
-
responseId: string;
|
|
1650
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
1651
|
-
kind: string;
|
|
1652
|
-
schemaVersion: number;
|
|
1653
|
-
metadata?: Record<string, unknown> | undefined;
|
|
1654
|
-
explanation?: string | undefined;
|
|
1655
|
-
scorerAIProvider?: string | undefined;
|
|
1656
|
-
scorerAIModelSlug?: string | undefined;
|
|
1657
|
-
scorerAIInputTokensUsed?: number | undefined;
|
|
1658
|
-
scorerAIOutputTokensUsed?: number | undefined;
|
|
1659
|
-
scorerAIInputCost?: string | undefined;
|
|
1660
|
-
scorerAIOutputCost?: string | undefined;
|
|
1661
|
-
}>;
|
|
1662
|
-
};
|
|
1004
|
+
} | undefined;
|
|
1005
|
+
}>;
|
|
1663
1006
|
|
|
1664
|
-
declare
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
}
|
|
1670
|
-
type FNOLFieldValueType = (typeof FNOLFieldValueType)[keyof typeof FNOLFieldValueType];
|
|
1671
|
-
declare const FNOLDoneReason: {
|
|
1672
|
-
readonly modelProvidedJson: "modelProvidedJson";
|
|
1673
|
-
readonly reachedMaxTurns: "reachedMaxTurns";
|
|
1674
|
-
readonly forcedFinalJson: "forcedFinalJson";
|
|
1675
|
-
};
|
|
1676
|
-
type FNOLDoneReason = (typeof FNOLDoneReason)[keyof typeof FNOLDoneReason];
|
|
1007
|
+
declare class PeerbenchJSONStorage extends JSONFileStorage<MCQTestCaseV1 | MCQResponseV1 | MCQScoreV1 | QATestCaseV1 | QAResponseV1 | QAScoreV1 | MultiTurnTestCaseV1 | MultiTurnResponseV1 | MultiTurnScoreV1> {
|
|
1008
|
+
constructor(config: {
|
|
1009
|
+
path: string;
|
|
1010
|
+
chunkSize?: number;
|
|
1011
|
+
});
|
|
1012
|
+
}
|
|
1677
1013
|
|
|
1678
|
-
declare const
|
|
1679
|
-
declare const
|
|
1680
|
-
type
|
|
1681
|
-
declare const
|
|
1682
|
-
type
|
|
1683
|
-
declare const
|
|
1684
|
-
type
|
|
1685
|
-
|
|
1686
|
-
declare const
|
|
1687
|
-
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
type
|
|
1693
|
-
declare const
|
|
1014
|
+
declare const index_MCQKind: typeof MCQKind;
|
|
1015
|
+
declare const index_MCQResponseSchemaV1: typeof MCQResponseSchemaV1;
|
|
1016
|
+
type index_MCQResponseV1 = MCQResponseV1;
|
|
1017
|
+
declare const index_MCQScoreSchemaV1: typeof MCQScoreSchemaV1;
|
|
1018
|
+
type index_MCQScoreV1 = MCQScoreV1;
|
|
1019
|
+
declare const index_MCQTestCaseSchemaV1: typeof MCQTestCaseSchemaV1;
|
|
1020
|
+
type index_MCQTestCaseV1 = MCQTestCaseV1;
|
|
1021
|
+
declare const index_MultiTurnKind: typeof MultiTurnKind;
|
|
1022
|
+
declare const index_MultiTurnResponseSchemaV1: typeof MultiTurnResponseSchemaV1;
|
|
1023
|
+
type index_MultiTurnResponseV1 = MultiTurnResponseV1;
|
|
1024
|
+
declare const index_MultiTurnScoreSchemaV1: typeof MultiTurnScoreSchemaV1;
|
|
1025
|
+
type index_MultiTurnScoreV1 = MultiTurnScoreV1;
|
|
1026
|
+
declare const index_MultiTurnTestCaseSchemaV1: typeof MultiTurnTestCaseSchemaV1;
|
|
1027
|
+
type index_MultiTurnTestCaseV1 = MultiTurnTestCaseV1;
|
|
1028
|
+
type index_PeerbenchJSONStorage = PeerbenchJSONStorage;
|
|
1029
|
+
declare const index_PeerbenchJSONStorage: typeof PeerbenchJSONStorage;
|
|
1030
|
+
declare const index_QAKind: typeof QAKind;
|
|
1031
|
+
declare const index_QAResponseSchemaV1: typeof QAResponseSchemaV1;
|
|
1032
|
+
type index_QAResponseV1 = QAResponseV1;
|
|
1033
|
+
declare const index_QAScoreSchemaV1: typeof QAScoreSchemaV1;
|
|
1034
|
+
type index_QAScoreV1 = QAScoreV1;
|
|
1035
|
+
declare const index_QATestCaseSchemaV1: typeof QATestCaseSchemaV1;
|
|
1036
|
+
type index_QATestCaseV1 = QATestCaseV1;
|
|
1037
|
+
declare const index_peerbenchRunner: typeof peerbenchRunner;
|
|
1694
1038
|
declare namespace index {
|
|
1695
|
-
export {
|
|
1039
|
+
export { index_MCQKind as MCQKind, index_MCQResponseSchemaV1 as MCQResponseSchemaV1, type index_MCQResponseV1 as MCQResponseV1, index_MCQScoreSchemaV1 as MCQScoreSchemaV1, type index_MCQScoreV1 as MCQScoreV1, index_MCQTestCaseSchemaV1 as MCQTestCaseSchemaV1, type index_MCQTestCaseV1 as MCQTestCaseV1, index_MultiTurnKind as MultiTurnKind, index_MultiTurnResponseSchemaV1 as MultiTurnResponseSchemaV1, type index_MultiTurnResponseV1 as MultiTurnResponseV1, index_MultiTurnScoreSchemaV1 as MultiTurnScoreSchemaV1, type index_MultiTurnScoreV1 as MultiTurnScoreV1, index_MultiTurnTestCaseSchemaV1 as MultiTurnTestCaseSchemaV1, type index_MultiTurnTestCaseV1 as MultiTurnTestCaseV1, index_PeerbenchJSONStorage as PeerbenchJSONStorage, index_QAKind as QAKind, index_QAResponseSchemaV1 as QAResponseSchemaV1, type index_QAResponseV1 as QAResponseV1, index_QAScoreSchemaV1 as QAScoreSchemaV1, type index_QAScoreV1 as QAScoreV1, index_QATestCaseSchemaV1 as QATestCaseSchemaV1, type index_QATestCaseV1 as QATestCaseV1, index_peerbenchRunner as peerbenchRunner };
|
|
1696
1040
|
}
|
|
1697
1041
|
|
|
1698
|
-
export { index as
|
|
1042
|
+
export { index as peerbench };
|