peerbench 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/aggregators/abstract.d.ts +10 -0
- package/dist/aggregators/index.d.ts +2 -67
- package/dist/aggregators/llm/avg.d.ts +26 -0
- package/dist/benchmarks/examples/echo-basic/index.d.ts +4 -0
- package/dist/benchmarks/examples/echo-basic/runner.d.ts +273 -0
- package/dist/benchmarks/examples/echo-basic/schema-sets/echo.v1.d.ts +241 -0
- package/dist/benchmarks/examples/echo-basic/storages/json.d.ts +14 -0
- package/dist/benchmarks/examples/echo-basic/storages/text.d.ts +24 -0
- package/dist/benchmarks/examples/exact-match-scorer/index.d.ts +4 -0
- package/dist/benchmarks/examples/exact-match-scorer/runner.d.ts +428 -0
- package/dist/benchmarks/examples/exact-match-scorer/schema-sets/exact-match.v1.d.ts +287 -0
- package/dist/benchmarks/examples/exact-match-scorer/scorer.d.ts +30 -0
- package/dist/benchmarks/examples/exact-match-scorer/storages/json.d.ts +8 -0
- package/dist/benchmarks/examples/text-transform/index.d.ts +4 -0
- package/dist/benchmarks/examples/text-transform/runner.d.ts +524 -0
- package/dist/benchmarks/examples/text-transform/schema-sets/echo.v1.d.ts +211 -0
- package/dist/benchmarks/examples/text-transform/schema-sets/namespace.d.ts +1 -0
- package/dist/benchmarks/examples/text-transform/schema-sets/reverse.v1.d.ts +216 -0
- package/dist/benchmarks/examples/text-transform/storages/json.d.ts +9 -0
- package/dist/benchmarks/index.d.ts +1 -1667
- package/dist/benchmarks/index.js +16 -16
- package/dist/benchmarks/peerbench/index.d.ts +5 -0
- package/dist/benchmarks/peerbench/runner.d.ts +754 -0
- package/dist/benchmarks/peerbench/schema-sets/mcq.v1.d.ts +261 -0
- package/dist/benchmarks/peerbench/schema-sets/multi-turn.v1.d.ts +351 -0
- package/dist/benchmarks/peerbench/schema-sets/qa.v1.d.ts +256 -0
- package/dist/benchmarks/peerbench/storages/json.d.ts +10 -0
- package/dist/{chunk-ZXTQJFGL.js → chunk-Q6GSOHOP.js} +4 -4
- package/dist/constants.d.ts +4 -0
- package/dist/errors/index.d.ts +2 -0
- package/dist/errors/peerbench.d.ts +6 -0
- package/dist/errors/polyfill.d.ts +1 -0
- package/dist/examples/basic.d.ts +1 -0
- package/dist/helpers/define-runner.d.ts +45 -0
- package/dist/helpers/index.d.ts +1 -0
- package/dist/index.d.ts +6 -101
- package/dist/index.js +3 -3
- package/dist/providers/abstract/llm.d.ts +20 -0
- package/dist/{provider-DnEBdl1n.d.ts → providers/abstract/provider.d.ts} +2 -4
- package/dist/providers/example/echo.d.ts +12 -0
- package/dist/providers/example/restapi.d.ts +37 -0
- package/dist/providers/index.d.ts +5 -96
- package/dist/providers/mastra.d.ts +40 -0
- package/dist/providers/openai.d.ts +29 -0
- package/dist/providers/openrouter.d.ts +27 -0
- package/dist/schemas/extensions/index.d.ts +18 -22
- package/dist/schemas/extensions/response/llm.d.ts +14 -0
- package/dist/schemas/extensions/score/llm-as-a-judge-scorer.d.ts +15 -0
- package/dist/schemas/id.d.ts +2 -0
- package/dist/schemas/index.d.ts +4 -200
- package/dist/schemas/llm/index.d.ts +2 -116
- package/dist/schemas/llm/index.js +2 -2
- package/dist/schemas/llm/simple-system-prompt.d.ts +51 -0
- package/dist/schemas/llm/system-prompt.d.ts +59 -0
- package/dist/schemas/response.d.ts +63 -0
- package/dist/schemas/schema-definer.d.ts +47 -0
- package/dist/schemas/score.d.ts +73 -0
- package/dist/schemas/test-case.d.ts +57 -0
- package/dist/{abstract-BdgLjkNC.d.ts → scorers/abstract.d.ts} +2 -4
- package/dist/scorers/index.d.ts +4 -68
- package/dist/scorers/llm-judge.d.ts +55 -0
- package/dist/scorers/mcq.d.ts +19 -0
- package/dist/scorers/mcq.test.d.ts +1 -0
- package/dist/scorers/regex.d.ts +58 -0
- package/dist/scorers/regex.test.d.ts +1 -0
- package/dist/storages/abstract.d.ts +7 -0
- package/dist/storages/examples/http.d.ts +1 -0
- package/dist/storages/examples/sqlite.d.ts +1 -0
- package/dist/storages/file.d.ts +43 -0
- package/dist/storages/http.d.ts +22 -0
- package/dist/storages/index.d.ts +5 -69
- package/dist/storages/json-file.d.ts +21 -0
- package/dist/storages/sqlite.d.ts +41 -0
- package/dist/types/index.d.ts +17 -0
- package/dist/types/runner.d.ts +18 -0
- package/dist/utilities.d.ts +9 -0
- package/dist/utils/id-generator.d.ts +2 -0
- package/dist/utils/index.d.ts +5 -0
- package/dist/utils/json.d.ts +17 -0
- package/dist/utils/llm.d.ts +7 -0
- package/dist/{rate-limiter-CSmVIRsM.d.ts → utils/rate-limiter.d.ts} +3 -5
- package/dist/utils/sleep.d.ts +1 -0
- package/dist/utils/string.d.ts +8 -0
- package/package.json +3 -3
- package/dist/index-Cn20kPrz.d.ts +0 -27
- package/dist/json-file-Bgv9TLcX.d.ts +0 -74
- package/dist/llm-8ecJmwKJ.d.ts +0 -23
- package/dist/llm-judge-BuF80-5-.d.ts +0 -75
- /package/dist/{chunk-ZXTQJFGL.js.map → chunk-Q6GSOHOP.js.map} +0 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import { BaseResponseV1, BaseTestCaseV1 } from "../schemas/index.js";
|
|
2
|
+
import { BaseScoreV1 } from "../schemas/score";
|
|
3
|
+
export declare abstract class AbstractAggregator {
|
|
4
|
+
abstract push(params: {
|
|
5
|
+
score: BaseScoreV1;
|
|
6
|
+
testCase?: BaseTestCaseV1;
|
|
7
|
+
response?: BaseResponseV1;
|
|
8
|
+
}): Promise<void>;
|
|
9
|
+
abstract aggregate(config?: unknown): Promise<unknown>;
|
|
10
|
+
}
|
|
@@ -1,67 +1,2 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
import z__default from 'zod';
|
|
4
|
-
import '../provider-DnEBdl1n.js';
|
|
5
|
-
import '../abstract-BdgLjkNC.js';
|
|
6
|
-
|
|
7
|
-
declare abstract class AbstractAggregator {
|
|
8
|
-
abstract push(params: {
|
|
9
|
-
score: BaseScoreV1;
|
|
10
|
-
testCase?: BaseTestCaseV1;
|
|
11
|
-
response?: BaseResponseV1;
|
|
12
|
-
}): Promise<void>;
|
|
13
|
-
abstract aggregate(config?: unknown): Promise<unknown>;
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
/**
|
|
17
|
-
* Provides a set of fields that holds information about the LLM and its response.
|
|
18
|
-
*/
|
|
19
|
-
declare const ExtensionLLMResponseFieldsV1: {
|
|
20
|
-
data: z__default.ZodString;
|
|
21
|
-
modelSlug: z__default.ZodString;
|
|
22
|
-
provider: z__default.ZodString;
|
|
23
|
-
systemPromptId: z__default.ZodOptional<z__default.ZodString>;
|
|
24
|
-
inputTokensUsed: z__default.ZodOptional<z__default.ZodNumber>;
|
|
25
|
-
outputTokensUsed: z__default.ZodOptional<z__default.ZodNumber>;
|
|
26
|
-
inputCost: z__default.ZodOptional<z__default.ZodString>;
|
|
27
|
-
outputCost: z__default.ZodOptional<z__default.ZodString>;
|
|
28
|
-
};
|
|
29
|
-
|
|
30
|
-
/**
|
|
31
|
-
* Provides a set of fields that holds information about the LLM model
|
|
32
|
-
* that was used to judge the response.
|
|
33
|
-
*/
|
|
34
|
-
declare const ExtensionLLMAsAJudgeScoreFieldsV1: {
|
|
35
|
-
scorerAISystemPrompt: z__default.ZodOptional<z__default.ZodString>;
|
|
36
|
-
scorerAISystemPromptId: z__default.ZodOptional<z__default.ZodString>;
|
|
37
|
-
scorerAIProvider: z__default.ZodOptional<z__default.ZodString>;
|
|
38
|
-
scorerAIModelSlug: z__default.ZodOptional<z__default.ZodString>;
|
|
39
|
-
scorerAIInputTokensUsed: z__default.ZodOptional<z__default.ZodNumber>;
|
|
40
|
-
scorerAIOutputTokensUsed: z__default.ZodOptional<z__default.ZodNumber>;
|
|
41
|
-
scorerAIInputCost: z__default.ZodOptional<z__default.ZodString>;
|
|
42
|
-
scorerAIOutputCost: z__default.ZodOptional<z__default.ZodString>;
|
|
43
|
-
};
|
|
44
|
-
|
|
45
|
-
declare class AvgAggregator extends AbstractAggregator {
|
|
46
|
-
private separateBySystemPrompt;
|
|
47
|
-
private scores;
|
|
48
|
-
constructor(params: {
|
|
49
|
-
separateBySystemPrompt?: boolean;
|
|
50
|
-
});
|
|
51
|
-
push(params: {
|
|
52
|
-
score: InferExtension<typeof ExtensionLLMAsAJudgeScoreFieldsV1, BaseScoreV1>;
|
|
53
|
-
response: InferExtension<typeof ExtensionLLMResponseFieldsV1, BaseResponseV1>;
|
|
54
|
-
testCase?: BaseTestCaseV1;
|
|
55
|
-
}): Promise<void>;
|
|
56
|
-
aggregate(): Promise<{
|
|
57
|
-
[k: string]: {
|
|
58
|
-
average: number;
|
|
59
|
-
model: string;
|
|
60
|
-
total: number;
|
|
61
|
-
count: number;
|
|
62
|
-
systemPromptId?: string;
|
|
63
|
-
};
|
|
64
|
-
}>;
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
export { AbstractAggregator, AvgAggregator };
|
|
1
|
+
export * from "./abstract";
|
|
2
|
+
export * from "./llm/avg";
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { BaseResponseV1, BaseScoreV1, BaseTestCaseV1 } from "../../schemas/index.js";
|
|
2
|
+
import { AbstractAggregator } from "../abstract";
|
|
3
|
+
import { InferExtension } from "../../utilities";
|
|
4
|
+
import { ExtensionLLMResponseFieldsV1 } from "../../schemas/extensions/response/llm";
|
|
5
|
+
import { ExtensionLLMAsAJudgeScoreFieldsV1 } from "../../schemas/extensions/score/llm-as-a-judge-scorer";
|
|
6
|
+
export declare class AvgAggregator extends AbstractAggregator {
|
|
7
|
+
private separateBySystemPrompt;
|
|
8
|
+
private scores;
|
|
9
|
+
constructor(params: {
|
|
10
|
+
separateBySystemPrompt?: boolean;
|
|
11
|
+
});
|
|
12
|
+
push(params: {
|
|
13
|
+
score: InferExtension<typeof ExtensionLLMAsAJudgeScoreFieldsV1, BaseScoreV1>;
|
|
14
|
+
response: InferExtension<typeof ExtensionLLMResponseFieldsV1, BaseResponseV1>;
|
|
15
|
+
testCase?: BaseTestCaseV1;
|
|
16
|
+
}): Promise<void>;
|
|
17
|
+
aggregate(): Promise<{
|
|
18
|
+
[k: string]: {
|
|
19
|
+
average: number;
|
|
20
|
+
model: string;
|
|
21
|
+
total: number;
|
|
22
|
+
count: number;
|
|
23
|
+
systemPromptId?: string;
|
|
24
|
+
};
|
|
25
|
+
}>;
|
|
26
|
+
}
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
import { AbstractLLMProvider } from "../../../providers/index.js";
|
|
2
|
+
import { z } from "zod";
|
|
3
|
+
/**
|
|
4
|
+
* Runners are the backbone of a benchmark. They are responsible for executing the test cases and producing
|
|
5
|
+
* the responses and scores. As the benchmark builder, you define what schemas the runner can work with,
|
|
6
|
+
* what are the providers and scorers are supported and what configurations can be passed by the caller
|
|
7
|
+
* at the execution phase.
|
|
8
|
+
*/
|
|
9
|
+
export declare const echoBasicRunner: ((params: {
|
|
10
|
+
testCase: {
|
|
11
|
+
id: string;
|
|
12
|
+
input: string;
|
|
13
|
+
namespace: "example.peerbench.ai";
|
|
14
|
+
kind: "llm/echo-basic.tc";
|
|
15
|
+
schemaVersion: 1;
|
|
16
|
+
metadata?: Record<string, unknown> | undefined;
|
|
17
|
+
};
|
|
18
|
+
provider: AbstractLLMProvider;
|
|
19
|
+
scorer?: undefined;
|
|
20
|
+
runConfig: {
|
|
21
|
+
model: string;
|
|
22
|
+
};
|
|
23
|
+
idGenerators?: {
|
|
24
|
+
response?: import("../../../index.js").IdGenerator;
|
|
25
|
+
score?: import("../../../index.js").IdGenerator;
|
|
26
|
+
};
|
|
27
|
+
}) => Promise<{
|
|
28
|
+
response: {
|
|
29
|
+
startedAt: number;
|
|
30
|
+
completedAt: number;
|
|
31
|
+
id: string;
|
|
32
|
+
testCaseId: string;
|
|
33
|
+
data: string;
|
|
34
|
+
modelSlug: string;
|
|
35
|
+
provider: string;
|
|
36
|
+
namespace: "example.peerbench.ai";
|
|
37
|
+
kind: "llm/echo-basic.rs";
|
|
38
|
+
schemaVersion: 1;
|
|
39
|
+
metadata?: Record<string, unknown> | undefined;
|
|
40
|
+
systemPromptId?: string | undefined;
|
|
41
|
+
inputTokensUsed?: number | undefined;
|
|
42
|
+
outputTokensUsed?: number | undefined;
|
|
43
|
+
inputCost?: string | undefined;
|
|
44
|
+
outputCost?: string | undefined;
|
|
45
|
+
};
|
|
46
|
+
score?: {
|
|
47
|
+
id: string;
|
|
48
|
+
value: number;
|
|
49
|
+
responseId: string;
|
|
50
|
+
scoringMethod: "ai" | "human" | "algo";
|
|
51
|
+
namespace: "example.peerbench.ai";
|
|
52
|
+
kind: "llm/echo-basic.sc";
|
|
53
|
+
schemaVersion: 1;
|
|
54
|
+
metadata?: Record<string, unknown> | undefined;
|
|
55
|
+
explanation?: string | undefined;
|
|
56
|
+
} | undefined;
|
|
57
|
+
}>) & {
|
|
58
|
+
config: {
|
|
59
|
+
runConfigSchema: z.ZodObject<{
|
|
60
|
+
model: z.ZodString;
|
|
61
|
+
}, z.core.$strip>;
|
|
62
|
+
schemaSets: [{
|
|
63
|
+
readonly testCase: z.ZodObject<Omit<{
|
|
64
|
+
id: z.ZodString;
|
|
65
|
+
namespace: z.ZodString;
|
|
66
|
+
schemaVersion: z.ZodNumber;
|
|
67
|
+
kind: z.ZodString;
|
|
68
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
69
|
+
}, "kind" | "namespace" | "schemaVersion"> & {
|
|
70
|
+
input: z.ZodString;
|
|
71
|
+
} & {
|
|
72
|
+
namespace: z.ZodLiteral<"example.peerbench.ai">;
|
|
73
|
+
kind: z.ZodLiteral<"llm/echo-basic.tc">;
|
|
74
|
+
schemaVersion: z.ZodLiteral<1>;
|
|
75
|
+
}, z.core.$strip> & {
|
|
76
|
+
new: (input: Omit<{
|
|
77
|
+
id: string;
|
|
78
|
+
input: string;
|
|
79
|
+
namespace: "example.peerbench.ai";
|
|
80
|
+
kind: "llm/echo-basic.tc";
|
|
81
|
+
schemaVersion: 1;
|
|
82
|
+
metadata?: Record<string, unknown> | undefined;
|
|
83
|
+
}, "kind" | "namespace" | "schemaVersion">) => {
|
|
84
|
+
id: string;
|
|
85
|
+
input: string;
|
|
86
|
+
namespace: "example.peerbench.ai";
|
|
87
|
+
kind: "llm/echo-basic.tc";
|
|
88
|
+
schemaVersion: 1;
|
|
89
|
+
metadata?: Record<string, unknown> | undefined;
|
|
90
|
+
};
|
|
91
|
+
newWithId(input: Omit<{
|
|
92
|
+
id: string;
|
|
93
|
+
input: string;
|
|
94
|
+
namespace: "example.peerbench.ai";
|
|
95
|
+
kind: "llm/echo-basic.tc";
|
|
96
|
+
schemaVersion: 1;
|
|
97
|
+
metadata?: Record<string, unknown> | undefined;
|
|
98
|
+
}, "kind" | "id" | "namespace" | "schemaVersion">, generator: import("../../../index.js").IdGenerator): Promise<{
|
|
99
|
+
id: string;
|
|
100
|
+
input: string;
|
|
101
|
+
namespace: "example.peerbench.ai";
|
|
102
|
+
kind: "llm/echo-basic.tc";
|
|
103
|
+
schemaVersion: 1;
|
|
104
|
+
metadata?: Record<string, unknown> | undefined;
|
|
105
|
+
}>;
|
|
106
|
+
};
|
|
107
|
+
readonly response: z.ZodObject<Omit<{
|
|
108
|
+
id: z.ZodString;
|
|
109
|
+
namespace: z.ZodString;
|
|
110
|
+
schemaVersion: z.ZodNumber;
|
|
111
|
+
kind: z.ZodString;
|
|
112
|
+
startedAt: z.ZodNumber;
|
|
113
|
+
completedAt: z.ZodNumber;
|
|
114
|
+
testCaseId: z.ZodString;
|
|
115
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
116
|
+
}, "kind" | "namespace" | "schemaVersion"> & {
|
|
117
|
+
data: z.ZodString;
|
|
118
|
+
modelSlug: z.ZodString;
|
|
119
|
+
provider: z.ZodString;
|
|
120
|
+
systemPromptId: z.ZodOptional<z.ZodString>;
|
|
121
|
+
inputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
122
|
+
outputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
123
|
+
inputCost: z.ZodOptional<z.ZodString>;
|
|
124
|
+
outputCost: z.ZodOptional<z.ZodString>;
|
|
125
|
+
} & {
|
|
126
|
+
namespace: z.ZodLiteral<"example.peerbench.ai">;
|
|
127
|
+
kind: z.ZodLiteral<"llm/echo-basic.rs">;
|
|
128
|
+
schemaVersion: z.ZodLiteral<1>;
|
|
129
|
+
}, z.core.$strip> & {
|
|
130
|
+
new: (input: Omit<{
|
|
131
|
+
startedAt: number;
|
|
132
|
+
completedAt: number;
|
|
133
|
+
id: string;
|
|
134
|
+
testCaseId: string;
|
|
135
|
+
data: string;
|
|
136
|
+
modelSlug: string;
|
|
137
|
+
provider: string;
|
|
138
|
+
namespace: "example.peerbench.ai";
|
|
139
|
+
kind: "llm/echo-basic.rs";
|
|
140
|
+
schemaVersion: 1;
|
|
141
|
+
metadata?: Record<string, unknown> | undefined;
|
|
142
|
+
systemPromptId?: string | undefined;
|
|
143
|
+
inputTokensUsed?: number | undefined;
|
|
144
|
+
outputTokensUsed?: number | undefined;
|
|
145
|
+
inputCost?: string | undefined;
|
|
146
|
+
outputCost?: string | undefined;
|
|
147
|
+
}, "kind" | "namespace" | "schemaVersion">) => {
|
|
148
|
+
startedAt: number;
|
|
149
|
+
completedAt: number;
|
|
150
|
+
id: string;
|
|
151
|
+
testCaseId: string;
|
|
152
|
+
data: string;
|
|
153
|
+
modelSlug: string;
|
|
154
|
+
provider: string;
|
|
155
|
+
namespace: "example.peerbench.ai";
|
|
156
|
+
kind: "llm/echo-basic.rs";
|
|
157
|
+
schemaVersion: 1;
|
|
158
|
+
metadata?: Record<string, unknown> | undefined;
|
|
159
|
+
systemPromptId?: string | undefined;
|
|
160
|
+
inputTokensUsed?: number | undefined;
|
|
161
|
+
outputTokensUsed?: number | undefined;
|
|
162
|
+
inputCost?: string | undefined;
|
|
163
|
+
outputCost?: string | undefined;
|
|
164
|
+
};
|
|
165
|
+
newWithId(input: Omit<{
|
|
166
|
+
startedAt: number;
|
|
167
|
+
completedAt: number;
|
|
168
|
+
id: string;
|
|
169
|
+
testCaseId: string;
|
|
170
|
+
data: string;
|
|
171
|
+
modelSlug: string;
|
|
172
|
+
provider: string;
|
|
173
|
+
namespace: "example.peerbench.ai";
|
|
174
|
+
kind: "llm/echo-basic.rs";
|
|
175
|
+
schemaVersion: 1;
|
|
176
|
+
metadata?: Record<string, unknown> | undefined;
|
|
177
|
+
systemPromptId?: string | undefined;
|
|
178
|
+
inputTokensUsed?: number | undefined;
|
|
179
|
+
outputTokensUsed?: number | undefined;
|
|
180
|
+
inputCost?: string | undefined;
|
|
181
|
+
outputCost?: string | undefined;
|
|
182
|
+
}, "kind" | "id" | "namespace" | "schemaVersion">, generator: import("../../../index.js").IdGenerator): Promise<{
|
|
183
|
+
startedAt: number;
|
|
184
|
+
completedAt: number;
|
|
185
|
+
id: string;
|
|
186
|
+
testCaseId: string;
|
|
187
|
+
data: string;
|
|
188
|
+
modelSlug: string;
|
|
189
|
+
provider: string;
|
|
190
|
+
namespace: "example.peerbench.ai";
|
|
191
|
+
kind: "llm/echo-basic.rs";
|
|
192
|
+
schemaVersion: 1;
|
|
193
|
+
metadata?: Record<string, unknown> | undefined;
|
|
194
|
+
systemPromptId?: string | undefined;
|
|
195
|
+
inputTokensUsed?: number | undefined;
|
|
196
|
+
outputTokensUsed?: number | undefined;
|
|
197
|
+
inputCost?: string | undefined;
|
|
198
|
+
outputCost?: string | undefined;
|
|
199
|
+
}>;
|
|
200
|
+
};
|
|
201
|
+
readonly score: z.ZodObject<Omit<{
|
|
202
|
+
id: z.ZodString;
|
|
203
|
+
namespace: z.ZodString;
|
|
204
|
+
kind: z.ZodString;
|
|
205
|
+
schemaVersion: z.ZodNumber;
|
|
206
|
+
value: z.ZodNumber;
|
|
207
|
+
responseId: z.ZodString;
|
|
208
|
+
explanation: z.ZodOptional<z.ZodString>;
|
|
209
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
210
|
+
scoringMethod: z.ZodEnum<{
|
|
211
|
+
readonly ai: "ai";
|
|
212
|
+
readonly human: "human";
|
|
213
|
+
readonly algo: "algo";
|
|
214
|
+
}>;
|
|
215
|
+
}, "kind" | "namespace" | "schemaVersion"> & {
|
|
216
|
+
namespace: z.ZodLiteral<"example.peerbench.ai">;
|
|
217
|
+
kind: z.ZodLiteral<"llm/echo-basic.sc">;
|
|
218
|
+
schemaVersion: z.ZodLiteral<1>;
|
|
219
|
+
}, z.core.$strip> & {
|
|
220
|
+
new: (input: Omit<{
|
|
221
|
+
id: string;
|
|
222
|
+
value: number;
|
|
223
|
+
responseId: string;
|
|
224
|
+
scoringMethod: "ai" | "human" | "algo";
|
|
225
|
+
namespace: "example.peerbench.ai";
|
|
226
|
+
kind: "llm/echo-basic.sc";
|
|
227
|
+
schemaVersion: 1;
|
|
228
|
+
metadata?: Record<string, unknown> | undefined;
|
|
229
|
+
explanation?: string | undefined;
|
|
230
|
+
}, "kind" | "namespace" | "schemaVersion">) => {
|
|
231
|
+
id: string;
|
|
232
|
+
value: number;
|
|
233
|
+
responseId: string;
|
|
234
|
+
scoringMethod: "ai" | "human" | "algo";
|
|
235
|
+
namespace: "example.peerbench.ai";
|
|
236
|
+
kind: "llm/echo-basic.sc";
|
|
237
|
+
schemaVersion: 1;
|
|
238
|
+
metadata?: Record<string, unknown> | undefined;
|
|
239
|
+
explanation?: string | undefined;
|
|
240
|
+
};
|
|
241
|
+
newWithId(input: Omit<{
|
|
242
|
+
id: string;
|
|
243
|
+
value: number;
|
|
244
|
+
responseId: string;
|
|
245
|
+
scoringMethod: "ai" | "human" | "algo";
|
|
246
|
+
namespace: "example.peerbench.ai";
|
|
247
|
+
kind: "llm/echo-basic.sc";
|
|
248
|
+
schemaVersion: 1;
|
|
249
|
+
metadata?: Record<string, unknown> | undefined;
|
|
250
|
+
explanation?: string | undefined;
|
|
251
|
+
}, "kind" | "id" | "namespace" | "schemaVersion">, generator: import("../../../index.js").IdGenerator): Promise<{
|
|
252
|
+
id: string;
|
|
253
|
+
value: number;
|
|
254
|
+
responseId: string;
|
|
255
|
+
scoringMethod: "ai" | "human" | "algo";
|
|
256
|
+
namespace: "example.peerbench.ai";
|
|
257
|
+
kind: "llm/echo-basic.sc";
|
|
258
|
+
schemaVersion: 1;
|
|
259
|
+
metadata?: Record<string, unknown> | undefined;
|
|
260
|
+
explanation?: string | undefined;
|
|
261
|
+
}>;
|
|
262
|
+
};
|
|
263
|
+
}];
|
|
264
|
+
providers: [typeof AbstractLLMProvider];
|
|
265
|
+
scorers: [];
|
|
266
|
+
parseRunConfig?: boolean;
|
|
267
|
+
defaults?: {
|
|
268
|
+
scorer?: undefined;
|
|
269
|
+
responseIdGenerator?: import("../../../index.js").IdGenerator;
|
|
270
|
+
scoreIdGenerator?: import("../../../index.js").IdGenerator;
|
|
271
|
+
} | undefined;
|
|
272
|
+
};
|
|
273
|
+
};
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
/**
|
|
3
|
+
* Schemas are the core components of a benchmark. They are the entities which hold the data.
|
|
4
|
+
* There are three base different type of entities live inside a benchmark which are connected to each other:
|
|
5
|
+
* - Test case: Entity that holds a single input/task. That task often paired with an expected output (e.g multiple choice questions, Q&A)
|
|
6
|
+
* - Response: Entity that holds an output for a test case in a structured format.
|
|
7
|
+
* - Score: Entity that holds scoring result for a response.
|
|
8
|
+
*
|
|
9
|
+
* The hierarchy starts from test case, then response, then score. Every test case definition
|
|
10
|
+
* must have a corresponding response and every response must have a corresponding score definition.
|
|
11
|
+
* With this way we can easily identify each object in their own context.
|
|
12
|
+
*
|
|
13
|
+
* All of those entities are versioned via the `schemaVersion` field. Versions are recommended to
|
|
14
|
+
* start at 1 and increment whenever the schema changes in a breaking way. Handling version changes
|
|
15
|
+
* are the runtime's responsibility.
|
|
16
|
+
*
|
|
17
|
+
* `kind` field is a unique string that identifies the type of the entity. Depends on the
|
|
18
|
+
* entity type, kind field is extended as "*.tc" (test case), "*.rs" (response) or "*.sc" (score). Defining this field
|
|
19
|
+
* allow you to have strong type safety and identity for the raw objects that are stored outside your codebase.
|
|
20
|
+
*
|
|
21
|
+
* `namespace` field is another string identifier that tells you where this object definition coming from.
|
|
22
|
+
* It is recommended to use domain like syntax for it (such as "peerbench.ai" or "example.com") to avoid
|
|
23
|
+
* collisions with other namespaces out there.
|
|
24
|
+
*
|
|
25
|
+
* Each entity also has a field that refers to the upper level entity in the hierarchy via its ID.
|
|
26
|
+
* For instance: Scores have a `responseId` field and responses have a `testCaseId` field.
|
|
27
|
+
* This way we can easily identify which entity is generated for which entity.
|
|
28
|
+
*
|
|
29
|
+
* Managing those relations are the runtime's responsibility.
|
|
30
|
+
*
|
|
31
|
+
* Each entity must be following its base definition defined by the SDK.
|
|
32
|
+
* As long as the given `baseSchema` is an extension of the SDK level base schema (e.g `BaseTestCaseSchemaV1`),
|
|
33
|
+
* you can pass a different base schema. You can extend the well-known fields for your schemas using the extensions
|
|
34
|
+
* provided by the SDK. For instance below you can find that we are using `ExtensionLLMResponseFieldsV1` to extend the
|
|
35
|
+
* response schema with LLM specific fields (because our benchmark focuses on LLM chat models).
|
|
36
|
+
*/
|
|
37
|
+
export declare const EchoBasicNamespace: "example.peerbench.ai";
|
|
38
|
+
export declare const EchoBasicKind: "llm/echo-basic";
|
|
39
|
+
export declare const EchoBasicTestCaseSchemaV1: z.ZodObject<Omit<{
|
|
40
|
+
id: z.ZodString;
|
|
41
|
+
namespace: z.ZodString;
|
|
42
|
+
schemaVersion: z.ZodNumber;
|
|
43
|
+
kind: z.ZodString;
|
|
44
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
45
|
+
}, "kind" | "namespace" | "schemaVersion"> & {
|
|
46
|
+
input: z.ZodString;
|
|
47
|
+
} & {
|
|
48
|
+
namespace: z.ZodLiteral<"example.peerbench.ai">;
|
|
49
|
+
kind: z.ZodLiteral<"llm/echo-basic.tc">;
|
|
50
|
+
schemaVersion: z.ZodLiteral<1>;
|
|
51
|
+
}, z.core.$strip> & {
|
|
52
|
+
new: (input: Omit<{
|
|
53
|
+
id: string;
|
|
54
|
+
input: string;
|
|
55
|
+
namespace: "example.peerbench.ai";
|
|
56
|
+
kind: "llm/echo-basic.tc";
|
|
57
|
+
schemaVersion: 1;
|
|
58
|
+
metadata?: Record<string, unknown> | undefined;
|
|
59
|
+
}, "kind" | "namespace" | "schemaVersion">) => {
|
|
60
|
+
id: string;
|
|
61
|
+
input: string;
|
|
62
|
+
namespace: "example.peerbench.ai";
|
|
63
|
+
kind: "llm/echo-basic.tc";
|
|
64
|
+
schemaVersion: 1;
|
|
65
|
+
metadata?: Record<string, unknown> | undefined;
|
|
66
|
+
};
|
|
67
|
+
newWithId(input: Omit<{
|
|
68
|
+
id: string;
|
|
69
|
+
input: string;
|
|
70
|
+
namespace: "example.peerbench.ai";
|
|
71
|
+
kind: "llm/echo-basic.tc";
|
|
72
|
+
schemaVersion: 1;
|
|
73
|
+
metadata?: Record<string, unknown> | undefined;
|
|
74
|
+
}, "kind" | "id" | "namespace" | "schemaVersion">, generator: import("../../../../index.js").IdGenerator): Promise<{
|
|
75
|
+
id: string;
|
|
76
|
+
input: string;
|
|
77
|
+
namespace: "example.peerbench.ai";
|
|
78
|
+
kind: "llm/echo-basic.tc";
|
|
79
|
+
schemaVersion: 1;
|
|
80
|
+
metadata?: Record<string, unknown> | undefined;
|
|
81
|
+
}>;
|
|
82
|
+
};
|
|
83
|
+
export type EchoBasicTestCaseV1 = z.infer<typeof EchoBasicTestCaseSchemaV1>;
|
|
84
|
+
export declare const EchoBasicResponseSchemaV1: z.ZodObject<Omit<{
|
|
85
|
+
id: z.ZodString;
|
|
86
|
+
namespace: z.ZodString;
|
|
87
|
+
schemaVersion: z.ZodNumber;
|
|
88
|
+
kind: z.ZodString;
|
|
89
|
+
startedAt: z.ZodNumber;
|
|
90
|
+
completedAt: z.ZodNumber;
|
|
91
|
+
testCaseId: z.ZodString;
|
|
92
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
93
|
+
}, "kind" | "namespace" | "schemaVersion"> & {
|
|
94
|
+
data: z.ZodString;
|
|
95
|
+
modelSlug: z.ZodString;
|
|
96
|
+
provider: z.ZodString;
|
|
97
|
+
systemPromptId: z.ZodOptional<z.ZodString>;
|
|
98
|
+
inputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
99
|
+
outputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
100
|
+
inputCost: z.ZodOptional<z.ZodString>;
|
|
101
|
+
outputCost: z.ZodOptional<z.ZodString>;
|
|
102
|
+
} & {
|
|
103
|
+
namespace: z.ZodLiteral<"example.peerbench.ai">;
|
|
104
|
+
kind: z.ZodLiteral<"llm/echo-basic.rs">;
|
|
105
|
+
schemaVersion: z.ZodLiteral<1>;
|
|
106
|
+
}, z.core.$strip> & {
|
|
107
|
+
new: (input: Omit<{
|
|
108
|
+
startedAt: number;
|
|
109
|
+
completedAt: number;
|
|
110
|
+
id: string;
|
|
111
|
+
testCaseId: string;
|
|
112
|
+
data: string;
|
|
113
|
+
modelSlug: string;
|
|
114
|
+
provider: string;
|
|
115
|
+
namespace: "example.peerbench.ai";
|
|
116
|
+
kind: "llm/echo-basic.rs";
|
|
117
|
+
schemaVersion: 1;
|
|
118
|
+
metadata?: Record<string, unknown> | undefined;
|
|
119
|
+
systemPromptId?: string | undefined;
|
|
120
|
+
inputTokensUsed?: number | undefined;
|
|
121
|
+
outputTokensUsed?: number | undefined;
|
|
122
|
+
inputCost?: string | undefined;
|
|
123
|
+
outputCost?: string | undefined;
|
|
124
|
+
}, "kind" | "namespace" | "schemaVersion">) => {
|
|
125
|
+
startedAt: number;
|
|
126
|
+
completedAt: number;
|
|
127
|
+
id: string;
|
|
128
|
+
testCaseId: string;
|
|
129
|
+
data: string;
|
|
130
|
+
modelSlug: string;
|
|
131
|
+
provider: string;
|
|
132
|
+
namespace: "example.peerbench.ai";
|
|
133
|
+
kind: "llm/echo-basic.rs";
|
|
134
|
+
schemaVersion: 1;
|
|
135
|
+
metadata?: Record<string, unknown> | undefined;
|
|
136
|
+
systemPromptId?: string | undefined;
|
|
137
|
+
inputTokensUsed?: number | undefined;
|
|
138
|
+
outputTokensUsed?: number | undefined;
|
|
139
|
+
inputCost?: string | undefined;
|
|
140
|
+
outputCost?: string | undefined;
|
|
141
|
+
};
|
|
142
|
+
newWithId(input: Omit<{
|
|
143
|
+
startedAt: number;
|
|
144
|
+
completedAt: number;
|
|
145
|
+
id: string;
|
|
146
|
+
testCaseId: string;
|
|
147
|
+
data: string;
|
|
148
|
+
modelSlug: string;
|
|
149
|
+
provider: string;
|
|
150
|
+
namespace: "example.peerbench.ai";
|
|
151
|
+
kind: "llm/echo-basic.rs";
|
|
152
|
+
schemaVersion: 1;
|
|
153
|
+
metadata?: Record<string, unknown> | undefined;
|
|
154
|
+
systemPromptId?: string | undefined;
|
|
155
|
+
inputTokensUsed?: number | undefined;
|
|
156
|
+
outputTokensUsed?: number | undefined;
|
|
157
|
+
inputCost?: string | undefined;
|
|
158
|
+
outputCost?: string | undefined;
|
|
159
|
+
}, "kind" | "id" | "namespace" | "schemaVersion">, generator: import("../../../../index.js").IdGenerator): Promise<{
|
|
160
|
+
startedAt: number;
|
|
161
|
+
completedAt: number;
|
|
162
|
+
id: string;
|
|
163
|
+
testCaseId: string;
|
|
164
|
+
data: string;
|
|
165
|
+
modelSlug: string;
|
|
166
|
+
provider: string;
|
|
167
|
+
namespace: "example.peerbench.ai";
|
|
168
|
+
kind: "llm/echo-basic.rs";
|
|
169
|
+
schemaVersion: 1;
|
|
170
|
+
metadata?: Record<string, unknown> | undefined;
|
|
171
|
+
systemPromptId?: string | undefined;
|
|
172
|
+
inputTokensUsed?: number | undefined;
|
|
173
|
+
outputTokensUsed?: number | undefined;
|
|
174
|
+
inputCost?: string | undefined;
|
|
175
|
+
outputCost?: string | undefined;
|
|
176
|
+
}>;
|
|
177
|
+
};
|
|
178
|
+
export type EchoBasicResponseV1 = z.infer<typeof EchoBasicResponseSchemaV1>;
|
|
179
|
+
export declare const EchoBasicScoreSchemaV1: z.ZodObject<Omit<{
|
|
180
|
+
id: z.ZodString;
|
|
181
|
+
namespace: z.ZodString;
|
|
182
|
+
kind: z.ZodString;
|
|
183
|
+
schemaVersion: z.ZodNumber;
|
|
184
|
+
value: z.ZodNumber;
|
|
185
|
+
responseId: z.ZodString;
|
|
186
|
+
explanation: z.ZodOptional<z.ZodString>;
|
|
187
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
188
|
+
scoringMethod: z.ZodEnum<{
|
|
189
|
+
readonly ai: "ai";
|
|
190
|
+
readonly human: "human";
|
|
191
|
+
readonly algo: "algo";
|
|
192
|
+
}>;
|
|
193
|
+
}, "kind" | "namespace" | "schemaVersion"> & {
|
|
194
|
+
namespace: z.ZodLiteral<"example.peerbench.ai">;
|
|
195
|
+
kind: z.ZodLiteral<"llm/echo-basic.sc">;
|
|
196
|
+
schemaVersion: z.ZodLiteral<1>;
|
|
197
|
+
}, z.core.$strip> & {
|
|
198
|
+
new: (input: Omit<{
|
|
199
|
+
id: string;
|
|
200
|
+
value: number;
|
|
201
|
+
responseId: string;
|
|
202
|
+
scoringMethod: "ai" | "human" | "algo";
|
|
203
|
+
namespace: "example.peerbench.ai";
|
|
204
|
+
kind: "llm/echo-basic.sc";
|
|
205
|
+
schemaVersion: 1;
|
|
206
|
+
metadata?: Record<string, unknown> | undefined;
|
|
207
|
+
explanation?: string | undefined;
|
|
208
|
+
}, "kind" | "namespace" | "schemaVersion">) => {
|
|
209
|
+
id: string;
|
|
210
|
+
value: number;
|
|
211
|
+
responseId: string;
|
|
212
|
+
scoringMethod: "ai" | "human" | "algo";
|
|
213
|
+
namespace: "example.peerbench.ai";
|
|
214
|
+
kind: "llm/echo-basic.sc";
|
|
215
|
+
schemaVersion: 1;
|
|
216
|
+
metadata?: Record<string, unknown> | undefined;
|
|
217
|
+
explanation?: string | undefined;
|
|
218
|
+
};
|
|
219
|
+
newWithId(input: Omit<{
|
|
220
|
+
id: string;
|
|
221
|
+
value: number;
|
|
222
|
+
responseId: string;
|
|
223
|
+
scoringMethod: "ai" | "human" | "algo";
|
|
224
|
+
namespace: "example.peerbench.ai";
|
|
225
|
+
kind: "llm/echo-basic.sc";
|
|
226
|
+
schemaVersion: 1;
|
|
227
|
+
metadata?: Record<string, unknown> | undefined;
|
|
228
|
+
explanation?: string | undefined;
|
|
229
|
+
}, "kind" | "id" | "namespace" | "schemaVersion">, generator: import("../../../../index.js").IdGenerator): Promise<{
|
|
230
|
+
id: string;
|
|
231
|
+
value: number;
|
|
232
|
+
responseId: string;
|
|
233
|
+
scoringMethod: "ai" | "human" | "algo";
|
|
234
|
+
namespace: "example.peerbench.ai";
|
|
235
|
+
kind: "llm/echo-basic.sc";
|
|
236
|
+
schemaVersion: 1;
|
|
237
|
+
metadata?: Record<string, unknown> | undefined;
|
|
238
|
+
explanation?: string | undefined;
|
|
239
|
+
}>;
|
|
240
|
+
};
|
|
241
|
+
export type EchoBasicScoreV1 = z.infer<typeof EchoBasicScoreSchemaV1>;
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { JSONFileStorage } from "../../../../storages/json-file";
|
|
2
|
+
import { EchoBasicResponseV1, EchoBasicScoreV1, EchoBasicTestCaseV1 } from "../schema-sets/echo.v1";
|
|
3
|
+
/**
|
|
4
|
+
* Storages are the abstractions that are responsible for persisting benchmark entities outside your codebase.
|
|
5
|
+
*
|
|
6
|
+
* This example uses one of the predefined Storage implementations from the SDK which is JSONFileStorage.
|
|
7
|
+
* This storage allows you to store benchmark entity(ies) in a regular JSON file as an array of objects.
|
|
8
|
+
*/
|
|
9
|
+
export declare class EchoBasicJSONStorage extends JSONFileStorage<EchoBasicTestCaseV1 | EchoBasicResponseV1 | EchoBasicScoreV1> {
|
|
10
|
+
constructor(config: {
|
|
11
|
+
path: string;
|
|
12
|
+
chunkSize?: number;
|
|
13
|
+
});
|
|
14
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { FileStorage } from "../../../../storages/file";
|
|
2
|
+
import { EchoBasicResponseV1, EchoBasicScoreV1, EchoBasicTestCaseV1 } from "../schema-sets/echo.v1";
|
|
3
|
+
/**
|
|
4
|
+
* Example custom file storage implementation that stores the benchmark entities in a custom formatted text file.
|
|
5
|
+
*
|
|
6
|
+
* The custom file format expected is:
|
|
7
|
+
*
|
|
8
|
+
* ```
|
|
9
|
+
* BEGIN PEERBENCH ENTITY
|
|
10
|
+
* id: <id>
|
|
11
|
+
* namespace: <namespace>
|
|
12
|
+
* kind: <kind>
|
|
13
|
+
* schemaVersion: <schemaVersion>
|
|
14
|
+
* json: <json>
|
|
15
|
+
* END PEERBENCH ENTITY
|
|
16
|
+
* ```
|
|
17
|
+
*/
|
|
18
|
+
export declare class EchoBasicTextStorage extends FileStorage<EchoBasicEntityV1> {
|
|
19
|
+
constructor(config: {
|
|
20
|
+
path: string;
|
|
21
|
+
});
|
|
22
|
+
}
|
|
23
|
+
type EchoBasicEntityV1 = EchoBasicTestCaseV1 | EchoBasicResponseV1 | EchoBasicScoreV1;
|
|
24
|
+
export {};
|