peerbench 0.0.10 → 0.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +71 -58
- package/dist/benchmarks/examples/echo-basic/runner.d.ts +11 -254
- package/dist/benchmarks/examples/echo-basic/schema-sets/echo.v1.d.ts +25 -25
- package/dist/benchmarks/examples/exact-match-scorer/runner.d.ts +38 -386
- package/dist/benchmarks/examples/exact-match-scorer/schema-sets/exact-match.v1.d.ts +25 -25
- package/dist/benchmarks/examples/text-transform/runner.d.ts +32 -480
- package/dist/benchmarks/examples/text-transform/schema-sets/echo.v1.d.ts +25 -25
- package/dist/benchmarks/examples/text-transform/schema-sets/reverse.v1.d.ts +25 -25
- package/dist/benchmarks/index.js +180 -248
- package/dist/benchmarks/index.js.map +1 -1
- package/dist/benchmarks/peerbench/index.d.ts +2 -1
- package/dist/benchmarks/peerbench/mcq-runner.d.ts +78 -0
- package/dist/benchmarks/peerbench/qa-runner.d.ts +77 -0
- package/dist/benchmarks/peerbench/schema-sets/mcq.v1.d.ts +25 -25
- package/dist/benchmarks/peerbench/schema-sets/multi-turn.v1.d.ts +25 -25
- package/dist/benchmarks/peerbench/schema-sets/qa.v1.d.ts +25 -25
- package/dist/chunk-6WDCU5BP.js +9 -0
- package/dist/chunk-6WDCU5BP.js.map +1 -0
- package/dist/{chunk-YY33MNMV.js → chunk-7KMGLEYP.js} +2 -2
- package/dist/{chunk-TRNCF2BG.js → chunk-HBGC6BDW.js} +1 -1
- package/dist/chunk-HBGC6BDW.js.map +1 -0
- package/dist/{chunk-HMQYGCKI.js → chunk-ZJWSK4VO.js} +1 -1
- package/dist/chunk-ZJWSK4VO.js.map +1 -0
- package/dist/dev.d.ts +22 -0
- package/dist/helpers/define-runner.d.ts +2 -45
- package/dist/index.js +2 -2
- package/dist/providers/ai-sdk.d.ts +24 -0
- package/dist/providers/callables/callable.d.ts +4 -0
- package/dist/providers/callables/llm.d.ts +41 -0
- package/dist/providers/example/echo.d.ts +12 -11
- package/dist/providers/example/restapi.d.ts +11 -18
- package/dist/providers/index.d.ts +4 -2
- package/dist/providers/index.js +380 -9
- package/dist/providers/index.js.map +1 -1
- package/dist/providers/mastra.d.ts +16 -21
- package/dist/providers/openai.d.ts +25 -10
- package/dist/providers/openrouter.d.ts +6 -8
- package/dist/schemas/extensions/index.js +1 -1
- package/dist/schemas/extensions/response/llm.d.ts +17 -0
- package/dist/schemas/index.js +2 -2
- package/dist/schemas/llm/index.js +36 -7
- package/dist/schemas/llm/index.js.map +1 -1
- package/dist/schemas/llm/simple-system-prompt.d.ts +3 -3
- package/dist/schemas/llm/system-prompt.d.ts +7 -7
- package/dist/schemas/response.d.ts +7 -7
- package/dist/schemas/schema-definer.d.ts +5 -5
- package/dist/schemas/score.d.ts +7 -7
- package/dist/schemas/test-case.d.ts +7 -7
- package/dist/scorers/abstract.d.ts +1 -1
- package/dist/scorers/index.js +377 -7
- package/dist/scorers/index.js.map +1 -1
- package/dist/scorers/llm-judge.d.ts +6 -6
- package/dist/types/index.d.ts +0 -5
- package/dist/types/runner.d.ts +13 -17
- package/package.json +8 -7
- package/dist/benchmarks/peerbench/runner.d.ts +0 -754
- package/dist/chunk-3JHDJEY3.js +0 -374
- package/dist/chunk-3JHDJEY3.js.map +0 -1
- package/dist/chunk-HMQYGCKI.js.map +0 -1
- package/dist/chunk-Q6GSOHOP.js +0 -44
- package/dist/chunk-Q6GSOHOP.js.map +0 -1
- package/dist/chunk-RTEAK4II.js +0 -37
- package/dist/chunk-RTEAK4II.js.map +0 -1
- package/dist/chunk-SMLNDQFX.js +0 -244
- package/dist/chunk-SMLNDQFX.js.map +0 -1
- package/dist/chunk-TRNCF2BG.js.map +0 -1
- package/dist/providers/abstract/llm.d.ts +0 -20
- /package/dist/{chunk-YY33MNMV.js.map → chunk-7KMGLEYP.js.map} +0 -0
- /package/dist/providers/{abstract/provider.d.ts → abstract.d.ts} +0 -0
package/README.md
CHANGED
|
@@ -1,21 +1,21 @@
|
|
|
1
1
|
# `peerbench` SDK
|
|
2
2
|
|
|
3
|
-
This package is the shared
|
|
3
|
+
This package is the shared "domain core" for _building benchmarks_ in a standardized, portable way. It gives you a consistent set of _persistable entities_ (schemas + types), and a consistent set of _runtime contracts_ (runners, scorers, providers, storages, aggregators) so the same benchmark can run in a CLI, a web app, a worker, or anything else.
|
|
4
4
|
|
|
5
|
-
If you
|
|
5
|
+
If you're implementing a new benchmark, the SDK is the part that keeps it portable instead of glued to one runtime. If you're integrating peerbench SDK into a runtime, the SDK is the part you don't want to rewrite in every repo.
|
|
6
6
|
|
|
7
7
|
> - _Runtime_ refers to the codebase that uses peerbench SDK (a CLI, a webapp, a background service etc.)
|
|
8
8
|
> - This package does not support CommonJS
|
|
9
9
|
|
|
10
10
|
## What is a benchmark?
|
|
11
11
|
|
|
12
|
-
A benchmark is a structured way to ask:
|
|
12
|
+
A benchmark is a structured way to ask: "How well does a system perform on a set of tasks, under a set of rules?"
|
|
13
13
|
|
|
14
14
|
If you look at widely-used benchmarks, the pattern is always the same even when the tasks are different:
|
|
15
15
|
|
|
16
16
|
- In MMLU-Pro, each item is a question (often multiple choice) and the score is about correctness across categories.
|
|
17
17
|
- In BIG-bench style task suites, you have many different task types and you want a consistent way to run and score them.
|
|
18
|
-
- In HELM-style evaluations, you care about not only
|
|
18
|
+
- In HELM-style evaluations, you care about not only "did it answer correctly", but also how you ran it (prompting setup, constraints, metadata) and how you report results.
|
|
19
19
|
|
|
20
20
|
Those benchmarks differ in details, but they all boil down to the same building blocks: a dataset of test cases, a way to run a system on each test case, and a way to score the output. The peerbench SDK is designed so these patterns can be represented with the same portable shape.
|
|
21
21
|
|
|
@@ -23,7 +23,7 @@ Those benchmarks differ in details, but they all boil down to the same building
|
|
|
23
23
|
|
|
24
24
|
Now that we agree on what a benchmark is, we can talk about how peerbench represents it.
|
|
25
25
|
|
|
26
|
-
peerbench is deliberately boring here. It doesn
|
|
26
|
+
peerbench is deliberately boring here. It doesn't try to invent a new "benchmark framework". It gives you a small set of building blocks that you can compose. If you understand these pieces, you can read any benchmark implementation and know where to look.
|
|
27
27
|
|
|
28
28
|
### Entities (the things you store)
|
|
29
29
|
|
|
@@ -40,8 +40,8 @@ Everything else in the SDK exists to create these entities in a predictable way.
|
|
|
40
40
|
Three fields show up everywhere:
|
|
41
41
|
|
|
42
42
|
- `kind` tells you _what type_ of entity something is. It is a stable string you pick (descriptive).
|
|
43
|
-
- `schemaVersion` tells you _which version_ of that entity shape you
|
|
44
|
-
- `namespace` tells you which
|
|
43
|
+
- `schemaVersion` tells you _which version_ of that entity shape you're looking at.
|
|
44
|
+
- `namespace` tells you which "owner" defines that kind (e.g peerbench.ai).
|
|
45
45
|
|
|
46
46
|
This is why peerbench leans on [Zod](https://zod.dev) schemas: it keeps the persisted data contract explicit and runtime-validated.
|
|
47
47
|
|
|
@@ -57,36 +57,36 @@ peerbench SDK provides some pre-defined storage abstractions you can use out of
|
|
|
57
57
|
|
|
58
58
|
### Provider (how you talk to a model)
|
|
59
59
|
|
|
60
|
-
A provider is the runtime bridge to a model endpoint.
|
|
60
|
+
A provider is the runtime bridge to a model endpoint. It's an API client factory that creates **callables** — lightweight objects that have the model/agent configuration baked in and can be invoked by runners.
|
|
61
61
|
|
|
62
|
-
Runners do not talk to
|
|
62
|
+
Runners do not talk to providers directly. They receive a **callable** (today that's `CallableLLM` for message-based LLM communication). That gives you a clean seam:
|
|
63
63
|
|
|
64
|
-
- benchmark code doesn
|
|
64
|
+
- benchmark code doesn't care where the model lives
|
|
65
65
|
- runtimes can swap providers without rewriting benchmark code
|
|
66
66
|
|
|
67
|
+
Each provider extends `AbstractProvider`. Provider classes have the custom logic to interact with a 3rd party service. They are expected to have a factory method such as `.model()` or `.agent()` that returns a callable unit for their infrastructure. Callable units are what runners receive — they provide a uniform interface so runners don't need any special treatment for the underlying provider implementation.
|
|
68
|
+
|
|
67
69
|
If you already have your own service in front of the model, you can still model it as a provider. The example in `packages/sdk-0.2/src/providers/example/restapi.ts` shows this pattern.
|
|
68
70
|
|
|
69
71
|
### Runner (how you execute one test case)
|
|
70
72
|
|
|
71
|
-
A runner is the execution part of a benchmark. A runner function takes whatever inputs it needs, calls a
|
|
73
|
+
A runner is the execution part of a benchmark. A runner function takes whatever inputs it needs, calls a callable's `forward()`, and produces a `Response`. It may also produce a `Score` (via a scorer).
|
|
72
74
|
|
|
73
|
-
Runners are
|
|
75
|
+
Runners are intended to be "per test case" because it keeps the benchmark logic small and easy to compose. Running a whole dataset is orchestration, and orchestration is where runtimes differ (parallelism, retries, persistence, budgets, progress UI).
|
|
74
76
|
|
|
75
77
|
There is no restriction that a benchmark must have exactly one runner. You can export multiple runner functions (different modes, different prompts, different providers, different scoring strategies). The runtime just needs to pick the runner it wants to use.
|
|
76
78
|
|
|
77
|
-
One practical convention you will see in the examples is `runConfig`. It’s runner-specific, and it’s recommended to kept as a simple JSON-serializable object so you can store it alongside your run and reproduce it later.
|
|
78
|
-
|
|
79
79
|
### Scorer (how you judge a response)
|
|
80
80
|
|
|
81
81
|
A scorer produces a numeric result. Some scorers are deterministic (same input → same output). Some scorers are non-deterministic (for example "LLM as a judge").
|
|
82
82
|
|
|
83
|
-
A scorer takes what it needs. Sometimes it
|
|
83
|
+
A scorer takes what it needs. Sometimes it's "expected + actual strings". Sometimes it's "a list of required fields + a JSON output". The runner decides what to pass into the scorer, because the runner is the piece that knows how the benchmark is structured.
|
|
84
84
|
|
|
85
85
|
If your benchmark can be scored in multiple ways, a runner can accept multiple scorer implementations and choose between them. The examples in `packages/sdk-0.2/src/benchmarks/examples/` show what that looks like in code.
|
|
86
86
|
|
|
87
87
|
## What the SDK does vs what the runtime does
|
|
88
88
|
|
|
89
|
-
It
|
|
89
|
+
It's easy to accidentally push "too much responsibility" to the SDK and end up with a framework you can't escape. It's also easy to push "too much responsibility" to the runtime and end up with copy-pasted benchmark logic.
|
|
90
90
|
|
|
91
91
|
This SDK tries to draw a clean line:
|
|
92
92
|
|
|
@@ -106,9 +106,9 @@ The runtime is responsible for:
|
|
|
106
106
|
|
|
107
107
|
If you keep that boundary, benchmarks stay portable and runtimes stay free to evolve.
|
|
108
108
|
|
|
109
|
-
## If you
|
|
109
|
+
## If you're implementing a benchmark
|
|
110
110
|
|
|
111
|
-
The easiest way to think about
|
|
111
|
+
The easiest way to think about "implementing a benchmark" is: you are implementing a small domain module that can be imported by multiple runtimes. That means your job is mostly about making your benchmark _self-contained and explicit_.
|
|
112
112
|
|
|
113
113
|
In practice, the benchmark implementer is responsible for:
|
|
114
114
|
|
|
@@ -121,11 +121,11 @@ Once those are in place, runtimes can focus on orchestration and product concern
|
|
|
121
121
|
|
|
122
122
|
Peerbench does not assume your new benchmarks will be part of the SDK itself. The normal expectation is that your benchmark code lives in your runtime (or in its own package), and it uses `peerbench` as a dependency for schemas, base types, and contracts.
|
|
123
123
|
|
|
124
|
-
Benchmarks can implement everything themselves, but they can also reuse the SDK
|
|
124
|
+
Benchmarks can implement everything themselves, but they can also reuse the SDK's predefined building blocks. If it is possible, it is recommended to stick with SDK base types (e.g `AbstractProvider`, `CallableLLM`) and implementations, because it increases compatibility with other tooling that speaks "Peerbench entities".
|
|
125
125
|
|
|
126
126
|
## A benchmark, step by step
|
|
127
127
|
|
|
128
|
-
A
|
|
128
|
+
A "benchmark" in this SDK is not a magical object. It is a small folder that exports a few well-known pieces. The simplest complete benchmark usually includes:
|
|
129
129
|
|
|
130
130
|
1. schemas (test case / response / score)
|
|
131
131
|
2. a runner (how a single test case is executed)
|
|
@@ -137,7 +137,6 @@ You can see a compact, end-to-end reference in:
|
|
|
137
137
|
- `packages/sdk-0.2/src/benchmarks/examples/echo-basic/`
|
|
138
138
|
- `packages/sdk-0.2/src/benchmarks/examples/text-transform/`
|
|
139
139
|
- `packages/sdk-0.2/src/benchmarks/examples/exact-match-scorer/`
|
|
140
|
-
- `packages/sdk-0.2/src/benchmarks/examples/mcq-qa-templated/`
|
|
141
140
|
|
|
142
141
|
### 1) Schemas: the source of truth
|
|
143
142
|
|
|
@@ -149,9 +148,9 @@ In `packages/sdk-0.2/src/benchmarks/examples/echo-basic/schema-sets/echo.v1.ts`
|
|
|
149
148
|
- define a response schema for that test case
|
|
150
149
|
- define a score schema for that response
|
|
151
150
|
|
|
152
|
-
The hierarchy starts from test case → response → score, and we keep the relationship by storing IDs (`testCaseId`, `responseId`). That relationship is
|
|
151
|
+
The hierarchy starts from test case → response → score, and we keep the relationship by storing IDs (`testCaseId`, `responseId`). That relationship is "real data", so the runtime is usually the one that persists it and queries it.
|
|
153
152
|
|
|
154
|
-
Here is what
|
|
153
|
+
Here is what "defining a test case schema" looks like in practice (trimmed to the idea):
|
|
155
154
|
|
|
156
155
|
```ts
|
|
157
156
|
import { z } from "zod";
|
|
@@ -170,9 +169,22 @@ export const MyTestCaseSchemaV1 = defineTestCaseSchema({
|
|
|
170
169
|
|
|
171
170
|
### 2) Provider: how runners talk to models
|
|
172
171
|
|
|
173
|
-
Runners communicate with models through a provider
|
|
172
|
+
Runners communicate with models through a callable created by a provider. That's how the same benchmark can run against different backends without rewriting the benchmark.
|
|
173
|
+
|
|
174
|
+
A provider is an API client factory. You create a provider once (with API keys, rate limiters, etc.), then call its factory method to create callables — lightweight objects with a `forward()` method and the model baked in:
|
|
175
|
+
|
|
176
|
+
```ts
|
|
177
|
+
import { OpenRouterProvider } from "peerbench/providers";
|
|
178
|
+
|
|
179
|
+
const provider = new OpenRouterProvider({ apiKey: "..." });
|
|
180
|
+
const target = provider.model({ model: "gpt-4o" });
|
|
181
|
+
|
|
182
|
+
// target.slug === "gpt-4o"
|
|
183
|
+
// target.provider.kind === "peerbench.ai/llm/openrouter.ai"
|
|
184
|
+
// target.forward({ messages }) — model is already captured
|
|
185
|
+
```
|
|
174
186
|
|
|
175
|
-
If you already have a service in front of your model, the REST API provider example shows the pattern:
|
|
187
|
+
If you already have a service in front of your model, the REST API provider example (`src/providers/example/restapi.ts`) shows the pattern: extend `AbstractProvider`, implement a factory method that returns a `CallableLLM`, and translate messages to HTTP requests inside the `forward()` arrow function.
|
|
176
188
|
|
|
177
189
|
### 3) Runner: run one test case
|
|
178
190
|
|
|
@@ -182,15 +194,15 @@ This is intentional. Running many test cases is orchestration, and orchestration
|
|
|
182
194
|
|
|
183
195
|
In the example runners (e.g. `packages/sdk-0.2/src/benchmarks/examples/echo-basic/runner.ts`) you can see the responsibilities:
|
|
184
196
|
|
|
185
|
-
- format a test case into
|
|
186
|
-
- call `
|
|
187
|
-
- map
|
|
197
|
+
- format a test case into callable-friendly input (`messages[]`)
|
|
198
|
+
- call `target.forward({ messages })`
|
|
199
|
+
- map the output into a `Response` entity
|
|
188
200
|
- if a scorer is provided, turn scorer output into a `Score` entity
|
|
189
201
|
|
|
190
202
|
Here is the idea in a minimal form:
|
|
191
203
|
|
|
192
204
|
```ts
|
|
193
|
-
const providerResponse = await
|
|
205
|
+
const providerResponse = await target.forward({ messages });
|
|
194
206
|
|
|
195
207
|
const response = ResponseSchemaV1.new({
|
|
196
208
|
id: "runtime-generates-id",
|
|
@@ -198,8 +210,8 @@ const response = ResponseSchemaV1.new({
|
|
|
198
210
|
data: providerResponse.data,
|
|
199
211
|
startedAt: providerResponse.startedAt,
|
|
200
212
|
completedAt: providerResponse.completedAt,
|
|
201
|
-
modelSlug:
|
|
202
|
-
provider: provider.kind,
|
|
213
|
+
modelSlug: target.slug,
|
|
214
|
+
provider: target.provider.kind,
|
|
203
215
|
});
|
|
204
216
|
```
|
|
205
217
|
|
|
@@ -207,7 +219,7 @@ const response = ResponseSchemaV1.new({
|
|
|
207
219
|
|
|
208
220
|
Some benchmarks are easy to score deterministically (string match, regex extraction, set coverage). Some benchmarks need semantic judgment. Some benchmarks want both.
|
|
209
221
|
|
|
210
|
-
That
|
|
222
|
+
That's why scorers are separate objects and why runners can accept more than one scorer implementation.
|
|
211
223
|
|
|
212
224
|
The examples show:
|
|
213
225
|
|
|
@@ -219,11 +231,11 @@ The examples show:
|
|
|
219
231
|
|
|
220
232
|
## Usage: run a single test case end-to-end
|
|
221
233
|
|
|
222
|
-
First, define schemas and a runner (this is the
|
|
234
|
+
First, define schemas and a runner (this is the "portable benchmark code"):
|
|
223
235
|
|
|
224
236
|
```ts
|
|
225
237
|
import { defineRunner, idGeneratorUUIDv7 } from "peerbench";
|
|
226
|
-
import {
|
|
238
|
+
import { CallableLLM } from "peerbench/providers";
|
|
227
239
|
import {
|
|
228
240
|
BaseResponseSchemaV1,
|
|
229
241
|
BaseScoreSchemaV1,
|
|
@@ -232,7 +244,7 @@ import {
|
|
|
232
244
|
defineScoreSchema,
|
|
233
245
|
defineTestCaseSchema,
|
|
234
246
|
} from "peerbench/schemas";
|
|
235
|
-
import {
|
|
247
|
+
import { ExtensionLLMResponseFieldsV1 } from "peerbench/schemas/extensions";
|
|
236
248
|
import z from "zod";
|
|
237
249
|
|
|
238
250
|
const Namespace = "example.peerbench.ai" as const;
|
|
@@ -251,7 +263,7 @@ const ResponseSchemaV1 = defineResponseSchema({
|
|
|
251
263
|
namespace: Namespace,
|
|
252
264
|
kind: Kind,
|
|
253
265
|
schemaVersion: 1,
|
|
254
|
-
fields: { ...
|
|
266
|
+
fields: { ...ExtensionLLMResponseFieldsV1 },
|
|
255
267
|
});
|
|
256
268
|
|
|
257
269
|
const ScoreSchemaV1 = defineScoreSchema({
|
|
@@ -262,22 +274,19 @@ const ScoreSchemaV1 = defineScoreSchema({
|
|
|
262
274
|
fields: {},
|
|
263
275
|
});
|
|
264
276
|
|
|
277
|
+
type TestCaseV1 = z.infer<typeof TestCaseSchemaV1.schema>;
|
|
278
|
+
|
|
265
279
|
export const runner = defineRunner(
|
|
266
|
-
{
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
runConfigSchema: { model: z.string() },
|
|
277
|
-
},
|
|
278
|
-
async ({ testCase, provider, runConfig, idGenerators }) => {
|
|
279
|
-
const providerResponse = await provider.forward({
|
|
280
|
-
model: runConfig.model,
|
|
280
|
+
async (params: {
|
|
281
|
+
testCase: TestCaseV1;
|
|
282
|
+
target: CallableLLM;
|
|
283
|
+
idGenerators?: {
|
|
284
|
+
response?: (input: unknown) => string;
|
|
285
|
+
};
|
|
286
|
+
}) => {
|
|
287
|
+
const { testCase, target } = params;
|
|
288
|
+
|
|
289
|
+
const providerResponse = await target.forward({
|
|
281
290
|
messages: [{ role: "user", content: `Echo:\n${testCase.input}` }],
|
|
282
291
|
});
|
|
283
292
|
|
|
@@ -287,14 +296,14 @@ export const runner = defineRunner(
|
|
|
287
296
|
startedAt: providerResponse.startedAt,
|
|
288
297
|
completedAt: providerResponse.completedAt,
|
|
289
298
|
testCaseId: testCase.id,
|
|
290
|
-
modelSlug:
|
|
291
|
-
provider: provider.kind,
|
|
299
|
+
modelSlug: target.slug,
|
|
300
|
+
provider: target.provider.kind,
|
|
292
301
|
inputTokensUsed: providerResponse.inputTokensUsed,
|
|
293
302
|
outputTokensUsed: providerResponse.outputTokensUsed,
|
|
294
303
|
inputCost: providerResponse.inputCost,
|
|
295
304
|
outputCost: providerResponse.outputCost,
|
|
296
305
|
},
|
|
297
|
-
idGenerators?.response ?? idGeneratorUUIDv7
|
|
306
|
+
params.idGenerators?.response ?? idGeneratorUUIDv7
|
|
298
307
|
);
|
|
299
308
|
|
|
300
309
|
return { response };
|
|
@@ -304,13 +313,18 @@ export const runner = defineRunner(
|
|
|
304
313
|
|
|
305
314
|
## Usage: what the runtime adds (orchestration)
|
|
306
315
|
|
|
307
|
-
Once you have a runner, the runtime
|
|
316
|
+
Once you have a runner, the runtime's job is mostly about repetition and persistence.
|
|
308
317
|
|
|
309
318
|
For example, a very small orchestrator might do:
|
|
310
319
|
|
|
311
320
|
```ts
|
|
321
|
+
import { OpenRouterProvider } from "peerbench/providers";
|
|
322
|
+
|
|
323
|
+
const provider = new OpenRouterProvider({ apiKey: "..." });
|
|
324
|
+
const target = provider.model({ model: "gpt-4o" });
|
|
325
|
+
|
|
312
326
|
for (const testCase of testCases) {
|
|
313
|
-
const result = await runner({ testCase,
|
|
327
|
+
const result = await runner({ testCase, target });
|
|
314
328
|
// store `result.response` and `result.score` somewhere durable
|
|
315
329
|
// decide how to handle errors, retries, progress, and budgets
|
|
316
330
|
}
|
|
@@ -325,9 +339,8 @@ The examples under `packages/sdk-0.2/src/benchmarks/examples/` each teach one id
|
|
|
325
339
|
- `echo-basic`: minimal schema set + runner + storage examples
|
|
326
340
|
- `text-transform`: one runner supports multiple kinds + deterministic scoring
|
|
327
341
|
- `exact-match-scorer`: scorer dispatch pattern (algo scorer vs LLM judge scorer)
|
|
328
|
-
- `mcq-qa-templated`: template variables + MCQ/QA tasks
|
|
329
342
|
|
|
330
343
|
## Design notes
|
|
331
344
|
|
|
332
|
-
- Schemas are runtime-validated (Zod) so
|
|
345
|
+
- Schemas are runtime-validated (Zod) so "type-only drift" doesn't silently corrupt stored data.
|
|
333
346
|
- Runners are per-test-case so they stay small and portable; runtimes keep orchestration control.
|
|
@@ -1,35 +1,19 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
* at the execution phase.
|
|
8
|
-
*/
|
|
9
|
-
export declare const echoBasicRunner: ((params: {
|
|
10
|
-
testCase: {
|
|
11
|
-
id: string;
|
|
12
|
-
input: string;
|
|
13
|
-
namespace: "example.peerbench.ai";
|
|
14
|
-
kind: "llm/echo-basic.tc";
|
|
15
|
-
schemaVersion: 1;
|
|
16
|
-
metadata?: Record<string, unknown> | undefined;
|
|
17
|
-
};
|
|
18
|
-
provider: AbstractLLMProvider;
|
|
19
|
-
scorer?: undefined;
|
|
20
|
-
runConfig: {
|
|
21
|
-
model: string;
|
|
22
|
-
};
|
|
1
|
+
import { CallableLLM } from "../../../providers/index.js";
|
|
2
|
+
import { IdGenerator } from "../../../types";
|
|
3
|
+
import { EchoBasicTestCaseV1 } from "./schema-sets/echo.v1";
|
|
4
|
+
export declare const echoBasicRunner: (params: {
|
|
5
|
+
testCase: EchoBasicTestCaseV1;
|
|
6
|
+
target: CallableLLM;
|
|
23
7
|
idGenerators?: {
|
|
24
|
-
response?:
|
|
25
|
-
score?:
|
|
8
|
+
response?: IdGenerator;
|
|
9
|
+
score?: IdGenerator;
|
|
26
10
|
};
|
|
27
11
|
}) => Promise<{
|
|
28
12
|
response: {
|
|
29
|
-
startedAt: number;
|
|
30
|
-
completedAt: number;
|
|
31
13
|
id: string;
|
|
32
14
|
testCaseId: string;
|
|
15
|
+
startedAt: number;
|
|
16
|
+
completedAt: number;
|
|
33
17
|
data: string;
|
|
34
18
|
modelSlug: string;
|
|
35
19
|
provider: string;
|
|
@@ -43,231 +27,4 @@ export declare const echoBasicRunner: ((params: {
|
|
|
43
27
|
inputCost?: string | undefined;
|
|
44
28
|
outputCost?: string | undefined;
|
|
45
29
|
};
|
|
46
|
-
|
|
47
|
-
id: string;
|
|
48
|
-
value: number;
|
|
49
|
-
responseId: string;
|
|
50
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
51
|
-
namespace: "example.peerbench.ai";
|
|
52
|
-
kind: "llm/echo-basic.sc";
|
|
53
|
-
schemaVersion: 1;
|
|
54
|
-
metadata?: Record<string, unknown> | undefined;
|
|
55
|
-
explanation?: string | undefined;
|
|
56
|
-
} | undefined;
|
|
57
|
-
}>) & {
|
|
58
|
-
config: {
|
|
59
|
-
runConfigSchema: z.ZodObject<{
|
|
60
|
-
model: z.ZodString;
|
|
61
|
-
}, z.core.$strip>;
|
|
62
|
-
schemaSets: [{
|
|
63
|
-
readonly testCase: z.ZodObject<Omit<{
|
|
64
|
-
id: z.ZodString;
|
|
65
|
-
namespace: z.ZodString;
|
|
66
|
-
schemaVersion: z.ZodNumber;
|
|
67
|
-
kind: z.ZodString;
|
|
68
|
-
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
69
|
-
}, "kind" | "namespace" | "schemaVersion"> & {
|
|
70
|
-
input: z.ZodString;
|
|
71
|
-
} & {
|
|
72
|
-
namespace: z.ZodLiteral<"example.peerbench.ai">;
|
|
73
|
-
kind: z.ZodLiteral<"llm/echo-basic.tc">;
|
|
74
|
-
schemaVersion: z.ZodLiteral<1>;
|
|
75
|
-
}, z.core.$strip> & {
|
|
76
|
-
new: (input: Omit<{
|
|
77
|
-
id: string;
|
|
78
|
-
input: string;
|
|
79
|
-
namespace: "example.peerbench.ai";
|
|
80
|
-
kind: "llm/echo-basic.tc";
|
|
81
|
-
schemaVersion: 1;
|
|
82
|
-
metadata?: Record<string, unknown> | undefined;
|
|
83
|
-
}, "kind" | "namespace" | "schemaVersion">) => {
|
|
84
|
-
id: string;
|
|
85
|
-
input: string;
|
|
86
|
-
namespace: "example.peerbench.ai";
|
|
87
|
-
kind: "llm/echo-basic.tc";
|
|
88
|
-
schemaVersion: 1;
|
|
89
|
-
metadata?: Record<string, unknown> | undefined;
|
|
90
|
-
};
|
|
91
|
-
newWithId(input: Omit<{
|
|
92
|
-
id: string;
|
|
93
|
-
input: string;
|
|
94
|
-
namespace: "example.peerbench.ai";
|
|
95
|
-
kind: "llm/echo-basic.tc";
|
|
96
|
-
schemaVersion: 1;
|
|
97
|
-
metadata?: Record<string, unknown> | undefined;
|
|
98
|
-
}, "kind" | "id" | "namespace" | "schemaVersion">, generator: import("../../../index.js").IdGenerator): Promise<{
|
|
99
|
-
id: string;
|
|
100
|
-
input: string;
|
|
101
|
-
namespace: "example.peerbench.ai";
|
|
102
|
-
kind: "llm/echo-basic.tc";
|
|
103
|
-
schemaVersion: 1;
|
|
104
|
-
metadata?: Record<string, unknown> | undefined;
|
|
105
|
-
}>;
|
|
106
|
-
};
|
|
107
|
-
readonly response: z.ZodObject<Omit<{
|
|
108
|
-
id: z.ZodString;
|
|
109
|
-
namespace: z.ZodString;
|
|
110
|
-
schemaVersion: z.ZodNumber;
|
|
111
|
-
kind: z.ZodString;
|
|
112
|
-
startedAt: z.ZodNumber;
|
|
113
|
-
completedAt: z.ZodNumber;
|
|
114
|
-
testCaseId: z.ZodString;
|
|
115
|
-
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
116
|
-
}, "kind" | "namespace" | "schemaVersion"> & {
|
|
117
|
-
data: z.ZodString;
|
|
118
|
-
modelSlug: z.ZodString;
|
|
119
|
-
provider: z.ZodString;
|
|
120
|
-
systemPromptId: z.ZodOptional<z.ZodString>;
|
|
121
|
-
inputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
122
|
-
outputTokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
123
|
-
inputCost: z.ZodOptional<z.ZodString>;
|
|
124
|
-
outputCost: z.ZodOptional<z.ZodString>;
|
|
125
|
-
} & {
|
|
126
|
-
namespace: z.ZodLiteral<"example.peerbench.ai">;
|
|
127
|
-
kind: z.ZodLiteral<"llm/echo-basic.rs">;
|
|
128
|
-
schemaVersion: z.ZodLiteral<1>;
|
|
129
|
-
}, z.core.$strip> & {
|
|
130
|
-
new: (input: Omit<{
|
|
131
|
-
startedAt: number;
|
|
132
|
-
completedAt: number;
|
|
133
|
-
id: string;
|
|
134
|
-
testCaseId: string;
|
|
135
|
-
data: string;
|
|
136
|
-
modelSlug: string;
|
|
137
|
-
provider: string;
|
|
138
|
-
namespace: "example.peerbench.ai";
|
|
139
|
-
kind: "llm/echo-basic.rs";
|
|
140
|
-
schemaVersion: 1;
|
|
141
|
-
metadata?: Record<string, unknown> | undefined;
|
|
142
|
-
systemPromptId?: string | undefined;
|
|
143
|
-
inputTokensUsed?: number | undefined;
|
|
144
|
-
outputTokensUsed?: number | undefined;
|
|
145
|
-
inputCost?: string | undefined;
|
|
146
|
-
outputCost?: string | undefined;
|
|
147
|
-
}, "kind" | "namespace" | "schemaVersion">) => {
|
|
148
|
-
startedAt: number;
|
|
149
|
-
completedAt: number;
|
|
150
|
-
id: string;
|
|
151
|
-
testCaseId: string;
|
|
152
|
-
data: string;
|
|
153
|
-
modelSlug: string;
|
|
154
|
-
provider: string;
|
|
155
|
-
namespace: "example.peerbench.ai";
|
|
156
|
-
kind: "llm/echo-basic.rs";
|
|
157
|
-
schemaVersion: 1;
|
|
158
|
-
metadata?: Record<string, unknown> | undefined;
|
|
159
|
-
systemPromptId?: string | undefined;
|
|
160
|
-
inputTokensUsed?: number | undefined;
|
|
161
|
-
outputTokensUsed?: number | undefined;
|
|
162
|
-
inputCost?: string | undefined;
|
|
163
|
-
outputCost?: string | undefined;
|
|
164
|
-
};
|
|
165
|
-
newWithId(input: Omit<{
|
|
166
|
-
startedAt: number;
|
|
167
|
-
completedAt: number;
|
|
168
|
-
id: string;
|
|
169
|
-
testCaseId: string;
|
|
170
|
-
data: string;
|
|
171
|
-
modelSlug: string;
|
|
172
|
-
provider: string;
|
|
173
|
-
namespace: "example.peerbench.ai";
|
|
174
|
-
kind: "llm/echo-basic.rs";
|
|
175
|
-
schemaVersion: 1;
|
|
176
|
-
metadata?: Record<string, unknown> | undefined;
|
|
177
|
-
systemPromptId?: string | undefined;
|
|
178
|
-
inputTokensUsed?: number | undefined;
|
|
179
|
-
outputTokensUsed?: number | undefined;
|
|
180
|
-
inputCost?: string | undefined;
|
|
181
|
-
outputCost?: string | undefined;
|
|
182
|
-
}, "kind" | "id" | "namespace" | "schemaVersion">, generator: import("../../../index.js").IdGenerator): Promise<{
|
|
183
|
-
startedAt: number;
|
|
184
|
-
completedAt: number;
|
|
185
|
-
id: string;
|
|
186
|
-
testCaseId: string;
|
|
187
|
-
data: string;
|
|
188
|
-
modelSlug: string;
|
|
189
|
-
provider: string;
|
|
190
|
-
namespace: "example.peerbench.ai";
|
|
191
|
-
kind: "llm/echo-basic.rs";
|
|
192
|
-
schemaVersion: 1;
|
|
193
|
-
metadata?: Record<string, unknown> | undefined;
|
|
194
|
-
systemPromptId?: string | undefined;
|
|
195
|
-
inputTokensUsed?: number | undefined;
|
|
196
|
-
outputTokensUsed?: number | undefined;
|
|
197
|
-
inputCost?: string | undefined;
|
|
198
|
-
outputCost?: string | undefined;
|
|
199
|
-
}>;
|
|
200
|
-
};
|
|
201
|
-
readonly score: z.ZodObject<Omit<{
|
|
202
|
-
id: z.ZodString;
|
|
203
|
-
namespace: z.ZodString;
|
|
204
|
-
kind: z.ZodString;
|
|
205
|
-
schemaVersion: z.ZodNumber;
|
|
206
|
-
value: z.ZodNumber;
|
|
207
|
-
responseId: z.ZodString;
|
|
208
|
-
explanation: z.ZodOptional<z.ZodString>;
|
|
209
|
-
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
210
|
-
scoringMethod: z.ZodEnum<{
|
|
211
|
-
readonly ai: "ai";
|
|
212
|
-
readonly human: "human";
|
|
213
|
-
readonly algo: "algo";
|
|
214
|
-
}>;
|
|
215
|
-
}, "kind" | "namespace" | "schemaVersion"> & {
|
|
216
|
-
namespace: z.ZodLiteral<"example.peerbench.ai">;
|
|
217
|
-
kind: z.ZodLiteral<"llm/echo-basic.sc">;
|
|
218
|
-
schemaVersion: z.ZodLiteral<1>;
|
|
219
|
-
}, z.core.$strip> & {
|
|
220
|
-
new: (input: Omit<{
|
|
221
|
-
id: string;
|
|
222
|
-
value: number;
|
|
223
|
-
responseId: string;
|
|
224
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
225
|
-
namespace: "example.peerbench.ai";
|
|
226
|
-
kind: "llm/echo-basic.sc";
|
|
227
|
-
schemaVersion: 1;
|
|
228
|
-
metadata?: Record<string, unknown> | undefined;
|
|
229
|
-
explanation?: string | undefined;
|
|
230
|
-
}, "kind" | "namespace" | "schemaVersion">) => {
|
|
231
|
-
id: string;
|
|
232
|
-
value: number;
|
|
233
|
-
responseId: string;
|
|
234
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
235
|
-
namespace: "example.peerbench.ai";
|
|
236
|
-
kind: "llm/echo-basic.sc";
|
|
237
|
-
schemaVersion: 1;
|
|
238
|
-
metadata?: Record<string, unknown> | undefined;
|
|
239
|
-
explanation?: string | undefined;
|
|
240
|
-
};
|
|
241
|
-
newWithId(input: Omit<{
|
|
242
|
-
id: string;
|
|
243
|
-
value: number;
|
|
244
|
-
responseId: string;
|
|
245
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
246
|
-
namespace: "example.peerbench.ai";
|
|
247
|
-
kind: "llm/echo-basic.sc";
|
|
248
|
-
schemaVersion: 1;
|
|
249
|
-
metadata?: Record<string, unknown> | undefined;
|
|
250
|
-
explanation?: string | undefined;
|
|
251
|
-
}, "kind" | "id" | "namespace" | "schemaVersion">, generator: import("../../../index.js").IdGenerator): Promise<{
|
|
252
|
-
id: string;
|
|
253
|
-
value: number;
|
|
254
|
-
responseId: string;
|
|
255
|
-
scoringMethod: "ai" | "human" | "algo";
|
|
256
|
-
namespace: "example.peerbench.ai";
|
|
257
|
-
kind: "llm/echo-basic.sc";
|
|
258
|
-
schemaVersion: 1;
|
|
259
|
-
metadata?: Record<string, unknown> | undefined;
|
|
260
|
-
explanation?: string | undefined;
|
|
261
|
-
}>;
|
|
262
|
-
};
|
|
263
|
-
}];
|
|
264
|
-
providers: [typeof AbstractLLMProvider];
|
|
265
|
-
scorers: [];
|
|
266
|
-
parseRunConfig?: boolean;
|
|
267
|
-
defaults?: {
|
|
268
|
-
scorer?: undefined;
|
|
269
|
-
responseIdGenerator?: import("../../../index.js").IdGenerator;
|
|
270
|
-
scoreIdGenerator?: import("../../../index.js").IdGenerator;
|
|
271
|
-
} | undefined;
|
|
272
|
-
};
|
|
273
|
-
};
|
|
30
|
+
}>;
|