peerbench 0.0.10 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/README.md +71 -58
  2. package/dist/benchmarks/examples/echo-basic/runner.d.ts +11 -254
  3. package/dist/benchmarks/examples/echo-basic/schema-sets/echo.v1.d.ts +25 -25
  4. package/dist/benchmarks/examples/exact-match-scorer/runner.d.ts +38 -386
  5. package/dist/benchmarks/examples/exact-match-scorer/schema-sets/exact-match.v1.d.ts +25 -25
  6. package/dist/benchmarks/examples/text-transform/runner.d.ts +32 -480
  7. package/dist/benchmarks/examples/text-transform/schema-sets/echo.v1.d.ts +25 -25
  8. package/dist/benchmarks/examples/text-transform/schema-sets/reverse.v1.d.ts +25 -25
  9. package/dist/benchmarks/index.js +180 -248
  10. package/dist/benchmarks/index.js.map +1 -1
  11. package/dist/benchmarks/peerbench/index.d.ts +2 -1
  12. package/dist/benchmarks/peerbench/mcq-runner.d.ts +78 -0
  13. package/dist/benchmarks/peerbench/qa-runner.d.ts +77 -0
  14. package/dist/benchmarks/peerbench/schema-sets/mcq.v1.d.ts +25 -25
  15. package/dist/benchmarks/peerbench/schema-sets/multi-turn.v1.d.ts +25 -25
  16. package/dist/benchmarks/peerbench/schema-sets/qa.v1.d.ts +25 -25
  17. package/dist/chunk-6WDCU5BP.js +9 -0
  18. package/dist/chunk-6WDCU5BP.js.map +1 -0
  19. package/dist/{chunk-YY33MNMV.js → chunk-7KMGLEYP.js} +2 -2
  20. package/dist/{chunk-TRNCF2BG.js → chunk-HBGC6BDW.js} +1 -1
  21. package/dist/chunk-HBGC6BDW.js.map +1 -0
  22. package/dist/{chunk-HMQYGCKI.js → chunk-ZJWSK4VO.js} +1 -1
  23. package/dist/chunk-ZJWSK4VO.js.map +1 -0
  24. package/dist/dev.d.ts +22 -0
  25. package/dist/helpers/define-runner.d.ts +2 -45
  26. package/dist/index.js +2 -2
  27. package/dist/providers/ai-sdk.d.ts +24 -0
  28. package/dist/providers/callables/callable.d.ts +4 -0
  29. package/dist/providers/callables/llm.d.ts +41 -0
  30. package/dist/providers/example/echo.d.ts +12 -11
  31. package/dist/providers/example/restapi.d.ts +11 -18
  32. package/dist/providers/index.d.ts +4 -2
  33. package/dist/providers/index.js +380 -9
  34. package/dist/providers/index.js.map +1 -1
  35. package/dist/providers/mastra.d.ts +16 -21
  36. package/dist/providers/openai.d.ts +25 -10
  37. package/dist/providers/openrouter.d.ts +6 -8
  38. package/dist/schemas/extensions/index.js +1 -1
  39. package/dist/schemas/extensions/response/llm.d.ts +17 -0
  40. package/dist/schemas/index.js +2 -2
  41. package/dist/schemas/llm/index.js +36 -7
  42. package/dist/schemas/llm/index.js.map +1 -1
  43. package/dist/schemas/llm/simple-system-prompt.d.ts +3 -3
  44. package/dist/schemas/llm/system-prompt.d.ts +7 -7
  45. package/dist/schemas/response.d.ts +7 -7
  46. package/dist/schemas/schema-definer.d.ts +5 -5
  47. package/dist/schemas/score.d.ts +7 -7
  48. package/dist/schemas/test-case.d.ts +7 -7
  49. package/dist/scorers/abstract.d.ts +1 -1
  50. package/dist/scorers/index.js +377 -7
  51. package/dist/scorers/index.js.map +1 -1
  52. package/dist/scorers/llm-judge.d.ts +6 -6
  53. package/dist/types/index.d.ts +0 -5
  54. package/dist/types/runner.d.ts +13 -17
  55. package/package.json +8 -7
  56. package/dist/benchmarks/peerbench/runner.d.ts +0 -754
  57. package/dist/chunk-3JHDJEY3.js +0 -374
  58. package/dist/chunk-3JHDJEY3.js.map +0 -1
  59. package/dist/chunk-HMQYGCKI.js.map +0 -1
  60. package/dist/chunk-Q6GSOHOP.js +0 -44
  61. package/dist/chunk-Q6GSOHOP.js.map +0 -1
  62. package/dist/chunk-RTEAK4II.js +0 -37
  63. package/dist/chunk-RTEAK4II.js.map +0 -1
  64. package/dist/chunk-SMLNDQFX.js +0 -244
  65. package/dist/chunk-SMLNDQFX.js.map +0 -1
  66. package/dist/chunk-TRNCF2BG.js.map +0 -1
  67. package/dist/providers/abstract/llm.d.ts +0 -20
  68. /package/dist/{chunk-YY33MNMV.js.map → chunk-7KMGLEYP.js.map} +0 -0
  69. /package/dist/providers/{abstract/provider.d.ts → abstract.d.ts} +0 -0
package/README.md CHANGED
@@ -1,21 +1,21 @@
1
1
  # `peerbench` SDK
2
2
 
3
- This package is the shared domain core for _building benchmarks_ in a standardized, portable way. It gives you a consistent set of _persistable entities_ (schemas + types), and a consistent set of _runtime contracts_ (runners, scorers, providers, storages, aggregators) so the same benchmark can run in a CLI, a web app, a worker, or anything else.
3
+ This package is the shared "domain core" for _building benchmarks_ in a standardized, portable way. It gives you a consistent set of _persistable entities_ (schemas + types), and a consistent set of _runtime contracts_ (runners, scorers, providers, storages, aggregators) so the same benchmark can run in a CLI, a web app, a worker, or anything else.
4
4
 
5
- If youre implementing a new benchmark, the SDK is the part that keeps it portable instead of glued to one runtime. If youre integrating peerbench SDK into a runtime, the SDK is the part you dont want to rewrite in every repo.
5
+ If you're implementing a new benchmark, the SDK is the part that keeps it portable instead of glued to one runtime. If you're integrating peerbench SDK into a runtime, the SDK is the part you don't want to rewrite in every repo.
6
6
 
7
7
  > - _Runtime_ refers to the codebase that uses peerbench SDK (a CLI, a webapp, a background service etc.)
8
8
  > - This package does not support CommonJS
9
9
 
10
10
  ## What is a benchmark?
11
11
 
12
- A benchmark is a structured way to ask: How well does a system perform on a set of tasks, under a set of rules?”
12
+ A benchmark is a structured way to ask: "How well does a system perform on a set of tasks, under a set of rules?"
13
13
 
14
14
  If you look at widely-used benchmarks, the pattern is always the same even when the tasks are different:
15
15
 
16
16
  - In MMLU-Pro, each item is a question (often multiple choice) and the score is about correctness across categories.
17
17
  - In BIG-bench style task suites, you have many different task types and you want a consistent way to run and score them.
18
- - In HELM-style evaluations, you care about not only did it answer correctly”, but also how you ran it (prompting setup, constraints, metadata) and how you report results.
18
+ - In HELM-style evaluations, you care about not only "did it answer correctly", but also how you ran it (prompting setup, constraints, metadata) and how you report results.
19
19
 
20
20
  Those benchmarks differ in details, but they all boil down to the same building blocks: a dataset of test cases, a way to run a system on each test case, and a way to score the output. The peerbench SDK is designed so these patterns can be represented with the same portable shape.
21
21
 
@@ -23,7 +23,7 @@ Those benchmarks differ in details, but they all boil down to the same building
23
23
 
24
24
  Now that we agree on what a benchmark is, we can talk about how peerbench represents it.
25
25
 
26
- peerbench is deliberately boring here. It doesnt try to invent a new benchmark framework”. It gives you a small set of building blocks that you can compose. If you understand these pieces, you can read any benchmark implementation and know where to look.
26
+ peerbench is deliberately boring here. It doesn't try to invent a new "benchmark framework". It gives you a small set of building blocks that you can compose. If you understand these pieces, you can read any benchmark implementation and know where to look.
27
27
 
28
28
  ### Entities (the things you store)
29
29
 
@@ -40,8 +40,8 @@ Everything else in the SDK exists to create these entities in a predictable way.
40
40
  Three fields show up everywhere:
41
41
 
42
42
  - `kind` tells you _what type_ of entity something is. It is a stable string you pick (descriptive).
43
- - `schemaVersion` tells you _which version_ of that entity shape youre looking at.
44
- - `namespace` tells you which owner defines that kind (e.g peerbench.ai).
43
+ - `schemaVersion` tells you _which version_ of that entity shape you're looking at.
44
+ - `namespace` tells you which "owner" defines that kind (e.g peerbench.ai).
45
45
 
46
46
  This is why peerbench leans on [Zod](https://zod.dev) schemas: it keeps the persisted data contract explicit and runtime-validated.
47
47
 
@@ -57,36 +57,36 @@ peerbench SDK provides some pre-defined storage abstractions you can use out of
57
57
 
58
58
  ### Provider (how you talk to a model)
59
59
 
60
- A provider is the runtime bridge to a model endpoint.
60
+ A provider is the runtime bridge to a model endpoint. It's an API client factory that creates **callables** — lightweight objects that have the model/agent configuration baked in and can be invoked by runners.
61
61
 
62
- Runners do not talk to models directly. They call a provider abstraction (today thats `AbstractLLMProvider` for message-based LLM communication). That gives you a clean seam:
62
+ Runners do not talk to providers directly. They receive a **callable** (today that's `CallableLLM` for message-based LLM communication). That gives you a clean seam:
63
63
 
64
- - benchmark code doesnt care where the model lives
64
+ - benchmark code doesn't care where the model lives
65
65
  - runtimes can swap providers without rewriting benchmark code
66
66
 
67
+ Each provider extends `AbstractProvider`. Provider classes have the custom logic to interact with a 3rd party service. They are expected to have a factory method such as `.model()` or `.agent()` that returns a callable unit for their infrastructure. Callable units are what runners receive — they provide a uniform interface so runners don't need any special treatment for the underlying provider implementation.
68
+
67
69
  If you already have your own service in front of the model, you can still model it as a provider. The example in `packages/sdk-0.2/src/providers/example/restapi.ts` shows this pattern.
68
70
 
69
71
  ### Runner (how you execute one test case)
70
72
 
71
- A runner is the execution part of a benchmark. A runner function takes whatever inputs it needs, calls a provider, and produces a `Response`. It may also produce a `Score` (via a scorer).
73
+ A runner is the execution part of a benchmark. A runner function takes whatever inputs it needs, calls a callable's `forward()`, and produces a `Response`. It may also produce a `Score` (via a scorer).
72
74
 
73
- Runners are indented to be per test case because it keeps the benchmark logic small and easy to compose. Running a whole dataset is orchestration, and orchestration is where runtimes differ (parallelism, retries, persistence, budgets, progress UI).
75
+ Runners are intended to be "per test case" because it keeps the benchmark logic small and easy to compose. Running a whole dataset is orchestration, and orchestration is where runtimes differ (parallelism, retries, persistence, budgets, progress UI).
74
76
 
75
77
  There is no restriction that a benchmark must have exactly one runner. You can export multiple runner functions (different modes, different prompts, different providers, different scoring strategies). The runtime just needs to pick the runner it wants to use.
76
78
 
77
- One practical convention you will see in the examples is `runConfig`. It’s runner-specific, and it’s recommended to kept as a simple JSON-serializable object so you can store it alongside your run and reproduce it later.
78
-
79
79
  ### Scorer (how you judge a response)
80
80
 
81
81
  A scorer produces a numeric result. Some scorers are deterministic (same input → same output). Some scorers are non-deterministic (for example "LLM as a judge").
82
82
 
83
- A scorer takes what it needs. Sometimes its expected + actual strings”. Sometimes its a list of required fields + a JSON output”. The runner decides what to pass into the scorer, because the runner is the piece that knows how the benchmark is structured.
83
+ A scorer takes what it needs. Sometimes it's "expected + actual strings". Sometimes it's "a list of required fields + a JSON output". The runner decides what to pass into the scorer, because the runner is the piece that knows how the benchmark is structured.
84
84
 
85
85
  If your benchmark can be scored in multiple ways, a runner can accept multiple scorer implementations and choose between them. The examples in `packages/sdk-0.2/src/benchmarks/examples/` show what that looks like in code.
86
86
 
87
87
  ## What the SDK does vs what the runtime does
88
88
 
89
- Its easy to accidentally push too much responsibility to the SDK and end up with a framework you cant escape. Its also easy to push too much responsibility to the runtime and end up with copy-pasted benchmark logic.
89
+ It's easy to accidentally push "too much responsibility" to the SDK and end up with a framework you can't escape. It's also easy to push "too much responsibility" to the runtime and end up with copy-pasted benchmark logic.
90
90
 
91
91
  This SDK tries to draw a clean line:
92
92
 
@@ -106,9 +106,9 @@ The runtime is responsible for:
106
106
 
107
107
  If you keep that boundary, benchmarks stay portable and runtimes stay free to evolve.
108
108
 
109
- ## If youre implementing a benchmark
109
+ ## If you're implementing a benchmark
110
110
 
111
- The easiest way to think about implementing a benchmark is: you are implementing a small domain module that can be imported by multiple runtimes. That means your job is mostly about making your benchmark _self-contained and explicit_.
111
+ The easiest way to think about "implementing a benchmark" is: you are implementing a small domain module that can be imported by multiple runtimes. That means your job is mostly about making your benchmark _self-contained and explicit_.
112
112
 
113
113
  In practice, the benchmark implementer is responsible for:
114
114
 
@@ -121,11 +121,11 @@ Once those are in place, runtimes can focus on orchestration and product concern
121
121
 
122
122
  Peerbench does not assume your new benchmarks will be part of the SDK itself. The normal expectation is that your benchmark code lives in your runtime (or in its own package), and it uses `peerbench` as a dependency for schemas, base types, and contracts.
123
123
 
124
- Benchmarks can implement everything themselves, but they can also reuse the SDKs predefined building blocks. If it is possible, it is recommended to stick with SDK base types (e.g `AbstractLLMProvider`) and implementations, because it increases compatibility with other tooling that speaks Peerbench entities”.
124
+ Benchmarks can implement everything themselves, but they can also reuse the SDK's predefined building blocks. If it is possible, it is recommended to stick with SDK base types (e.g `AbstractProvider`, `CallableLLM`) and implementations, because it increases compatibility with other tooling that speaks "Peerbench entities".
125
125
 
126
126
  ## A benchmark, step by step
127
127
 
128
- A benchmark in this SDK is not a magical object. It is a small folder that exports a few well-known pieces. The simplest complete benchmark usually includes:
128
+ A "benchmark" in this SDK is not a magical object. It is a small folder that exports a few well-known pieces. The simplest complete benchmark usually includes:
129
129
 
130
130
  1. schemas (test case / response / score)
131
131
  2. a runner (how a single test case is executed)
@@ -137,7 +137,6 @@ You can see a compact, end-to-end reference in:
137
137
  - `packages/sdk-0.2/src/benchmarks/examples/echo-basic/`
138
138
  - `packages/sdk-0.2/src/benchmarks/examples/text-transform/`
139
139
  - `packages/sdk-0.2/src/benchmarks/examples/exact-match-scorer/`
140
- - `packages/sdk-0.2/src/benchmarks/examples/mcq-qa-templated/`
141
140
 
142
141
  ### 1) Schemas: the source of truth
143
142
 
@@ -149,9 +148,9 @@ In `packages/sdk-0.2/src/benchmarks/examples/echo-basic/schema-sets/echo.v1.ts`
149
148
  - define a response schema for that test case
150
149
  - define a score schema for that response
151
150
 
152
- The hierarchy starts from test case → response → score, and we keep the relationship by storing IDs (`testCaseId`, `responseId`). That relationship is real data”, so the runtime is usually the one that persists it and queries it.
151
+ The hierarchy starts from test case → response → score, and we keep the relationship by storing IDs (`testCaseId`, `responseId`). That relationship is "real data", so the runtime is usually the one that persists it and queries it.
153
152
 
154
- Here is what defining a test case schema looks like in practice (trimmed to the idea):
153
+ Here is what "defining a test case schema" looks like in practice (trimmed to the idea):
155
154
 
156
155
  ```ts
157
156
  import { z } from "zod";
@@ -170,9 +169,22 @@ export const MyTestCaseSchemaV1 = defineTestCaseSchema({
170
169
 
171
170
  ### 2) Provider: how runners talk to models
172
171
 
173
- Runners communicate with models through a provider implementation. Thats how the same benchmark can run against different backends without rewriting the benchmark.
172
+ Runners communicate with models through a callable created by a provider. That's how the same benchmark can run against different backends without rewriting the benchmark.
173
+
174
+ A provider is an API client factory. You create a provider once (with API keys, rate limiters, etc.), then call its factory method to create callables — lightweight objects with a `forward()` method and the model baked in:
175
+
176
+ ```ts
177
+ import { OpenRouterProvider } from "peerbench/providers";
178
+
179
+ const provider = new OpenRouterProvider({ apiKey: "..." });
180
+ const target = provider.model({ model: "gpt-4o" });
181
+
182
+ // target.slug === "gpt-4o"
183
+ // target.provider.kind === "peerbench.ai/llm/openrouter.ai"
184
+ // target.forward({ messages }) — model is already captured
185
+ ```
174
186
 
175
- If you already have a service in front of your model, the REST API provider example shows the pattern: accept the SDK’s `messages + model` input, translate it to an HTTP request, and translate the HTTP response back into a single string. Nothing else is required.
187
+ If you already have a service in front of your model, the REST API provider example (`src/providers/example/restapi.ts`) shows the pattern: extend `AbstractProvider`, implement a factory method that returns a `CallableLLM`, and translate messages to HTTP requests inside the `forward()` arrow function.
176
188
 
177
189
  ### 3) Runner: run one test case
178
190
 
@@ -182,15 +194,15 @@ This is intentional. Running many test cases is orchestration, and orchestration
182
194
 
183
195
  In the example runners (e.g. `packages/sdk-0.2/src/benchmarks/examples/echo-basic/runner.ts`) you can see the responsibilities:
184
196
 
185
- - format a test case into provider-friendly input (for chat models, `messages[]`)
186
- - call `provider.forward(...)`
187
- - map provider output into a `Response` entity
197
+ - format a test case into callable-friendly input (`messages[]`)
198
+ - call `target.forward({ messages })`
199
+ - map the output into a `Response` entity
188
200
  - if a scorer is provided, turn scorer output into a `Score` entity
189
201
 
190
202
  Here is the idea in a minimal form:
191
203
 
192
204
  ```ts
193
- const providerResponse = await provider.forward({ model, messages });
205
+ const providerResponse = await target.forward({ messages });
194
206
 
195
207
  const response = ResponseSchemaV1.new({
196
208
  id: "runtime-generates-id",
@@ -198,8 +210,8 @@ const response = ResponseSchemaV1.new({
198
210
  data: providerResponse.data,
199
211
  startedAt: providerResponse.startedAt,
200
212
  completedAt: providerResponse.completedAt,
201
- modelSlug: model,
202
- provider: provider.kind,
213
+ modelSlug: target.slug,
214
+ provider: target.provider.kind,
203
215
  });
204
216
  ```
205
217
 
@@ -207,7 +219,7 @@ const response = ResponseSchemaV1.new({
207
219
 
208
220
  Some benchmarks are easy to score deterministically (string match, regex extraction, set coverage). Some benchmarks need semantic judgment. Some benchmarks want both.
209
221
 
210
- Thats why scorers are separate objects and why runners can accept more than one scorer implementation.
222
+ That's why scorers are separate objects and why runners can accept more than one scorer implementation.
211
223
 
212
224
  The examples show:
213
225
 
@@ -219,11 +231,11 @@ The examples show:
219
231
 
220
232
  ## Usage: run a single test case end-to-end
221
233
 
222
- First, define schemas and a runner (this is the portable benchmark code):
234
+ First, define schemas and a runner (this is the "portable benchmark code"):
223
235
 
224
236
  ```ts
225
237
  import { defineRunner, idGeneratorUUIDv7 } from "peerbench";
226
- import { AbstractLLMProvider } from "peerbench/providers";
238
+ import { CallableLLM } from "peerbench/providers";
227
239
  import {
228
240
  BaseResponseSchemaV1,
229
241
  BaseScoreSchemaV1,
@@ -232,7 +244,7 @@ import {
232
244
  defineScoreSchema,
233
245
  defineTestCaseSchema,
234
246
  } from "peerbench/schemas";
235
- import { ResponseExtensions } from "peerbench/schemas/extensions";
247
+ import { ExtensionLLMResponseFieldsV1 } from "peerbench/schemas/extensions";
236
248
  import z from "zod";
237
249
 
238
250
  const Namespace = "example.peerbench.ai" as const;
@@ -251,7 +263,7 @@ const ResponseSchemaV1 = defineResponseSchema({
251
263
  namespace: Namespace,
252
264
  kind: Kind,
253
265
  schemaVersion: 1,
254
- fields: { ...ResponseExtensions.ExtensionLLMResponseFieldsV1 },
266
+ fields: { ...ExtensionLLMResponseFieldsV1 },
255
267
  });
256
268
 
257
269
  const ScoreSchemaV1 = defineScoreSchema({
@@ -262,22 +274,19 @@ const ScoreSchemaV1 = defineScoreSchema({
262
274
  fields: {},
263
275
  });
264
276
 
277
+ type TestCaseV1 = z.infer<typeof TestCaseSchemaV1.schema>;
278
+
265
279
  export const runner = defineRunner(
266
- {
267
- schemaSets: [
268
- {
269
- testCase: TestCaseSchemaV1,
270
- response: ResponseSchemaV1,
271
- score: ScoreSchemaV1,
272
- },
273
- ],
274
- providers: [AbstractLLMProvider],
275
- scorers: [],
276
- runConfigSchema: { model: z.string() },
277
- },
278
- async ({ testCase, provider, runConfig, idGenerators }) => {
279
- const providerResponse = await provider.forward({
280
- model: runConfig.model,
280
+ async (params: {
281
+ testCase: TestCaseV1;
282
+ target: CallableLLM;
283
+ idGenerators?: {
284
+ response?: (input: unknown) => string;
285
+ };
286
+ }) => {
287
+ const { testCase, target } = params;
288
+
289
+ const providerResponse = await target.forward({
281
290
  messages: [{ role: "user", content: `Echo:\n${testCase.input}` }],
282
291
  });
283
292
 
@@ -287,14 +296,14 @@ export const runner = defineRunner(
287
296
  startedAt: providerResponse.startedAt,
288
297
  completedAt: providerResponse.completedAt,
289
298
  testCaseId: testCase.id,
290
- modelSlug: runConfig.model,
291
- provider: provider.kind,
299
+ modelSlug: target.slug,
300
+ provider: target.provider.kind,
292
301
  inputTokensUsed: providerResponse.inputTokensUsed,
293
302
  outputTokensUsed: providerResponse.outputTokensUsed,
294
303
  inputCost: providerResponse.inputCost,
295
304
  outputCost: providerResponse.outputCost,
296
305
  },
297
- idGenerators?.response ?? idGeneratorUUIDv7
306
+ params.idGenerators?.response ?? idGeneratorUUIDv7
298
307
  );
299
308
 
300
309
  return { response };
@@ -304,13 +313,18 @@ export const runner = defineRunner(
304
313
 
305
314
  ## Usage: what the runtime adds (orchestration)
306
315
 
307
- Once you have a runner, the runtimes job is mostly about repetition and persistence.
316
+ Once you have a runner, the runtime's job is mostly about repetition and persistence.
308
317
 
309
318
  For example, a very small orchestrator might do:
310
319
 
311
320
  ```ts
321
+ import { OpenRouterProvider } from "peerbench/providers";
322
+
323
+ const provider = new OpenRouterProvider({ apiKey: "..." });
324
+ const target = provider.model({ model: "gpt-4o" });
325
+
312
326
  for (const testCase of testCases) {
313
- const result = await runner({ testCase, provider, runConfig });
327
+ const result = await runner({ testCase, target });
314
328
  // store `result.response` and `result.score` somewhere durable
315
329
  // decide how to handle errors, retries, progress, and budgets
316
330
  }
@@ -325,9 +339,8 @@ The examples under `packages/sdk-0.2/src/benchmarks/examples/` each teach one id
325
339
  - `echo-basic`: minimal schema set + runner + storage examples
326
340
  - `text-transform`: one runner supports multiple kinds + deterministic scoring
327
341
  - `exact-match-scorer`: scorer dispatch pattern (algo scorer vs LLM judge scorer)
328
- - `mcq-qa-templated`: template variables + MCQ/QA tasks
329
342
 
330
343
  ## Design notes
331
344
 
332
- - Schemas are runtime-validated (Zod) so type-only drift doesnt silently corrupt stored data.
345
+ - Schemas are runtime-validated (Zod) so "type-only drift" doesn't silently corrupt stored data.
333
346
  - Runners are per-test-case so they stay small and portable; runtimes keep orchestration control.
@@ -1,35 +1,19 @@
1
- import { AbstractLLMProvider } from "../../../providers/index.js";
2
- import { z } from "zod";
3
- /**
4
- * Runners are the backbone of a benchmark. They are responsible for executing the test cases and producing
5
- * the responses and scores. As the benchmark builder, you define what schemas the runner can work with,
6
- * what are the providers and scorers are supported and what configurations can be passed by the caller
7
- * at the execution phase.
8
- */
9
- export declare const echoBasicRunner: ((params: {
10
- testCase: {
11
- id: string;
12
- input: string;
13
- namespace: "example.peerbench.ai";
14
- kind: "llm/echo-basic.tc";
15
- schemaVersion: 1;
16
- metadata?: Record<string, unknown> | undefined;
17
- };
18
- provider: AbstractLLMProvider;
19
- scorer?: undefined;
20
- runConfig: {
21
- model: string;
22
- };
1
+ import { CallableLLM } from "../../../providers/index.js";
2
+ import { IdGenerator } from "../../../types";
3
+ import { EchoBasicTestCaseV1 } from "./schema-sets/echo.v1";
4
+ export declare const echoBasicRunner: (params: {
5
+ testCase: EchoBasicTestCaseV1;
6
+ target: CallableLLM;
23
7
  idGenerators?: {
24
- response?: import("../../../index.js").IdGenerator;
25
- score?: import("../../../index.js").IdGenerator;
8
+ response?: IdGenerator;
9
+ score?: IdGenerator;
26
10
  };
27
11
  }) => Promise<{
28
12
  response: {
29
- startedAt: number;
30
- completedAt: number;
31
13
  id: string;
32
14
  testCaseId: string;
15
+ startedAt: number;
16
+ completedAt: number;
33
17
  data: string;
34
18
  modelSlug: string;
35
19
  provider: string;
@@ -43,231 +27,4 @@ export declare const echoBasicRunner: ((params: {
43
27
  inputCost?: string | undefined;
44
28
  outputCost?: string | undefined;
45
29
  };
46
- score?: {
47
- id: string;
48
- value: number;
49
- responseId: string;
50
- scoringMethod: "ai" | "human" | "algo";
51
- namespace: "example.peerbench.ai";
52
- kind: "llm/echo-basic.sc";
53
- schemaVersion: 1;
54
- metadata?: Record<string, unknown> | undefined;
55
- explanation?: string | undefined;
56
- } | undefined;
57
- }>) & {
58
- config: {
59
- runConfigSchema: z.ZodObject<{
60
- model: z.ZodString;
61
- }, z.core.$strip>;
62
- schemaSets: [{
63
- readonly testCase: z.ZodObject<Omit<{
64
- id: z.ZodString;
65
- namespace: z.ZodString;
66
- schemaVersion: z.ZodNumber;
67
- kind: z.ZodString;
68
- metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
69
- }, "kind" | "namespace" | "schemaVersion"> & {
70
- input: z.ZodString;
71
- } & {
72
- namespace: z.ZodLiteral<"example.peerbench.ai">;
73
- kind: z.ZodLiteral<"llm/echo-basic.tc">;
74
- schemaVersion: z.ZodLiteral<1>;
75
- }, z.core.$strip> & {
76
- new: (input: Omit<{
77
- id: string;
78
- input: string;
79
- namespace: "example.peerbench.ai";
80
- kind: "llm/echo-basic.tc";
81
- schemaVersion: 1;
82
- metadata?: Record<string, unknown> | undefined;
83
- }, "kind" | "namespace" | "schemaVersion">) => {
84
- id: string;
85
- input: string;
86
- namespace: "example.peerbench.ai";
87
- kind: "llm/echo-basic.tc";
88
- schemaVersion: 1;
89
- metadata?: Record<string, unknown> | undefined;
90
- };
91
- newWithId(input: Omit<{
92
- id: string;
93
- input: string;
94
- namespace: "example.peerbench.ai";
95
- kind: "llm/echo-basic.tc";
96
- schemaVersion: 1;
97
- metadata?: Record<string, unknown> | undefined;
98
- }, "kind" | "id" | "namespace" | "schemaVersion">, generator: import("../../../index.js").IdGenerator): Promise<{
99
- id: string;
100
- input: string;
101
- namespace: "example.peerbench.ai";
102
- kind: "llm/echo-basic.tc";
103
- schemaVersion: 1;
104
- metadata?: Record<string, unknown> | undefined;
105
- }>;
106
- };
107
- readonly response: z.ZodObject<Omit<{
108
- id: z.ZodString;
109
- namespace: z.ZodString;
110
- schemaVersion: z.ZodNumber;
111
- kind: z.ZodString;
112
- startedAt: z.ZodNumber;
113
- completedAt: z.ZodNumber;
114
- testCaseId: z.ZodString;
115
- metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
116
- }, "kind" | "namespace" | "schemaVersion"> & {
117
- data: z.ZodString;
118
- modelSlug: z.ZodString;
119
- provider: z.ZodString;
120
- systemPromptId: z.ZodOptional<z.ZodString>;
121
- inputTokensUsed: z.ZodOptional<z.ZodNumber>;
122
- outputTokensUsed: z.ZodOptional<z.ZodNumber>;
123
- inputCost: z.ZodOptional<z.ZodString>;
124
- outputCost: z.ZodOptional<z.ZodString>;
125
- } & {
126
- namespace: z.ZodLiteral<"example.peerbench.ai">;
127
- kind: z.ZodLiteral<"llm/echo-basic.rs">;
128
- schemaVersion: z.ZodLiteral<1>;
129
- }, z.core.$strip> & {
130
- new: (input: Omit<{
131
- startedAt: number;
132
- completedAt: number;
133
- id: string;
134
- testCaseId: string;
135
- data: string;
136
- modelSlug: string;
137
- provider: string;
138
- namespace: "example.peerbench.ai";
139
- kind: "llm/echo-basic.rs";
140
- schemaVersion: 1;
141
- metadata?: Record<string, unknown> | undefined;
142
- systemPromptId?: string | undefined;
143
- inputTokensUsed?: number | undefined;
144
- outputTokensUsed?: number | undefined;
145
- inputCost?: string | undefined;
146
- outputCost?: string | undefined;
147
- }, "kind" | "namespace" | "schemaVersion">) => {
148
- startedAt: number;
149
- completedAt: number;
150
- id: string;
151
- testCaseId: string;
152
- data: string;
153
- modelSlug: string;
154
- provider: string;
155
- namespace: "example.peerbench.ai";
156
- kind: "llm/echo-basic.rs";
157
- schemaVersion: 1;
158
- metadata?: Record<string, unknown> | undefined;
159
- systemPromptId?: string | undefined;
160
- inputTokensUsed?: number | undefined;
161
- outputTokensUsed?: number | undefined;
162
- inputCost?: string | undefined;
163
- outputCost?: string | undefined;
164
- };
165
- newWithId(input: Omit<{
166
- startedAt: number;
167
- completedAt: number;
168
- id: string;
169
- testCaseId: string;
170
- data: string;
171
- modelSlug: string;
172
- provider: string;
173
- namespace: "example.peerbench.ai";
174
- kind: "llm/echo-basic.rs";
175
- schemaVersion: 1;
176
- metadata?: Record<string, unknown> | undefined;
177
- systemPromptId?: string | undefined;
178
- inputTokensUsed?: number | undefined;
179
- outputTokensUsed?: number | undefined;
180
- inputCost?: string | undefined;
181
- outputCost?: string | undefined;
182
- }, "kind" | "id" | "namespace" | "schemaVersion">, generator: import("../../../index.js").IdGenerator): Promise<{
183
- startedAt: number;
184
- completedAt: number;
185
- id: string;
186
- testCaseId: string;
187
- data: string;
188
- modelSlug: string;
189
- provider: string;
190
- namespace: "example.peerbench.ai";
191
- kind: "llm/echo-basic.rs";
192
- schemaVersion: 1;
193
- metadata?: Record<string, unknown> | undefined;
194
- systemPromptId?: string | undefined;
195
- inputTokensUsed?: number | undefined;
196
- outputTokensUsed?: number | undefined;
197
- inputCost?: string | undefined;
198
- outputCost?: string | undefined;
199
- }>;
200
- };
201
- readonly score: z.ZodObject<Omit<{
202
- id: z.ZodString;
203
- namespace: z.ZodString;
204
- kind: z.ZodString;
205
- schemaVersion: z.ZodNumber;
206
- value: z.ZodNumber;
207
- responseId: z.ZodString;
208
- explanation: z.ZodOptional<z.ZodString>;
209
- metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
210
- scoringMethod: z.ZodEnum<{
211
- readonly ai: "ai";
212
- readonly human: "human";
213
- readonly algo: "algo";
214
- }>;
215
- }, "kind" | "namespace" | "schemaVersion"> & {
216
- namespace: z.ZodLiteral<"example.peerbench.ai">;
217
- kind: z.ZodLiteral<"llm/echo-basic.sc">;
218
- schemaVersion: z.ZodLiteral<1>;
219
- }, z.core.$strip> & {
220
- new: (input: Omit<{
221
- id: string;
222
- value: number;
223
- responseId: string;
224
- scoringMethod: "ai" | "human" | "algo";
225
- namespace: "example.peerbench.ai";
226
- kind: "llm/echo-basic.sc";
227
- schemaVersion: 1;
228
- metadata?: Record<string, unknown> | undefined;
229
- explanation?: string | undefined;
230
- }, "kind" | "namespace" | "schemaVersion">) => {
231
- id: string;
232
- value: number;
233
- responseId: string;
234
- scoringMethod: "ai" | "human" | "algo";
235
- namespace: "example.peerbench.ai";
236
- kind: "llm/echo-basic.sc";
237
- schemaVersion: 1;
238
- metadata?: Record<string, unknown> | undefined;
239
- explanation?: string | undefined;
240
- };
241
- newWithId(input: Omit<{
242
- id: string;
243
- value: number;
244
- responseId: string;
245
- scoringMethod: "ai" | "human" | "algo";
246
- namespace: "example.peerbench.ai";
247
- kind: "llm/echo-basic.sc";
248
- schemaVersion: 1;
249
- metadata?: Record<string, unknown> | undefined;
250
- explanation?: string | undefined;
251
- }, "kind" | "id" | "namespace" | "schemaVersion">, generator: import("../../../index.js").IdGenerator): Promise<{
252
- id: string;
253
- value: number;
254
- responseId: string;
255
- scoringMethod: "ai" | "human" | "algo";
256
- namespace: "example.peerbench.ai";
257
- kind: "llm/echo-basic.sc";
258
- schemaVersion: 1;
259
- metadata?: Record<string, unknown> | undefined;
260
- explanation?: string | undefined;
261
- }>;
262
- };
263
- }];
264
- providers: [typeof AbstractLLMProvider];
265
- scorers: [];
266
- parseRunConfig?: boolean;
267
- defaults?: {
268
- scorer?: undefined;
269
- responseIdGenerator?: import("../../../index.js").IdGenerator;
270
- scoreIdGenerator?: import("../../../index.js").IdGenerator;
271
- } | undefined;
272
- };
273
- };
30
+ }>;