peerbench 0.0.2-alpha.0 → 0.0.2-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/README.md +123 -99
  2. package/dist/aggregators/index.d.ts +67 -0
  3. package/dist/aggregators/index.js +46 -0
  4. package/dist/aggregators/index.js.map +1 -0
  5. package/dist/benchmarks/index.d.ts +615 -1271
  6. package/dist/benchmarks/index.js +358 -805
  7. package/dist/benchmarks/index.js.map +1 -1
  8. package/dist/{chunk-DUBKY73H.js → chunk-4UBK6452.js} +13 -13
  9. package/dist/chunk-4UBK6452.js.map +1 -0
  10. package/dist/chunk-ERALDEZY.js +112 -0
  11. package/dist/chunk-ERALDEZY.js.map +1 -0
  12. package/dist/{chunk-ZJWSK4VO.js → chunk-HMQYGCKI.js} +1 -1
  13. package/dist/chunk-HMQYGCKI.js.map +1 -0
  14. package/dist/chunk-NUEOE3K5.js +8 -0
  15. package/dist/chunk-NUEOE3K5.js.map +1 -0
  16. package/dist/chunk-OQE6TQXZ.js +42 -0
  17. package/dist/chunk-OQE6TQXZ.js.map +1 -0
  18. package/dist/chunk-QY5MPNNB.js +28 -0
  19. package/dist/chunk-QY5MPNNB.js.map +1 -0
  20. package/dist/chunk-R76XA2K6.js +229 -0
  21. package/dist/chunk-R76XA2K6.js.map +1 -0
  22. package/dist/chunk-TRNCF2BG.js +35 -0
  23. package/dist/chunk-TRNCF2BG.js.map +1 -0
  24. package/dist/chunk-UHHHSYVE.js +11 -0
  25. package/dist/chunk-UHHHSYVE.js.map +1 -0
  26. package/dist/{chunk-232PY7K3.js → chunk-YY33MNMV.js} +29 -14
  27. package/dist/chunk-YY33MNMV.js.map +1 -0
  28. package/dist/chunk-ZEWI24CV.js +365 -0
  29. package/dist/chunk-ZEWI24CV.js.map +1 -0
  30. package/dist/chunk-ZXTQJFGL.js +44 -0
  31. package/dist/chunk-ZXTQJFGL.js.map +1 -0
  32. package/dist/index-BAioQhp2.d.ts +27 -0
  33. package/dist/index.d.ts +51 -26
  34. package/dist/index.js +28 -25
  35. package/dist/index.js.map +1 -1
  36. package/dist/json-file-ZwzLUbje.d.ts +73 -0
  37. package/dist/llm-judge-QThCZ9TQ.d.ts +67 -0
  38. package/dist/providers/index.d.ts +16 -19
  39. package/dist/providers/index.js +8 -253
  40. package/dist/providers/index.js.map +1 -1
  41. package/dist/schemas/extensions/index.d.ts +16 -2
  42. package/dist/schemas/extensions/index.js +9 -3
  43. package/dist/schemas/extensions/index.js.map +1 -1
  44. package/dist/schemas/index.d.ts +108 -141
  45. package/dist/schemas/index.js +7 -10
  46. package/dist/schemas/llm/index.d.ts +100 -82
  47. package/dist/schemas/llm/index.js +7 -29
  48. package/dist/schemas/llm/index.js.map +1 -1
  49. package/dist/scorers/index.d.ts +3 -2
  50. package/dist/scorers/index.js +8 -486
  51. package/dist/scorers/index.js.map +1 -1
  52. package/dist/storages/index.d.ts +69 -0
  53. package/dist/storages/index.js +98 -0
  54. package/dist/storages/index.js.map +1 -0
  55. package/package.json +12 -6
  56. package/dist/catalogs/index.d.ts +0 -75
  57. package/dist/catalogs/index.js +0 -88
  58. package/dist/catalogs/index.js.map +0 -1
  59. package/dist/chunk-22HU24QF.js +0 -8
  60. package/dist/chunk-22HU24QF.js.map +0 -1
  61. package/dist/chunk-232PY7K3.js.map +0 -1
  62. package/dist/chunk-7TREBPSJ.js +0 -26
  63. package/dist/chunk-7TREBPSJ.js.map +0 -1
  64. package/dist/chunk-DUBKY73H.js.map +0 -1
  65. package/dist/chunk-GVF4YZF3.js +0 -15
  66. package/dist/chunk-GVF4YZF3.js.map +0 -1
  67. package/dist/chunk-HJH3SW3L.js +0 -103
  68. package/dist/chunk-HJH3SW3L.js.map +0 -1
  69. package/dist/chunk-IUN2IUCS.js +0 -58
  70. package/dist/chunk-IUN2IUCS.js.map +0 -1
  71. package/dist/chunk-VBOM2YEG.js +0 -47
  72. package/dist/chunk-VBOM2YEG.js.map +0 -1
  73. package/dist/chunk-ZJWSK4VO.js.map +0 -1
  74. package/dist/data-BmN5WjZ4.d.ts +0 -57
  75. package/dist/generic-array-DLHWSvf1.d.ts +0 -22
  76. package/dist/index-WiPjF2AL.d.ts +0 -15
  77. package/dist/llm-judge-DIG1f1Az.d.ts +0 -67
  78. package/dist/simple-system-prompt-CzPYuvo0.d.ts +0 -49
  79. package/dist/system-prompt--0FdPWqK.d.ts +0 -58
  80. package/dist/utilities-BrRH32rD.d.ts +0 -30
package/README.md CHANGED
@@ -1,12 +1,11 @@
1
1
  # `peerbench` SDK
2
2
 
3
- This package is the shared “domain core” for _building benchmarks_ in a standardized, portable way. It gives you a consistent set of _persistable entities_ (schemas + types), and a consistent set of _runtime contracts_ (loaders, runners, scorers, providers) so the same benchmark can run in a CLI, a web app, a worker, or anything else.
3
+ This package is the shared “domain core” for _building benchmarks_ in a standardized, portable way. It gives you a consistent set of _persistable entities_ (schemas + types), and a consistent set of _runtime contracts_ (runners, scorers, providers, storages, aggregators) so the same benchmark can run in a CLI, a web app, a worker, or anything else.
4
4
 
5
- > _Runtime_ refers to the codebase (a CLI, a webapp, a background service etc.) that uses the SDK.
5
+ If you’re implementing a new benchmark, the SDK is the part that keeps it portable instead of glued to one runtime. If you’re integrating peerbench SDK into a runtime, the SDK is the part you don’t want to rewrite in every repo.
6
6
 
7
- If you’re implementing a new benchmark, the SDK is the part that keeps it portable instead of glued to one runtime. If you’re integrating Peerbench into a runtime, the SDK is the part you don’t want to rewrite in every repo.
8
-
9
- > This package does not support CommonJS
7
+ > - _Runtime_ refers to the codebase that uses peerbench SDK (a CLI, a webapp, a background service etc.)
8
+ > - This package does not support CommonJS
10
9
 
11
10
  ## What is a benchmark?
12
11
 
@@ -18,39 +17,43 @@ If you look at widely-used benchmarks, the pattern is always the same even when
18
17
  - In BIG-bench style task suites, you have many different task types and you want a consistent way to run and score them.
19
18
  - In HELM-style evaluations, you care about not only “did it answer correctly”, but also how you ran it (prompting setup, constraints, metadata) and how you report results.
20
19
 
21
- Those benchmarks differ in details, but they all boil down to the same building blocks: a dataset of test cases, a way to run a system on each test case, and a way to score the output. The Peerbench SDK is designed so these patterns can be represented with the same portable shape.
20
+ Those benchmarks differ in details, but they all boil down to the same building blocks: a dataset of test cases, a way to run a system on each test case, and a way to score the output. The peerbench SDK is designed so these patterns can be represented with the same portable shape.
22
21
 
23
22
  ## The mental model
24
23
 
25
- Now that we agree on what a benchmark is, we can talk about how Peerbench represents it.
24
+ Now that we agree on what a benchmark is, we can talk about how peerbench represents it.
26
25
 
27
- Peerbench is deliberately boring here. It doesn’t try to invent a new “benchmark framework”. It gives you a small set of building blocks that you can compose. If you understand these pieces, you can read any benchmark implementation and know where to look.
26
+ peerbench is deliberately boring here. It doesn’t try to invent a new “benchmark framework”. It gives you a small set of building blocks that you can compose. If you understand these pieces, you can read any benchmark implementation and know where to look.
28
27
 
29
28
  ### Entities (the things you store)
30
29
 
31
- When you run an evaluation, you end up with data that you want to store, query, re-score, and share. Peerbench standardizes that output by modeling it as a small set of entities.
30
+ When you run an evaluation, you end up with data that you want to store, query, re-score, and share. peerbench standardizes that output by modeling it as a small set of entities.
32
31
 
33
- This SDK assumes four core entities:
32
+ This SDK assumes three core entities:
34
33
 
35
- - `BenchmarkSpec`: optional benchmark-level configuration (think: “applies to the whole dataset/run”).
36
34
  - `TestCase`: a single input/task.
37
35
  - `Response`: the model output for a specific test case (`testCaseId` points to `TestCase.id`).
38
- - `Score`: an evaluation result for a specific response (`responseId` points to `Response.id`).
36
+ - `Score` (optional): an evaluation result for a specific response (`responseId` points to `Response.id`).
39
37
 
40
38
  Everything else in the SDK exists to create these entities in a predictable way.
41
39
 
42
- Two fields show up everywhere:
40
+ Three fields show up everywhere:
43
41
 
44
42
  - `kind` tells you _what type_ of entity something is. It is a stable string you pick (descriptive).
45
43
  - `schemaVersion` tells you _which version_ of that entity shape you’re looking at.
44
+ - `namespace` tells you which “owner” defines that kind (e.g peerbench.ai).
45
+
46
+ This is why peerbench leans on [Zod](https://zod.dev) schemas: it keeps the persisted data contract explicit and runtime-validated.
46
47
 
47
- This is why Peerbench leans on [Zod](https://zod.dev) schemas: it keeps the persisted data contract explicit and runtime-validated.
48
+ ### Storage (how entities are persisted)
48
49
 
49
- ### Loader (how raw data becomes test cases)
50
+ The SDK does not prescribe how you ingest datasets. Runtimes often load test cases from JSON/JSONL, DB rows, Parquet, or an API. Storages are the abstractions that allow you to standardize the way you load the data from a source to the memory.
50
51
 
51
- In real projects, test cases live in many places: JSON files, JSONL streams, a database, Parquet, an API, etc.
52
+ peerbench SDK provides some pre-defined storage abstractions you can use out of the box:
52
53
 
53
- A loader is the piece that reads that raw data and returns `TestCase[]` (and optionally existing `Response[]` / `Score[]`). The important point is not the file format. The important point is that the loader is where your “raw input → Peerbench entities” mapping lives.
54
+ - file-based storage with custom codecs (`FileStorage`)
55
+ - JSON array files (`JSONFileStorage`)
56
+ - SQLite storage (codec-based)
54
57
 
55
58
  ### Provider (how you talk to a model)
56
59
 
@@ -65,13 +68,13 @@ If you already have your own service in front of the model, you can still model
65
68
 
66
69
  ### Runner (how you execute one test case)
67
70
 
68
- A runner is the execution part of a benchmark. A runner function takes whatever inputs it needs, calls a provider, and produces a `Response`. It may also produce a `Score` (directly, or via a scorer).
71
+ A runner is the execution part of a benchmark. A runner function takes whatever inputs it needs, calls a provider, and produces a `Response`. It may also produce a `Score` (via a scorer).
69
72
 
70
73
  Runners are indented to be “per test case” because it keeps the benchmark logic small and easy to compose. Running a whole dataset is orchestration, and orchestration is where runtimes differ (parallelism, retries, persistence, budgets, progress UI).
71
74
 
72
75
  There is no restriction that a benchmark must have exactly one runner. You can export multiple runner functions (different modes, different prompts, different providers, different scoring strategies). The runtime just needs to pick the runner it wants to use.
73
76
 
74
- One practical convention you will see in the examples is `runConfig`. It’s runner-specific, and it’s usually kept as a simple JSON-serializable object so you can store it alongside your run and reproduce it later. This is a best practice, not a hard restriction: if something doesn’t belong in `runConfig`, you can pass it as a normal parameter next to it.
77
+ One practical convention you will see in the examples is `runConfig`. It’s runner-specific, and it’s recommended to kept as a simple JSON-serializable object so you can store it alongside your run and reproduce it later.
75
78
 
76
79
  ### Scorer (how you judge a response)
77
80
 
@@ -79,7 +82,7 @@ A scorer produces a numeric result. Some scorers are deterministic (same input
79
82
 
80
83
  A scorer takes what it needs. Sometimes it’s “expected + actual strings”. Sometimes it’s “a list of required fields + a JSON output”. The runner decides what to pass into the scorer, because the runner is the piece that knows how the benchmark is structured.
81
84
 
82
- If your benchmark can be scored in multiple ways, a runner can accept multiple scorer implementations and choose between them based on `scorer.kind`. The examples in `packages/sdk-0.2/src/benchmarks/example/` show what that looks like in code.
85
+ If your benchmark can be scored in multiple ways, a runner can accept multiple scorer implementations and choose between them. The examples in `packages/sdk-0.2/src/benchmarks/examples/` show what that looks like in code.
83
86
 
84
87
  ## What the SDK does vs what the runtime does
85
88
 
@@ -87,14 +90,15 @@ It’s easy to accidentally push “too much responsibility” to the SDK and en
87
90
 
88
91
  This SDK tries to draw a clean line:
89
92
 
90
- The SDK is responsible for:
93
+ It is responsible for:
91
94
 
92
95
  - defining and validating entity shapes (Zod schemas are the source of truth)
93
- - providing base contracts and reusable building blocks (schemas + loaders + runners + scorers)
96
+ - providing base contracts and reusable building blocks (schemas + runners + scorers + storages + aggregators)
94
97
  - defining provider/scorer contracts so you can swap backends without rewriting benchmarks
95
98
 
96
99
  The runtime is responsible for:
97
100
 
101
+ - sourcing test cases (JSON/DB/Parquet/API/etc.) and mapping them into `TestCase` entities
98
102
  - orchestration across many test cases (parallelism, retries, persistence, resuming, progress UI)
99
103
  - deciding how/where entities are stored (DB schema, file layout, caching)
100
104
  - secrets and private content (API keys, redacted prompts, access control)
@@ -110,7 +114,6 @@ In practice, the benchmark implementer is responsible for:
110
114
 
111
115
  - choosing stable `kind` strings (namespaced, descriptive) and bumping `schemaVersion` on breaking changes
112
116
  - defining the schemas that are safe to store and share (and keeping secrets out of them)
113
- - deciding how raw datasets map into `TestCase` entities (loader)
114
117
  - deciding how a test case is executed (runner) and how it becomes a `Response`
115
118
  - deciding how scoring works (inline in runner, a separate scorer, or multiple scorers)
116
119
 
@@ -125,19 +128,22 @@ Benchmarks can implement everything themselves, but they can also reuse the SDK
125
128
  A “benchmark” in this SDK is not a magical object. It is a small folder that exports a few well-known pieces. The simplest complete benchmark usually includes:
126
129
 
127
130
  1. schemas (test case / response / score)
128
- 2. a loader (how test cases are read from disk/DB/etc.)
129
- 3. a runner (how a single test case is executed)
130
- 4. one or more scorers (optional)
131
+ 2. a runner (how a single test case is executed)
132
+ 3. one or more scorers (if the SDK provided scorers do not work)
133
+ 4. (optional) one or more storages (how entities are persisted)
131
134
 
132
135
  You can see a compact, end-to-end reference in:
133
136
 
134
- - `packages/sdk-0.2/src/benchmarks/example/basic/`
137
+ - `packages/sdk-0.2/src/benchmarks/examples/echo-basic/`
138
+ - `packages/sdk-0.2/src/benchmarks/examples/text-transform/`
139
+ - `packages/sdk-0.2/src/benchmarks/examples/exact-match-scorer/`
140
+ - `packages/sdk-0.2/src/benchmarks/examples/mcq-qa-templated/`
135
141
 
136
142
  ### 1) Schemas: the source of truth
137
143
 
138
144
  Schemas are the core of a benchmark. They are the entities that hold the data.
139
145
 
140
- In `packages/sdk-0.2/src/benchmarks/example/basic/test-cases/echo.v1.ts` you can see the pattern:
146
+ In `packages/sdk-0.2/src/benchmarks/examples/echo-basic/schema-sets/echo.v1.ts` you can see the pattern:
141
147
 
142
148
  - define a test case schema (`kind` + `schemaVersion` + benchmark fields)
143
149
  - define a response schema for that test case
@@ -153,7 +159,8 @@ import { BaseTestCaseSchemaV1, defineTestCaseSchema } from "peerbench/schemas";
153
159
 
154
160
  export const MyTestCaseSchemaV1 = defineTestCaseSchema({
155
161
  baseSchema: BaseTestCaseSchemaV1,
156
- kind: "mybench.ts.someTask",
162
+ namespace: "example.peerbench.ai",
163
+ kind: "llm/my-benchmark",
157
164
  schemaVersion: 1,
158
165
  fields: {
159
166
  prompt: z.string(),
@@ -161,38 +168,19 @@ export const MyTestCaseSchemaV1 = defineTestCaseSchema({
161
168
  });
162
169
  ```
163
170
 
164
- ### 2) Loader: how test cases become entities
165
-
166
- A loader reads external data and returns in-memory entities:
167
-
168
- ```ts
169
- type LoaderResult<TTestCase> = {
170
- testCases: TTestCase[];
171
- responses: [];
172
- scores: [];
173
- };
174
- ```
175
-
176
- In the basic example (`packages/sdk-0.2/src/benchmarks/example/basic/loader.ts`) the loader reads a JSON array and maps it into `TestCase` entities.
177
-
178
- ### 3) Provider: how runners talk to models
171
+ ### 2) Provider: how runners talk to models
179
172
 
180
173
  Runners communicate with models through a provider implementation. That’s how the same benchmark can run against different backends without rewriting the benchmark.
181
174
 
182
- There are also example providers meant to be read as reference implementations:
183
-
184
- - `packages/sdk-0.2/src/providers/example/echo.ts` (no network calls; returns deterministic content)
185
- - `packages/sdk-0.2/src/providers/example/restapi.ts` (calls your own REST “agent service”)
186
-
187
175
  If you already have a service in front of your model, the REST API provider example shows the pattern: accept the SDK’s `messages + model` input, translate it to an HTTP request, and translate the HTTP response back into a single string. Nothing else is required.
188
176
 
189
- ### 4) Runner: run one test case
177
+ ### 3) Runner: run one test case
190
178
 
191
179
  A runner function typically executes one test case and returns `{ response, score? }`.
192
180
 
193
181
  This is intentional. Running many test cases is orchestration, and orchestration is where runtimes differ the most (parallelism, retries, persistence, resuming, UI, cost limits). The runner is the small, portable unit.
194
182
 
195
- In the basic example runner (`packages/sdk-0.2/src/benchmarks/example/basic/runner.ts`) you can see the responsibilities:
183
+ In the example runners (e.g. `packages/sdk-0.2/src/benchmarks/examples/echo-basic/runner.ts`) you can see the responsibilities:
196
184
 
197
185
  - format a test case into provider-friendly input (for chat models, `messages[]`)
198
186
  - call `provider.forward(...)`
@@ -215,7 +203,7 @@ const response = ResponseSchemaV1.new({
215
203
  });
216
204
  ```
217
205
 
218
- ### 5) Scorers: optional, but powerful
206
+ ### 5) Scorers
219
207
 
220
208
  Some benchmarks are easy to score deterministically (string match, regex extraction, set coverage). Some benchmarks need semantic judgment. Some benchmarks want both.
221
209
 
@@ -223,70 +211,106 @@ That’s why scorers are separate objects and why runners can accept more than o
223
211
 
224
212
  The examples show:
225
213
 
226
- - a deterministic scorer (`packages/sdk-0.2/src/benchmarks/example/basic/scorer.ts`)
227
- - a non-deterministic scorer (`packages/sdk-0.2/src/scorers/llm-judge.ts`)
228
- - a runner that can switch based on `scorer.kind` (`packages/sdk-0.2/src/benchmarks/example/basic/runner.ts`)
214
+ - deterministic scoring inside a runner (`packages/sdk-0.2/src/benchmarks/examples/text-transform/runner.ts`)
215
+ - multi-scorer dispatch (`packages/sdk-0.2/src/benchmarks/examples/exact-match-scorer/runner.ts`)
216
+ - `MCQScorer` and `LLMAsAJudgeScorer` in `peerbench/scorers`
229
217
 
230
- ## Usage: run a single test case end-to-end
231
-
232
- First, pick a benchmark and a provider:
233
-
234
- ```ts
235
- import { example } from "peerbench/benchmarks";
236
- import { ExampleEchoLLMProvider } from "peerbench/providers";
218
+ `LLMAsAJudgeScorer` returns a normalized `value` in the `0..1` range (inclusive).
237
219
 
238
- const provider = new ExampleEchoLLMProvider();
239
- ```
220
+ ## Usage: run a single test case end-to-end
240
221
 
241
- Then build a test case entity and run it:
222
+ First, define schemas and a runner (this is the “portable benchmark code”):
242
223
 
243
224
  ```ts
244
- const testCase = example.ExampleEchoTestCaseSchemaV1.new({
245
- id: "tc-1",
246
- instruction: "Repeat the input exactly",
247
- input: "hello",
248
- expectedOutput: "hello",
225
+ import { defineRunner, idGeneratorUUIDv7 } from "peerbench";
226
+ import { AbstractLLMProvider } from "peerbench/providers";
227
+ import {
228
+ BaseResponseSchemaV1,
229
+ BaseScoreSchemaV1,
230
+ BaseTestCaseSchemaV1,
231
+ defineResponseSchema,
232
+ defineScoreSchema,
233
+ defineTestCaseSchema,
234
+ } from "peerbench/schemas";
235
+ import { ResponseExtensions } from "peerbench/schemas/extensions";
236
+ import z from "zod";
237
+
238
+ const Namespace = "example.peerbench.ai" as const;
239
+ const Kind = "llm/echo-basic" as const;
240
+
241
+ const TestCaseSchemaV1 = defineTestCaseSchema({
242
+ baseSchema: BaseTestCaseSchemaV1,
243
+ namespace: Namespace,
244
+ kind: Kind,
245
+ schemaVersion: 1,
246
+ fields: { input: z.string() },
249
247
  });
250
248
 
251
- const scorer = new example.ExampleExactMatchScorer();
249
+ const ResponseSchemaV1 = defineResponseSchema({
250
+ baseSchema: BaseResponseSchemaV1,
251
+ namespace: Namespace,
252
+ kind: Kind,
253
+ schemaVersion: 1,
254
+ fields: { ...ResponseExtensions.ExtensionLLMResponseFieldsV1 },
255
+ });
252
256
 
253
- const { response, score } = await example.runTestCase({
254
- testCase,
255
- provider,
256
- scorer,
257
- runConfig: { model: "example-model" },
257
+ const ScoreSchemaV1 = defineScoreSchema({
258
+ baseSchema: BaseScoreSchemaV1,
259
+ namespace: Namespace,
260
+ kind: Kind,
261
+ schemaVersion: 1,
262
+ fields: {},
258
263
  });
259
- ```
260
264
 
261
- If you want to load test cases instead of constructing them manually, use the loader:
265
+ export const runner = defineRunner(
266
+ {
267
+ schemaSets: [
268
+ {
269
+ testCase: TestCaseSchemaV1,
270
+ response: ResponseSchemaV1,
271
+ score: ScoreSchemaV1,
272
+ },
273
+ ],
274
+ providers: [AbstractLLMProvider],
275
+ scorers: [],
276
+ runConfigSchema: { model: z.string() },
277
+ },
278
+ async ({ testCase, provider, runConfig, idGenerators }) => {
279
+ const providerResponse = await provider.forward({
280
+ model: runConfig.model,
281
+ messages: [{ role: "user", content: `Echo:\n${testCase.input}` }],
282
+ });
262
283
 
263
- ```ts
264
- const loader = new example.ExampleJSONDataLoader();
265
- const { testCases } = await loader.loadData({
266
- content: new TextEncoder().encode(
267
- JSON.stringify([
284
+ const response = await ResponseSchemaV1.newWithId(
268
285
  {
269
- id: "tc-1",
270
- kind: "example.ts.echo",
271
- schemaVersion: 1,
272
- instruction: "Repeat the input exactly",
273
- input: "hello",
274
- expectedOutput: "hello",
286
+ data: providerResponse.data,
287
+ startedAt: providerResponse.startedAt,
288
+ completedAt: providerResponse.completedAt,
289
+ testCaseId: testCase.id,
290
+ modelSlug: runConfig.model,
291
+ provider: provider.kind,
292
+ inputTokensUsed: providerResponse.inputTokensUsed,
293
+ outputTokensUsed: providerResponse.outputTokensUsed,
294
+ inputCost: providerResponse.inputCost,
295
+ outputCost: providerResponse.outputCost,
275
296
  },
276
- ])
277
- ),
278
- });
297
+ idGenerators?.response ?? idGeneratorUUIDv7
298
+ );
299
+
300
+ return { response };
301
+ }
302
+ );
279
303
  ```
280
304
 
281
305
  ## Usage: what the runtime adds (orchestration)
282
306
 
283
- Once you have `runTestCase(...)`, the runtime’s job is mostly about repetition and persistence.
307
+ Once you have a runner, the runtime’s job is mostly about repetition and persistence.
284
308
 
285
309
  For example, a very small orchestrator might do:
286
310
 
287
311
  ```ts
288
312
  for (const testCase of testCases) {
289
- const result = await example.runTestCase({ testCase, provider, runConfig });
313
+ const result = await runner({ testCase, provider, runConfig });
290
314
  // store `result.response` and `result.score` somewhere durable
291
315
  // decide how to handle errors, retries, progress, and budgets
292
316
  }
@@ -296,14 +320,14 @@ That loop is where your product decisions live. The SDK is intentionally not opi
296
320
 
297
321
  ## More examples to read
298
322
 
299
- The `example` benchmark is split into folders that each teach one idea:
323
+ The examples under `packages/sdk-0.2/src/benchmarks/examples/` each teach one idea:
300
324
 
301
- - `packages/sdk-0.2/src/benchmarks/example/basic/`: the simplest complete example
302
- - `packages/sdk-0.2/src/benchmarks/example/multi-kind/`: one runner, multiple test case kinds
303
- - `packages/sdk-0.2/src/benchmarks/example/multi-scorer/`: one runner, multiple scorer implementations
325
+ - `echo-basic`: minimal schema set + runner + storage examples
326
+ - `text-transform`: one runner supports multiple kinds + deterministic scoring
327
+ - `exact-match-scorer`: scorer dispatch pattern (algo scorer vs LLM judge scorer)
328
+ - `mcq-qa-templated`: template variables + MCQ/QA tasks
304
329
 
305
330
  ## Design notes
306
331
 
307
332
  - Schemas are runtime-validated (Zod) so “type-only drift” doesn’t silently corrupt stored data.
308
333
  - Runners are per-test-case so they stay small and portable; runtimes keep orchestration control.
309
- - Kinds are namespaced strings (e.g. `example.ts.echo`) to avoid collisions across benchmarks.
@@ -0,0 +1,67 @@
1
+ import { BaseScoreV1, BaseTestCaseV1, BaseResponseV1 } from '../schemas/index.js';
2
+ import { a as InferExtension } from '../index-BAioQhp2.js';
3
+ import z__default from 'zod';
4
+ import '../provider-BDjGp2y-.js';
5
+ import '../abstract-Dec9Sc5O.js';
6
+
7
+ declare abstract class AbstractAggregator {
8
+ abstract push(params: {
9
+ score: BaseScoreV1;
10
+ testCase?: BaseTestCaseV1;
11
+ response?: BaseResponseV1;
12
+ }): Promise<void>;
13
+ abstract aggregate(config?: unknown): Promise<unknown>;
14
+ }
15
+
16
+ /**
17
+ * Provides a set of fields that holds information about the LLM and its response.
18
+ */
19
+ declare const ExtensionLLMResponseFieldsV1: {
20
+ data: z__default.ZodString;
21
+ modelSlug: z__default.ZodString;
22
+ provider: z__default.ZodString;
23
+ systemPromptId: z__default.ZodOptional<z__default.ZodString>;
24
+ inputTokensUsed: z__default.ZodOptional<z__default.ZodNumber>;
25
+ outputTokensUsed: z__default.ZodOptional<z__default.ZodNumber>;
26
+ inputCost: z__default.ZodOptional<z__default.ZodString>;
27
+ outputCost: z__default.ZodOptional<z__default.ZodString>;
28
+ };
29
+
30
+ /**
31
+ * Provides a set of fields that holds information about the LLM model
32
+ * that was used to judge the response.
33
+ */
34
+ declare const ExtensionLLMAsAJudgeScoreFieldsV1: {
35
+ scorerAISystemPrompt: z__default.ZodOptional<z__default.ZodString>;
36
+ scorerAISystemPromptId: z__default.ZodOptional<z__default.ZodString>;
37
+ scorerAIProvider: z__default.ZodOptional<z__default.ZodString>;
38
+ scorerAIModelSlug: z__default.ZodOptional<z__default.ZodString>;
39
+ scorerAIInputTokensUsed: z__default.ZodOptional<z__default.ZodNumber>;
40
+ scorerAIOutputTokensUsed: z__default.ZodOptional<z__default.ZodNumber>;
41
+ scorerAIInputCost: z__default.ZodOptional<z__default.ZodString>;
42
+ scorerAIOutputCost: z__default.ZodOptional<z__default.ZodString>;
43
+ };
44
+
45
+ declare class AvgAggregator extends AbstractAggregator {
46
+ private separateBySystemPrompt;
47
+ private scores;
48
+ constructor(params: {
49
+ separateBySystemPrompt?: boolean;
50
+ });
51
+ push(params: {
52
+ score: InferExtension<typeof ExtensionLLMAsAJudgeScoreFieldsV1, BaseScoreV1>;
53
+ response: InferExtension<typeof ExtensionLLMResponseFieldsV1, BaseResponseV1>;
54
+ testCase?: BaseTestCaseV1;
55
+ }): Promise<void>;
56
+ aggregate(): Promise<{
57
+ [k: string]: {
58
+ average: number;
59
+ model: string;
60
+ total: number;
61
+ count: number;
62
+ systemPromptId?: string;
63
+ };
64
+ }>;
65
+ }
66
+
67
+ export { AbstractAggregator, AvgAggregator };
@@ -0,0 +1,46 @@
1
+ import "../chunk-PZ5AY32C.js";
2
+
3
+ // src/aggregators/abstract.ts
4
+ var AbstractAggregator = class {
5
+ };
6
+
7
+ // src/aggregators/llm/avg.ts
8
+ var AvgAggregator = class extends AbstractAggregator {
9
+ separateBySystemPrompt = false;
10
+ scores = {};
11
+ constructor(params) {
12
+ super();
13
+ this.separateBySystemPrompt = params.separateBySystemPrompt ?? false;
14
+ }
15
+ async push(params) {
16
+ const model = params.response.modelSlug;
17
+ const compositeKey = model + (params.response.systemPromptId ?? "");
18
+ const key = this.separateBySystemPrompt ? compositeKey : model;
19
+ if (!this.scores[key]) {
20
+ this.scores[key] = {
21
+ model,
22
+ systemPromptId: params.response.systemPromptId,
23
+ count: 0,
24
+ total: 0
25
+ };
26
+ }
27
+ this.scores[key].total += params.score.value;
28
+ this.scores[key].count++;
29
+ }
30
+ async aggregate() {
31
+ return Object.fromEntries(
32
+ Object.entries(this.scores).map(([model, score]) => [
33
+ model,
34
+ {
35
+ ...score,
36
+ average: score.total / score.count
37
+ }
38
+ ])
39
+ );
40
+ }
41
+ };
42
+ export {
43
+ AbstractAggregator,
44
+ AvgAggregator
45
+ };
46
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../src/aggregators/abstract.ts","../../src/aggregators/llm/avg.ts"],"sourcesContent":["import { BaseResponseV1, BaseTestCaseV1 } from \"@/schemas\";\nimport { BaseScoreV1 } from \"@/schemas/score\";\n\nexport abstract class AbstractAggregator {\n abstract push(params: {\n score: BaseScoreV1;\n testCase?: BaseTestCaseV1;\n response?: BaseResponseV1;\n }): Promise<void>;\n\n abstract aggregate(config?: unknown): Promise<unknown>;\n}\n","import { BaseResponseV1, BaseScoreV1, BaseTestCaseV1 } from \"@/schemas\";\nimport { AbstractAggregator } from \"../abstract\";\nimport { InferExtension } from \"@/utilities\";\nimport { ExtensionLLMResponseFieldsV1 } from \"@/schemas/extensions/response/llm\";\nimport { ExtensionLLMAsAJudgeScoreFieldsV1 } from \"@/schemas/extensions/score/llm-as-a-judge-scorer\";\n\nexport class AvgAggregator extends AbstractAggregator {\n private separateBySystemPrompt: boolean = false;\n private scores: Record<\n string,\n {\n model: string;\n total: number;\n count: number;\n systemPromptId?: string;\n }\n > = {};\n\n constructor(params: { separateBySystemPrompt?: boolean }) {\n super();\n this.separateBySystemPrompt = params.separateBySystemPrompt ?? false;\n }\n\n override async push(params: {\n score: InferExtension<\n typeof ExtensionLLMAsAJudgeScoreFieldsV1,\n BaseScoreV1\n >;\n response: InferExtension<\n typeof ExtensionLLMResponseFieldsV1,\n BaseResponseV1\n >;\n testCase?: BaseTestCaseV1;\n }) {\n const model = params.response.modelSlug;\n const compositeKey = model + (params.response.systemPromptId ?? \"\");\n const key = this.separateBySystemPrompt ? compositeKey : model;\n\n if (!this.scores[key]) {\n this.scores[key] = {\n model,\n systemPromptId: params.response.systemPromptId,\n count: 0,\n total: 0,\n };\n }\n\n this.scores[key].total += params.score.value;\n this.scores[key].count++;\n }\n\n override async aggregate() {\n return Object.fromEntries(\n Object.entries(this.scores).map(([model, score]) => [\n model,\n {\n ...score,\n average: score.total / score.count,\n },\n ])\n );\n }\n}\n"],"mappings":";;;AAGO,IAAe,qBAAf,MAAkC;AAQzC;;;ACLO,IAAM,gBAAN,cAA4B,mBAAmB;AAAA,EAC5C,yBAAkC;AAAA,EAClC,SAQJ,CAAC;AAAA,EAEL,YAAY,QAA8C;AACxD,UAAM;AACN,SAAK,yBAAyB,OAAO,0BAA0B;AAAA,EACjE;AAAA,EAEA,MAAe,KAAK,QAUjB;AACD,UAAM,QAAQ,OAAO,SAAS;AAC9B,UAAM,eAAe,SAAS,OAAO,SAAS,kBAAkB;AAChE,UAAM,MAAM,KAAK,yBAAyB,eAAe;AAEzD,QAAI,CAAC,KAAK,OAAO,GAAG,GAAG;AACrB,WAAK,OAAO,GAAG,IAAI;AAAA,QACjB;AAAA,QACA,gBAAgB,OAAO,SAAS;AAAA,QAChC,OAAO;AAAA,QACP,OAAO;AAAA,MACT;AAAA,IACF;AAEA,SAAK,OAAO,GAAG,EAAE,SAAS,OAAO,MAAM;AACvC,SAAK,OAAO,GAAG,EAAE;AAAA,EACnB;AAAA,EAEA,MAAe,YAAY;AACzB,WAAO,OAAO;AAAA,MACZ,OAAO,QAAQ,KAAK,MAAM,EAAE,IAAI,CAAC,CAAC,OAAO,KAAK,MAAM;AAAA,QAClD;AAAA,QACA;AAAA,UACE,GAAG;AAAA,UACH,SAAS,MAAM,QAAQ,MAAM;AAAA,QAC/B;AAAA,MACF,CAAC;AAAA,IACH;AAAA,EACF;AACF;","names":[]}