npm - peerbench - Versions diffs - 0.0.10 → 0.0.12 - Mend

peerbench 0.0.10 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

package/README.md +71 -58
package/dist/benchmarks/examples/echo-basic/runner.d.ts +11 -254
package/dist/benchmarks/examples/echo-basic/schema-sets/echo.v1.d.ts +25 -25
package/dist/benchmarks/examples/exact-match-scorer/runner.d.ts +38 -386
package/dist/benchmarks/examples/exact-match-scorer/schema-sets/exact-match.v1.d.ts +25 -25
package/dist/benchmarks/examples/text-transform/runner.d.ts +32 -480
package/dist/benchmarks/examples/text-transform/schema-sets/echo.v1.d.ts +25 -25
package/dist/benchmarks/examples/text-transform/schema-sets/reverse.v1.d.ts +25 -25
package/dist/benchmarks/index.js +180 -248
package/dist/benchmarks/index.js.map +1 -1
package/dist/benchmarks/peerbench/index.d.ts +2 -1
package/dist/benchmarks/peerbench/mcq-runner.d.ts +78 -0
package/dist/benchmarks/peerbench/qa-runner.d.ts +77 -0
package/dist/benchmarks/peerbench/schema-sets/mcq.v1.d.ts +25 -25
package/dist/benchmarks/peerbench/schema-sets/multi-turn.v1.d.ts +25 -25
package/dist/benchmarks/peerbench/schema-sets/qa.v1.d.ts +25 -25
package/dist/chunk-6WDCU5BP.js +9 -0
package/dist/chunk-6WDCU5BP.js.map +1 -0
package/dist/{chunk-YY33MNMV.js → chunk-7KMGLEYP.js} +2 -2
package/dist/{chunk-TRNCF2BG.js → chunk-HBGC6BDW.js} +1 -1
package/dist/chunk-HBGC6BDW.js.map +1 -0
package/dist/{chunk-HMQYGCKI.js → chunk-ZJWSK4VO.js} +1 -1
package/dist/chunk-ZJWSK4VO.js.map +1 -0
package/dist/dev.d.ts +22 -0
package/dist/helpers/define-runner.d.ts +2 -45
package/dist/index.js +2 -2
package/dist/providers/ai-sdk.d.ts +24 -0
package/dist/providers/callables/callable.d.ts +4 -0
package/dist/providers/callables/llm.d.ts +41 -0
package/dist/providers/example/echo.d.ts +12 -11
package/dist/providers/example/restapi.d.ts +11 -18
package/dist/providers/index.d.ts +4 -2
package/dist/providers/index.js +380 -9
package/dist/providers/index.js.map +1 -1
package/dist/providers/mastra.d.ts +16 -21
package/dist/providers/openai.d.ts +25 -10
package/dist/providers/openrouter.d.ts +6 -8
package/dist/schemas/extensions/index.js +1 -1
package/dist/schemas/extensions/response/llm.d.ts +17 -0
package/dist/schemas/index.js +2 -2
package/dist/schemas/llm/index.js +36 -7
package/dist/schemas/llm/index.js.map +1 -1
package/dist/schemas/llm/simple-system-prompt.d.ts +3 -3
package/dist/schemas/llm/system-prompt.d.ts +7 -7
package/dist/schemas/response.d.ts +7 -7
package/dist/schemas/schema-definer.d.ts +5 -5
package/dist/schemas/score.d.ts +7 -7
package/dist/schemas/test-case.d.ts +7 -7
package/dist/scorers/abstract.d.ts +1 -1
package/dist/scorers/index.js +377 -7
package/dist/scorers/index.js.map +1 -1
package/dist/scorers/llm-judge.d.ts +6 -6
package/dist/types/index.d.ts +0 -5
package/dist/types/runner.d.ts +13 -17
package/package.json +8 -7
package/dist/benchmarks/peerbench/runner.d.ts +0 -754
package/dist/chunk-3JHDJEY3.js +0 -374
package/dist/chunk-3JHDJEY3.js.map +0 -1
package/dist/chunk-HMQYGCKI.js.map +0 -1
package/dist/chunk-Q6GSOHOP.js +0 -44
package/dist/chunk-Q6GSOHOP.js.map +0 -1
package/dist/chunk-RTEAK4II.js +0 -37
package/dist/chunk-RTEAK4II.js.map +0 -1
package/dist/chunk-SMLNDQFX.js +0 -244
package/dist/chunk-SMLNDQFX.js.map +0 -1
package/dist/chunk-TRNCF2BG.js.map +0 -1
package/dist/providers/abstract/llm.d.ts +0 -20
/package/dist/{chunk-YY33MNMV.js.map → chunk-7KMGLEYP.js.map} +0 -0
/package/dist/providers/{abstract/provider.d.ts → abstract.d.ts} +0 -0

package/README.md CHANGED Viewed

@@ -1,21 +1,21 @@
 # `peerbench` SDK
-This package is the shared “domain core” for _building benchmarks_ in a standardized, portable way. It gives you a consistent set of _persistable entities_ (schemas + types), and a consistent set of _runtime contracts_ (runners, scorers, providers, storages, aggregators) so the same benchmark can run in a CLI, a web app, a worker, or anything else.
+This package is the shared "domain core" for _building benchmarks_ in a standardized, portable way. It gives you a consistent set of _persistable entities_ (schemas + types), and a consistent set of _runtime contracts_ (runners, scorers, providers, storages, aggregators) so the same benchmark can run in a CLI, a web app, a worker, or anything else.
-If you’re implementing a new benchmark, the SDK is the part that keeps it portable instead of glued to one runtime. If you’re integrating peerbench SDK into a runtime, the SDK is the part you don’t want to rewrite in every repo.
+If you're implementing a new benchmark, the SDK is the part that keeps it portable instead of glued to one runtime. If you're integrating peerbench SDK into a runtime, the SDK is the part you don't want to rewrite in every repo.
 > - _Runtime_ refers to the codebase that uses peerbench SDK (a CLI, a webapp, a background service etc.)
 > - This package does not support CommonJS
 ## What is a benchmark?
-A benchmark is a structured way to ask: “How well does a system perform on a set of tasks, under a set of rules?”
+A benchmark is a structured way to ask: "How well does a system perform on a set of tasks, under a set of rules?"
 If you look at widely-used benchmarks, the pattern is always the same even when the tasks are different:
 - In MMLU-Pro, each item is a question (often multiple choice) and the score is about correctness across categories.
 - In BIG-bench style task suites, you have many different task types and you want a consistent way to run and score them.
-- In HELM-style evaluations, you care about not only “did it answer correctly”, but also how you ran it (prompting setup, constraints, metadata) and how you report results.
+- In HELM-style evaluations, you care about not only "did it answer correctly", but also how you ran it (prompting setup, constraints, metadata) and how you report results.
 Those benchmarks differ in details, but they all boil down to the same building blocks: a dataset of test cases, a way to run a system on each test case, and a way to score the output. The peerbench SDK is designed so these patterns can be represented with the same portable shape.
@@ -23,7 +23,7 @@ Those benchmarks differ in details, but they all boil down to the same building
 Now that we agree on what a benchmark is, we can talk about how peerbench represents it.
-peerbench is deliberately boring here. It doesn’t try to invent a new “benchmark framework”. It gives you a small set of building blocks that you can compose. If you understand these pieces, you can read any benchmark implementation and know where to look.
+peerbench is deliberately boring here. It doesn't try to invent a new "benchmark framework". It gives you a small set of building blocks that you can compose. If you understand these pieces, you can read any benchmark implementation and know where to look.
 ### Entities (the things you store)
@@ -40,8 +40,8 @@ Everything else in the SDK exists to create these entities in a predictable way.
 Three fields show up everywhere:
 - `kind` tells you _what type_ of entity something is. It is a stable string you pick (descriptive).
-- `schemaVersion` tells you _which version_ of that entity shape you’re looking at.
-- `namespace` tells you which “owner” defines that kind (e.g peerbench.ai).
+- `schemaVersion` tells you _which version_ of that entity shape you're looking at.
+- `namespace` tells you which "owner" defines that kind (e.g peerbench.ai).
 This is why peerbench leans on [Zod](https://zod.dev) schemas: it keeps the persisted data contract explicit and runtime-validated.
@@ -57,36 +57,36 @@ peerbench SDK provides some pre-defined storage abstractions you can use out of
 ### Provider (how you talk to a model)
-A provider is the runtime bridge to a model endpoint.
+A provider is the runtime bridge to a model endpoint. It's an API client factory that creates **callables** — lightweight objects that have the model/agent configuration baked in and can be invoked by runners.
-Runners do not talk to models directly. They call a provider abstraction (today that’s `AbstractLLMProvider` for message-based LLM communication). That gives you a clean seam:
+Runners do not talk to providers directly. They receive a **callable** (today that's `CallableLLM` for message-based LLM communication). That gives you a clean seam:
-- benchmark code doesn’t care where the model lives
+- benchmark code doesn't care where the model lives
 - runtimes can swap providers without rewriting benchmark code
+Each provider extends `AbstractProvider`. Provider classes have the custom logic to interact with a 3rd party service. They are expected to have a factory method such as `.model()` or `.agent()` that returns a callable unit for their infrastructure. Callable units are what runners receive — they provide a uniform interface so runners don't need any special treatment for the underlying provider implementation.
 If you already have your own service in front of the model, you can still model it as a provider. The example in `packages/sdk-0.2/src/providers/example/restapi.ts` shows this pattern.
 ### Runner (how you execute one test case)
-A runner is the execution part of a benchmark. A runner function takes whatever inputs it needs, calls a provider, and produces a `Response`. It may also produce a `Score` (via a scorer).
+A runner is the execution part of a benchmark. A runner function takes whatever inputs it needs, calls a callable's `forward()`, and produces a `Response`. It may also produce a `Score` (via a scorer).
-Runners are indented to be “per test case” because it keeps the benchmark logic small and easy to compose. Running a whole dataset is orchestration, and orchestration is where runtimes differ (parallelism, retries, persistence, budgets, progress UI).
+Runners are intended to be "per test case" because it keeps the benchmark logic small and easy to compose. Running a whole dataset is orchestration, and orchestration is where runtimes differ (parallelism, retries, persistence, budgets, progress UI).
 There is no restriction that a benchmark must have exactly one runner. You can export multiple runner functions (different modes, different prompts, different providers, different scoring strategies). The runtime just needs to pick the runner it wants to use.
-One practical convention you will see in the examples is `runConfig`. It’s runner-specific, and it’s recommended to kept as a simple JSON-serializable object so you can store it alongside your run and reproduce it later.
 ### Scorer (how you judge a response)
 A scorer produces a numeric result. Some scorers are deterministic (same input → same output). Some scorers are non-deterministic (for example "LLM as a judge").
-A scorer takes what it needs. Sometimes it’s “expected + actual strings”. Sometimes it’s “a list of required fields + a JSON output”. The runner decides what to pass into the scorer, because the runner is the piece that knows how the benchmark is structured.
+A scorer takes what it needs. Sometimes it's "expected + actual strings". Sometimes it's "a list of required fields + a JSON output". The runner decides what to pass into the scorer, because the runner is the piece that knows how the benchmark is structured.
 If your benchmark can be scored in multiple ways, a runner can accept multiple scorer implementations and choose between them. The examples in `packages/sdk-0.2/src/benchmarks/examples/` show what that looks like in code.
 ## What the SDK does vs what the runtime does
-It’s easy to accidentally push “too much responsibility” to the SDK and end up with a framework you can’t escape. It’s also easy to push “too much responsibility” to the runtime and end up with copy-pasted benchmark logic.
+It's easy to accidentally push "too much responsibility" to the SDK and end up with a framework you can't escape. It's also easy to push "too much responsibility" to the runtime and end up with copy-pasted benchmark logic.
 This SDK tries to draw a clean line:
@@ -106,9 +106,9 @@ The runtime is responsible for:
 If you keep that boundary, benchmarks stay portable and runtimes stay free to evolve.
-## If you’re implementing a benchmark
+## If you're implementing a benchmark
-The easiest way to think about “implementing a benchmark” is: you are implementing a small domain module that can be imported by multiple runtimes. That means your job is mostly about making your benchmark _self-contained and explicit_.
+The easiest way to think about "implementing a benchmark" is: you are implementing a small domain module that can be imported by multiple runtimes. That means your job is mostly about making your benchmark _self-contained and explicit_.
 In practice, the benchmark implementer is responsible for:
@@ -121,11 +121,11 @@ Once those are in place, runtimes can focus on orchestration and product concern
 Peerbench does not assume your new benchmarks will be part of the SDK itself. The normal expectation is that your benchmark code lives in your runtime (or in its own package), and it uses `peerbench` as a dependency for schemas, base types, and contracts.
-Benchmarks can implement everything themselves, but they can also reuse the SDK’s predefined building blocks. If it is possible, it is recommended to stick with SDK base types (e.g `AbstractLLMProvider`) and implementations, because it increases compatibility with other tooling that speaks “Peerbench entities”.
+Benchmarks can implement everything themselves, but they can also reuse the SDK's predefined building blocks. If it is possible, it is recommended to stick with SDK base types (e.g `AbstractProvider`, `CallableLLM`) and implementations, because it increases compatibility with other tooling that speaks "Peerbench entities".
 ## A benchmark, step by step
-A “benchmark” in this SDK is not a magical object. It is a small folder that exports a few well-known pieces. The simplest complete benchmark usually includes:
+A "benchmark" in this SDK is not a magical object. It is a small folder that exports a few well-known pieces. The simplest complete benchmark usually includes:
 1. schemas (test case / response / score)
 2. a runner (how a single test case is executed)
@@ -137,7 +137,6 @@ You can see a compact, end-to-end reference in:
 - `packages/sdk-0.2/src/benchmarks/examples/echo-basic/`
 - `packages/sdk-0.2/src/benchmarks/examples/text-transform/`
 - `packages/sdk-0.2/src/benchmarks/examples/exact-match-scorer/`
-- `packages/sdk-0.2/src/benchmarks/examples/mcq-qa-templated/`
 ### 1) Schemas: the source of truth
@@ -149,9 +148,9 @@ In `packages/sdk-0.2/src/benchmarks/examples/echo-basic/schema-sets/echo.v1.ts`
 - define a response schema for that test case
 - define a score schema for that response
-The hierarchy starts from test case → response → score, and we keep the relationship by storing IDs (`testCaseId`, `responseId`). That relationship is “real data”, so the runtime is usually the one that persists it and queries it.
+The hierarchy starts from test case → response → score, and we keep the relationship by storing IDs (`testCaseId`, `responseId`). That relationship is "real data", so the runtime is usually the one that persists it and queries it.
-Here is what “defining a test case schema” looks like in practice (trimmed to the idea):
+Here is what "defining a test case schema" looks like in practice (trimmed to the idea):
 ```ts
 import { z } from "zod";
@@ -170,9 +169,22 @@ export const MyTestCaseSchemaV1 = defineTestCaseSchema({
 ### 2) Provider: how runners talk to models
-Runners communicate with models through a provider implementation. That’s how the same benchmark can run against different backends without rewriting the benchmark.
+Runners communicate with models through a callable created by a provider. That's how the same benchmark can run against different backends without rewriting the benchmark.
+A provider is an API client factory. You create a provider once (with API keys, rate limiters, etc.), then call its factory method to create callables — lightweight objects with a `forward()` method and the model baked in:
+```ts
+import { OpenRouterProvider } from "peerbench/providers";
+const provider = new OpenRouterProvider({ apiKey: "..." });
+const target = provider.model({ model: "gpt-4o" });
+// target.slug === "gpt-4o"
+// target.provider.kind === "peerbench.ai/llm/openrouter.ai"
+// target.forward({ messages }) — model is already captured
+```
-If you already have a service in front of your model, the REST API provider example shows the pattern: accept the SDK’s `messages + model` input, translate it to an HTTP request, and translate the HTTP response back into a single string. Nothing else is required.
+If you already have a service in front of your model, the REST API provider example (`src/providers/example/restapi.ts`) shows the pattern: extend `AbstractProvider`, implement a factory method that returns a `CallableLLM`, and translate messages to HTTP requests inside the `forward()` arrow function.
 ### 3) Runner: run one test case
@@ -182,15 +194,15 @@ This is intentional. Running many test cases is orchestration, and orchestration
 In the example runners (e.g. `packages/sdk-0.2/src/benchmarks/examples/echo-basic/runner.ts`) you can see the responsibilities:
-- format a test case into provider-friendly input (for chat models, `messages[]`)
-- call `provider.forward(...)`
-- map provider output into a `Response` entity
+- format a test case into callable-friendly input (`messages[]`)
+- call `target.forward({ messages })`
+- map the output into a `Response` entity
 - if a scorer is provided, turn scorer output into a `Score` entity
 Here is the idea in a minimal form:
 ```ts
-const providerResponse = await provider.forward({ model, messages });
+const providerResponse = await target.forward({ messages });
 const response = ResponseSchemaV1.new({
   id: "runtime-generates-id",
@@ -198,8 +210,8 @@ const response = ResponseSchemaV1.new({
   data: providerResponse.data,
   startedAt: providerResponse.startedAt,
   completedAt: providerResponse.completedAt,
-  modelSlug: model,
-  provider: provider.kind,
+  modelSlug: target.slug,
+  provider: target.provider.kind,
 });
 ```
@@ -207,7 +219,7 @@ const response = ResponseSchemaV1.new({
 Some benchmarks are easy to score deterministically (string match, regex extraction, set coverage). Some benchmarks need semantic judgment. Some benchmarks want both.
-That’s why scorers are separate objects and why runners can accept more than one scorer implementation.
+That's why scorers are separate objects and why runners can accept more than one scorer implementation.
 The examples show:
@@ -219,11 +231,11 @@ The examples show:
 ## Usage: run a single test case end-to-end
-First, define schemas and a runner (this is the “portable benchmark code”):
+First, define schemas and a runner (this is the "portable benchmark code"):
 ```ts
 import { defineRunner, idGeneratorUUIDv7 } from "peerbench";
-import { AbstractLLMProvider } from "peerbench/providers";
+import { CallableLLM } from "peerbench/providers";
 import {
   BaseResponseSchemaV1,
   BaseScoreSchemaV1,
@@ -232,7 +244,7 @@ import {
   defineScoreSchema,
   defineTestCaseSchema,
 } from "peerbench/schemas";
-import { ResponseExtensions } from "peerbench/schemas/extensions";
+import { ExtensionLLMResponseFieldsV1 } from "peerbench/schemas/extensions";
 import z from "zod";
 const Namespace = "example.peerbench.ai" as const;
@@ -251,7 +263,7 @@ const ResponseSchemaV1 = defineResponseSchema({
   namespace: Namespace,
   kind: Kind,
   schemaVersion: 1,
-  fields: { ...ResponseExtensions.ExtensionLLMResponseFieldsV1 },
+  fields: { ...ExtensionLLMResponseFieldsV1 },
 });
 const ScoreSchemaV1 = defineScoreSchema({
@@ -262,22 +274,19 @@ const ScoreSchemaV1 = defineScoreSchema({
   fields: {},
 });
+type TestCaseV1 = z.infer<typeof TestCaseSchemaV1.schema>;
 export const runner = defineRunner(
-  {
-    schemaSets: [
-      {
-        testCase: TestCaseSchemaV1,
-        response: ResponseSchemaV1,
-        score: ScoreSchemaV1,
-      },
-    ],
-    providers: [AbstractLLMProvider],
-    scorers: [],
-    runConfigSchema: { model: z.string() },
-  },
-  async ({ testCase, provider, runConfig, idGenerators }) => {
-    const providerResponse = await provider.forward({
-      model: runConfig.model,
+  async (params: {
+    testCase: TestCaseV1;
+    target: CallableLLM;
+    idGenerators?: {
+      response?: (input: unknown) => string;
+    };
+  }) => {
+    const { testCase, target } = params;
+    const providerResponse = await target.forward({
       messages: [{ role: "user", content: `Echo:\n${testCase.input}` }],
     });
@@ -287,14 +296,14 @@ export const runner = defineRunner(
         startedAt: providerResponse.startedAt,
         completedAt: providerResponse.completedAt,
         testCaseId: testCase.id,
-        modelSlug: runConfig.model,
-        provider: provider.kind,
+        modelSlug: target.slug,
+        provider: target.provider.kind,
         inputTokensUsed: providerResponse.inputTokensUsed,
         outputTokensUsed: providerResponse.outputTokensUsed,
         inputCost: providerResponse.inputCost,
         outputCost: providerResponse.outputCost,
       },
-      idGenerators?.response ?? idGeneratorUUIDv7
+      params.idGenerators?.response ?? idGeneratorUUIDv7
     );
     return { response };
@@ -304,13 +313,18 @@ export const runner = defineRunner(
 ## Usage: what the runtime adds (orchestration)
-Once you have a runner, the runtime’s job is mostly about repetition and persistence.
+Once you have a runner, the runtime's job is mostly about repetition and persistence.
 For example, a very small orchestrator might do:
 ```ts
+import { OpenRouterProvider } from "peerbench/providers";
+const provider = new OpenRouterProvider({ apiKey: "..." });
+const target = provider.model({ model: "gpt-4o" });
 for (const testCase of testCases) {
-  const result = await runner({ testCase, provider, runConfig });
+  const result = await runner({ testCase, target });
   // store `result.response` and `result.score` somewhere durable
   // decide how to handle errors, retries, progress, and budgets
 }
@@ -325,9 +339,8 @@ The examples under `packages/sdk-0.2/src/benchmarks/examples/` each teach one id
 - `echo-basic`: minimal schema set + runner + storage examples
 - `text-transform`: one runner supports multiple kinds + deterministic scoring
 - `exact-match-scorer`: scorer dispatch pattern (algo scorer vs LLM judge scorer)
-- `mcq-qa-templated`: template variables + MCQ/QA tasks
 ## Design notes
-- Schemas are runtime-validated (Zod) so “type-only drift” doesn’t silently corrupt stored data.
+- Schemas are runtime-validated (Zod) so "type-only drift" doesn't silently corrupt stored data.
 - Runners are per-test-case so they stay small and portable; runtimes keep orchestration control.

package/dist/benchmarks/examples/echo-basic/runner.d.ts CHANGED Viewed

@@ -1,35 +1,19 @@
-import { AbstractLLMProvider } from "../../../providers/index.js";
-import { z } from "zod";
-/**
- * Runners are the backbone of a benchmark. They are responsible for executing the test cases and producing
- * the responses and scores. As the benchmark builder, you define what schemas the runner can work with,
- * what are the providers and scorers are supported and what configurations can be passed by the caller
- * at the execution phase.
- */
-export declare const echoBasicRunner: ((params: {
-    testCase: {
-        id: string;
-        input: string;
-        namespace: "example.peerbench.ai";
-        kind: "llm/echo-basic.tc";
-        schemaVersion: 1;
-        metadata?: Record<string, unknown> | undefined;
-    };
-    provider: AbstractLLMProvider;
-    scorer?: undefined;
-    runConfig: {
-        model: string;
-    };
+import { CallableLLM } from "../../../providers/index.js";
+import { IdGenerator } from "../../../types";
+import { EchoBasicTestCaseV1 } from "./schema-sets/echo.v1";
+export declare const echoBasicRunner: (params: {
+    testCase: EchoBasicTestCaseV1;
+    target: CallableLLM;
     idGenerators?: {
-        response?: import("../../../index.js").IdGenerator;
-        score?: import("../../../index.js").IdGenerator;
+        response?: IdGenerator;
+        score?: IdGenerator;
     };
 }) => Promise<{
     response: {
-        startedAt: number;
-        completedAt: number;
         id: string;
         testCaseId: string;
+        startedAt: number;
+        completedAt: number;
         data: string;
         modelSlug: string;
         provider: string;
@@ -43,231 +27,4 @@ export declare const echoBasicRunner: ((params: {
         inputCost?: string | undefined;
         outputCost?: string | undefined;
     };
-    score?: {
-        id: string;
-        value: number;
-        responseId: string;
-        scoringMethod: "ai" | "human" | "algo";
-        namespace: "example.peerbench.ai";
-        kind: "llm/echo-basic.sc";
-        schemaVersion: 1;
-        metadata?: Record<string, unknown> | undefined;
-        explanation?: string | undefined;
-    } | undefined;
-}>) & {
-    config: {
-        runConfigSchema: z.ZodObject<{
-            model: z.ZodString;
-        }, z.core.$strip>;
-        schemaSets: [{
-            readonly testCase: z.ZodObject<Omit<{
-                id: z.ZodString;
-                namespace: z.ZodString;
-                schemaVersion: z.ZodNumber;
-                kind: z.ZodString;
-                metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
-            }, "kind" | "namespace" | "schemaVersion"> & {
-                input: z.ZodString;
-            } & {
-                namespace: z.ZodLiteral<"example.peerbench.ai">;
-                kind: z.ZodLiteral<"llm/echo-basic.tc">;
-                schemaVersion: z.ZodLiteral<1>;
-            }, z.core.$strip> & {
-                new: (input: Omit<{
-                    id: string;
-                    input: string;
-                    namespace: "example.peerbench.ai";
-                    kind: "llm/echo-basic.tc";
-                    schemaVersion: 1;
-                    metadata?: Record<string, unknown> | undefined;
-                }, "kind" | "namespace" | "schemaVersion">) => {
-                    id: string;
-                    input: string;
-                    namespace: "example.peerbench.ai";
-                    kind: "llm/echo-basic.tc";
-                    schemaVersion: 1;
-                    metadata?: Record<string, unknown> | undefined;
-                };
-                newWithId(input: Omit<{
-                    id: string;
-                    input: string;
-                    namespace: "example.peerbench.ai";
-                    kind: "llm/echo-basic.tc";
-                    schemaVersion: 1;
-                    metadata?: Record<string, unknown> | undefined;
-                }, "kind" | "id" | "namespace" | "schemaVersion">, generator: import("../../../index.js").IdGenerator): Promise<{
-                    id: string;
-                    input: string;
-                    namespace: "example.peerbench.ai";
-                    kind: "llm/echo-basic.tc";
-                    schemaVersion: 1;
-                    metadata?: Record<string, unknown> | undefined;
-                }>;
-            };
-            readonly response: z.ZodObject<Omit<{
-                id: z.ZodString;
-                namespace: z.ZodString;
-                schemaVersion: z.ZodNumber;
-                kind: z.ZodString;
-                startedAt: z.ZodNumber;
-                completedAt: z.ZodNumber;
-                testCaseId: z.ZodString;
-                metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
-            }, "kind" | "namespace" | "schemaVersion"> & {
-                data: z.ZodString;
-                modelSlug: z.ZodString;
-                provider: z.ZodString;
-                systemPromptId: z.ZodOptional<z.ZodString>;
-                inputTokensUsed: z.ZodOptional<z.ZodNumber>;
-                outputTokensUsed: z.ZodOptional<z.ZodNumber>;
-                inputCost: z.ZodOptional<z.ZodString>;
-                outputCost: z.ZodOptional<z.ZodString>;
-            } & {
-                namespace: z.ZodLiteral<"example.peerbench.ai">;
-                kind: z.ZodLiteral<"llm/echo-basic.rs">;
-                schemaVersion: z.ZodLiteral<1>;
-            }, z.core.$strip> & {
-                new: (input: Omit<{
-                    startedAt: number;
-                    completedAt: number;
-                    id: string;
-                    testCaseId: string;
-                    data: string;
-                    modelSlug: string;
-                    provider: string;
-                    namespace: "example.peerbench.ai";
-                    kind: "llm/echo-basic.rs";
-                    schemaVersion: 1;
-                    metadata?: Record<string, unknown> | undefined;
-                    systemPromptId?: string | undefined;
-                    inputTokensUsed?: number | undefined;
-                    outputTokensUsed?: number | undefined;
-                    inputCost?: string | undefined;
-                    outputCost?: string | undefined;
-                }, "kind" | "namespace" | "schemaVersion">) => {
-                    startedAt: number;
-                    completedAt: number;
-                    id: string;
-                    testCaseId: string;
-                    data: string;
-                    modelSlug: string;
-                    provider: string;
-                    namespace: "example.peerbench.ai";
-                    kind: "llm/echo-basic.rs";
-                    schemaVersion: 1;
-                    metadata?: Record<string, unknown> | undefined;
-                    systemPromptId?: string | undefined;
-                    inputTokensUsed?: number | undefined;
-                    outputTokensUsed?: number | undefined;
-                    inputCost?: string | undefined;
-                    outputCost?: string | undefined;
-                };
-                newWithId(input: Omit<{
-                    startedAt: number;
-                    completedAt: number;
-                    id: string;
-                    testCaseId: string;
-                    data: string;
-                    modelSlug: string;
-                    provider: string;
-                    namespace: "example.peerbench.ai";
-                    kind: "llm/echo-basic.rs";
-                    schemaVersion: 1;
-                    metadata?: Record<string, unknown> | undefined;
-                    systemPromptId?: string | undefined;
-                    inputTokensUsed?: number | undefined;
-                    outputTokensUsed?: number | undefined;
-                    inputCost?: string | undefined;
-                    outputCost?: string | undefined;
-                }, "kind" | "id" | "namespace" | "schemaVersion">, generator: import("../../../index.js").IdGenerator): Promise<{
-                    startedAt: number;
-                    completedAt: number;
-                    id: string;
-                    testCaseId: string;
-                    data: string;
-                    modelSlug: string;
-                    provider: string;
-                    namespace: "example.peerbench.ai";
-                    kind: "llm/echo-basic.rs";
-                    schemaVersion: 1;
-                    metadata?: Record<string, unknown> | undefined;
-                    systemPromptId?: string | undefined;
-                    inputTokensUsed?: number | undefined;
-                    outputTokensUsed?: number | undefined;
-                    inputCost?: string | undefined;
-                    outputCost?: string | undefined;
-                }>;
-            };
-            readonly score: z.ZodObject<Omit<{
-                id: z.ZodString;
-                namespace: z.ZodString;
-                kind: z.ZodString;
-                schemaVersion: z.ZodNumber;
-                value: z.ZodNumber;
-                responseId: z.ZodString;
-                explanation: z.ZodOptional<z.ZodString>;
-                metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
-                scoringMethod: z.ZodEnum<{
-                    readonly ai: "ai";
-                    readonly human: "human";
-                    readonly algo: "algo";
-                }>;
-            }, "kind" | "namespace" | "schemaVersion"> & {
-                namespace: z.ZodLiteral<"example.peerbench.ai">;
-                kind: z.ZodLiteral<"llm/echo-basic.sc">;
-                schemaVersion: z.ZodLiteral<1>;
-            }, z.core.$strip> & {
-                new: (input: Omit<{
-                    id: string;
-                    value: number;
-                    responseId: string;
-                    scoringMethod: "ai" | "human" | "algo";
-                    namespace: "example.peerbench.ai";
-                    kind: "llm/echo-basic.sc";
-                    schemaVersion: 1;
-                    metadata?: Record<string, unknown> | undefined;
-                    explanation?: string | undefined;
-                }, "kind" | "namespace" | "schemaVersion">) => {
-                    id: string;
-                    value: number;
-                    responseId: string;
-                    scoringMethod: "ai" | "human" | "algo";
-                    namespace: "example.peerbench.ai";
-                    kind: "llm/echo-basic.sc";
-                    schemaVersion: 1;
-                    metadata?: Record<string, unknown> | undefined;
-                    explanation?: string | undefined;
-                };
-                newWithId(input: Omit<{
-                    id: string;
-                    value: number;
-                    responseId: string;
-                    scoringMethod: "ai" | "human" | "algo";
-                    namespace: "example.peerbench.ai";
-                    kind: "llm/echo-basic.sc";
-                    schemaVersion: 1;
-                    metadata?: Record<string, unknown> | undefined;
-                    explanation?: string | undefined;
-                }, "kind" | "id" | "namespace" | "schemaVersion">, generator: import("../../../index.js").IdGenerator): Promise<{
-                    id: string;
-                    value: number;
-                    responseId: string;
-                    scoringMethod: "ai" | "human" | "algo";
-                    namespace: "example.peerbench.ai";
-                    kind: "llm/echo-basic.sc";
-                    schemaVersion: 1;
-                    metadata?: Record<string, unknown> | undefined;
-                    explanation?: string | undefined;
-                }>;
-            };
-        }];
-        providers: [typeof AbstractLLMProvider];
-        scorers: [];
-        parseRunConfig?: boolean;
-        defaults?: {
-            scorer?: undefined;
-            responseIdGenerator?: import("../../../index.js").IdGenerator;
-            scoreIdGenerator?: import("../../../index.js").IdGenerator;
-        } | undefined;
-    };
-};
+}>;