@dividedby/bench-core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,62 @@
1
+ # bench — model × effort benchmark for Claude Code skills
2
+
3
+ Goal: build a cost/quality matrix across `model × effort` for the skills used daily,
4
+ so the cheapest config above a chosen quality floor can be picked as the daily driver.
5
+
6
+ ## `@dividedby/bench-core`
7
+
8
+ The reusable core primitives are published as `@dividedby/bench-core` (ESM, zero
9
+ runtime deps, hand-written types in `core/index.d.ts`). The CLIs in this repo
10
+ (`run.mjs`, `sweep.mjs`, `aggregate-grades.mjs`) are thin wrappers over them.
11
+
12
+ ```js
13
+ import {
14
+ executeRun, // run one (task, model, effort) cell; injectable runCli
15
+ normalize, groupByCell, // multi-judge grade aggregation
16
+ priceTokens, rederiveCostUsd, // pure cost re-derivation (pricing passed in)
17
+ createDefaultJudgeBackend, // JudgeBackend interface + no-model default
18
+ } from "@dividedby/bench-core";
19
+ ```
20
+
21
+ Only `core/**` ships in the package; fixtures, tasks, results, and pricing are
22
+ local-harness assets. Test: `npm test` (node's built-in runner, `node --test`).
23
+
24
+ ## Status
25
+ - **Stage 1 (current):** scaffold + one synthetic task + single-run runner with metrics extraction.
26
+ - Stage 2: full sweep runner (model × effort × task, 1 trial) → `aggregated.csv`.
27
+ - Stage 3: grader (objective checks + LLM judge) → `report.md`.
28
+ - Stage 4: 3 trials, replayed-real tasks, variance flags.
29
+
30
+ ## Axes
31
+ - Models: `opus`, `sonnet`, `haiku`
32
+ - Effort: `low`, `medium`, `high` (CLI also supports `xhigh`, `max`)
33
+
34
+ ## How it works
35
+ Each run executes a skill headlessly in an isolated copy of a fixture repo:
36
+
37
+ ```
38
+ claude -p "<prompt>" --model M --effort E \
39
+ --output-format json --permission-mode acceptEdits \
40
+ --append-system-prompt "<unattended constant>" (cwd = working copy)
41
+ ```
42
+
43
+ `--output-format json` returns `total_cost_usd`, `duration_ms`, `num_turns`, and token
44
+ usage directly — no transcript parsing needed. Artifacts left in the working copy are
45
+ graded later.
46
+
47
+ ### Unattended constant
48
+ Every cell gets the same appended system prompt telling the agent it is running
49
+ unattended (no user to answer "check with the user" checkpoints). Applied identically
50
+ everywhere so it does not bias the model comparison. See `UNATTENDED` in `run.mjs`.
51
+
52
+ ### Fixtures
53
+ Synthetic fixtures live in `fixtures/`. Each declares a **local-markdown** issue tracker
54
+ (`docs/agents/issue-tracker.md`) so skills like `/to-prd` write files instead of calling
55
+ `gh` — gradeable, resettable, zero external side effects.
56
+
57
+ ## Run one
58
+ ```
59
+ node run.mjs --task tasks/software-design-synthetic.json --model sonnet --effort medium
60
+ ```
61
+ Outputs `results/runs/<runId>.json` (config + metrics) and leaves the worked copy under
62
+ `results/work/<runId>/`.
@@ -0,0 +1,94 @@
1
+ // Grade aggregation primitives — the canonical home (moved from aggregate-grades.mjs).
2
+ // Multi-judge aggregation: judges run on different scales, so a raw mean would let the
3
+ // wider-spread judge dominate. We per-judge z-score `overall` across that judge's cells,
4
+ // then average the z-scores. Only judges with FULL coverage (graded every cell) enter the
5
+ // normalized aggregate — you can't normalize a partial judge, and this auto-excludes
6
+ // leftover/rejected partial passes without deleting data.
7
+
8
+ const DISAGREE_RANK_GAP = 4; // judges placing a cell >= this many rank positions
9
+ // apart (out of N cells) get flagged for review.
10
+
11
+ // Pure core. grades: [{ blindId, judge, scores:{overall,...} }]
12
+ // Returns { cells, judges, dropped } where each cell carries raw + normalized data.
13
+ export function normalize(grades) {
14
+ const blindIds = [...new Set(grades.map((g) => g.blindId))];
15
+ const allJudges = [...new Set(grades.map((g) => g.judge))];
16
+
17
+ // A judge counts only if it graded every blindId exactly once.
18
+ const fullJudges = [];
19
+ const dropped = [];
20
+ for (const j of allJudges) {
21
+ const seen = grades.filter((g) => g.judge === j).map((g) => g.blindId);
22
+ const uniq = new Set(seen);
23
+ if (uniq.size === blindIds.length && seen.length === blindIds.length) fullJudges.push(j);
24
+ else dropped.push({ judge: j, graded: uniq.size, of: blindIds.length });
25
+ }
26
+
27
+ // Per full judge: z-score and within-judge rank (1 = best) over its overalls.
28
+ const perJudge = {}; // judge -> blindId -> { z, rank, raw }
29
+ for (const j of fullJudges) {
30
+ const rows = grades
31
+ .filter((g) => g.judge === j)
32
+ .map((g) => ({ blindId: g.blindId, raw: g.scores.overall }));
33
+ const mean = rows.reduce((s, r) => s + r.raw, 0) / rows.length;
34
+ const variance = rows.reduce((s, r) => s + (r.raw - mean) ** 2, 0) / rows.length;
35
+ const std = Math.sqrt(variance);
36
+ const ranked = [...rows].sort((a, b) => b.raw - a.raw);
37
+ perJudge[j] = {};
38
+ for (const r of rows) {
39
+ const z = std === 0 ? 0 : (r.raw - mean) / std;
40
+ const rank = ranked.findIndex((x) => x.blindId === r.blindId) + 1;
41
+ perJudge[j][r.blindId] = { z, rank, raw: r.raw };
42
+ }
43
+ }
44
+
45
+ const cells = blindIds.map((blindId) => {
46
+ const zs = fullJudges.map((j) => perJudge[j][blindId].z);
47
+ const ranks = fullJudges.map((j) => perJudge[j][blindId].rank);
48
+ const raws = fullJudges.map((j) => perJudge[j][blindId].raw);
49
+ const normZ = zs.reduce((s, x) => s + x, 0) / (zs.length || 1);
50
+ const rawMean = raws.reduce((s, x) => s + x, 0) / (raws.length || 1);
51
+ const rankGap = ranks.length > 1 ? Math.max(...ranks) - Math.min(...ranks) : 0;
52
+ return {
53
+ blindId,
54
+ judges: fullJudges.length,
55
+ raws, // raw overalls, ordered by fullJudges
56
+ rawMean,
57
+ normZ,
58
+ rankGap,
59
+ disagree: fullJudges.length > 1 && rankGap >= DISAGREE_RANK_GAP,
60
+ };
61
+ });
62
+
63
+ cells.sort((a, b) => b.normZ - a.normZ);
64
+ return { cells, judges: fullJudges, dropped };
65
+ }
66
+
67
+ const NOISY_TRIAL_STD = 0.5; // trial-to-trial std of normZ (judge-std units) above
68
+ // which a cell's score is wobbly enough to flag.
69
+
70
+ // Group normalized per-blind cells into (model,effort) cells across trials.
71
+ // resolve(blindId) -> { model, effort }. Reports trial spread = the run-variance signal.
72
+ export function groupByCell(blindCells, resolve) {
73
+ const byCell = {};
74
+ for (const bc of blindCells) {
75
+ const { model, effort } = resolve(bc.blindId);
76
+ const key = `${model}__${effort}`;
77
+ (byCell[key] ??= { model, effort, trials: [] }).trials.push(bc);
78
+ }
79
+ const rows = Object.values(byCell).map((c) => {
80
+ const zs = c.trials.map((t) => t.normZ);
81
+ const rawMeans = c.trials.map((t) => t.rawMean);
82
+ const meanZ = zs.reduce((s, x) => s + x, 0) / zs.length;
83
+ const varZ = zs.reduce((s, x) => s + (x - meanZ) ** 2, 0) / zs.length;
84
+ const trialStd = Math.sqrt(varZ);
85
+ const rawSpread = Math.max(...rawMeans) - Math.min(...rawMeans);
86
+ return {
87
+ model: c.model, effort: c.effort, nTrials: c.trials.length,
88
+ rawMeans, meanZ, trialStd, rawSpread,
89
+ noisy: c.trials.length > 1 && trialStd >= NOISY_TRIAL_STD,
90
+ };
91
+ });
92
+ rows.sort((a, b) => b.meanZ - a.meanZ);
93
+ return rows;
94
+ }
package/core/cost.mjs ADDED
@@ -0,0 +1,58 @@
1
+ // Cost re-derivation primitives. Pure: pricing is passed IN — no file reads here.
2
+ // The sweep.mjs wrapper loads pricing.json and passes the dict to rederiveCostUsd.
3
+
4
+ /**
5
+ * Price a single token bundle against one model's per-million rates.
6
+ * @param {{input:number,output:number,cacheWrite5m:number,cacheRead:number}} pricingRates
7
+ * @param {{input?:number,output?:number,cacheCreation?:number,cacheRead?:number}} tokens
8
+ * @returns {number} USD cost for this bundle (assumes 5-minute cache).
9
+ */
10
+ export function priceTokens(pricingRates, tokens) {
11
+ const t = (n) => Number(n ?? 0) / 1e6;
12
+ return (
13
+ t(tokens.input) * pricingRates.input +
14
+ t(tokens.output) * pricingRates.output +
15
+ t(tokens.cacheCreation) * pricingRates.cacheWrite5m +
16
+ t(tokens.cacheRead) * pricingRates.cacheRead
17
+ );
18
+ }
19
+
20
+ function modelIdToKey(id) {
21
+ if (id.includes("opus")) return "opus";
22
+ if (id.includes("sonnet")) return "sonnet";
23
+ if (id.includes("haiku")) return "haiku";
24
+ return null;
25
+ }
26
+
27
+ /**
28
+ * Re-derive API-equivalent cost as an independent check against the harness's
29
+ * total_cost_usd. Prefers the per-model modelUsage breakdown (accurate for the
30
+ * multi-model sessions these skills produce); falls back to single-rate pricing of
31
+ * the primary-model aggregate when modelUsage is absent. Assumes 5m cache.
32
+ * @param {string} primaryModel - pricing-dict key for the primary model (e.g. "opus").
33
+ * @param {object} metrics - run metrics carrying inputTokens/outputTokens/cache* fields.
34
+ * @param {object|null} modelUsage - per-model-id usage breakdown, or null.
35
+ * @param {object} pricingDict - { opus:{...}, sonnet:{...}, ... } rate table.
36
+ * @returns {number|null} re-derived USD, or null if pricing is missing for a model.
37
+ */
38
+ export function rederiveCostUsd(primaryModel, metrics, modelUsage, pricingDict) {
39
+ if (!pricingDict) return null;
40
+ if (modelUsage && typeof modelUsage === "object") {
41
+ let total = 0;
42
+ for (const [id, u] of Object.entries(modelUsage)) {
43
+ const p = pricingDict[modelIdToKey(id)];
44
+ if (!p) return null;
45
+ total += priceTokens(p, {
46
+ input: u.inputTokens, output: u.outputTokens,
47
+ cacheCreation: u.cacheCreationInputTokens, cacheRead: u.cacheReadInputTokens,
48
+ });
49
+ }
50
+ return total;
51
+ }
52
+ const p = pricingDict[primaryModel];
53
+ if (!p) return null;
54
+ return priceTokens(p, {
55
+ input: metrics.inputTokens, output: metrics.outputTokens,
56
+ cacheCreation: metrics.cacheCreationTokens, cacheRead: metrics.cacheReadTokens,
57
+ });
58
+ }
@@ -0,0 +1,101 @@
1
+ // The run mechanic, decoupled from argv parsing and results/ persistence.
2
+ // executeRun spawns one `claude` invocation for one (task, model, effort, trial) cell
3
+ // and returns the parsed run metrics. Fixture copy + results write live in the run.mjs
4
+ // wrapper — this primitive only invokes the CLI and shapes the metrics.
5
+
6
+ import { spawnSync } from "node:child_process";
7
+
8
+ // Applied identically to EVERY cell so it does not bias the model comparison.
9
+ // The skills have "check with the user" checkpoints; in --print mode there is no user.
10
+ export const UNATTENDED =
11
+ "You are running fully unattended in a benchmark harness. There is no user available " +
12
+ "to answer questions or approve checkpoints. When a skill says to check with the user " +
13
+ "or wait for confirmation, instead make the most reasonable assumption, state it briefly, " +
14
+ "and proceed to completion. Do not ask questions. Finish the task and write all artifacts " +
15
+ "to disk before ending your turn.";
16
+
17
+ // Default CLI runner: the real `claude` spawn. Tests inject a fake via deps.runCli.
18
+ // Returns { status, stdout, stderr } — the subset executeRun consumes.
19
+ function defaultRunCli({ args, cwd }) {
20
+ const proc = spawnSync("claude", args, {
21
+ cwd,
22
+ encoding: "utf8",
23
+ maxBuffer: 64 * 1024 * 1024,
24
+ });
25
+ return { status: proc.status, stdout: proc.stdout, stderr: proc.stderr };
26
+ }
27
+
28
+ /**
29
+ * Execute one benchmark cell.
30
+ * @param {object} config
31
+ * @param {{id:string,prompt:string,skill?:string,source?:string,fixture?:string}} config.task
32
+ * @param {string} config.model
33
+ * @param {string} config.effort
34
+ * @param {number|string} [config.trial=1]
35
+ * @param {string} config.workDir - cwd for the CLI (caller prepares the fixture copy here).
36
+ * @param {string} [config.appendSystemPrompt=UNATTENDED]
37
+ * @param {object} [deps]
38
+ * @param {(opts:{args:string[],cwd:string})=>{status:number|null,stdout:string,stderr:string}} [deps.runCli]
39
+ * @returns {Promise<object>} run record { runId, task, config, startedAt, exitCode, metrics, modelUsage, raw? }
40
+ */
41
+ export async function executeRun(config, deps = {}) {
42
+ const runCli = deps.runCli ?? defaultRunCli;
43
+ const { task, model, effort, workDir } = config;
44
+ const trial = config.trial ?? "1";
45
+ const appendSystemPrompt = config.appendSystemPrompt ?? UNATTENDED;
46
+ const runId = `${task.id}__${model}__${effort}__t${trial}`;
47
+
48
+ const cliArgs = [
49
+ "-p", task.prompt,
50
+ "--model", model,
51
+ "--effort", effort,
52
+ "--output-format", "json",
53
+ "--permission-mode", "acceptEdits",
54
+ "--append-system-prompt", appendSystemPrompt,
55
+ ];
56
+
57
+ const startedAt = new Date().toISOString();
58
+ const wall0 = Date.now();
59
+ const proc = runCli({ args: cliArgs, cwd: workDir });
60
+ const wallMs = Date.now() - wall0;
61
+
62
+ let result = null;
63
+ try {
64
+ result = JSON.parse(proc.stdout);
65
+ } catch {
66
+ // leave result null; record raw stdout/stderr for debugging
67
+ }
68
+
69
+ const u = result?.usage ?? {};
70
+ const metrics = result
71
+ ? {
72
+ isError: result.is_error ?? null,
73
+ costUsd: result.total_cost_usd ?? null,
74
+ durationMs: result.duration_ms ?? null,
75
+ durationApiMs: result.duration_api_ms ?? null,
76
+ wallMs,
77
+ numTurns: result.num_turns ?? null,
78
+ inputTokens: u.input_tokens ?? null,
79
+ outputTokens: u.output_tokens ?? null,
80
+ cacheCreationTokens: u.cache_creation_input_tokens ?? null,
81
+ cacheReadTokens: u.cache_read_input_tokens ?? null,
82
+ }
83
+ : { parseFailed: true, wallMs };
84
+
85
+ return {
86
+ runId,
87
+ task: { id: task.id, skill: task.skill, source: task.source, fixture: task.fixture },
88
+ config: { model, effort, trial: Number(trial) },
89
+ startedAt,
90
+ exitCode: proc.status,
91
+ metrics,
92
+ // Per-model token breakdown — total_cost_usd spans every model used in the session
93
+ // (sub-agents, auxiliary calls), so this is needed to re-derive cost accurately.
94
+ modelUsage: result?.modelUsage ?? null,
95
+ // The full parsed CLI result (modelUsage, server tool use, etc.) for the wrapper to
96
+ // persist for introspection. null when stdout didn't parse as JSON.
97
+ result,
98
+ // Present only on parse failure, for the wrapper to persist for debugging.
99
+ raw: result ? undefined : { stdout: proc.stdout?.slice(0, 4000), stderr: proc.stderr?.slice(0, 4000) },
100
+ };
101
+ }
@@ -0,0 +1,186 @@
1
+ // Hand-written types for @dividedby/bench-core. Sources stay .mjs; these declare the
2
+ // exported primitives so a TS consumer typechecks cleanly.
3
+
4
+ // ---- execute-run ----
5
+
6
+ export interface TaskDef {
7
+ id: string;
8
+ prompt: string;
9
+ skill?: string;
10
+ source?: string;
11
+ fixture?: string;
12
+ }
13
+
14
+ export interface RunConfig {
15
+ task: TaskDef;
16
+ model: string;
17
+ effort: string;
18
+ trial?: number | string;
19
+ /** cwd for the CLI; the caller prepares the fixture copy here. */
20
+ workDir: string;
21
+ appendSystemPrompt?: string;
22
+ }
23
+
24
+ export interface CliInvocation {
25
+ args: string[];
26
+ cwd: string;
27
+ }
28
+
29
+ export interface CliResult {
30
+ status: number | null;
31
+ stdout: string;
32
+ stderr: string;
33
+ }
34
+
35
+ export type RunCli = (invocation: CliInvocation) => CliResult;
36
+
37
+ export interface RunDeps {
38
+ /** Defaults to the real `claude` spawnSync wrapper. */
39
+ runCli?: RunCli;
40
+ }
41
+
42
+ export interface RunMetrics {
43
+ isError?: boolean | null;
44
+ costUsd?: number | null;
45
+ durationMs?: number | null;
46
+ durationApiMs?: number | null;
47
+ wallMs: number;
48
+ numTurns?: number | null;
49
+ inputTokens?: number | null;
50
+ outputTokens?: number | null;
51
+ cacheCreationTokens?: number | null;
52
+ cacheReadTokens?: number | null;
53
+ parseFailed?: boolean;
54
+ }
55
+
56
+ export interface RunResult {
57
+ runId: string;
58
+ task: { id: string; skill?: string; source?: string; fixture?: string };
59
+ config: { model: string; effort: string; trial: number };
60
+ startedAt: string;
61
+ exitCode: number | null;
62
+ metrics: RunMetrics;
63
+ modelUsage: Record<string, unknown> | null;
64
+ /** Full parsed CLI result for introspection; null when stdout didn't parse. */
65
+ result: Record<string, unknown> | null;
66
+ raw?: { stdout?: string; stderr?: string };
67
+ }
68
+
69
+ export declare const UNATTENDED: string;
70
+ export declare function executeRun(config: RunConfig, deps?: RunDeps): Promise<RunResult>;
71
+
72
+ // ---- aggregate ----
73
+
74
+ export interface Grade {
75
+ blindId: string;
76
+ judge: string;
77
+ scores: { overall: number; [criterion: string]: number | string };
78
+ }
79
+
80
+ export interface NormalizedCell {
81
+ blindId: string;
82
+ judges: number;
83
+ raws: number[];
84
+ rawMean: number;
85
+ normZ: number;
86
+ rankGap: number;
87
+ disagree: boolean;
88
+ }
89
+
90
+ export interface DroppedJudge {
91
+ judge: string;
92
+ graded: number;
93
+ of: number;
94
+ }
95
+
96
+ export interface NormalizeResult {
97
+ cells: NormalizedCell[];
98
+ judges: string[];
99
+ dropped: DroppedJudge[];
100
+ }
101
+
102
+ export declare function normalize(grades: Grade[]): NormalizeResult;
103
+
104
+ export interface BlindCell {
105
+ blindId: string;
106
+ normZ: number;
107
+ rawMean: number;
108
+ }
109
+
110
+ export type CellResolver = (blindId: string) => { model: string; effort: string };
111
+
112
+ export interface GroupedCell {
113
+ model: string;
114
+ effort: string;
115
+ nTrials: number;
116
+ rawMeans: number[];
117
+ meanZ: number;
118
+ trialStd: number;
119
+ rawSpread: number;
120
+ noisy: boolean;
121
+ }
122
+
123
+ export declare function groupByCell(
124
+ blindCells: BlindCell[],
125
+ resolve: CellResolver,
126
+ ): GroupedCell[];
127
+
128
+ // ---- cost ----
129
+
130
+ export interface PricingRates {
131
+ input: number;
132
+ output: number;
133
+ cacheWrite5m: number;
134
+ cacheRead: number;
135
+ }
136
+
137
+ export type PricingDict = Record<string, PricingRates>;
138
+
139
+ export interface TokenBundle {
140
+ input?: number;
141
+ output?: number;
142
+ cacheCreation?: number;
143
+ cacheRead?: number;
144
+ }
145
+
146
+ export declare function priceTokens(pricingRates: PricingRates, tokens: TokenBundle): number;
147
+
148
+ export interface CostMetrics {
149
+ inputTokens?: number | null;
150
+ outputTokens?: number | null;
151
+ cacheCreationTokens?: number | null;
152
+ cacheReadTokens?: number | null;
153
+ }
154
+
155
+ export interface ModelUsageEntry {
156
+ inputTokens?: number;
157
+ outputTokens?: number;
158
+ cacheCreationInputTokens?: number;
159
+ cacheReadInputTokens?: number;
160
+ }
161
+
162
+ export declare function rederiveCostUsd(
163
+ primaryModel: string,
164
+ metrics: CostMetrics,
165
+ modelUsage: Record<string, ModelUsageEntry> | null,
166
+ pricingDict: PricingDict | null,
167
+ ): number | null;
168
+
169
+ // ---- judge ----
170
+
171
+ export interface GradeResult {
172
+ blindId: string | null;
173
+ scores: Record<string, number | string> | null;
174
+ prompt: string;
175
+ graded: boolean;
176
+ }
177
+
178
+ export interface JudgeBackend {
179
+ name: string;
180
+ grade(prompt: string, schema: object): Promise<GradeResult>;
181
+ }
182
+
183
+ export declare function createDefaultJudgeBackend(opts?: {
184
+ name?: string;
185
+ blindId?: string;
186
+ }): JudgeBackend;
package/core/index.mjs ADDED
@@ -0,0 +1,5 @@
1
+ // @dividedby/bench-core — the four benchmark primitives, decoupled from argv + file I/O.
2
+ export { executeRun, UNATTENDED } from "./execute-run.mjs";
3
+ export { normalize, groupByCell } from "./aggregate.mjs";
4
+ export { priceTokens, rederiveCostUsd } from "./cost.mjs";
5
+ export { createDefaultJudgeBackend } from "./judge.mjs";
package/core/judge.mjs ADDED
@@ -0,0 +1,41 @@
1
+ // JudgeBackend interface + a default no-model backend.
2
+ // The real Opus panel is a separate issue (#138) — this is the interface only.
3
+
4
+ /**
5
+ * @typedef {object} GradeResult
6
+ * @property {string} blindId - the blind submission id being graded.
7
+ * @property {object|null} scores - rubric scores ({ c1..cN, overall, note }), or null
8
+ * when no model graded (the default backend defers).
9
+ * @property {string} prompt - the exact prompt that would be sent to a judge model.
10
+ * @property {boolean} graded - true if a model produced scores; false if deferred.
11
+ */
12
+
13
+ /**
14
+ * @typedef {object} JudgeBackend
15
+ * @property {string} name
16
+ * @property {(prompt:string, schema:object)=>Promise<GradeResult>} grade
17
+ * Grade one blind prompt against a JSON schema. A real backend calls a model and
18
+ * parses its JSON reply into `scores`; the default backend defers (scores=null).
19
+ */
20
+
21
+ /**
22
+ * Default no-model backend: emits/stores the prompt without calling a model, mirroring
23
+ * today's manual external-grading flow. `grade` echoes the prompt back ungraded so a
24
+ * human (or a later panel backend) can score it.
25
+ * @param {{name?:string, blindId?:string}} [opts]
26
+ * @returns {JudgeBackend}
27
+ */
28
+ export function createDefaultJudgeBackend(opts = {}) {
29
+ const name = opts.name ?? "manual";
30
+ return {
31
+ name,
32
+ async grade(prompt, _schema) {
33
+ return {
34
+ blindId: opts.blindId ?? null,
35
+ scores: null,
36
+ prompt,
37
+ graded: false,
38
+ };
39
+ },
40
+ };
41
+ }
package/package.json ADDED
@@ -0,0 +1,30 @@
1
+ {
2
+ "name": "@dividedby/bench-core",
3
+ "version": "0.1.0",
4
+ "description": "Decoupled core primitives for the model x effort benchmark harness: run execution, grade aggregation, cost re-derivation, and a judge backend interface.",
5
+ "type": "module",
6
+ "license": "MIT",
7
+ "engines": {
8
+ "node": ">=20"
9
+ },
10
+ "types": "./core/index.d.ts",
11
+ "exports": {
12
+ ".": {
13
+ "types": "./core/index.d.ts",
14
+ "import": "./core/index.mjs"
15
+ }
16
+ },
17
+ "files": [
18
+ "core"
19
+ ],
20
+ "publishConfig": {
21
+ "access": "public"
22
+ },
23
+ "repository": {
24
+ "type": "git",
25
+ "url": "git+https://github.com/dividedby/bench.git"
26
+ },
27
+ "scripts": {
28
+ "test": "node --test"
29
+ }
30
+ }