npm - @dividedby/bench-core - Versions diffs - 0.1.0 - Mend

@dividedby/bench-core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,62 @@
+# bench — model × effort benchmark for Claude Code skills
+Goal: build a cost/quality matrix across `model × effort` for the skills used daily,
+so the cheapest config above a chosen quality floor can be picked as the daily driver.
+## `@dividedby/bench-core`
+The reusable core primitives are published as `@dividedby/bench-core` (ESM, zero
+runtime deps, hand-written types in `core/index.d.ts`). The CLIs in this repo
+(`run.mjs`, `sweep.mjs`, `aggregate-grades.mjs`) are thin wrappers over them.
+```js
+import {
+  executeRun,            // run one (task, model, effort) cell; injectable runCli
+  normalize, groupByCell, // multi-judge grade aggregation
+  priceTokens, rederiveCostUsd, // pure cost re-derivation (pricing passed in)
+  createDefaultJudgeBackend,    // JudgeBackend interface + no-model default
+} from "@dividedby/bench-core";
+```
+Only `core/**` ships in the package; fixtures, tasks, results, and pricing are
+local-harness assets. Test: `npm test` (node's built-in runner, `node --test`).
+## Status
+- **Stage 1 (current):** scaffold + one synthetic task + single-run runner with metrics extraction.
+- Stage 2: full sweep runner (model × effort × task, 1 trial) → `aggregated.csv`.
+- Stage 3: grader (objective checks + LLM judge) → `report.md`.
+- Stage 4: 3 trials, replayed-real tasks, variance flags.
+## Axes
+- Models: `opus`, `sonnet`, `haiku`
+- Effort: `low`, `medium`, `high` (CLI also supports `xhigh`, `max`)
+## How it works
+Each run executes a skill headlessly in an isolated copy of a fixture repo:
+```
+claude -p "<prompt>" --model M --effort E \
+  --output-format json --permission-mode acceptEdits \
+  --append-system-prompt "<unattended constant>"   (cwd = working copy)
+```
+`--output-format json` returns `total_cost_usd`, `duration_ms`, `num_turns`, and token
+usage directly — no transcript parsing needed. Artifacts left in the working copy are
+graded later.
+### Unattended constant
+Every cell gets the same appended system prompt telling the agent it is running
+unattended (no user to answer "check with the user" checkpoints). Applied identically
+everywhere so it does not bias the model comparison. See `UNATTENDED` in `run.mjs`.
+### Fixtures
+Synthetic fixtures live in `fixtures/`. Each declares a **local-markdown** issue tracker
+(`docs/agents/issue-tracker.md`) so skills like `/to-prd` write files instead of calling
+`gh` — gradeable, resettable, zero external side effects.
+## Run one
+```
+node run.mjs --task tasks/software-design-synthetic.json --model sonnet --effort medium
+```
+Outputs `results/runs/<runId>.json` (config + metrics) and leaves the worked copy under
+`results/work/<runId>/`.

package/core/aggregate.mjs ADDED Viewed

@@ -0,0 +1,94 @@
+// Grade aggregation primitives — the canonical home (moved from aggregate-grades.mjs).
+// Multi-judge aggregation: judges run on different scales, so a raw mean would let the
+// wider-spread judge dominate. We per-judge z-score `overall` across that judge's cells,
+// then average the z-scores. Only judges with FULL coverage (graded every cell) enter the
+// normalized aggregate — you can't normalize a partial judge, and this auto-excludes
+// leftover/rejected partial passes without deleting data.
+const DISAGREE_RANK_GAP = 4; // judges placing a cell >= this many rank positions
+                             // apart (out of N cells) get flagged for review.
+// Pure core. grades: [{ blindId, judge, scores:{overall,...} }]
+// Returns { cells, judges, dropped } where each cell carries raw + normalized data.
+export function normalize(grades) {
+  const blindIds = [...new Set(grades.map((g) => g.blindId))];
+  const allJudges = [...new Set(grades.map((g) => g.judge))];
+  // A judge counts only if it graded every blindId exactly once.
+  const fullJudges = [];
+  const dropped = [];
+  for (const j of allJudges) {
+    const seen = grades.filter((g) => g.judge === j).map((g) => g.blindId);
+    const uniq = new Set(seen);
+    if (uniq.size === blindIds.length && seen.length === blindIds.length) fullJudges.push(j);
+    else dropped.push({ judge: j, graded: uniq.size, of: blindIds.length });
+  }
+  // Per full judge: z-score and within-judge rank (1 = best) over its overalls.
+  const perJudge = {}; // judge -> blindId -> { z, rank, raw }
+  for (const j of fullJudges) {
+    const rows = grades
+      .filter((g) => g.judge === j)
+      .map((g) => ({ blindId: g.blindId, raw: g.scores.overall }));
+    const mean = rows.reduce((s, r) => s + r.raw, 0) / rows.length;
+    const variance = rows.reduce((s, r) => s + (r.raw - mean) ** 2, 0) / rows.length;
+    const std = Math.sqrt(variance);
+    const ranked = [...rows].sort((a, b) => b.raw - a.raw);
+    perJudge[j] = {};
+    for (const r of rows) {
+      const z = std === 0 ? 0 : (r.raw - mean) / std;
+      const rank = ranked.findIndex((x) => x.blindId === r.blindId) + 1;
+      perJudge[j][r.blindId] = { z, rank, raw: r.raw };
+    }
+  }
+  const cells = blindIds.map((blindId) => {
+    const zs = fullJudges.map((j) => perJudge[j][blindId].z);
+    const ranks = fullJudges.map((j) => perJudge[j][blindId].rank);
+    const raws = fullJudges.map((j) => perJudge[j][blindId].raw);
+    const normZ = zs.reduce((s, x) => s + x, 0) / (zs.length || 1);
+    const rawMean = raws.reduce((s, x) => s + x, 0) / (raws.length || 1);
+    const rankGap = ranks.length > 1 ? Math.max(...ranks) - Math.min(...ranks) : 0;
+    return {
+      blindId,
+      judges: fullJudges.length,
+      raws, // raw overalls, ordered by fullJudges
+      rawMean,
+      normZ,
+      rankGap,
+      disagree: fullJudges.length > 1 && rankGap >= DISAGREE_RANK_GAP,
+    };
+  });
+  cells.sort((a, b) => b.normZ - a.normZ);
+  return { cells, judges: fullJudges, dropped };
+}
+const NOISY_TRIAL_STD = 0.5; // trial-to-trial std of normZ (judge-std units) above
+                            // which a cell's score is wobbly enough to flag.
+// Group normalized per-blind cells into (model,effort) cells across trials.
+// resolve(blindId) -> { model, effort }. Reports trial spread = the run-variance signal.
+export function groupByCell(blindCells, resolve) {
+  const byCell = {};
+  for (const bc of blindCells) {
+    const { model, effort } = resolve(bc.blindId);
+    const key = `${model}__${effort}`;
+    (byCell[key] ??= { model, effort, trials: [] }).trials.push(bc);
+  }
+  const rows = Object.values(byCell).map((c) => {
+    const zs = c.trials.map((t) => t.normZ);
+    const rawMeans = c.trials.map((t) => t.rawMean);
+    const meanZ = zs.reduce((s, x) => s + x, 0) / zs.length;
+    const varZ = zs.reduce((s, x) => s + (x - meanZ) ** 2, 0) / zs.length;
+    const trialStd = Math.sqrt(varZ);
+    const rawSpread = Math.max(...rawMeans) - Math.min(...rawMeans);
+    return {
+      model: c.model, effort: c.effort, nTrials: c.trials.length,
+      rawMeans, meanZ, trialStd, rawSpread,
+      noisy: c.trials.length > 1 && trialStd >= NOISY_TRIAL_STD,
+    };
+  });
+  rows.sort((a, b) => b.meanZ - a.meanZ);
+  return rows;
+}

package/core/cost.mjs ADDED Viewed

@@ -0,0 +1,58 @@
+// Cost re-derivation primitives. Pure: pricing is passed IN — no file reads here.
+// The sweep.mjs wrapper loads pricing.json and passes the dict to rederiveCostUsd.
+/**
+ * Price a single token bundle against one model's per-million rates.
+ * @param {{input:number,output:number,cacheWrite5m:number,cacheRead:number}} pricingRates
+ * @param {{input?:number,output?:number,cacheCreation?:number,cacheRead?:number}} tokens
+ * @returns {number} USD cost for this bundle (assumes 5-minute cache).
+ */
+export function priceTokens(pricingRates, tokens) {
+  const t = (n) => Number(n ?? 0) / 1e6;
+  return (
+    t(tokens.input) * pricingRates.input +
+    t(tokens.output) * pricingRates.output +
+    t(tokens.cacheCreation) * pricingRates.cacheWrite5m +
+    t(tokens.cacheRead) * pricingRates.cacheRead
+  );
+}
+function modelIdToKey(id) {
+  if (id.includes("opus")) return "opus";
+  if (id.includes("sonnet")) return "sonnet";
+  if (id.includes("haiku")) return "haiku";
+  return null;
+}
+/**
+ * Re-derive API-equivalent cost as an independent check against the harness's
+ * total_cost_usd. Prefers the per-model modelUsage breakdown (accurate for the
+ * multi-model sessions these skills produce); falls back to single-rate pricing of
+ * the primary-model aggregate when modelUsage is absent. Assumes 5m cache.
+ * @param {string} primaryModel - pricing-dict key for the primary model (e.g. "opus").
+ * @param {object} metrics - run metrics carrying inputTokens/outputTokens/cache* fields.
+ * @param {object|null} modelUsage - per-model-id usage breakdown, or null.
+ * @param {object} pricingDict - { opus:{...}, sonnet:{...}, ... } rate table.
+ * @returns {number|null} re-derived USD, or null if pricing is missing for a model.
+ */
+export function rederiveCostUsd(primaryModel, metrics, modelUsage, pricingDict) {
+  if (!pricingDict) return null;
+  if (modelUsage && typeof modelUsage === "object") {
+    let total = 0;
+    for (const [id, u] of Object.entries(modelUsage)) {
+      const p = pricingDict[modelIdToKey(id)];
+      if (!p) return null;
+      total += priceTokens(p, {
+        input: u.inputTokens, output: u.outputTokens,
+        cacheCreation: u.cacheCreationInputTokens, cacheRead: u.cacheReadInputTokens,
+      });
+    }
+    return total;
+  }
+  const p = pricingDict[primaryModel];
+  if (!p) return null;
+  return priceTokens(p, {
+    input: metrics.inputTokens, output: metrics.outputTokens,
+    cacheCreation: metrics.cacheCreationTokens, cacheRead: metrics.cacheReadTokens,
+  });
+}

package/core/execute-run.mjs ADDED Viewed

@@ -0,0 +1,101 @@
+// The run mechanic, decoupled from argv parsing and results/ persistence.
+// executeRun spawns one `claude` invocation for one (task, model, effort, trial) cell
+// and returns the parsed run metrics. Fixture copy + results write live in the run.mjs
+// wrapper — this primitive only invokes the CLI and shapes the metrics.
+import { spawnSync } from "node:child_process";
+// Applied identically to EVERY cell so it does not bias the model comparison.
+// The skills have "check with the user" checkpoints; in --print mode there is no user.
+export const UNATTENDED =
+  "You are running fully unattended in a benchmark harness. There is no user available " +
+  "to answer questions or approve checkpoints. When a skill says to check with the user " +
+  "or wait for confirmation, instead make the most reasonable assumption, state it briefly, " +
+  "and proceed to completion. Do not ask questions. Finish the task and write all artifacts " +
+  "to disk before ending your turn.";
+// Default CLI runner: the real `claude` spawn. Tests inject a fake via deps.runCli.
+// Returns { status, stdout, stderr } — the subset executeRun consumes.
+function defaultRunCli({ args, cwd }) {
+  const proc = spawnSync("claude", args, {
+    cwd,
+    encoding: "utf8",
+    maxBuffer: 64 * 1024 * 1024,
+  });
+  return { status: proc.status, stdout: proc.stdout, stderr: proc.stderr };
+}
+/**
+ * Execute one benchmark cell.
+ * @param {object} config
+ * @param {{id:string,prompt:string,skill?:string,source?:string,fixture?:string}} config.task
+ * @param {string} config.model
+ * @param {string} config.effort
+ * @param {number|string} [config.trial=1]
+ * @param {string} config.workDir - cwd for the CLI (caller prepares the fixture copy here).
+ * @param {string} [config.appendSystemPrompt=UNATTENDED]
+ * @param {object} [deps]
+ * @param {(opts:{args:string[],cwd:string})=>{status:number|null,stdout:string,stderr:string}} [deps.runCli]
+ * @returns {Promise<object>} run record { runId, task, config, startedAt, exitCode, metrics, modelUsage, raw? }
+ */
+export async function executeRun(config, deps = {}) {
+  const runCli = deps.runCli ?? defaultRunCli;
+  const { task, model, effort, workDir } = config;
+  const trial = config.trial ?? "1";
+  const appendSystemPrompt = config.appendSystemPrompt ?? UNATTENDED;
+  const runId = `${task.id}__${model}__${effort}__t${trial}`;
+  const cliArgs = [
+    "-p", task.prompt,
+    "--model", model,
+    "--effort", effort,
+    "--output-format", "json",
+    "--permission-mode", "acceptEdits",
+    "--append-system-prompt", appendSystemPrompt,
+  ];
+  const startedAt = new Date().toISOString();
+  const wall0 = Date.now();
+  const proc = runCli({ args: cliArgs, cwd: workDir });
+  const wallMs = Date.now() - wall0;
+  let result = null;
+  try {
+    result = JSON.parse(proc.stdout);
+  } catch {
+    // leave result null; record raw stdout/stderr for debugging
+  }
+  const u = result?.usage ?? {};
+  const metrics = result
+    ? {
+        isError: result.is_error ?? null,
+        costUsd: result.total_cost_usd ?? null,
+        durationMs: result.duration_ms ?? null,
+        durationApiMs: result.duration_api_ms ?? null,
+        wallMs,
+        numTurns: result.num_turns ?? null,
+        inputTokens: u.input_tokens ?? null,
+        outputTokens: u.output_tokens ?? null,
+        cacheCreationTokens: u.cache_creation_input_tokens ?? null,
+        cacheReadTokens: u.cache_read_input_tokens ?? null,
+      }
+    : { parseFailed: true, wallMs };
+  return {
+    runId,
+    task: { id: task.id, skill: task.skill, source: task.source, fixture: task.fixture },
+    config: { model, effort, trial: Number(trial) },
+    startedAt,
+    exitCode: proc.status,
+    metrics,
+    // Per-model token breakdown — total_cost_usd spans every model used in the session
+    // (sub-agents, auxiliary calls), so this is needed to re-derive cost accurately.
+    modelUsage: result?.modelUsage ?? null,
+    // The full parsed CLI result (modelUsage, server tool use, etc.) for the wrapper to
+    // persist for introspection. null when stdout didn't parse as JSON.
+    result,
+    // Present only on parse failure, for the wrapper to persist for debugging.
+    raw: result ? undefined : { stdout: proc.stdout?.slice(0, 4000), stderr: proc.stderr?.slice(0, 4000) },
+  };
+}

package/core/index.d.ts ADDED Viewed

@@ -0,0 +1,186 @@
+// Hand-written types for @dividedby/bench-core. Sources stay .mjs; these declare the
+// exported primitives so a TS consumer typechecks cleanly.
+// ---- execute-run ----
+export interface TaskDef {
+  id: string;
+  prompt: string;
+  skill?: string;
+  source?: string;
+  fixture?: string;
+}
+export interface RunConfig {
+  task: TaskDef;
+  model: string;
+  effort: string;
+  trial?: number | string;
+  /** cwd for the CLI; the caller prepares the fixture copy here. */
+  workDir: string;
+  appendSystemPrompt?: string;
+}
+export interface CliInvocation {
+  args: string[];
+  cwd: string;
+}
+export interface CliResult {
+  status: number | null;
+  stdout: string;
+  stderr: string;
+}
+export type RunCli = (invocation: CliInvocation) => CliResult;
+export interface RunDeps {
+  /** Defaults to the real `claude` spawnSync wrapper. */
+  runCli?: RunCli;
+}
+export interface RunMetrics {
+  isError?: boolean | null;
+  costUsd?: number | null;
+  durationMs?: number | null;
+  durationApiMs?: number | null;
+  wallMs: number;
+  numTurns?: number | null;
+  inputTokens?: number | null;
+  outputTokens?: number | null;
+  cacheCreationTokens?: number | null;
+  cacheReadTokens?: number | null;
+  parseFailed?: boolean;
+}
+export interface RunResult {
+  runId: string;
+  task: { id: string; skill?: string; source?: string; fixture?: string };
+  config: { model: string; effort: string; trial: number };
+  startedAt: string;
+  exitCode: number | null;
+  metrics: RunMetrics;
+  modelUsage: Record<string, unknown> | null;
+  /** Full parsed CLI result for introspection; null when stdout didn't parse. */
+  result: Record<string, unknown> | null;
+  raw?: { stdout?: string; stderr?: string };
+}
+export declare const UNATTENDED: string;
+export declare function executeRun(config: RunConfig, deps?: RunDeps): Promise<RunResult>;
+// ---- aggregate ----
+export interface Grade {
+  blindId: string;
+  judge: string;
+  scores: { overall: number; [criterion: string]: number | string };
+}
+export interface NormalizedCell {
+  blindId: string;
+  judges: number;
+  raws: number[];
+  rawMean: number;
+  normZ: number;
+  rankGap: number;
+  disagree: boolean;
+}
+export interface DroppedJudge {
+  judge: string;
+  graded: number;
+  of: number;
+}
+export interface NormalizeResult {
+  cells: NormalizedCell[];
+  judges: string[];
+  dropped: DroppedJudge[];
+}
+export declare function normalize(grades: Grade[]): NormalizeResult;
+export interface BlindCell {
+  blindId: string;
+  normZ: number;
+  rawMean: number;
+}
+export type CellResolver = (blindId: string) => { model: string; effort: string };
+export interface GroupedCell {
+  model: string;
+  effort: string;
+  nTrials: number;
+  rawMeans: number[];
+  meanZ: number;
+  trialStd: number;
+  rawSpread: number;
+  noisy: boolean;
+}
+export declare function groupByCell(
+  blindCells: BlindCell[],
+  resolve: CellResolver,
+): GroupedCell[];
+// ---- cost ----
+export interface PricingRates {
+  input: number;
+  output: number;
+  cacheWrite5m: number;
+  cacheRead: number;
+}
+export type PricingDict = Record<string, PricingRates>;
+export interface TokenBundle {
+  input?: number;
+  output?: number;
+  cacheCreation?: number;
+  cacheRead?: number;
+}
+export declare function priceTokens(pricingRates: PricingRates, tokens: TokenBundle): number;
+export interface CostMetrics {
+  inputTokens?: number | null;
+  outputTokens?: number | null;
+  cacheCreationTokens?: number | null;
+  cacheReadTokens?: number | null;
+}
+export interface ModelUsageEntry {
+  inputTokens?: number;
+  outputTokens?: number;
+  cacheCreationInputTokens?: number;
+  cacheReadInputTokens?: number;
+}
+export declare function rederiveCostUsd(
+  primaryModel: string,
+  metrics: CostMetrics,
+  modelUsage: Record<string, ModelUsageEntry> | null,
+  pricingDict: PricingDict | null,
+): number | null;
+// ---- judge ----
+export interface GradeResult {
+  blindId: string | null;
+  scores: Record<string, number | string> | null;
+  prompt: string;
+  graded: boolean;
+}
+export interface JudgeBackend {
+  name: string;
+  grade(prompt: string, schema: object): Promise<GradeResult>;
+}
+export declare function createDefaultJudgeBackend(opts?: {
+  name?: string;
+  blindId?: string;
+}): JudgeBackend;

package/core/index.mjs ADDED Viewed

@@ -0,0 +1,5 @@
+// @dividedby/bench-core — the four benchmark primitives, decoupled from argv + file I/O.
+export { executeRun, UNATTENDED } from "./execute-run.mjs";
+export { normalize, groupByCell } from "./aggregate.mjs";
+export { priceTokens, rederiveCostUsd } from "./cost.mjs";
+export { createDefaultJudgeBackend } from "./judge.mjs";

package/core/judge.mjs ADDED Viewed

@@ -0,0 +1,41 @@
+// JudgeBackend interface + a default no-model backend.
+// The real Opus panel is a separate issue (#138) — this is the interface only.
+/**
+ * @typedef {object} GradeResult
+ * @property {string} blindId   - the blind submission id being graded.
+ * @property {object|null} scores - rubric scores ({ c1..cN, overall, note }), or null
+ *                                  when no model graded (the default backend defers).
+ * @property {string} prompt    - the exact prompt that would be sent to a judge model.
+ * @property {boolean} graded   - true if a model produced scores; false if deferred.
+ */
+/**
+ * @typedef {object} JudgeBackend
+ * @property {string} name
+ * @property {(prompt:string, schema:object)=>Promise<GradeResult>} grade
+ *   Grade one blind prompt against a JSON schema. A real backend calls a model and
+ *   parses its JSON reply into `scores`; the default backend defers (scores=null).
+ */
+/**
+ * Default no-model backend: emits/stores the prompt without calling a model, mirroring
+ * today's manual external-grading flow. `grade` echoes the prompt back ungraded so a
+ * human (or a later panel backend) can score it.
+ * @param {{name?:string, blindId?:string}} [opts]
+ * @returns {JudgeBackend}
+ */
+export function createDefaultJudgeBackend(opts = {}) {
+  const name = opts.name ?? "manual";
+  return {
+    name,
+    async grade(prompt, _schema) {
+      return {
+        blindId: opts.blindId ?? null,
+        scores: null,
+        prompt,
+        graded: false,
+      };
+    },
+  };
+}

package/package.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+  "name": "@dividedby/bench-core",
+  "version": "0.1.0",
+  "description": "Decoupled core primitives for the model x effort benchmark harness: run execution, grade aggregation, cost re-derivation, and a judge backend interface.",
+  "type": "module",
+  "license": "MIT",
+  "engines": {
+    "node": ">=20"
+  },
+  "types": "./core/index.d.ts",
+  "exports": {
+    ".": {
+      "types": "./core/index.d.ts",
+      "import": "./core/index.mjs"
+    }
+  },
+  "files": [
+    "core"
+  ],
+  "publishConfig": {
+    "access": "public"
+  },
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/dividedby/bench.git"
+  },
+  "scripts": {
+    "test": "node --test"
+  }
+}