npm - @tangle-network/agent-eval - Versions diffs - 0.16.2 → 0.17.0 - Mend

@tangle-network/agent-eval 0.16.2 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md CHANGED Viewed

@@ -196,10 +196,11 @@ These are the primitives any team running prompt-optimization in production need
   meta-loop (`inspectFailures` → `proposeChange` → `applyChange` →
   `evaluateChange`). Ship a `NoopResearcher` as a placeholder; real
   implementations live downstream.
-- `benchmarks/{gsm8k,swebench-lite,routing}` — reference benchmark
-  wrappers behind one `BenchmarkAdapter` shape, with deterministic
-  splits and fail-loud env-var configuration. Mostly for reproducible
-  comparisons; not core surface.
+- `benchmarks/routing` — synthetic 16-task router benchmark we own.
+  Ships in the package. Reference wrappers for GSM8K and SWE-Bench
+  Lite live under `examples/benchmarks/` — read, copy, adapt. All
+  three implement one `BenchmarkAdapter` shape with deterministic
+  splits and fail-loud env-var configuration.
 ### v0.16 changes from v0.15

package/dist/index.d.ts CHANGED Viewed

@@ -6975,103 +6975,6 @@ declare const BENCHMARK_SPLIT_SEED = "agent-eval-v1";
  */
 declare function deterministicSplit(itemId: string, seed?: string): RunSplitTag;
-/**
- * GSM8K wrapper — exact-match grading on the final numeric answer.
- *
- * The dataset itself is NOT bundled. `loadDataset` will:
- *   1. read from `process.env.AGENT_EVAL_GSM8K_PATH` if set (a JSONL
- *      file with `{ id, question, answer }` records — the standard
- *      HF mirror layout converted to JSONL);
- *   2. otherwise throw a clearly-marked error pointing to the loader.
- *
- * `evaluate` parses the final number out of the response (last
- * occurrence of a signed-decimal-or-integer literal, optionally after
- * `####`, the GSM8K answer convention) and compares to the ground-
- * truth integer. Floating-point comparisons use a 1e-6 tolerance.
- */
-interface Gsm8kPayload {
-    question: string;
-    /** Reference answer, post-#### normalization. May be a number or
-     *  a numeric string ("72", "1.5"). */
-    answer: string;
-}
-type Gsm8kItem = BenchmarkDatasetItem<Gsm8kPayload>;
-declare class Gsm8kAdapter implements BenchmarkAdapter<Gsm8kItem, Gsm8kPayload> {
-    loadDataset(split: RunSplitTag): Promise<Gsm8kItem[]>;
-    evaluate(item: Gsm8kItem, response: string): Promise<BenchmarkEvaluation>;
-    assignSplit(itemId: string): RunSplitTag;
-}
-/**
- * Parse a GSM8K-style answer. Honors the dataset's `#### N`
- * convention (the canonical answer comes after `####`); otherwise
- * returns the LAST signed numeric literal in the string.
- */
-declare function parseGsm8kAnswer(text: string): number | null;
-declare const loadDataset$2: (split: RunSplitTag) => Promise<Gsm8kItem[]>;
-declare const evaluate$2: (item: Gsm8kItem, response: string) => Promise<BenchmarkEvaluation>;
-declare const assignSplit$2: (itemId: string) => RunSplitTag;
-type index$3_Gsm8kAdapter = Gsm8kAdapter;
-declare const index$3_Gsm8kAdapter: typeof Gsm8kAdapter;
-type index$3_Gsm8kItem = Gsm8kItem;
-type index$3_Gsm8kPayload = Gsm8kPayload;
-declare const index$3_parseGsm8kAnswer: typeof parseGsm8kAnswer;
-declare namespace index$3 {
-  export { index$3_Gsm8kAdapter as Gsm8kAdapter, type index$3_Gsm8kItem as Gsm8kItem, type index$3_Gsm8kPayload as Gsm8kPayload, assignSplit$2 as assignSplit, evaluate$2 as evaluate, loadDataset$2 as loadDataset, index$3_parseGsm8kAnswer as parseGsm8kAnswer };
-}
-/**
- * SWE-Bench Lite wrapper — 30-instance subset.
- *
- * Status: STUB. The actual SWE-Bench harness needs a Docker host and
- * is too heavy to ship inside this package. We expose the contract
- * (loadDataset, evaluate, assignSplit) so consumers can plug in their
- * own grader without touching call sites.
- *
- * Wire-up paths in priority order:
- *
- *   1. `process.env.AGENT_EVAL_SWEBENCH_PATH` → JSONL with the 30
- *      lite instances + per-instance metadata (instance_id,
- *      problem_statement, base_commit, repo, FAIL_TO_PASS,
- *      PASS_TO_PASS).
- *   2. `process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD` → executable
- *      that reads `{instance_id, patch}` JSON on stdin and writes
- *      `{passed, fail_to_pass_passed, pass_to_pass_passed, log}`
- *      JSON on stdout. Implementations can shell out to the
- *      official `swebench` runner here.
- *
- * If neither is set, every public method throws a clearly-marked
- * "not implemented" error. The stub fails LOUD; it never silently
- * scores zero.
- */
-interface SweBenchLitePayload {
-    instanceId: string;
-    problemStatement: string;
-    baseCommit: string;
-    repo: string;
-    failToPass: string[];
-    passToPass: string[];
-}
-type SweBenchLiteItem = BenchmarkDatasetItem<SweBenchLitePayload>;
-declare class SweBenchLiteAdapter implements BenchmarkAdapter<SweBenchLiteItem, SweBenchLitePayload> {
-    loadDataset(split: RunSplitTag): Promise<SweBenchLiteItem[]>;
-    evaluate(item: SweBenchLiteItem, response: string): Promise<BenchmarkEvaluation>;
-    assignSplit(itemId: string): RunSplitTag;
-}
-declare const loadDataset$1: (split: RunSplitTag) => Promise<SweBenchLiteItem[]>;
-declare const evaluate$1: (item: SweBenchLiteItem, response: string) => Promise<BenchmarkEvaluation>;
-declare const assignSplit$1: (itemId: string) => RunSplitTag;
-type index$2_SweBenchLiteAdapter = SweBenchLiteAdapter;
-declare const index$2_SweBenchLiteAdapter: typeof SweBenchLiteAdapter;
-type index$2_SweBenchLiteItem = SweBenchLiteItem;
-type index$2_SweBenchLitePayload = SweBenchLitePayload;
-declare namespace index$2 {
-  export { index$2_SweBenchLiteAdapter as SweBenchLiteAdapter, type index$2_SweBenchLiteItem as SweBenchLiteItem, type index$2_SweBenchLitePayload as SweBenchLitePayload, assignSplit$1 as assignSplit, evaluate$1 as evaluate, loadDataset$1 as loadDataset };
-}
 /**
  * Synthetic routing dataset. 16 tasks across 4 categories. Used as a
  * deterministic, dependency-free benchmark for any router that maps a
@@ -7153,21 +7056,21 @@ declare namespace index$1 {
 /**
  * Reference benchmark wrappers — entry point.
  *
- * Three benchmarks ship under `src/benchmarks/`:
- *   - `gsm8k`           — exact-match math reasoning (HF mirror,
- *                          dataset NOT bundled — see `gsm8k/index.ts`).
- *   - `swebench-lite`   — 30-instance SWE-Bench subset (STUB; needs
- *                          external grader).
- *   - `routing`         — synthetic 16-task router benchmark, ships
- *                          in the package.
+ * Core surface (exported here):
+ *   - The `BenchmarkAdapter` contract.
+ *   - `deterministicSplit` + `BENCHMARK_SPLIT_SEED` for split assignment.
+ *   - `routing` — synthetic 16-task router benchmark. The only novel
+ *     benchmark we built; ships in the package.
  *
- * Every benchmark exposes the same three exports — `loadDataset`,
- * `evaluate`, `assignSplit` — and a typed adapter class. Pick the
- * import path that matches the benchmark.
+ * Example wrappers (under `examples/benchmarks/`, NOT in the bundle):
+ *   - `gsm8k`         — exact-match math reasoning (HF mirror, dataset
+ *                       not bundled).
+ *   - `swebench-lite` — 30-instance SWE-Bench subset (stub; needs an
+ *                       external grader).
  *
- * Shared types (`BenchmarkAdapter`, `BenchmarkDatasetItem`,
- * `BenchmarkEvaluation`, `deterministicSplit`, `BENCHMARK_SPLIT_SEED`)
- * live in `./types`.
+ * The example wrappers are reference implementations of `BenchmarkAdapter`.
+ * Read them, copy them, adapt them. They're intentionally not in the main
+ * entry — every team will configure them differently.
  */
 declare const index_BENCHMARK_SPLIT_SEED: typeof BENCHMARK_SPLIT_SEED;
@@ -7176,7 +7079,7 @@ type index_BenchmarkDatasetItem<TPayload = unknown> = BenchmarkDatasetItem<TPayl
 type index_BenchmarkEvaluation = BenchmarkEvaluation;
 declare const index_deterministicSplit: typeof deterministicSplit;
 declare namespace index {
-  export { index_BENCHMARK_SPLIT_SEED as BENCHMARK_SPLIT_SEED, type index_BenchmarkAdapter as BenchmarkAdapter, type index_BenchmarkDatasetItem as BenchmarkDatasetItem, type index_BenchmarkEvaluation as BenchmarkEvaluation, index_deterministicSplit as deterministicSplit, index$3 as gsm8k, index$1 as routing, index$2 as swebenchLite };
+  export { index_BENCHMARK_SPLIT_SEED as BENCHMARK_SPLIT_SEED, type index_BenchmarkAdapter as BenchmarkAdapter, type index_BenchmarkDatasetItem as BenchmarkDatasetItem, type index_BenchmarkEvaluation as BenchmarkEvaluation, index_deterministicSplit as deterministicSplit, index$1 as routing };
 }
 interface ReferenceReplaySteeringRowsOptions<Input = unknown> {

package/dist/index.js CHANGED Viewed

@@ -3337,12 +3337,12 @@ var SubprocessSandboxDriver = class {
     this.defaultEnv = options.env;
   }
   async exec(phase, command, config) {
-    const { spawn: spawn2 } = await import("child_process");
+    const { spawn } = await import("child_process");
     const start = Date.now();
     const effectiveCwd = config.cwd ?? this.defaultCwd;
     const effectiveEnv = { ...process.env, ...this.defaultEnv ?? {}, ...config.env ?? {} };
     return await new Promise((resolve) => {
-      const child = spawn2(command, {
+      const child = spawn(command, {
         shell: true,
         cwd: effectiveCwd,
         env: effectiveEnv
@@ -8578,20 +8578,20 @@ function mergeLayerResults(name, perAdapter, options = {}) {
   let durationMs = 0;
   const reasonParts = [];
   const diagnostics = {};
-  for (const { adapter: adapter4, result } of perAdapter) {
+  for (const { adapter: adapter2, result } of perAdapter) {
     status = worst(status, result.status);
     if (typeof result.score === "number") {
       weightedScoreSum += result.score;
       weightCount += 1;
     }
     durationMs = mergeDuration === "sum" ? durationMs + result.durationMs : Math.max(durationMs, result.durationMs);
-    reasonParts.push(`${adapter4}: ${result.status}`);
+    reasonParts.push(`${adapter2}: ${result.status}`);
     for (const f of result.findings) {
       findings.push({
         ...f,
         layer: name,
-        message: prefix ? `${prefix(adapter4)} ${f.message}` : f.message,
-        detail: { ...f.detail ?? {}, adapter: adapter4 }
+        message: prefix ? `${prefix(adapter2)} ${f.message}` : f.message,
+        detail: { ...f.detail ?? {}, adapter: adapter2 }
       });
     }
     for (const [k, v] of Object.entries(result.diagnostics ?? {})) {
@@ -8610,8 +8610,8 @@ function mergeLayerResults(name, perAdapter, options = {}) {
     reason: reasonParts.join(" \xB7 "),
     diagnostics: Object.keys(diagnostics).length > 0 ? diagnostics : void 0,
     detail: {
-      adapters: perAdapter.map(({ adapter: adapter4, result }) => ({
-        adapter: adapter4,
+      adapters: perAdapter.map(({ adapter: adapter2, result }) => ({
+        adapter: adapter2,
         status: result.status,
         score: result.score ?? null
       })),
@@ -8637,10 +8637,10 @@ function multiToolchainLayer(config) {
           reason: "no adapters detected"
         };
       }
-      const runOne = async (adapter4) => {
-        const adapterName = config.adapterName(adapter4);
+      const runOne = async (adapter2) => {
+        const adapterName = config.adapterName(adapter2);
         try {
-          const r = await config.run(adapter4, ctx);
+          const r = await config.run(adapter2, ctx);
           return { adapter: adapterName, result: r };
         } catch (err) {
           return {
@@ -10076,8 +10076,8 @@ function formatPct(value) {
 function bySplitOrder(a, b) {
   return ALL_SPLITS.indexOf(a) - ALL_SPLITS.indexOf(b);
 }
-function runAdapter(adapter4, scenario, context) {
-  return typeof adapter4 === "function" ? adapter4(scenario, context) : adapter4.run(scenario, context);
+function runAdapter(adapter2, scenario, context) {
+  return typeof adapter2 === "function" ? adapter2(scenario, context) : adapter2.run(scenario, context);
 }
 function throwIfAborted(signal) {
   if (!signal?.aborted) return;
@@ -10968,232 +10968,18 @@ var benchmarks_exports = {};
 __export(benchmarks_exports, {
   BENCHMARK_SPLIT_SEED: () => BENCHMARK_SPLIT_SEED,
   deterministicSplit: () => deterministicSplit,
-  gsm8k: () => gsm8k_exports,
-  routing: () => routing_exports,
-  swebenchLite: () => swebench_lite_exports
+  routing: () => routing_exports
 });
-// src/benchmarks/gsm8k/index.ts
-var gsm8k_exports = {};
-__export(gsm8k_exports, {
-  Gsm8kAdapter: () => Gsm8kAdapter,
-  assignSplit: () => assignSplit,
-  evaluate: () => evaluate,
-  loadDataset: () => loadDataset,
-  parseGsm8kAnswer: () => parseGsm8kAnswer
-});
-import { existsSync as existsSync5, readFileSync as readFileSync5 } from "fs";
-var Gsm8kAdapter = class {
-  async loadDataset(split) {
-    const path = process.env.AGENT_EVAL_GSM8K_PATH;
-    if (!path) {
-      throw new Error(
-        "GSM8K dataset not provided. Set AGENT_EVAL_GSM8K_PATH to a JSONL file with {id, question, answer} records (the HF GSM8K mirror converted to JSONL)."
-      );
-    }
-    if (!existsSync5(path)) {
-      throw new Error(`AGENT_EVAL_GSM8K_PATH=${path} does not exist`);
-    }
-    const items = parseJsonl(path).filter((it) => assignSplitImpl(it.id) === split);
-    return items;
-  }
-  async evaluate(item, response) {
-    const expected = parseGsm8kAnswer(item.payload.answer);
-    const observed = parseGsm8kAnswer(response);
-    if (expected === null) {
-      return { score: 0, raw: { reason: "reference_not_numeric", expected: item.payload.answer } };
-    }
-    if (observed === null) {
-      return { score: 0, raw: { reason: "no_numeric_in_response", expected, observed: null } };
-    }
-    const ok = Math.abs(expected - observed) < 1e-6;
-    return { score: ok ? 1 : 0, raw: { expected, observed, exactMatch: ok } };
-  }
-  assignSplit(itemId) {
-    return assignSplitImpl(itemId);
-  }
-};
-function assignSplitImpl(itemId) {
-  return deterministicSplit(`gsm8k::${itemId}`);
-}
-function parseJsonl(path) {
-  const raw = readFileSync5(path, "utf8");
-  const out = [];
-  let lineNo = 0;
-  for (const line of raw.split("\n")) {
-    lineNo++;
-    const trimmed = line.trim();
-    if (!trimmed) continue;
-    let row;
-    try {
-      row = JSON.parse(trimmed);
-    } catch (e) {
-      throw new Error(`GSM8K JSONL parse error at line ${lineNo}: ${e.message}`);
-    }
-    const id = String(row.id ?? `gsm8k_${lineNo}`);
-    const question = String(row.question ?? "");
-    const answer = String(row.answer ?? "");
-    if (!question || !answer) {
-      throw new Error(`GSM8K JSONL line ${lineNo} missing question/answer`);
-    }
-    out.push({ id, payload: { question, answer } });
-  }
-  return out;
-}
-function parseGsm8kAnswer(text) {
-  if (!text) return null;
-  const afterMarker = text.match(/####\s*(-?\d[\d,]*\.?\d*)/);
-  if (afterMarker) {
-    const cleaned2 = afterMarker[1].replace(/,/g, "");
-    const v2 = Number(cleaned2);
-    if (Number.isFinite(v2)) return v2;
-  }
-  const matches2 = text.match(/-?\d[\d,]*\.?\d*/g);
-  if (!matches2 || matches2.length === 0) return null;
-  const last = matches2[matches2.length - 1];
-  const cleaned = last.replace(/,/g, "");
-  const v = Number(cleaned);
-  return Number.isFinite(v) ? v : null;
-}
-var adapter = new Gsm8kAdapter();
-var loadDataset = adapter.loadDataset.bind(adapter);
-var evaluate = adapter.evaluate.bind(adapter);
-var assignSplit = adapter.assignSplit.bind(adapter);
-// src/benchmarks/swebench-lite/index.ts
-var swebench_lite_exports = {};
-__export(swebench_lite_exports, {
-  SweBenchLiteAdapter: () => SweBenchLiteAdapter,
-  assignSplit: () => assignSplit2,
-  evaluate: () => evaluate2,
-  loadDataset: () => loadDataset2
-});
-import { existsSync as existsSync6, readFileSync as readFileSync6 } from "fs";
-import { spawn } from "child_process";
-var SweBenchLiteAdapter = class {
-  async loadDataset(split) {
-    const path = process.env.AGENT_EVAL_SWEBENCH_PATH;
-    if (!path) {
-      throw new Error(
-        "SWE-Bench Lite dataset not provided. Set AGENT_EVAL_SWEBENCH_PATH to a JSONL file with the 30 lite instances. STUB: this wrapper does not bundle the dataset; see https://www.swebench.com/lite.html for the canonical source."
-      );
-    }
-    if (!existsSync6(path)) {
-      throw new Error(`AGENT_EVAL_SWEBENCH_PATH=${path} does not exist`);
-    }
-    const all = parseJsonl2(path);
-    return all.filter((it) => assignSplitImpl2(it.id) === split);
-  }
-  async evaluate(item, response) {
-    const cmd = process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD;
-    if (!cmd) {
-      throw new Error(
-        "SWE-Bench Lite grader not configured. Set AGENT_EVAL_SWEBENCH_GRADER_CMD to an executable that reads {instance_id, patch} JSON on stdin and writes {passed, fail_to_pass_passed, pass_to_pass_passed, log} JSON on stdout. TODO(swebench-lite): bundle a default Docker-based runner once the SDK stabilises (https://github.com/swe-bench/SWE-bench)."
-      );
-    }
-    const stdinPayload = JSON.stringify({ instance_id: item.payload.instanceId, patch: response });
-    const result = await runGrader(cmd, stdinPayload);
-    let parsed;
-    try {
-      parsed = JSON.parse(result.stdout);
-    } catch (e) {
-      throw new Error(
-        `SWE-Bench grader emitted non-JSON stdout: ${e.message}
-stdout=${result.stdout.slice(0, 400)}
-stderr=${result.stderr.slice(0, 400)}`
-      );
-    }
-    const passed = Boolean(parsed.passed);
-    return {
-      score: passed ? 1 : 0,
-      raw: {
-        passed,
-        failToPassPassed: Boolean(parsed.fail_to_pass_passed),
-        passToPassPassed: Boolean(parsed.pass_to_pass_passed),
-        graderLog: typeof parsed.log === "string" ? parsed.log.slice(0, 4e3) : ""
-      }
-    };
-  }
-  assignSplit(itemId) {
-    return assignSplitImpl2(itemId);
-  }
-};
-function assignSplitImpl2(itemId) {
-  return deterministicSplit(`swebench-lite::${itemId}`);
-}
-function parseJsonl2(path) {
-  const raw = readFileSync6(path, "utf8");
-  const out = [];
-  let lineNo = 0;
-  for (const line of raw.split("\n")) {
-    lineNo++;
-    const trimmed = line.trim();
-    if (!trimmed) continue;
-    const row = JSON.parse(trimmed);
-    const instanceId = String(row.instance_id ?? row.instanceId ?? "");
-    if (!instanceId) {
-      throw new Error(`swebench-lite line ${lineNo} missing instance_id`);
-    }
-    out.push({
-      id: instanceId,
-      payload: {
-        instanceId,
-        problemStatement: String(row.problem_statement ?? row.problemStatement ?? ""),
-        baseCommit: String(row.base_commit ?? row.baseCommit ?? ""),
-        repo: String(row.repo ?? ""),
-        failToPass: asStringArray(row.FAIL_TO_PASS ?? row.failToPass),
-        passToPass: asStringArray(row.PASS_TO_PASS ?? row.passToPass)
-      }
-    });
-  }
-  return out;
-}
-function asStringArray(v) {
-  if (Array.isArray(v)) return v.filter((x) => typeof x === "string");
-  if (typeof v === "string") {
-    try {
-      const parsed = JSON.parse(v);
-      if (Array.isArray(parsed)) return parsed.filter((x) => typeof x === "string");
-    } catch {
-      return [v];
-    }
-  }
-  return [];
-}
-function runGrader(cmd, stdin) {
-  return new Promise((resolve, reject) => {
-    const parts = cmd.split(/\s+/);
-    const child = spawn(parts[0], parts.slice(1), { stdio: ["pipe", "pipe", "pipe"] });
-    let stdout = "";
-    let stderr = "";
-    child.stdout.on("data", (b) => stdout += b.toString("utf8"));
-    child.stderr.on("data", (b) => stderr += b.toString("utf8"));
-    child.on("error", reject);
-    child.on("close", (code) => {
-      if (code !== 0) {
-        reject(new Error(`grader exited with code ${code}: ${stderr.slice(0, 400)}`));
-        return;
-      }
-      resolve({ stdout, stderr });
-    });
-    child.stdin.write(stdin);
-    child.stdin.end();
-  });
-}
-var adapter2 = new SweBenchLiteAdapter();
-var loadDataset2 = adapter2.loadDataset.bind(adapter2);
-var evaluate2 = adapter2.evaluate.bind(adapter2);
-var assignSplit2 = adapter2.assignSplit.bind(adapter2);
 // src/benchmarks/routing/index.ts
 var routing_exports = {};
 __export(routing_exports, {
   ROUTING_DATASET: () => ROUTING_DATASET,
   RoutingAdapter: () => RoutingAdapter,
-  assignSplit: () => assignSplit3,
-  evaluate: () => evaluate3,
+  assignSplit: () => assignSplit,
+  evaluate: () => evaluate,
   extractRouteTokens: () => extractRouteTokens,
-  loadDataset: () => loadDataset3
+  loadDataset: () => loadDataset
 });
 // src/benchmarks/routing/dataset.ts
@@ -11331,7 +11117,7 @@ var ROUTING_DATASET = [
 // src/benchmarks/routing/index.ts
 var RoutingAdapter = class {
   async loadDataset(split) {
-    return ROUTING_DATASET.map((item) => ({ id: item.id, payload: item })).filter((it) => assignSplitImpl3(it.id) === split);
+    return ROUTING_DATASET.map((item) => ({ id: item.id, payload: item })).filter((it) => assignSplitImpl(it.id) === split);
   }
   async evaluate(item, response) {
     const tokens2 = extractRouteTokens(response);
@@ -11352,20 +11138,20 @@ var RoutingAdapter = class {
     };
   }
   assignSplit(itemId) {
-    return assignSplitImpl3(itemId);
+    return assignSplitImpl(itemId);
   }
 };
-function assignSplitImpl3(itemId) {
+function assignSplitImpl(itemId) {
   return deterministicSplit(`routing::${itemId}`);
 }
 function extractRouteTokens(response) {
   const matches2 = response.match(/[a-z][a-z0-9_]*\.[a-z][a-z0-9_]*/gi);
   return matches2 ?? [];
 }
-var adapter3 = new RoutingAdapter();
-var loadDataset3 = adapter3.loadDataset.bind(adapter3);
-var evaluate3 = adapter3.evaluate.bind(adapter3);
-var assignSplit3 = adapter3.assignSplit.bind(adapter3);
+var adapter = new RoutingAdapter();
+var loadDataset = adapter.loadDataset.bind(adapter);
+var evaluate = adapter.evaluate.bind(adapter);
+var assignSplit = adapter.assignSplit.bind(adapter);
 // src/reference-replay-steering.ts
 function referenceReplayRunsToSteeringRows(runs, options = {}) {
@@ -11632,11 +11418,11 @@ function samePopulation(a, b) {
 }
 // src/jsonl-trial-cache.ts
-import { appendFileSync as appendFileSync4, existsSync as existsSync8, mkdirSync as mkdirSync4, readFileSync as readFileSync7 } from "fs";
+import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
 import { dirname as dirname4 } from "path";
 // src/locked-jsonl-appender.ts
-import { appendFileSync as appendFileSync3, existsSync as existsSync7, mkdirSync as mkdirSync3 } from "fs";
+import { appendFileSync as appendFileSync3, existsSync as existsSync5, mkdirSync as mkdirSync3 } from "fs";
 import { dirname as dirname3 } from "path";
 var mutexes = /* @__PURE__ */ new Map();
 function getMutex(path) {
@@ -11651,7 +11437,7 @@ var LockedJsonlAppender = class {
   constructor(path) {
     this.path = path;
     this.mutex = getMutex(path);
-    if (!existsSync7(dirname3(path))) {
+    if (!existsSync5(dirname3(path))) {
       mkdirSync3(dirname3(path), { recursive: true });
     }
   }
@@ -11676,8 +11462,8 @@ var JsonlTrialCache = class {
   appender;
   constructor(path) {
     this.path = path;
-    if (existsSync8(path)) {
-      for (const line of readFileSync7(path, "utf-8").split("\n")) {
+    if (existsSync6(path)) {
+      for (const line of readFileSync5(path, "utf-8").split("\n")) {
         if (!line.trim()) continue;
         try {
           const entry = JSON.parse(line);
@@ -11715,7 +11501,7 @@ var JsonlTrialCache = class {
 };
 // src/evolution-telemetry.ts
-import { appendFileSync as appendFileSync5, existsSync as existsSync9, mkdirSync as mkdirSync5, readFileSync as readFileSync8, writeFileSync } from "fs";
+import { appendFileSync as appendFileSync5, existsSync as existsSync7, mkdirSync as mkdirSync5, readFileSync as readFileSync6, writeFileSync } from "fs";
 import { dirname as dirname5 } from "path";
 var MutationTelemetry = class {
   appender;
@@ -11746,16 +11532,16 @@ var LineageRecorder = class {
     this.snapshotPath = `${path}.snapshot`;
     this.kindOf = kindOf ?? defaultKindOf;
     mkdirSync5(dirname5(path), { recursive: true });
-    if (existsSync9(this.snapshotPath)) {
+    if (existsSync7(this.snapshotPath)) {
       try {
-        const parsed = JSON.parse(readFileSync8(this.snapshotPath, "utf-8"));
+        const parsed = JSON.parse(readFileSync6(this.snapshotPath, "utf-8"));
         for (const n of parsed) this.nodes.set(n.id, n);
       } catch {
       }
     }
-    if (existsSync9(path)) {
+    if (existsSync7(path)) {
       try {
-        for (const line of readFileSync8(path, "utf-8").split("\n")) {
+        for (const line of readFileSync6(path, "utf-8").split("\n")) {
           if (!line.trim()) continue;
           try {
             const entry = JSON.parse(line);
@@ -11767,9 +11553,9 @@ var LineageRecorder = class {
       } catch {
       }
     }
-    if (existsSync9(path) && this.nodes.size === 0) {
+    if (existsSync7(path) && this.nodes.size === 0) {
       try {
-        const raw = readFileSync8(path, "utf-8").trim();
+        const raw = readFileSync6(path, "utf-8").trim();
         if (raw.startsWith("[")) {
           const parsed = JSON.parse(raw);
           for (const n of parsed) this.nodes.set(n.id, n);
@@ -11783,8 +11569,8 @@ var LineageRecorder = class {
       const prev = this.nodes.get(node.id);
       this.nodes.set(node.id, { ...prev, ...node });
       try {
-        if (existsSync9(this.path)) {
-          const head = readFileSync8(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
+        if (existsSync7(this.path)) {
+          const head = readFileSync6(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
           if (head === "[") {
             writeFileSync(this.path, "");
           }
@@ -11850,9 +11636,9 @@ var CostLedger = class {
   mutex = new Mutex();
   constructor(path) {
     this.path = path;
-    if (existsSync9(path)) {
+    if (existsSync7(path)) {
       try {
-        const loaded = JSON.parse(readFileSync8(path, "utf-8"));
+        const loaded = JSON.parse(readFileSync6(path, "utf-8"));
         for (const k of Object.keys(this.totals)) {
           if (k === "byGeneration") {
             if (loaded.byGeneration && typeof loaded.byGeneration === "object") {