@tangle-network/agent-eval 0.16.2 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -196,10 +196,11 @@ These are the primitives any team running prompt-optimization in production need
196
196
  meta-loop (`inspectFailures` → `proposeChange` → `applyChange` →
197
197
  `evaluateChange`). Ship a `NoopResearcher` as a placeholder; real
198
198
  implementations live downstream.
199
- - `benchmarks/{gsm8k,swebench-lite,routing}` — reference benchmark
200
- wrappers behind one `BenchmarkAdapter` shape, with deterministic
201
- splits and fail-loud env-var configuration. Mostly for reproducible
202
- comparisons; not core surface.
199
+ - `benchmarks/routing` — synthetic 16-task router benchmark we own.
200
+ Ships in the package. Reference wrappers for GSM8K and SWE-Bench
201
+ Lite live under `examples/benchmarks/` read, copy, adapt. All
202
+ three implement one `BenchmarkAdapter` shape with deterministic
203
+ splits and fail-loud env-var configuration.
203
204
 
204
205
  ### v0.16 changes from v0.15
205
206
 
package/dist/index.d.ts CHANGED
@@ -6975,103 +6975,6 @@ declare const BENCHMARK_SPLIT_SEED = "agent-eval-v1";
6975
6975
  */
6976
6976
  declare function deterministicSplit(itemId: string, seed?: string): RunSplitTag;
6977
6977
 
6978
- /**
6979
- * GSM8K wrapper — exact-match grading on the final numeric answer.
6980
- *
6981
- * The dataset itself is NOT bundled. `loadDataset` will:
6982
- * 1. read from `process.env.AGENT_EVAL_GSM8K_PATH` if set (a JSONL
6983
- * file with `{ id, question, answer }` records — the standard
6984
- * HF mirror layout converted to JSONL);
6985
- * 2. otherwise throw a clearly-marked error pointing to the loader.
6986
- *
6987
- * `evaluate` parses the final number out of the response (last
6988
- * occurrence of a signed-decimal-or-integer literal, optionally after
6989
- * `####`, the GSM8K answer convention) and compares to the ground-
6990
- * truth integer. Floating-point comparisons use a 1e-6 tolerance.
6991
- */
6992
-
6993
- interface Gsm8kPayload {
6994
- question: string;
6995
- /** Reference answer, post-#### normalization. May be a number or
6996
- * a numeric string ("72", "1.5"). */
6997
- answer: string;
6998
- }
6999
- type Gsm8kItem = BenchmarkDatasetItem<Gsm8kPayload>;
7000
- declare class Gsm8kAdapter implements BenchmarkAdapter<Gsm8kItem, Gsm8kPayload> {
7001
- loadDataset(split: RunSplitTag): Promise<Gsm8kItem[]>;
7002
- evaluate(item: Gsm8kItem, response: string): Promise<BenchmarkEvaluation>;
7003
- assignSplit(itemId: string): RunSplitTag;
7004
- }
7005
- /**
7006
- * Parse a GSM8K-style answer. Honors the dataset's `#### N`
7007
- * convention (the canonical answer comes after `####`); otherwise
7008
- * returns the LAST signed numeric literal in the string.
7009
- */
7010
- declare function parseGsm8kAnswer(text: string): number | null;
7011
- declare const loadDataset$2: (split: RunSplitTag) => Promise<Gsm8kItem[]>;
7012
- declare const evaluate$2: (item: Gsm8kItem, response: string) => Promise<BenchmarkEvaluation>;
7013
- declare const assignSplit$2: (itemId: string) => RunSplitTag;
7014
-
7015
- type index$3_Gsm8kAdapter = Gsm8kAdapter;
7016
- declare const index$3_Gsm8kAdapter: typeof Gsm8kAdapter;
7017
- type index$3_Gsm8kItem = Gsm8kItem;
7018
- type index$3_Gsm8kPayload = Gsm8kPayload;
7019
- declare const index$3_parseGsm8kAnswer: typeof parseGsm8kAnswer;
7020
- declare namespace index$3 {
7021
- export { index$3_Gsm8kAdapter as Gsm8kAdapter, type index$3_Gsm8kItem as Gsm8kItem, type index$3_Gsm8kPayload as Gsm8kPayload, assignSplit$2 as assignSplit, evaluate$2 as evaluate, loadDataset$2 as loadDataset, index$3_parseGsm8kAnswer as parseGsm8kAnswer };
7022
- }
7023
-
7024
- /**
7025
- * SWE-Bench Lite wrapper — 30-instance subset.
7026
- *
7027
- * Status: STUB. The actual SWE-Bench harness needs a Docker host and
7028
- * is too heavy to ship inside this package. We expose the contract
7029
- * (loadDataset, evaluate, assignSplit) so consumers can plug in their
7030
- * own grader without touching call sites.
7031
- *
7032
- * Wire-up paths in priority order:
7033
- *
7034
- * 1. `process.env.AGENT_EVAL_SWEBENCH_PATH` → JSONL with the 30
7035
- * lite instances + per-instance metadata (instance_id,
7036
- * problem_statement, base_commit, repo, FAIL_TO_PASS,
7037
- * PASS_TO_PASS).
7038
- * 2. `process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD` → executable
7039
- * that reads `{instance_id, patch}` JSON on stdin and writes
7040
- * `{passed, fail_to_pass_passed, pass_to_pass_passed, log}`
7041
- * JSON on stdout. Implementations can shell out to the
7042
- * official `swebench` runner here.
7043
- *
7044
- * If neither is set, every public method throws a clearly-marked
7045
- * "not implemented" error. The stub fails LOUD; it never silently
7046
- * scores zero.
7047
- */
7048
-
7049
- interface SweBenchLitePayload {
7050
- instanceId: string;
7051
- problemStatement: string;
7052
- baseCommit: string;
7053
- repo: string;
7054
- failToPass: string[];
7055
- passToPass: string[];
7056
- }
7057
- type SweBenchLiteItem = BenchmarkDatasetItem<SweBenchLitePayload>;
7058
- declare class SweBenchLiteAdapter implements BenchmarkAdapter<SweBenchLiteItem, SweBenchLitePayload> {
7059
- loadDataset(split: RunSplitTag): Promise<SweBenchLiteItem[]>;
7060
- evaluate(item: SweBenchLiteItem, response: string): Promise<BenchmarkEvaluation>;
7061
- assignSplit(itemId: string): RunSplitTag;
7062
- }
7063
- declare const loadDataset$1: (split: RunSplitTag) => Promise<SweBenchLiteItem[]>;
7064
- declare const evaluate$1: (item: SweBenchLiteItem, response: string) => Promise<BenchmarkEvaluation>;
7065
- declare const assignSplit$1: (itemId: string) => RunSplitTag;
7066
-
7067
- type index$2_SweBenchLiteAdapter = SweBenchLiteAdapter;
7068
- declare const index$2_SweBenchLiteAdapter: typeof SweBenchLiteAdapter;
7069
- type index$2_SweBenchLiteItem = SweBenchLiteItem;
7070
- type index$2_SweBenchLitePayload = SweBenchLitePayload;
7071
- declare namespace index$2 {
7072
- export { index$2_SweBenchLiteAdapter as SweBenchLiteAdapter, type index$2_SweBenchLiteItem as SweBenchLiteItem, type index$2_SweBenchLitePayload as SweBenchLitePayload, assignSplit$1 as assignSplit, evaluate$1 as evaluate, loadDataset$1 as loadDataset };
7073
- }
7074
-
7075
6978
  /**
7076
6979
  * Synthetic routing dataset. 16 tasks across 4 categories. Used as a
7077
6980
  * deterministic, dependency-free benchmark for any router that maps a
@@ -7153,21 +7056,21 @@ declare namespace index$1 {
7153
7056
  /**
7154
7057
  * Reference benchmark wrappers — entry point.
7155
7058
  *
7156
- * Three benchmarks ship under `src/benchmarks/`:
7157
- * - `gsm8k` exact-match math reasoning (HF mirror,
7158
- * dataset NOT bundled see `gsm8k/index.ts`).
7159
- * - `swebench-lite` 30-instance SWE-Bench subset (STUB; needs
7160
- * external grader).
7161
- * - `routing` — synthetic 16-task router benchmark, ships
7162
- * in the package.
7059
+ * Core surface (exported here):
7060
+ * - The `BenchmarkAdapter` contract.
7061
+ * - `deterministicSplit` + `BENCHMARK_SPLIT_SEED` for split assignment.
7062
+ * - `routing` synthetic 16-task router benchmark. The only novel
7063
+ * benchmark we built; ships in the package.
7163
7064
  *
7164
- * Every benchmark exposes the same three exports — `loadDataset`,
7165
- * `evaluate`, `assignSplit` and a typed adapter class. Pick the
7166
- * import path that matches the benchmark.
7065
+ * Example wrappers (under `examples/benchmarks/`, NOT in the bundle):
7066
+ * - `gsm8k` exact-match math reasoning (HF mirror, dataset
7067
+ * not bundled).
7068
+ * - `swebench-lite` — 30-instance SWE-Bench subset (stub; needs an
7069
+ * external grader).
7167
7070
  *
7168
- * Shared types (`BenchmarkAdapter`, `BenchmarkDatasetItem`,
7169
- * `BenchmarkEvaluation`, `deterministicSplit`, `BENCHMARK_SPLIT_SEED`)
7170
- * live in `./types`.
7071
+ * The example wrappers are reference implementations of `BenchmarkAdapter`.
7072
+ * Read them, copy them, adapt them. They're intentionally not in the main
7073
+ * entry every team will configure them differently.
7171
7074
  */
7172
7075
 
7173
7076
  declare const index_BENCHMARK_SPLIT_SEED: typeof BENCHMARK_SPLIT_SEED;
@@ -7176,7 +7079,7 @@ type index_BenchmarkDatasetItem<TPayload = unknown> = BenchmarkDatasetItem<TPayl
7176
7079
  type index_BenchmarkEvaluation = BenchmarkEvaluation;
7177
7080
  declare const index_deterministicSplit: typeof deterministicSplit;
7178
7081
  declare namespace index {
7179
- export { index_BENCHMARK_SPLIT_SEED as BENCHMARK_SPLIT_SEED, type index_BenchmarkAdapter as BenchmarkAdapter, type index_BenchmarkDatasetItem as BenchmarkDatasetItem, type index_BenchmarkEvaluation as BenchmarkEvaluation, index_deterministicSplit as deterministicSplit, index$3 as gsm8k, index$1 as routing, index$2 as swebenchLite };
7082
+ export { index_BENCHMARK_SPLIT_SEED as BENCHMARK_SPLIT_SEED, type index_BenchmarkAdapter as BenchmarkAdapter, type index_BenchmarkDatasetItem as BenchmarkDatasetItem, type index_BenchmarkEvaluation as BenchmarkEvaluation, index_deterministicSplit as deterministicSplit, index$1 as routing };
7180
7083
  }
7181
7084
 
7182
7085
  interface ReferenceReplaySteeringRowsOptions<Input = unknown> {
package/dist/index.js CHANGED
@@ -3337,12 +3337,12 @@ var SubprocessSandboxDriver = class {
3337
3337
  this.defaultEnv = options.env;
3338
3338
  }
3339
3339
  async exec(phase, command, config) {
3340
- const { spawn: spawn2 } = await import("child_process");
3340
+ const { spawn } = await import("child_process");
3341
3341
  const start = Date.now();
3342
3342
  const effectiveCwd = config.cwd ?? this.defaultCwd;
3343
3343
  const effectiveEnv = { ...process.env, ...this.defaultEnv ?? {}, ...config.env ?? {} };
3344
3344
  return await new Promise((resolve) => {
3345
- const child = spawn2(command, {
3345
+ const child = spawn(command, {
3346
3346
  shell: true,
3347
3347
  cwd: effectiveCwd,
3348
3348
  env: effectiveEnv
@@ -8578,20 +8578,20 @@ function mergeLayerResults(name, perAdapter, options = {}) {
8578
8578
  let durationMs = 0;
8579
8579
  const reasonParts = [];
8580
8580
  const diagnostics = {};
8581
- for (const { adapter: adapter4, result } of perAdapter) {
8581
+ for (const { adapter: adapter2, result } of perAdapter) {
8582
8582
  status = worst(status, result.status);
8583
8583
  if (typeof result.score === "number") {
8584
8584
  weightedScoreSum += result.score;
8585
8585
  weightCount += 1;
8586
8586
  }
8587
8587
  durationMs = mergeDuration === "sum" ? durationMs + result.durationMs : Math.max(durationMs, result.durationMs);
8588
- reasonParts.push(`${adapter4}: ${result.status}`);
8588
+ reasonParts.push(`${adapter2}: ${result.status}`);
8589
8589
  for (const f of result.findings) {
8590
8590
  findings.push({
8591
8591
  ...f,
8592
8592
  layer: name,
8593
- message: prefix ? `${prefix(adapter4)} ${f.message}` : f.message,
8594
- detail: { ...f.detail ?? {}, adapter: adapter4 }
8593
+ message: prefix ? `${prefix(adapter2)} ${f.message}` : f.message,
8594
+ detail: { ...f.detail ?? {}, adapter: adapter2 }
8595
8595
  });
8596
8596
  }
8597
8597
  for (const [k, v] of Object.entries(result.diagnostics ?? {})) {
@@ -8610,8 +8610,8 @@ function mergeLayerResults(name, perAdapter, options = {}) {
8610
8610
  reason: reasonParts.join(" \xB7 "),
8611
8611
  diagnostics: Object.keys(diagnostics).length > 0 ? diagnostics : void 0,
8612
8612
  detail: {
8613
- adapters: perAdapter.map(({ adapter: adapter4, result }) => ({
8614
- adapter: adapter4,
8613
+ adapters: perAdapter.map(({ adapter: adapter2, result }) => ({
8614
+ adapter: adapter2,
8615
8615
  status: result.status,
8616
8616
  score: result.score ?? null
8617
8617
  })),
@@ -8637,10 +8637,10 @@ function multiToolchainLayer(config) {
8637
8637
  reason: "no adapters detected"
8638
8638
  };
8639
8639
  }
8640
- const runOne = async (adapter4) => {
8641
- const adapterName = config.adapterName(adapter4);
8640
+ const runOne = async (adapter2) => {
8641
+ const adapterName = config.adapterName(adapter2);
8642
8642
  try {
8643
- const r = await config.run(adapter4, ctx);
8643
+ const r = await config.run(adapter2, ctx);
8644
8644
  return { adapter: adapterName, result: r };
8645
8645
  } catch (err) {
8646
8646
  return {
@@ -10076,8 +10076,8 @@ function formatPct(value) {
10076
10076
  function bySplitOrder(a, b) {
10077
10077
  return ALL_SPLITS.indexOf(a) - ALL_SPLITS.indexOf(b);
10078
10078
  }
10079
- function runAdapter(adapter4, scenario, context) {
10080
- return typeof adapter4 === "function" ? adapter4(scenario, context) : adapter4.run(scenario, context);
10079
+ function runAdapter(adapter2, scenario, context) {
10080
+ return typeof adapter2 === "function" ? adapter2(scenario, context) : adapter2.run(scenario, context);
10081
10081
  }
10082
10082
  function throwIfAborted(signal) {
10083
10083
  if (!signal?.aborted) return;
@@ -10968,232 +10968,18 @@ var benchmarks_exports = {};
10968
10968
  __export(benchmarks_exports, {
10969
10969
  BENCHMARK_SPLIT_SEED: () => BENCHMARK_SPLIT_SEED,
10970
10970
  deterministicSplit: () => deterministicSplit,
10971
- gsm8k: () => gsm8k_exports,
10972
- routing: () => routing_exports,
10973
- swebenchLite: () => swebench_lite_exports
10971
+ routing: () => routing_exports
10974
10972
  });
10975
10973
 
10976
- // src/benchmarks/gsm8k/index.ts
10977
- var gsm8k_exports = {};
10978
- __export(gsm8k_exports, {
10979
- Gsm8kAdapter: () => Gsm8kAdapter,
10980
- assignSplit: () => assignSplit,
10981
- evaluate: () => evaluate,
10982
- loadDataset: () => loadDataset,
10983
- parseGsm8kAnswer: () => parseGsm8kAnswer
10984
- });
10985
- import { existsSync as existsSync5, readFileSync as readFileSync5 } from "fs";
10986
- var Gsm8kAdapter = class {
10987
- async loadDataset(split) {
10988
- const path = process.env.AGENT_EVAL_GSM8K_PATH;
10989
- if (!path) {
10990
- throw new Error(
10991
- "GSM8K dataset not provided. Set AGENT_EVAL_GSM8K_PATH to a JSONL file with {id, question, answer} records (the HF GSM8K mirror converted to JSONL)."
10992
- );
10993
- }
10994
- if (!existsSync5(path)) {
10995
- throw new Error(`AGENT_EVAL_GSM8K_PATH=${path} does not exist`);
10996
- }
10997
- const items = parseJsonl(path).filter((it) => assignSplitImpl(it.id) === split);
10998
- return items;
10999
- }
11000
- async evaluate(item, response) {
11001
- const expected = parseGsm8kAnswer(item.payload.answer);
11002
- const observed = parseGsm8kAnswer(response);
11003
- if (expected === null) {
11004
- return { score: 0, raw: { reason: "reference_not_numeric", expected: item.payload.answer } };
11005
- }
11006
- if (observed === null) {
11007
- return { score: 0, raw: { reason: "no_numeric_in_response", expected, observed: null } };
11008
- }
11009
- const ok = Math.abs(expected - observed) < 1e-6;
11010
- return { score: ok ? 1 : 0, raw: { expected, observed, exactMatch: ok } };
11011
- }
11012
- assignSplit(itemId) {
11013
- return assignSplitImpl(itemId);
11014
- }
11015
- };
11016
- function assignSplitImpl(itemId) {
11017
- return deterministicSplit(`gsm8k::${itemId}`);
11018
- }
11019
- function parseJsonl(path) {
11020
- const raw = readFileSync5(path, "utf8");
11021
- const out = [];
11022
- let lineNo = 0;
11023
- for (const line of raw.split("\n")) {
11024
- lineNo++;
11025
- const trimmed = line.trim();
11026
- if (!trimmed) continue;
11027
- let row;
11028
- try {
11029
- row = JSON.parse(trimmed);
11030
- } catch (e) {
11031
- throw new Error(`GSM8K JSONL parse error at line ${lineNo}: ${e.message}`);
11032
- }
11033
- const id = String(row.id ?? `gsm8k_${lineNo}`);
11034
- const question = String(row.question ?? "");
11035
- const answer = String(row.answer ?? "");
11036
- if (!question || !answer) {
11037
- throw new Error(`GSM8K JSONL line ${lineNo} missing question/answer`);
11038
- }
11039
- out.push({ id, payload: { question, answer } });
11040
- }
11041
- return out;
11042
- }
11043
- function parseGsm8kAnswer(text) {
11044
- if (!text) return null;
11045
- const afterMarker = text.match(/####\s*(-?\d[\d,]*\.?\d*)/);
11046
- if (afterMarker) {
11047
- const cleaned2 = afterMarker[1].replace(/,/g, "");
11048
- const v2 = Number(cleaned2);
11049
- if (Number.isFinite(v2)) return v2;
11050
- }
11051
- const matches2 = text.match(/-?\d[\d,]*\.?\d*/g);
11052
- if (!matches2 || matches2.length === 0) return null;
11053
- const last = matches2[matches2.length - 1];
11054
- const cleaned = last.replace(/,/g, "");
11055
- const v = Number(cleaned);
11056
- return Number.isFinite(v) ? v : null;
11057
- }
11058
- var adapter = new Gsm8kAdapter();
11059
- var loadDataset = adapter.loadDataset.bind(adapter);
11060
- var evaluate = adapter.evaluate.bind(adapter);
11061
- var assignSplit = adapter.assignSplit.bind(adapter);
11062
-
11063
- // src/benchmarks/swebench-lite/index.ts
11064
- var swebench_lite_exports = {};
11065
- __export(swebench_lite_exports, {
11066
- SweBenchLiteAdapter: () => SweBenchLiteAdapter,
11067
- assignSplit: () => assignSplit2,
11068
- evaluate: () => evaluate2,
11069
- loadDataset: () => loadDataset2
11070
- });
11071
- import { existsSync as existsSync6, readFileSync as readFileSync6 } from "fs";
11072
- import { spawn } from "child_process";
11073
- var SweBenchLiteAdapter = class {
11074
- async loadDataset(split) {
11075
- const path = process.env.AGENT_EVAL_SWEBENCH_PATH;
11076
- if (!path) {
11077
- throw new Error(
11078
- "SWE-Bench Lite dataset not provided. Set AGENT_EVAL_SWEBENCH_PATH to a JSONL file with the 30 lite instances. STUB: this wrapper does not bundle the dataset; see https://www.swebench.com/lite.html for the canonical source."
11079
- );
11080
- }
11081
- if (!existsSync6(path)) {
11082
- throw new Error(`AGENT_EVAL_SWEBENCH_PATH=${path} does not exist`);
11083
- }
11084
- const all = parseJsonl2(path);
11085
- return all.filter((it) => assignSplitImpl2(it.id) === split);
11086
- }
11087
- async evaluate(item, response) {
11088
- const cmd = process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD;
11089
- if (!cmd) {
11090
- throw new Error(
11091
- "SWE-Bench Lite grader not configured. Set AGENT_EVAL_SWEBENCH_GRADER_CMD to an executable that reads {instance_id, patch} JSON on stdin and writes {passed, fail_to_pass_passed, pass_to_pass_passed, log} JSON on stdout. TODO(swebench-lite): bundle a default Docker-based runner once the SDK stabilises (https://github.com/swe-bench/SWE-bench)."
11092
- );
11093
- }
11094
- const stdinPayload = JSON.stringify({ instance_id: item.payload.instanceId, patch: response });
11095
- const result = await runGrader(cmd, stdinPayload);
11096
- let parsed;
11097
- try {
11098
- parsed = JSON.parse(result.stdout);
11099
- } catch (e) {
11100
- throw new Error(
11101
- `SWE-Bench grader emitted non-JSON stdout: ${e.message}
11102
- stdout=${result.stdout.slice(0, 400)}
11103
- stderr=${result.stderr.slice(0, 400)}`
11104
- );
11105
- }
11106
- const passed = Boolean(parsed.passed);
11107
- return {
11108
- score: passed ? 1 : 0,
11109
- raw: {
11110
- passed,
11111
- failToPassPassed: Boolean(parsed.fail_to_pass_passed),
11112
- passToPassPassed: Boolean(parsed.pass_to_pass_passed),
11113
- graderLog: typeof parsed.log === "string" ? parsed.log.slice(0, 4e3) : ""
11114
- }
11115
- };
11116
- }
11117
- assignSplit(itemId) {
11118
- return assignSplitImpl2(itemId);
11119
- }
11120
- };
11121
- function assignSplitImpl2(itemId) {
11122
- return deterministicSplit(`swebench-lite::${itemId}`);
11123
- }
11124
- function parseJsonl2(path) {
11125
- const raw = readFileSync6(path, "utf8");
11126
- const out = [];
11127
- let lineNo = 0;
11128
- for (const line of raw.split("\n")) {
11129
- lineNo++;
11130
- const trimmed = line.trim();
11131
- if (!trimmed) continue;
11132
- const row = JSON.parse(trimmed);
11133
- const instanceId = String(row.instance_id ?? row.instanceId ?? "");
11134
- if (!instanceId) {
11135
- throw new Error(`swebench-lite line ${lineNo} missing instance_id`);
11136
- }
11137
- out.push({
11138
- id: instanceId,
11139
- payload: {
11140
- instanceId,
11141
- problemStatement: String(row.problem_statement ?? row.problemStatement ?? ""),
11142
- baseCommit: String(row.base_commit ?? row.baseCommit ?? ""),
11143
- repo: String(row.repo ?? ""),
11144
- failToPass: asStringArray(row.FAIL_TO_PASS ?? row.failToPass),
11145
- passToPass: asStringArray(row.PASS_TO_PASS ?? row.passToPass)
11146
- }
11147
- });
11148
- }
11149
- return out;
11150
- }
11151
- function asStringArray(v) {
11152
- if (Array.isArray(v)) return v.filter((x) => typeof x === "string");
11153
- if (typeof v === "string") {
11154
- try {
11155
- const parsed = JSON.parse(v);
11156
- if (Array.isArray(parsed)) return parsed.filter((x) => typeof x === "string");
11157
- } catch {
11158
- return [v];
11159
- }
11160
- }
11161
- return [];
11162
- }
11163
- function runGrader(cmd, stdin) {
11164
- return new Promise((resolve, reject) => {
11165
- const parts = cmd.split(/\s+/);
11166
- const child = spawn(parts[0], parts.slice(1), { stdio: ["pipe", "pipe", "pipe"] });
11167
- let stdout = "";
11168
- let stderr = "";
11169
- child.stdout.on("data", (b) => stdout += b.toString("utf8"));
11170
- child.stderr.on("data", (b) => stderr += b.toString("utf8"));
11171
- child.on("error", reject);
11172
- child.on("close", (code) => {
11173
- if (code !== 0) {
11174
- reject(new Error(`grader exited with code ${code}: ${stderr.slice(0, 400)}`));
11175
- return;
11176
- }
11177
- resolve({ stdout, stderr });
11178
- });
11179
- child.stdin.write(stdin);
11180
- child.stdin.end();
11181
- });
11182
- }
11183
- var adapter2 = new SweBenchLiteAdapter();
11184
- var loadDataset2 = adapter2.loadDataset.bind(adapter2);
11185
- var evaluate2 = adapter2.evaluate.bind(adapter2);
11186
- var assignSplit2 = adapter2.assignSplit.bind(adapter2);
11187
-
11188
10974
  // src/benchmarks/routing/index.ts
11189
10975
  var routing_exports = {};
11190
10976
  __export(routing_exports, {
11191
10977
  ROUTING_DATASET: () => ROUTING_DATASET,
11192
10978
  RoutingAdapter: () => RoutingAdapter,
11193
- assignSplit: () => assignSplit3,
11194
- evaluate: () => evaluate3,
10979
+ assignSplit: () => assignSplit,
10980
+ evaluate: () => evaluate,
11195
10981
  extractRouteTokens: () => extractRouteTokens,
11196
- loadDataset: () => loadDataset3
10982
+ loadDataset: () => loadDataset
11197
10983
  });
11198
10984
 
11199
10985
  // src/benchmarks/routing/dataset.ts
@@ -11331,7 +11117,7 @@ var ROUTING_DATASET = [
11331
11117
  // src/benchmarks/routing/index.ts
11332
11118
  var RoutingAdapter = class {
11333
11119
  async loadDataset(split) {
11334
- return ROUTING_DATASET.map((item) => ({ id: item.id, payload: item })).filter((it) => assignSplitImpl3(it.id) === split);
11120
+ return ROUTING_DATASET.map((item) => ({ id: item.id, payload: item })).filter((it) => assignSplitImpl(it.id) === split);
11335
11121
  }
11336
11122
  async evaluate(item, response) {
11337
11123
  const tokens2 = extractRouteTokens(response);
@@ -11352,20 +11138,20 @@ var RoutingAdapter = class {
11352
11138
  };
11353
11139
  }
11354
11140
  assignSplit(itemId) {
11355
- return assignSplitImpl3(itemId);
11141
+ return assignSplitImpl(itemId);
11356
11142
  }
11357
11143
  };
11358
- function assignSplitImpl3(itemId) {
11144
+ function assignSplitImpl(itemId) {
11359
11145
  return deterministicSplit(`routing::${itemId}`);
11360
11146
  }
11361
11147
  function extractRouteTokens(response) {
11362
11148
  const matches2 = response.match(/[a-z][a-z0-9_]*\.[a-z][a-z0-9_]*/gi);
11363
11149
  return matches2 ?? [];
11364
11150
  }
11365
- var adapter3 = new RoutingAdapter();
11366
- var loadDataset3 = adapter3.loadDataset.bind(adapter3);
11367
- var evaluate3 = adapter3.evaluate.bind(adapter3);
11368
- var assignSplit3 = adapter3.assignSplit.bind(adapter3);
11151
+ var adapter = new RoutingAdapter();
11152
+ var loadDataset = adapter.loadDataset.bind(adapter);
11153
+ var evaluate = adapter.evaluate.bind(adapter);
11154
+ var assignSplit = adapter.assignSplit.bind(adapter);
11369
11155
 
11370
11156
  // src/reference-replay-steering.ts
11371
11157
  function referenceReplayRunsToSteeringRows(runs, options = {}) {
@@ -11632,11 +11418,11 @@ function samePopulation(a, b) {
11632
11418
  }
11633
11419
 
11634
11420
  // src/jsonl-trial-cache.ts
11635
- import { appendFileSync as appendFileSync4, existsSync as existsSync8, mkdirSync as mkdirSync4, readFileSync as readFileSync7 } from "fs";
11421
+ import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
11636
11422
  import { dirname as dirname4 } from "path";
11637
11423
 
11638
11424
  // src/locked-jsonl-appender.ts
11639
- import { appendFileSync as appendFileSync3, existsSync as existsSync7, mkdirSync as mkdirSync3 } from "fs";
11425
+ import { appendFileSync as appendFileSync3, existsSync as existsSync5, mkdirSync as mkdirSync3 } from "fs";
11640
11426
  import { dirname as dirname3 } from "path";
11641
11427
  var mutexes = /* @__PURE__ */ new Map();
11642
11428
  function getMutex(path) {
@@ -11651,7 +11437,7 @@ var LockedJsonlAppender = class {
11651
11437
  constructor(path) {
11652
11438
  this.path = path;
11653
11439
  this.mutex = getMutex(path);
11654
- if (!existsSync7(dirname3(path))) {
11440
+ if (!existsSync5(dirname3(path))) {
11655
11441
  mkdirSync3(dirname3(path), { recursive: true });
11656
11442
  }
11657
11443
  }
@@ -11676,8 +11462,8 @@ var JsonlTrialCache = class {
11676
11462
  appender;
11677
11463
  constructor(path) {
11678
11464
  this.path = path;
11679
- if (existsSync8(path)) {
11680
- for (const line of readFileSync7(path, "utf-8").split("\n")) {
11465
+ if (existsSync6(path)) {
11466
+ for (const line of readFileSync5(path, "utf-8").split("\n")) {
11681
11467
  if (!line.trim()) continue;
11682
11468
  try {
11683
11469
  const entry = JSON.parse(line);
@@ -11715,7 +11501,7 @@ var JsonlTrialCache = class {
11715
11501
  };
11716
11502
 
11717
11503
  // src/evolution-telemetry.ts
11718
- import { appendFileSync as appendFileSync5, existsSync as existsSync9, mkdirSync as mkdirSync5, readFileSync as readFileSync8, writeFileSync } from "fs";
11504
+ import { appendFileSync as appendFileSync5, existsSync as existsSync7, mkdirSync as mkdirSync5, readFileSync as readFileSync6, writeFileSync } from "fs";
11719
11505
  import { dirname as dirname5 } from "path";
11720
11506
  var MutationTelemetry = class {
11721
11507
  appender;
@@ -11746,16 +11532,16 @@ var LineageRecorder = class {
11746
11532
  this.snapshotPath = `${path}.snapshot`;
11747
11533
  this.kindOf = kindOf ?? defaultKindOf;
11748
11534
  mkdirSync5(dirname5(path), { recursive: true });
11749
- if (existsSync9(this.snapshotPath)) {
11535
+ if (existsSync7(this.snapshotPath)) {
11750
11536
  try {
11751
- const parsed = JSON.parse(readFileSync8(this.snapshotPath, "utf-8"));
11537
+ const parsed = JSON.parse(readFileSync6(this.snapshotPath, "utf-8"));
11752
11538
  for (const n of parsed) this.nodes.set(n.id, n);
11753
11539
  } catch {
11754
11540
  }
11755
11541
  }
11756
- if (existsSync9(path)) {
11542
+ if (existsSync7(path)) {
11757
11543
  try {
11758
- for (const line of readFileSync8(path, "utf-8").split("\n")) {
11544
+ for (const line of readFileSync6(path, "utf-8").split("\n")) {
11759
11545
  if (!line.trim()) continue;
11760
11546
  try {
11761
11547
  const entry = JSON.parse(line);
@@ -11767,9 +11553,9 @@ var LineageRecorder = class {
11767
11553
  } catch {
11768
11554
  }
11769
11555
  }
11770
- if (existsSync9(path) && this.nodes.size === 0) {
11556
+ if (existsSync7(path) && this.nodes.size === 0) {
11771
11557
  try {
11772
- const raw = readFileSync8(path, "utf-8").trim();
11558
+ const raw = readFileSync6(path, "utf-8").trim();
11773
11559
  if (raw.startsWith("[")) {
11774
11560
  const parsed = JSON.parse(raw);
11775
11561
  for (const n of parsed) this.nodes.set(n.id, n);
@@ -11783,8 +11569,8 @@ var LineageRecorder = class {
11783
11569
  const prev = this.nodes.get(node.id);
11784
11570
  this.nodes.set(node.id, { ...prev, ...node });
11785
11571
  try {
11786
- if (existsSync9(this.path)) {
11787
- const head = readFileSync8(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
11572
+ if (existsSync7(this.path)) {
11573
+ const head = readFileSync6(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
11788
11574
  if (head === "[") {
11789
11575
  writeFileSync(this.path, "");
11790
11576
  }
@@ -11850,9 +11636,9 @@ var CostLedger = class {
11850
11636
  mutex = new Mutex();
11851
11637
  constructor(path) {
11852
11638
  this.path = path;
11853
- if (existsSync9(path)) {
11639
+ if (existsSync7(path)) {
11854
11640
  try {
11855
- const loaded = JSON.parse(readFileSync8(path, "utf-8"));
11641
+ const loaded = JSON.parse(readFileSync6(path, "utf-8"));
11856
11642
  for (const k of Object.keys(this.totals)) {
11857
11643
  if (k === "byGeneration") {
11858
11644
  if (loaded.byGeneration && typeof loaded.byGeneration === "object") {