@tangle-network/agent-eval 0.16.1 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -4
- package/dist/index.d.ts +14 -111
- package/dist/index.js +41 -260
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -196,10 +196,11 @@ These are the primitives any team running prompt-optimization in production need
|
|
|
196
196
|
meta-loop (`inspectFailures` → `proposeChange` → `applyChange` →
|
|
197
197
|
`evaluateChange`). Ship a `NoopResearcher` as a placeholder; real
|
|
198
198
|
implementations live downstream.
|
|
199
|
-
- `benchmarks/
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
199
|
+
- `benchmarks/routing` — synthetic 16-task router benchmark we own.
|
|
200
|
+
Ships in the package. Reference wrappers for GSM8K and SWE-Bench
|
|
201
|
+
Lite live under `examples/benchmarks/` — read, copy, adapt. All
|
|
202
|
+
three implement one `BenchmarkAdapter` shape with deterministic
|
|
203
|
+
splits and fail-loud env-var configuration.
|
|
203
204
|
|
|
204
205
|
### v0.16 changes from v0.15
|
|
205
206
|
|
package/dist/index.d.ts
CHANGED
|
@@ -6975,103 +6975,6 @@ declare const BENCHMARK_SPLIT_SEED = "agent-eval-v1";
|
|
|
6975
6975
|
*/
|
|
6976
6976
|
declare function deterministicSplit(itemId: string, seed?: string): RunSplitTag;
|
|
6977
6977
|
|
|
6978
|
-
/**
|
|
6979
|
-
* GSM8K wrapper — exact-match grading on the final numeric answer.
|
|
6980
|
-
*
|
|
6981
|
-
* The dataset itself is NOT bundled. `loadDataset` will:
|
|
6982
|
-
* 1. read from `process.env.AGENT_EVAL_GSM8K_PATH` if set (a JSONL
|
|
6983
|
-
* file with `{ id, question, answer }` records — the standard
|
|
6984
|
-
* HF mirror layout converted to JSONL);
|
|
6985
|
-
* 2. otherwise throw a clearly-marked error pointing to the loader.
|
|
6986
|
-
*
|
|
6987
|
-
* `evaluate` parses the final number out of the response (last
|
|
6988
|
-
* occurrence of a signed-decimal-or-integer literal, optionally after
|
|
6989
|
-
* `####`, the GSM8K answer convention) and compares to the ground-
|
|
6990
|
-
* truth integer. Floating-point comparisons use a 1e-6 tolerance.
|
|
6991
|
-
*/
|
|
6992
|
-
|
|
6993
|
-
interface Gsm8kPayload {
|
|
6994
|
-
question: string;
|
|
6995
|
-
/** Reference answer, post-#### normalization. May be a number or
|
|
6996
|
-
* a numeric string ("72", "1.5"). */
|
|
6997
|
-
answer: string;
|
|
6998
|
-
}
|
|
6999
|
-
type Gsm8kItem = BenchmarkDatasetItem<Gsm8kPayload>;
|
|
7000
|
-
declare class Gsm8kAdapter implements BenchmarkAdapter<Gsm8kItem, Gsm8kPayload> {
|
|
7001
|
-
loadDataset(split: RunSplitTag): Promise<Gsm8kItem[]>;
|
|
7002
|
-
evaluate(item: Gsm8kItem, response: string): Promise<BenchmarkEvaluation>;
|
|
7003
|
-
assignSplit(itemId: string): RunSplitTag;
|
|
7004
|
-
}
|
|
7005
|
-
/**
|
|
7006
|
-
* Parse a GSM8K-style answer. Honors the dataset's `#### N`
|
|
7007
|
-
* convention (the canonical answer comes after `####`); otherwise
|
|
7008
|
-
* returns the LAST signed numeric literal in the string.
|
|
7009
|
-
*/
|
|
7010
|
-
declare function parseGsm8kAnswer(text: string): number | null;
|
|
7011
|
-
declare const loadDataset$2: (split: RunSplitTag) => Promise<Gsm8kItem[]>;
|
|
7012
|
-
declare const evaluate$2: (item: Gsm8kItem, response: string) => Promise<BenchmarkEvaluation>;
|
|
7013
|
-
declare const assignSplit$2: (itemId: string) => RunSplitTag;
|
|
7014
|
-
|
|
7015
|
-
type index$3_Gsm8kAdapter = Gsm8kAdapter;
|
|
7016
|
-
declare const index$3_Gsm8kAdapter: typeof Gsm8kAdapter;
|
|
7017
|
-
type index$3_Gsm8kItem = Gsm8kItem;
|
|
7018
|
-
type index$3_Gsm8kPayload = Gsm8kPayload;
|
|
7019
|
-
declare const index$3_parseGsm8kAnswer: typeof parseGsm8kAnswer;
|
|
7020
|
-
declare namespace index$3 {
|
|
7021
|
-
export { index$3_Gsm8kAdapter as Gsm8kAdapter, type index$3_Gsm8kItem as Gsm8kItem, type index$3_Gsm8kPayload as Gsm8kPayload, assignSplit$2 as assignSplit, evaluate$2 as evaluate, loadDataset$2 as loadDataset, index$3_parseGsm8kAnswer as parseGsm8kAnswer };
|
|
7022
|
-
}
|
|
7023
|
-
|
|
7024
|
-
/**
|
|
7025
|
-
* SWE-Bench Lite wrapper — 30-instance subset.
|
|
7026
|
-
*
|
|
7027
|
-
* Status: STUB. The actual SWE-Bench harness needs a Docker host and
|
|
7028
|
-
* is too heavy to ship inside this package. We expose the contract
|
|
7029
|
-
* (loadDataset, evaluate, assignSplit) so consumers can plug in their
|
|
7030
|
-
* own grader without touching call sites.
|
|
7031
|
-
*
|
|
7032
|
-
* Wire-up paths in priority order:
|
|
7033
|
-
*
|
|
7034
|
-
* 1. `process.env.AGENT_EVAL_SWEBENCH_PATH` → JSONL with the 30
|
|
7035
|
-
* lite instances + per-instance metadata (instance_id,
|
|
7036
|
-
* problem_statement, base_commit, repo, FAIL_TO_PASS,
|
|
7037
|
-
* PASS_TO_PASS).
|
|
7038
|
-
* 2. `process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD` → executable
|
|
7039
|
-
* that reads `{instance_id, patch}` JSON on stdin and writes
|
|
7040
|
-
* `{passed, fail_to_pass_passed, pass_to_pass_passed, log}`
|
|
7041
|
-
* JSON on stdout. Implementations can shell out to the
|
|
7042
|
-
* official `swebench` runner here.
|
|
7043
|
-
*
|
|
7044
|
-
* If neither is set, every public method throws a clearly-marked
|
|
7045
|
-
* "not implemented" error. The stub fails LOUD; it never silently
|
|
7046
|
-
* scores zero.
|
|
7047
|
-
*/
|
|
7048
|
-
|
|
7049
|
-
interface SweBenchLitePayload {
|
|
7050
|
-
instanceId: string;
|
|
7051
|
-
problemStatement: string;
|
|
7052
|
-
baseCommit: string;
|
|
7053
|
-
repo: string;
|
|
7054
|
-
failToPass: string[];
|
|
7055
|
-
passToPass: string[];
|
|
7056
|
-
}
|
|
7057
|
-
type SweBenchLiteItem = BenchmarkDatasetItem<SweBenchLitePayload>;
|
|
7058
|
-
declare class SweBenchLiteAdapter implements BenchmarkAdapter<SweBenchLiteItem, SweBenchLitePayload> {
|
|
7059
|
-
loadDataset(split: RunSplitTag): Promise<SweBenchLiteItem[]>;
|
|
7060
|
-
evaluate(item: SweBenchLiteItem, response: string): Promise<BenchmarkEvaluation>;
|
|
7061
|
-
assignSplit(itemId: string): RunSplitTag;
|
|
7062
|
-
}
|
|
7063
|
-
declare const loadDataset$1: (split: RunSplitTag) => Promise<SweBenchLiteItem[]>;
|
|
7064
|
-
declare const evaluate$1: (item: SweBenchLiteItem, response: string) => Promise<BenchmarkEvaluation>;
|
|
7065
|
-
declare const assignSplit$1: (itemId: string) => RunSplitTag;
|
|
7066
|
-
|
|
7067
|
-
type index$2_SweBenchLiteAdapter = SweBenchLiteAdapter;
|
|
7068
|
-
declare const index$2_SweBenchLiteAdapter: typeof SweBenchLiteAdapter;
|
|
7069
|
-
type index$2_SweBenchLiteItem = SweBenchLiteItem;
|
|
7070
|
-
type index$2_SweBenchLitePayload = SweBenchLitePayload;
|
|
7071
|
-
declare namespace index$2 {
|
|
7072
|
-
export { index$2_SweBenchLiteAdapter as SweBenchLiteAdapter, type index$2_SweBenchLiteItem as SweBenchLiteItem, type index$2_SweBenchLitePayload as SweBenchLitePayload, assignSplit$1 as assignSplit, evaluate$1 as evaluate, loadDataset$1 as loadDataset };
|
|
7073
|
-
}
|
|
7074
|
-
|
|
7075
6978
|
/**
|
|
7076
6979
|
* Synthetic routing dataset. 16 tasks across 4 categories. Used as a
|
|
7077
6980
|
* deterministic, dependency-free benchmark for any router that maps a
|
|
@@ -7153,21 +7056,21 @@ declare namespace index$1 {
|
|
|
7153
7056
|
/**
|
|
7154
7057
|
* Reference benchmark wrappers — entry point.
|
|
7155
7058
|
*
|
|
7156
|
-
*
|
|
7157
|
-
* - `
|
|
7158
|
-
*
|
|
7159
|
-
* - `
|
|
7160
|
-
*
|
|
7161
|
-
* - `routing` — synthetic 16-task router benchmark, ships
|
|
7162
|
-
* in the package.
|
|
7059
|
+
* Core surface (exported here):
|
|
7060
|
+
* - The `BenchmarkAdapter` contract.
|
|
7061
|
+
* - `deterministicSplit` + `BENCHMARK_SPLIT_SEED` for split assignment.
|
|
7062
|
+
* - `routing` — synthetic 16-task router benchmark. The only novel
|
|
7063
|
+
* benchmark we built; ships in the package.
|
|
7163
7064
|
*
|
|
7164
|
-
*
|
|
7165
|
-
* `
|
|
7166
|
-
*
|
|
7065
|
+
* Example wrappers (under `examples/benchmarks/`, NOT in the bundle):
|
|
7066
|
+
* - `gsm8k` — exact-match math reasoning (HF mirror, dataset
|
|
7067
|
+
* not bundled).
|
|
7068
|
+
* - `swebench-lite` — 30-instance SWE-Bench subset (stub; needs an
|
|
7069
|
+
* external grader).
|
|
7167
7070
|
*
|
|
7168
|
-
*
|
|
7169
|
-
*
|
|
7170
|
-
*
|
|
7071
|
+
* The example wrappers are reference implementations of `BenchmarkAdapter`.
|
|
7072
|
+
* Read them, copy them, adapt them. They're intentionally not in the main
|
|
7073
|
+
* entry — every team will configure them differently.
|
|
7171
7074
|
*/
|
|
7172
7075
|
|
|
7173
7076
|
declare const index_BENCHMARK_SPLIT_SEED: typeof BENCHMARK_SPLIT_SEED;
|
|
@@ -7176,7 +7079,7 @@ type index_BenchmarkDatasetItem<TPayload = unknown> = BenchmarkDatasetItem<TPayl
|
|
|
7176
7079
|
type index_BenchmarkEvaluation = BenchmarkEvaluation;
|
|
7177
7080
|
declare const index_deterministicSplit: typeof deterministicSplit;
|
|
7178
7081
|
declare namespace index {
|
|
7179
|
-
export { index_BENCHMARK_SPLIT_SEED as BENCHMARK_SPLIT_SEED, type index_BenchmarkAdapter as BenchmarkAdapter, type index_BenchmarkDatasetItem as BenchmarkDatasetItem, type index_BenchmarkEvaluation as BenchmarkEvaluation, index_deterministicSplit as deterministicSplit, index$
|
|
7082
|
+
export { index_BENCHMARK_SPLIT_SEED as BENCHMARK_SPLIT_SEED, type index_BenchmarkAdapter as BenchmarkAdapter, type index_BenchmarkDatasetItem as BenchmarkDatasetItem, type index_BenchmarkEvaluation as BenchmarkEvaluation, index_deterministicSplit as deterministicSplit, index$1 as routing };
|
|
7180
7083
|
}
|
|
7181
7084
|
|
|
7182
7085
|
interface ReferenceReplaySteeringRowsOptions<Input = unknown> {
|
package/dist/index.js
CHANGED
|
@@ -268,12 +268,7 @@ ${codeText}`
|
|
|
268
268
|
};
|
|
269
269
|
var coherenceJudge = async (tc, { scenario, turns }) => {
|
|
270
270
|
if (turns.length < 2) {
|
|
271
|
-
return [
|
|
272
|
-
judgeName: "coherence",
|
|
273
|
-
dimension: "coherence",
|
|
274
|
-
score: 5,
|
|
275
|
-
reasoning: "Single-turn scenario \u2014 coherence not fully testable."
|
|
276
|
-
}];
|
|
271
|
+
return [];
|
|
277
272
|
}
|
|
278
273
|
const conversation = turns.map(
|
|
279
274
|
(t, i) => `Turn ${i + 1}:
|
|
@@ -3342,12 +3337,12 @@ var SubprocessSandboxDriver = class {
|
|
|
3342
3337
|
this.defaultEnv = options.env;
|
|
3343
3338
|
}
|
|
3344
3339
|
async exec(phase, command, config) {
|
|
3345
|
-
const { spawn
|
|
3340
|
+
const { spawn } = await import("child_process");
|
|
3346
3341
|
const start = Date.now();
|
|
3347
3342
|
const effectiveCwd = config.cwd ?? this.defaultCwd;
|
|
3348
3343
|
const effectiveEnv = { ...process.env, ...this.defaultEnv ?? {}, ...config.env ?? {} };
|
|
3349
3344
|
return await new Promise((resolve) => {
|
|
3350
|
-
const child =
|
|
3345
|
+
const child = spawn(command, {
|
|
3351
3346
|
shell: true,
|
|
3352
3347
|
cwd: effectiveCwd,
|
|
3353
3348
|
env: effectiveEnv
|
|
@@ -8583,20 +8578,20 @@ function mergeLayerResults(name, perAdapter, options = {}) {
|
|
|
8583
8578
|
let durationMs = 0;
|
|
8584
8579
|
const reasonParts = [];
|
|
8585
8580
|
const diagnostics = {};
|
|
8586
|
-
for (const { adapter:
|
|
8581
|
+
for (const { adapter: adapter2, result } of perAdapter) {
|
|
8587
8582
|
status = worst(status, result.status);
|
|
8588
8583
|
if (typeof result.score === "number") {
|
|
8589
8584
|
weightedScoreSum += result.score;
|
|
8590
8585
|
weightCount += 1;
|
|
8591
8586
|
}
|
|
8592
8587
|
durationMs = mergeDuration === "sum" ? durationMs + result.durationMs : Math.max(durationMs, result.durationMs);
|
|
8593
|
-
reasonParts.push(`${
|
|
8588
|
+
reasonParts.push(`${adapter2}: ${result.status}`);
|
|
8594
8589
|
for (const f of result.findings) {
|
|
8595
8590
|
findings.push({
|
|
8596
8591
|
...f,
|
|
8597
8592
|
layer: name,
|
|
8598
|
-
message: prefix ? `${prefix(
|
|
8599
|
-
detail: { ...f.detail ?? {}, adapter:
|
|
8593
|
+
message: prefix ? `${prefix(adapter2)} ${f.message}` : f.message,
|
|
8594
|
+
detail: { ...f.detail ?? {}, adapter: adapter2 }
|
|
8600
8595
|
});
|
|
8601
8596
|
}
|
|
8602
8597
|
for (const [k, v] of Object.entries(result.diagnostics ?? {})) {
|
|
@@ -8615,8 +8610,8 @@ function mergeLayerResults(name, perAdapter, options = {}) {
|
|
|
8615
8610
|
reason: reasonParts.join(" \xB7 "),
|
|
8616
8611
|
diagnostics: Object.keys(diagnostics).length > 0 ? diagnostics : void 0,
|
|
8617
8612
|
detail: {
|
|
8618
|
-
adapters: perAdapter.map(({ adapter:
|
|
8619
|
-
adapter:
|
|
8613
|
+
adapters: perAdapter.map(({ adapter: adapter2, result }) => ({
|
|
8614
|
+
adapter: adapter2,
|
|
8620
8615
|
status: result.status,
|
|
8621
8616
|
score: result.score ?? null
|
|
8622
8617
|
})),
|
|
@@ -8642,10 +8637,10 @@ function multiToolchainLayer(config) {
|
|
|
8642
8637
|
reason: "no adapters detected"
|
|
8643
8638
|
};
|
|
8644
8639
|
}
|
|
8645
|
-
const runOne = async (
|
|
8646
|
-
const adapterName = config.adapterName(
|
|
8640
|
+
const runOne = async (adapter2) => {
|
|
8641
|
+
const adapterName = config.adapterName(adapter2);
|
|
8647
8642
|
try {
|
|
8648
|
-
const r = await config.run(
|
|
8643
|
+
const r = await config.run(adapter2, ctx);
|
|
8649
8644
|
return { adapter: adapterName, result: r };
|
|
8650
8645
|
} catch (err) {
|
|
8651
8646
|
return {
|
|
@@ -10081,8 +10076,8 @@ function formatPct(value) {
|
|
|
10081
10076
|
function bySplitOrder(a, b) {
|
|
10082
10077
|
return ALL_SPLITS.indexOf(a) - ALL_SPLITS.indexOf(b);
|
|
10083
10078
|
}
|
|
10084
|
-
function runAdapter(
|
|
10085
|
-
return typeof
|
|
10079
|
+
function runAdapter(adapter2, scenario, context) {
|
|
10080
|
+
return typeof adapter2 === "function" ? adapter2(scenario, context) : adapter2.run(scenario, context);
|
|
10086
10081
|
}
|
|
10087
10082
|
function throwIfAborted(signal) {
|
|
10088
10083
|
if (!signal?.aborted) return;
|
|
@@ -10973,232 +10968,18 @@ var benchmarks_exports = {};
|
|
|
10973
10968
|
__export(benchmarks_exports, {
|
|
10974
10969
|
BENCHMARK_SPLIT_SEED: () => BENCHMARK_SPLIT_SEED,
|
|
10975
10970
|
deterministicSplit: () => deterministicSplit,
|
|
10976
|
-
|
|
10977
|
-
routing: () => routing_exports,
|
|
10978
|
-
swebenchLite: () => swebench_lite_exports
|
|
10971
|
+
routing: () => routing_exports
|
|
10979
10972
|
});
|
|
10980
10973
|
|
|
10981
|
-
// src/benchmarks/gsm8k/index.ts
|
|
10982
|
-
var gsm8k_exports = {};
|
|
10983
|
-
__export(gsm8k_exports, {
|
|
10984
|
-
Gsm8kAdapter: () => Gsm8kAdapter,
|
|
10985
|
-
assignSplit: () => assignSplit,
|
|
10986
|
-
evaluate: () => evaluate,
|
|
10987
|
-
loadDataset: () => loadDataset,
|
|
10988
|
-
parseGsm8kAnswer: () => parseGsm8kAnswer
|
|
10989
|
-
});
|
|
10990
|
-
import { existsSync as existsSync5, readFileSync as readFileSync5 } from "fs";
|
|
10991
|
-
var Gsm8kAdapter = class {
|
|
10992
|
-
async loadDataset(split) {
|
|
10993
|
-
const path = process.env.AGENT_EVAL_GSM8K_PATH;
|
|
10994
|
-
if (!path) {
|
|
10995
|
-
throw new Error(
|
|
10996
|
-
"GSM8K dataset not provided. Set AGENT_EVAL_GSM8K_PATH to a JSONL file with {id, question, answer} records (the HF GSM8K mirror converted to JSONL)."
|
|
10997
|
-
);
|
|
10998
|
-
}
|
|
10999
|
-
if (!existsSync5(path)) {
|
|
11000
|
-
throw new Error(`AGENT_EVAL_GSM8K_PATH=${path} does not exist`);
|
|
11001
|
-
}
|
|
11002
|
-
const items = parseJsonl(path).filter((it) => assignSplitImpl(it.id) === split);
|
|
11003
|
-
return items;
|
|
11004
|
-
}
|
|
11005
|
-
async evaluate(item, response) {
|
|
11006
|
-
const expected = parseGsm8kAnswer(item.payload.answer);
|
|
11007
|
-
const observed = parseGsm8kAnswer(response);
|
|
11008
|
-
if (expected === null) {
|
|
11009
|
-
return { score: 0, raw: { reason: "reference_not_numeric", expected: item.payload.answer } };
|
|
11010
|
-
}
|
|
11011
|
-
if (observed === null) {
|
|
11012
|
-
return { score: 0, raw: { reason: "no_numeric_in_response", expected, observed: null } };
|
|
11013
|
-
}
|
|
11014
|
-
const ok = Math.abs(expected - observed) < 1e-6;
|
|
11015
|
-
return { score: ok ? 1 : 0, raw: { expected, observed, exactMatch: ok } };
|
|
11016
|
-
}
|
|
11017
|
-
assignSplit(itemId) {
|
|
11018
|
-
return assignSplitImpl(itemId);
|
|
11019
|
-
}
|
|
11020
|
-
};
|
|
11021
|
-
function assignSplitImpl(itemId) {
|
|
11022
|
-
return deterministicSplit(`gsm8k::${itemId}`);
|
|
11023
|
-
}
|
|
11024
|
-
function parseJsonl(path) {
|
|
11025
|
-
const raw = readFileSync5(path, "utf8");
|
|
11026
|
-
const out = [];
|
|
11027
|
-
let lineNo = 0;
|
|
11028
|
-
for (const line of raw.split("\n")) {
|
|
11029
|
-
lineNo++;
|
|
11030
|
-
const trimmed = line.trim();
|
|
11031
|
-
if (!trimmed) continue;
|
|
11032
|
-
let row;
|
|
11033
|
-
try {
|
|
11034
|
-
row = JSON.parse(trimmed);
|
|
11035
|
-
} catch (e) {
|
|
11036
|
-
throw new Error(`GSM8K JSONL parse error at line ${lineNo}: ${e.message}`);
|
|
11037
|
-
}
|
|
11038
|
-
const id = String(row.id ?? `gsm8k_${lineNo}`);
|
|
11039
|
-
const question = String(row.question ?? "");
|
|
11040
|
-
const answer = String(row.answer ?? "");
|
|
11041
|
-
if (!question || !answer) {
|
|
11042
|
-
throw new Error(`GSM8K JSONL line ${lineNo} missing question/answer`);
|
|
11043
|
-
}
|
|
11044
|
-
out.push({ id, payload: { question, answer } });
|
|
11045
|
-
}
|
|
11046
|
-
return out;
|
|
11047
|
-
}
|
|
11048
|
-
function parseGsm8kAnswer(text) {
|
|
11049
|
-
if (!text) return null;
|
|
11050
|
-
const afterMarker = text.match(/####\s*(-?\d[\d,]*\.?\d*)/);
|
|
11051
|
-
if (afterMarker) {
|
|
11052
|
-
const cleaned2 = afterMarker[1].replace(/,/g, "");
|
|
11053
|
-
const v2 = Number(cleaned2);
|
|
11054
|
-
if (Number.isFinite(v2)) return v2;
|
|
11055
|
-
}
|
|
11056
|
-
const matches2 = text.match(/-?\d[\d,]*\.?\d*/g);
|
|
11057
|
-
if (!matches2 || matches2.length === 0) return null;
|
|
11058
|
-
const last = matches2[matches2.length - 1];
|
|
11059
|
-
const cleaned = last.replace(/,/g, "");
|
|
11060
|
-
const v = Number(cleaned);
|
|
11061
|
-
return Number.isFinite(v) ? v : null;
|
|
11062
|
-
}
|
|
11063
|
-
var adapter = new Gsm8kAdapter();
|
|
11064
|
-
var loadDataset = adapter.loadDataset.bind(adapter);
|
|
11065
|
-
var evaluate = adapter.evaluate.bind(adapter);
|
|
11066
|
-
var assignSplit = adapter.assignSplit.bind(adapter);
|
|
11067
|
-
|
|
11068
|
-
// src/benchmarks/swebench-lite/index.ts
|
|
11069
|
-
var swebench_lite_exports = {};
|
|
11070
|
-
__export(swebench_lite_exports, {
|
|
11071
|
-
SweBenchLiteAdapter: () => SweBenchLiteAdapter,
|
|
11072
|
-
assignSplit: () => assignSplit2,
|
|
11073
|
-
evaluate: () => evaluate2,
|
|
11074
|
-
loadDataset: () => loadDataset2
|
|
11075
|
-
});
|
|
11076
|
-
import { existsSync as existsSync6, readFileSync as readFileSync6 } from "fs";
|
|
11077
|
-
import { spawn } from "child_process";
|
|
11078
|
-
var SweBenchLiteAdapter = class {
|
|
11079
|
-
async loadDataset(split) {
|
|
11080
|
-
const path = process.env.AGENT_EVAL_SWEBENCH_PATH;
|
|
11081
|
-
if (!path) {
|
|
11082
|
-
throw new Error(
|
|
11083
|
-
"SWE-Bench Lite dataset not provided. Set AGENT_EVAL_SWEBENCH_PATH to a JSONL file with the 30 lite instances. STUB: this wrapper does not bundle the dataset; see https://www.swebench.com/lite.html for the canonical source."
|
|
11084
|
-
);
|
|
11085
|
-
}
|
|
11086
|
-
if (!existsSync6(path)) {
|
|
11087
|
-
throw new Error(`AGENT_EVAL_SWEBENCH_PATH=${path} does not exist`);
|
|
11088
|
-
}
|
|
11089
|
-
const all = parseJsonl2(path);
|
|
11090
|
-
return all.filter((it) => assignSplitImpl2(it.id) === split);
|
|
11091
|
-
}
|
|
11092
|
-
async evaluate(item, response) {
|
|
11093
|
-
const cmd = process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD;
|
|
11094
|
-
if (!cmd) {
|
|
11095
|
-
throw new Error(
|
|
11096
|
-
"SWE-Bench Lite grader not configured. Set AGENT_EVAL_SWEBENCH_GRADER_CMD to an executable that reads {instance_id, patch} JSON on stdin and writes {passed, fail_to_pass_passed, pass_to_pass_passed, log} JSON on stdout. TODO(swebench-lite): bundle a default Docker-based runner once the SDK stabilises (https://github.com/swe-bench/SWE-bench)."
|
|
11097
|
-
);
|
|
11098
|
-
}
|
|
11099
|
-
const stdinPayload = JSON.stringify({ instance_id: item.payload.instanceId, patch: response });
|
|
11100
|
-
const result = await runGrader(cmd, stdinPayload);
|
|
11101
|
-
let parsed;
|
|
11102
|
-
try {
|
|
11103
|
-
parsed = JSON.parse(result.stdout);
|
|
11104
|
-
} catch (e) {
|
|
11105
|
-
throw new Error(
|
|
11106
|
-
`SWE-Bench grader emitted non-JSON stdout: ${e.message}
|
|
11107
|
-
stdout=${result.stdout.slice(0, 400)}
|
|
11108
|
-
stderr=${result.stderr.slice(0, 400)}`
|
|
11109
|
-
);
|
|
11110
|
-
}
|
|
11111
|
-
const passed = Boolean(parsed.passed);
|
|
11112
|
-
return {
|
|
11113
|
-
score: passed ? 1 : 0,
|
|
11114
|
-
raw: {
|
|
11115
|
-
passed,
|
|
11116
|
-
failToPassPassed: Boolean(parsed.fail_to_pass_passed),
|
|
11117
|
-
passToPassPassed: Boolean(parsed.pass_to_pass_passed),
|
|
11118
|
-
graderLog: typeof parsed.log === "string" ? parsed.log.slice(0, 4e3) : ""
|
|
11119
|
-
}
|
|
11120
|
-
};
|
|
11121
|
-
}
|
|
11122
|
-
assignSplit(itemId) {
|
|
11123
|
-
return assignSplitImpl2(itemId);
|
|
11124
|
-
}
|
|
11125
|
-
};
|
|
11126
|
-
function assignSplitImpl2(itemId) {
|
|
11127
|
-
return deterministicSplit(`swebench-lite::${itemId}`);
|
|
11128
|
-
}
|
|
11129
|
-
function parseJsonl2(path) {
|
|
11130
|
-
const raw = readFileSync6(path, "utf8");
|
|
11131
|
-
const out = [];
|
|
11132
|
-
let lineNo = 0;
|
|
11133
|
-
for (const line of raw.split("\n")) {
|
|
11134
|
-
lineNo++;
|
|
11135
|
-
const trimmed = line.trim();
|
|
11136
|
-
if (!trimmed) continue;
|
|
11137
|
-
const row = JSON.parse(trimmed);
|
|
11138
|
-
const instanceId = String(row.instance_id ?? row.instanceId ?? "");
|
|
11139
|
-
if (!instanceId) {
|
|
11140
|
-
throw new Error(`swebench-lite line ${lineNo} missing instance_id`);
|
|
11141
|
-
}
|
|
11142
|
-
out.push({
|
|
11143
|
-
id: instanceId,
|
|
11144
|
-
payload: {
|
|
11145
|
-
instanceId,
|
|
11146
|
-
problemStatement: String(row.problem_statement ?? row.problemStatement ?? ""),
|
|
11147
|
-
baseCommit: String(row.base_commit ?? row.baseCommit ?? ""),
|
|
11148
|
-
repo: String(row.repo ?? ""),
|
|
11149
|
-
failToPass: asStringArray(row.FAIL_TO_PASS ?? row.failToPass),
|
|
11150
|
-
passToPass: asStringArray(row.PASS_TO_PASS ?? row.passToPass)
|
|
11151
|
-
}
|
|
11152
|
-
});
|
|
11153
|
-
}
|
|
11154
|
-
return out;
|
|
11155
|
-
}
|
|
11156
|
-
function asStringArray(v) {
|
|
11157
|
-
if (Array.isArray(v)) return v.filter((x) => typeof x === "string");
|
|
11158
|
-
if (typeof v === "string") {
|
|
11159
|
-
try {
|
|
11160
|
-
const parsed = JSON.parse(v);
|
|
11161
|
-
if (Array.isArray(parsed)) return parsed.filter((x) => typeof x === "string");
|
|
11162
|
-
} catch {
|
|
11163
|
-
return [v];
|
|
11164
|
-
}
|
|
11165
|
-
}
|
|
11166
|
-
return [];
|
|
11167
|
-
}
|
|
11168
|
-
function runGrader(cmd, stdin) {
|
|
11169
|
-
return new Promise((resolve, reject) => {
|
|
11170
|
-
const parts = cmd.split(/\s+/);
|
|
11171
|
-
const child = spawn(parts[0], parts.slice(1), { stdio: ["pipe", "pipe", "pipe"] });
|
|
11172
|
-
let stdout = "";
|
|
11173
|
-
let stderr = "";
|
|
11174
|
-
child.stdout.on("data", (b) => stdout += b.toString("utf8"));
|
|
11175
|
-
child.stderr.on("data", (b) => stderr += b.toString("utf8"));
|
|
11176
|
-
child.on("error", reject);
|
|
11177
|
-
child.on("close", (code) => {
|
|
11178
|
-
if (code !== 0) {
|
|
11179
|
-
reject(new Error(`grader exited with code ${code}: ${stderr.slice(0, 400)}`));
|
|
11180
|
-
return;
|
|
11181
|
-
}
|
|
11182
|
-
resolve({ stdout, stderr });
|
|
11183
|
-
});
|
|
11184
|
-
child.stdin.write(stdin);
|
|
11185
|
-
child.stdin.end();
|
|
11186
|
-
});
|
|
11187
|
-
}
|
|
11188
|
-
var adapter2 = new SweBenchLiteAdapter();
|
|
11189
|
-
var loadDataset2 = adapter2.loadDataset.bind(adapter2);
|
|
11190
|
-
var evaluate2 = adapter2.evaluate.bind(adapter2);
|
|
11191
|
-
var assignSplit2 = adapter2.assignSplit.bind(adapter2);
|
|
11192
|
-
|
|
11193
10974
|
// src/benchmarks/routing/index.ts
|
|
11194
10975
|
var routing_exports = {};
|
|
11195
10976
|
__export(routing_exports, {
|
|
11196
10977
|
ROUTING_DATASET: () => ROUTING_DATASET,
|
|
11197
10978
|
RoutingAdapter: () => RoutingAdapter,
|
|
11198
|
-
assignSplit: () =>
|
|
11199
|
-
evaluate: () =>
|
|
10979
|
+
assignSplit: () => assignSplit,
|
|
10980
|
+
evaluate: () => evaluate,
|
|
11200
10981
|
extractRouteTokens: () => extractRouteTokens,
|
|
11201
|
-
loadDataset: () =>
|
|
10982
|
+
loadDataset: () => loadDataset
|
|
11202
10983
|
});
|
|
11203
10984
|
|
|
11204
10985
|
// src/benchmarks/routing/dataset.ts
|
|
@@ -11336,7 +11117,7 @@ var ROUTING_DATASET = [
|
|
|
11336
11117
|
// src/benchmarks/routing/index.ts
|
|
11337
11118
|
var RoutingAdapter = class {
|
|
11338
11119
|
async loadDataset(split) {
|
|
11339
|
-
return ROUTING_DATASET.map((item) => ({ id: item.id, payload: item })).filter((it) =>
|
|
11120
|
+
return ROUTING_DATASET.map((item) => ({ id: item.id, payload: item })).filter((it) => assignSplitImpl(it.id) === split);
|
|
11340
11121
|
}
|
|
11341
11122
|
async evaluate(item, response) {
|
|
11342
11123
|
const tokens2 = extractRouteTokens(response);
|
|
@@ -11357,20 +11138,20 @@ var RoutingAdapter = class {
|
|
|
11357
11138
|
};
|
|
11358
11139
|
}
|
|
11359
11140
|
assignSplit(itemId) {
|
|
11360
|
-
return
|
|
11141
|
+
return assignSplitImpl(itemId);
|
|
11361
11142
|
}
|
|
11362
11143
|
};
|
|
11363
|
-
function
|
|
11144
|
+
function assignSplitImpl(itemId) {
|
|
11364
11145
|
return deterministicSplit(`routing::${itemId}`);
|
|
11365
11146
|
}
|
|
11366
11147
|
function extractRouteTokens(response) {
|
|
11367
11148
|
const matches2 = response.match(/[a-z][a-z0-9_]*\.[a-z][a-z0-9_]*/gi);
|
|
11368
11149
|
return matches2 ?? [];
|
|
11369
11150
|
}
|
|
11370
|
-
var
|
|
11371
|
-
var
|
|
11372
|
-
var
|
|
11373
|
-
var
|
|
11151
|
+
var adapter = new RoutingAdapter();
|
|
11152
|
+
var loadDataset = adapter.loadDataset.bind(adapter);
|
|
11153
|
+
var evaluate = adapter.evaluate.bind(adapter);
|
|
11154
|
+
var assignSplit = adapter.assignSplit.bind(adapter);
|
|
11374
11155
|
|
|
11375
11156
|
// src/reference-replay-steering.ts
|
|
11376
11157
|
function referenceReplayRunsToSteeringRows(runs, options = {}) {
|
|
@@ -11637,11 +11418,11 @@ function samePopulation(a, b) {
|
|
|
11637
11418
|
}
|
|
11638
11419
|
|
|
11639
11420
|
// src/jsonl-trial-cache.ts
|
|
11640
|
-
import { appendFileSync as appendFileSync4, existsSync as
|
|
11421
|
+
import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
|
|
11641
11422
|
import { dirname as dirname4 } from "path";
|
|
11642
11423
|
|
|
11643
11424
|
// src/locked-jsonl-appender.ts
|
|
11644
|
-
import { appendFileSync as appendFileSync3, existsSync as
|
|
11425
|
+
import { appendFileSync as appendFileSync3, existsSync as existsSync5, mkdirSync as mkdirSync3 } from "fs";
|
|
11645
11426
|
import { dirname as dirname3 } from "path";
|
|
11646
11427
|
var mutexes = /* @__PURE__ */ new Map();
|
|
11647
11428
|
function getMutex(path) {
|
|
@@ -11656,7 +11437,7 @@ var LockedJsonlAppender = class {
|
|
|
11656
11437
|
constructor(path) {
|
|
11657
11438
|
this.path = path;
|
|
11658
11439
|
this.mutex = getMutex(path);
|
|
11659
|
-
if (!
|
|
11440
|
+
if (!existsSync5(dirname3(path))) {
|
|
11660
11441
|
mkdirSync3(dirname3(path), { recursive: true });
|
|
11661
11442
|
}
|
|
11662
11443
|
}
|
|
@@ -11681,8 +11462,8 @@ var JsonlTrialCache = class {
|
|
|
11681
11462
|
appender;
|
|
11682
11463
|
constructor(path) {
|
|
11683
11464
|
this.path = path;
|
|
11684
|
-
if (
|
|
11685
|
-
for (const line of
|
|
11465
|
+
if (existsSync6(path)) {
|
|
11466
|
+
for (const line of readFileSync5(path, "utf-8").split("\n")) {
|
|
11686
11467
|
if (!line.trim()) continue;
|
|
11687
11468
|
try {
|
|
11688
11469
|
const entry = JSON.parse(line);
|
|
@@ -11720,7 +11501,7 @@ var JsonlTrialCache = class {
|
|
|
11720
11501
|
};
|
|
11721
11502
|
|
|
11722
11503
|
// src/evolution-telemetry.ts
|
|
11723
|
-
import { appendFileSync as appendFileSync5, existsSync as
|
|
11504
|
+
import { appendFileSync as appendFileSync5, existsSync as existsSync7, mkdirSync as mkdirSync5, readFileSync as readFileSync6, writeFileSync } from "fs";
|
|
11724
11505
|
import { dirname as dirname5 } from "path";
|
|
11725
11506
|
var MutationTelemetry = class {
|
|
11726
11507
|
appender;
|
|
@@ -11751,16 +11532,16 @@ var LineageRecorder = class {
|
|
|
11751
11532
|
this.snapshotPath = `${path}.snapshot`;
|
|
11752
11533
|
this.kindOf = kindOf ?? defaultKindOf;
|
|
11753
11534
|
mkdirSync5(dirname5(path), { recursive: true });
|
|
11754
|
-
if (
|
|
11535
|
+
if (existsSync7(this.snapshotPath)) {
|
|
11755
11536
|
try {
|
|
11756
|
-
const parsed = JSON.parse(
|
|
11537
|
+
const parsed = JSON.parse(readFileSync6(this.snapshotPath, "utf-8"));
|
|
11757
11538
|
for (const n of parsed) this.nodes.set(n.id, n);
|
|
11758
11539
|
} catch {
|
|
11759
11540
|
}
|
|
11760
11541
|
}
|
|
11761
|
-
if (
|
|
11542
|
+
if (existsSync7(path)) {
|
|
11762
11543
|
try {
|
|
11763
|
-
for (const line of
|
|
11544
|
+
for (const line of readFileSync6(path, "utf-8").split("\n")) {
|
|
11764
11545
|
if (!line.trim()) continue;
|
|
11765
11546
|
try {
|
|
11766
11547
|
const entry = JSON.parse(line);
|
|
@@ -11772,9 +11553,9 @@ var LineageRecorder = class {
|
|
|
11772
11553
|
} catch {
|
|
11773
11554
|
}
|
|
11774
11555
|
}
|
|
11775
|
-
if (
|
|
11556
|
+
if (existsSync7(path) && this.nodes.size === 0) {
|
|
11776
11557
|
try {
|
|
11777
|
-
const raw =
|
|
11558
|
+
const raw = readFileSync6(path, "utf-8").trim();
|
|
11778
11559
|
if (raw.startsWith("[")) {
|
|
11779
11560
|
const parsed = JSON.parse(raw);
|
|
11780
11561
|
for (const n of parsed) this.nodes.set(n.id, n);
|
|
@@ -11788,8 +11569,8 @@ var LineageRecorder = class {
|
|
|
11788
11569
|
const prev = this.nodes.get(node.id);
|
|
11789
11570
|
this.nodes.set(node.id, { ...prev, ...node });
|
|
11790
11571
|
try {
|
|
11791
|
-
if (
|
|
11792
|
-
const head =
|
|
11572
|
+
if (existsSync7(this.path)) {
|
|
11573
|
+
const head = readFileSync6(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
|
|
11793
11574
|
if (head === "[") {
|
|
11794
11575
|
writeFileSync(this.path, "");
|
|
11795
11576
|
}
|
|
@@ -11855,9 +11636,9 @@ var CostLedger = class {
|
|
|
11855
11636
|
mutex = new Mutex();
|
|
11856
11637
|
constructor(path) {
|
|
11857
11638
|
this.path = path;
|
|
11858
|
-
if (
|
|
11639
|
+
if (existsSync7(path)) {
|
|
11859
11640
|
try {
|
|
11860
|
-
const loaded = JSON.parse(
|
|
11641
|
+
const loaded = JSON.parse(readFileSync6(path, "utf-8"));
|
|
11861
11642
|
for (const k of Object.keys(this.totals)) {
|
|
11862
11643
|
if (k === "byGeneration") {
|
|
11863
11644
|
if (loaded.byGeneration && typeof loaded.byGeneration === "object") {
|