@wrongstack/bench 0.260.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +107 -0
- package/dist/index.d.ts +606 -0
- package/dist/index.js +1050 -0
- package/dist/index.js.map +1 -0
- package/package.json +45 -0
- package/subsets/swe-bench-verified-50.json +22 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,606 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core contracts for the model-independent benchmark harness.
|
|
3
|
+
*
|
|
4
|
+
* The guiding principle: WrongStack is the *harness* (system prompt + tool set
|
|
5
|
+
* + agent loop + scaffolding). The model is the swappable variable. Grading is
|
|
6
|
+
* deterministic (the suite's own tests decide pass/fail — never an LLM), and
|
|
7
|
+
* every report is stamped with a {@link HarnessFingerprint} so rows are only
|
|
8
|
+
* comparable when the harness is identical.
|
|
9
|
+
*/
|
|
10
|
+
/** A single model under test — one column in the leaderboard. */
|
|
11
|
+
interface ModelCell {
|
|
12
|
+
/** Short human label shown in the report (e.g. "opus-4.8"). Must be unique. */
|
|
13
|
+
label: string;
|
|
14
|
+
/** Provider id passed to `wstack --provider` (e.g. "anthropic"). */
|
|
15
|
+
provider: string;
|
|
16
|
+
/** Model id passed to `wstack --model` (e.g. "claude-opus-4-8"). */
|
|
17
|
+
model: string;
|
|
18
|
+
}
|
|
19
|
+
/** Loaded `bench.config.json`. */
|
|
20
|
+
interface BenchConfig {
|
|
21
|
+
/** Per-task iteration cap (seeded into the isolated config). Default 40. */
|
|
22
|
+
maxIterations: number;
|
|
23
|
+
/** How many cells/tasks run concurrently. Default 4. */
|
|
24
|
+
concurrency: number;
|
|
25
|
+
/** Per-task wall-clock timeout in milliseconds. Default 600_000 (10m). */
|
|
26
|
+
timeoutMs: number;
|
|
27
|
+
/** The models to benchmark. At least one. */
|
|
28
|
+
cells: ModelCell[];
|
|
29
|
+
}
|
|
30
|
+
/** One unit of work: a single benchmark exercise/issue. */
|
|
31
|
+
interface BenchTask {
|
|
32
|
+
/** Stable id, unique within the suite (e.g. "polyglot/python/bowling"). */
|
|
33
|
+
id: string;
|
|
34
|
+
/** Suite this task belongs to. */
|
|
35
|
+
suite: SuiteId;
|
|
36
|
+
/** The instruction text handed to the agent via `--prompt`. */
|
|
37
|
+
prompt: string;
|
|
38
|
+
/**
|
|
39
|
+
* Absolute path to a template directory. The runner copies it into an
|
|
40
|
+
* isolated workdir before each cell so parallel runs never collide.
|
|
41
|
+
*/
|
|
42
|
+
templateDir: string;
|
|
43
|
+
/**
|
|
44
|
+
* Top-level entry names to omit when copying the template (e.g. `.meta` so
|
|
45
|
+
* the agent never sees the reference solution). Matched against each path's
|
|
46
|
+
* segments. Defaults to none.
|
|
47
|
+
*/
|
|
48
|
+
templateExclude?: string[] | undefined;
|
|
49
|
+
/** Opaque per-suite data the grader needs (test command, language, etc.). */
|
|
50
|
+
meta: Record<string, unknown>;
|
|
51
|
+
}
|
|
52
|
+
type SuiteId = 'polyglot' | 'swebench';
|
|
53
|
+
/** A suite knows how to enumerate its tasks and grade a finished workdir. */
|
|
54
|
+
interface BenchSuite {
|
|
55
|
+
id: SuiteId;
|
|
56
|
+
/** Discover tasks. `limit` caps the count (for cheap smoke runs). */
|
|
57
|
+
loadTasks(opts: {
|
|
58
|
+
limit?: number | undefined;
|
|
59
|
+
}): Promise<BenchTask[]>;
|
|
60
|
+
/** A stable id for the exact task subset, folded into the fingerprint. */
|
|
61
|
+
subsetId(tasks: BenchTask[]): string;
|
|
62
|
+
}
|
|
63
|
+
/** Deterministic grader verdict for one finished workdir. */
|
|
64
|
+
interface GradeResult {
|
|
65
|
+
/** Did the suite's own tests pass? This is the headline correctness signal. */
|
|
66
|
+
passed: boolean;
|
|
67
|
+
/**
|
|
68
|
+
* Whether a verdict was actually produced. Defaults to true. SWE-bench sets
|
|
69
|
+
* this false when it only exported a prediction for offline grading by the
|
|
70
|
+
* official harness — such rows are excluded from the pass rate so they don't
|
|
71
|
+
* masquerade as failures.
|
|
72
|
+
*/
|
|
73
|
+
graded?: boolean | undefined;
|
|
74
|
+
/** Optional detail (failing test names, compiler error, etc.). */
|
|
75
|
+
detail?: string | undefined;
|
|
76
|
+
}
|
|
77
|
+
/** Raw telemetry parsed from a single `wstack` subprocess run. */
|
|
78
|
+
interface RawRun {
|
|
79
|
+
/** RunResult.status from `--output-json`, or a harness-level status. */
|
|
80
|
+
status: 'completed' | 'failed' | 'aborted' | 'max_iterations' | 'timeout' | 'crashed';
|
|
81
|
+
finalText: string | null;
|
|
82
|
+
iterations: number;
|
|
83
|
+
tokensIn: number;
|
|
84
|
+
tokensOut: number;
|
|
85
|
+
costUsd: number;
|
|
86
|
+
elapsedMs: number;
|
|
87
|
+
/** Process exit code (null when killed by timeout). */
|
|
88
|
+
exitCode: number | null;
|
|
89
|
+
}
|
|
90
|
+
/** Per-(task × cell) result: telemetry + deterministic grade + tool metrics. */
|
|
91
|
+
interface TaskResult {
|
|
92
|
+
taskId: string;
|
|
93
|
+
cell: ModelCell;
|
|
94
|
+
run: RawRun;
|
|
95
|
+
grade: GradeResult;
|
|
96
|
+
/** Tool-call metrics parsed from the isolated session JSONL. */
|
|
97
|
+
tools: ToolMetrics;
|
|
98
|
+
}
|
|
99
|
+
/** Tool-level metrics derived from the session log (model-free). */
|
|
100
|
+
interface ToolMetrics {
|
|
101
|
+
totalCalls: number;
|
|
102
|
+
/** edit/write tool invocations. */
|
|
103
|
+
editCalls: number;
|
|
104
|
+
/** edit/write invocations that returned an error (failed to apply). */
|
|
105
|
+
editErrors: number;
|
|
106
|
+
/** provider 429 / retry events. */
|
|
107
|
+
rateLimitRetries: number;
|
|
108
|
+
}
|
|
109
|
+
/** Folded results for one model cell across all its tasks. */
|
|
110
|
+
interface CellResult {
|
|
111
|
+
cell: ModelCell;
|
|
112
|
+
taskCount: number;
|
|
113
|
+
/** How many tasks produced an actual graded verdict (graded !== false). */
|
|
114
|
+
gradedCount: number;
|
|
115
|
+
/** Fraction in [0,1] of GRADED tasks whose grader passed (pass@1). */
|
|
116
|
+
passRate: number;
|
|
117
|
+
/** Fraction in [0,1] of edit/write calls that applied cleanly. */
|
|
118
|
+
editApplyRate: number;
|
|
119
|
+
avgCostUsd: number;
|
|
120
|
+
avgTokensIn: number;
|
|
121
|
+
avgTokensOut: number;
|
|
122
|
+
/** Median iterations across tasks. */
|
|
123
|
+
p50Iterations: number;
|
|
124
|
+
/** Median wall-clock per task, ms. */
|
|
125
|
+
p50ElapsedMs: number;
|
|
126
|
+
/** Fraction in [0,1] of tasks that hit the timeout. */
|
|
127
|
+
timeoutRate: number;
|
|
128
|
+
totalRateLimitRetries: number;
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Identifies the harness configuration. Two reports are only comparable when
|
|
132
|
+
* their fingerprints match; a prompt/tool/version change flips the hash and
|
|
133
|
+
* marks older rows stale.
|
|
134
|
+
*/
|
|
135
|
+
interface HarnessFingerprint {
|
|
136
|
+
cliVersion: string;
|
|
137
|
+
/** Sorted, comma-joined tool names available to the agent. */
|
|
138
|
+
toolNames: string[];
|
|
139
|
+
maxIterations: number;
|
|
140
|
+
yolo: boolean;
|
|
141
|
+
/** Suite subset id (the exact task set). */
|
|
142
|
+
subsetId: string;
|
|
143
|
+
/** sha256 hex (first 12 chars) of the above. */
|
|
144
|
+
hash: string;
|
|
145
|
+
}
|
|
146
|
+
/** The full report artifact written to disk. */
|
|
147
|
+
interface BenchReport {
|
|
148
|
+
suite: SuiteId;
|
|
149
|
+
/** ISO timestamp the run finished (stamped by the caller, not the harness). */
|
|
150
|
+
finishedAt: string;
|
|
151
|
+
fingerprint: HarnessFingerprint;
|
|
152
|
+
cells: CellResult[];
|
|
153
|
+
/** Every per-(task × cell) row, for reproducibility. */
|
|
154
|
+
results: TaskResult[];
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Fold every per-(task × cell) result for ONE cell into its leaderboard row.
|
|
159
|
+
* All metrics are derived from deterministic signals (grader pass/fail, the
|
|
160
|
+
* `--output-json` usage block, and session-log tool counts) — nothing here
|
|
161
|
+
* consults a model.
|
|
162
|
+
*/
|
|
163
|
+
declare function aggregateCell(cell: ModelCell, results: TaskResult[]): CellResult;
|
|
164
|
+
/** Group all results by cell label and aggregate each group. */
|
|
165
|
+
declare function aggregateAll(cells: ModelCell[], results: TaskResult[]): CellResult[];
|
|
166
|
+
/** Median of a numeric array (0 for empty). Exported for tests. */
|
|
167
|
+
declare function median(values: number[]): number;
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Parse and validate a raw `bench.config.json` object. Throws a descriptive
|
|
171
|
+
* Error on any structural problem so the CLI can surface it cleanly instead of
|
|
172
|
+
* failing deep inside the runner.
|
|
173
|
+
*/
|
|
174
|
+
declare function parseBenchConfig(raw: unknown): BenchConfig;
|
|
175
|
+
/** Load and validate a `bench.config.json` from disk. */
|
|
176
|
+
declare function loadBenchConfig(path: string): Promise<BenchConfig>;
|
|
177
|
+
|
|
178
|
+
interface ExecResult {
|
|
179
|
+
exitCode: number | null;
|
|
180
|
+
stdout: string;
|
|
181
|
+
stderr: string;
|
|
182
|
+
timedOut: boolean;
|
|
183
|
+
}
|
|
184
|
+
/**
|
|
185
|
+
* Run a command (argv form) in a directory and capture its result. Used by the
|
|
186
|
+
* deterministic graders to run a suite's own test command — the run's exit code
|
|
187
|
+
* is the pass/fail signal, no LLM involved.
|
|
188
|
+
*
|
|
189
|
+
* Never rejects; a spawn failure surfaces as exitCode null with the error on
|
|
190
|
+
* stderr.
|
|
191
|
+
*/
|
|
192
|
+
declare function execCommand(opts: {
|
|
193
|
+
command: string;
|
|
194
|
+
args: string[];
|
|
195
|
+
cwd: string;
|
|
196
|
+
timeoutMs: number;
|
|
197
|
+
env?: NodeJS.ProcessEnv | undefined;
|
|
198
|
+
/**
|
|
199
|
+
* Run through a shell. Needed for launchers that resolve platform wrappers
|
|
200
|
+
* (`npm` → npm.cmd on Windows, `./gradlew`). Defaults to true. Pass false for
|
|
201
|
+
* real executables (git, python, node, cargo, go) to avoid the shell entirely
|
|
202
|
+
* — no metacharacter interpretation, no DEP0190, no injection surface.
|
|
203
|
+
*/
|
|
204
|
+
shell?: boolean | undefined;
|
|
205
|
+
}): Promise<ExecResult>;
|
|
206
|
+
|
|
207
|
+
/**
|
|
208
|
+
* Compute the harness fingerprint. This is what makes a report
|
|
209
|
+
* "model-independent": every cell in a run shares one fingerprint, so the only
|
|
210
|
+
* thing that varies across leaderboard rows is the model. Change the CLI
|
|
211
|
+
* version, the tool roster, the iteration cap, the yolo flag, or the task
|
|
212
|
+
* subset and the hash changes — which is exactly when old numbers stop being
|
|
213
|
+
* comparable.
|
|
214
|
+
*
|
|
215
|
+
* The hash is intentionally cheap and reproducible: no timestamps, no random
|
|
216
|
+
* salt. Same inputs → same hash, on any machine.
|
|
217
|
+
*/
|
|
218
|
+
declare function computeHarnessFingerprint(input: {
|
|
219
|
+
cliVersion: string;
|
|
220
|
+
toolNames: string[];
|
|
221
|
+
maxIterations: number;
|
|
222
|
+
yolo: boolean;
|
|
223
|
+
subsetId: string;
|
|
224
|
+
}): HarnessFingerprint;
|
|
225
|
+
/** One-line human label for report headers: `wrongstack@0.255 · fp:a3f9c7 · maxIter=40 · yolo`. */
|
|
226
|
+
declare function fingerprintLabel(fp: HarnessFingerprint): string;
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* Deterministic polyglot grader: run the exercise's own test command in the
|
|
230
|
+
* finished workdir. Exit code 0 → passed. No LLM, no judgement — this is the
|
|
231
|
+
* invariant that keeps the report model-independent.
|
|
232
|
+
*
|
|
233
|
+
* For languages with a dependency-install step (JS `npm install`, etc.) the
|
|
234
|
+
* setup command runs first; if setup fails the task is graded as not-passed
|
|
235
|
+
* with the setup error as detail (it cannot be the model's fault, but the run
|
|
236
|
+
* is genuinely ungradeable, so it counts as a fail rather than crashing).
|
|
237
|
+
*/
|
|
238
|
+
declare function gradePolyglot(opts: {
|
|
239
|
+
workdir: string;
|
|
240
|
+
task: BenchTask;
|
|
241
|
+
/** Per-step timeout for setup/test commands. */
|
|
242
|
+
timeoutMs: number;
|
|
243
|
+
}): Promise<GradeResult>;
|
|
244
|
+
|
|
245
|
+
/** Injectable command runner (defaults to execCommand) for testability. */
|
|
246
|
+
type Exec = (opts: {
|
|
247
|
+
command: string;
|
|
248
|
+
args: string[];
|
|
249
|
+
cwd: string;
|
|
250
|
+
timeoutMs: number;
|
|
251
|
+
shell?: boolean | undefined;
|
|
252
|
+
}) => Promise<ExecResult>;
|
|
253
|
+
/**
|
|
254
|
+
* Extract the model's patch from a finished SWE-bench workdir.
|
|
255
|
+
*
|
|
256
|
+
* The workdir is a git checkout at the instance's base commit; the agent's
|
|
257
|
+
* edits are uncommitted. `git add -A` stages new/modified/deleted files, then
|
|
258
|
+
* `git diff --cached` produces a unified diff in the exact form the official
|
|
259
|
+
* SWE-bench harness expects as `model_patch`.
|
|
260
|
+
*
|
|
261
|
+
* Changes to files touched by the held-out `test_patch` are stripped: the
|
|
262
|
+
* harness applies the model patch and then the test patch, and the agent is
|
|
263
|
+
* told not to edit tests — dropping those sections keeps the model patch from
|
|
264
|
+
* conflicting with (or sneaking changes into) the graded tests.
|
|
265
|
+
*/
|
|
266
|
+
declare function extractModelPatch(opts: {
|
|
267
|
+
workdir: string;
|
|
268
|
+
/** The instance's held-out test patch, used to exclude test-file edits. */
|
|
269
|
+
testPatch?: string | undefined;
|
|
270
|
+
timeoutMs: number;
|
|
271
|
+
exec?: Exec | undefined;
|
|
272
|
+
}): Promise<string>;
|
|
273
|
+
/**
|
|
274
|
+
* Collect the file paths a unified diff touches. Reads both the
|
|
275
|
+
* `diff --git a/<p> b/<p>` header and the `+++ b/<p>` / `--- a/<p>` lines so it
|
|
276
|
+
* works on patches produced by git or by `diff -u`.
|
|
277
|
+
*/
|
|
278
|
+
declare function extractPatchPaths(patch: string): Set<string>;
|
|
279
|
+
/**
|
|
280
|
+
* Drop every per-file section of `patch` whose target path is in `exclude`.
|
|
281
|
+
* Sections are delimited by `diff --git` headers (git's format).
|
|
282
|
+
*/
|
|
283
|
+
declare function filterPatchExcludingPaths(patch: string, exclude: Set<string>): string;
|
|
284
|
+
/**
|
|
285
|
+
* Drop each `diff --git` section for which `shouldDrop(aPath, bPath)` is true.
|
|
286
|
+
*/
|
|
287
|
+
declare function filterPatchSections(patch: string, shouldDrop: (aPath: string, bPath: string) => boolean): string;
|
|
288
|
+
|
|
289
|
+
/**
|
|
290
|
+
* Inline (Docker) grader hook. Given an instance's model patch and held-out
|
|
291
|
+
* test data, return whether the issue is resolved — or `undefined` if it could
|
|
292
|
+
* not produce a verdict. Left injectable so the heavy, version-sensitive Docker
|
|
293
|
+
* execution is plugged in by the host (the official SWE-bench harness) rather
|
|
294
|
+
* than re-implemented and guessed-at here.
|
|
295
|
+
*/
|
|
296
|
+
type SwebenchExternalGrade = (args: {
|
|
297
|
+
instanceId: string;
|
|
298
|
+
patch: string;
|
|
299
|
+
image?: string | undefined;
|
|
300
|
+
failToPass: string[];
|
|
301
|
+
passToPass: string[];
|
|
302
|
+
testPatch?: string | undefined;
|
|
303
|
+
workdir: string;
|
|
304
|
+
timeoutMs: number;
|
|
305
|
+
}) => Promise<boolean | undefined>;
|
|
306
|
+
/**
|
|
307
|
+
* SWE-bench grader.
|
|
308
|
+
*
|
|
309
|
+
* Resolution is decided deterministically by the instance's own tests
|
|
310
|
+
* (`FAIL_TO_PASS` must pass, `PASS_TO_PASS` must still pass) inside its pinned
|
|
311
|
+
* Docker image — never an LLM. We do NOT re-implement that Docker evaluation
|
|
312
|
+
* (the official `princeton-nlp/SWE-bench` harness owns it and it is version
|
|
313
|
+
* sensitive). Instead this grader:
|
|
314
|
+
*
|
|
315
|
+
* 1. Extracts the model patch from the finished workdir (`git diff`),
|
|
316
|
+
* excluding any edits to the held-out test files.
|
|
317
|
+
* 2. Writes a conformant per-instance prediction so the run can be graded by
|
|
318
|
+
* the official harness (`--predictions_path`).
|
|
319
|
+
* 3. If an inline grader is supplied (Docker available), runs it and returns a
|
|
320
|
+
* real pass/fail verdict; otherwise marks the row "exported, ungraded"
|
|
321
|
+
* (graded:false) so it is excluded from pass@1 rather than counted as a
|
|
322
|
+
* failure.
|
|
323
|
+
*/
|
|
324
|
+
declare function gradeSwebench(opts: {
|
|
325
|
+
workdir: string;
|
|
326
|
+
task: BenchTask;
|
|
327
|
+
cell: ModelCell;
|
|
328
|
+
timeoutMs: number;
|
|
329
|
+
/** Where per-instance prediction files are written. */
|
|
330
|
+
predictionsDir: string;
|
|
331
|
+
exec?: Exec | undefined;
|
|
332
|
+
externalGrade?: SwebenchExternalGrade | undefined;
|
|
333
|
+
}): Promise<GradeResult>;
|
|
334
|
+
|
|
335
|
+
/**
|
|
336
|
+
* Per-run isolation. Each benchmark run gets one sandbox directory tree:
|
|
337
|
+
*
|
|
338
|
+
* <sandbox>/
|
|
339
|
+
* home/ → isolated WRONGSTACK_HOME (config seed + all session JSONL)
|
|
340
|
+
* work/<id>/ → one copy of a task template per (task × cell)
|
|
341
|
+
*
|
|
342
|
+
* The isolated home keeps the bench off the developer's real ~/.wrongstack
|
|
343
|
+
* (config, sessions, models cache). Each task workdir hashes to its own
|
|
344
|
+
* project slug under home/projects/, so concurrent runs never share a session
|
|
345
|
+
* file even though they share one home.
|
|
346
|
+
*/
|
|
347
|
+
interface Sandbox {
|
|
348
|
+
/** Root sandbox dir. */
|
|
349
|
+
root: string;
|
|
350
|
+
/** Isolated WRONGSTACK_HOME. */
|
|
351
|
+
homeDir: string;
|
|
352
|
+
/** Directory that holds per-task workdirs. */
|
|
353
|
+
workRoot: string;
|
|
354
|
+
}
|
|
355
|
+
/** Create the sandbox tree and seed the isolated home's config.json. */
|
|
356
|
+
declare function createSandbox(opts: {
|
|
357
|
+
/** Where to create the sandbox. Defaults to an OS temp dir. */
|
|
358
|
+
baseDir?: string | undefined;
|
|
359
|
+
maxIterations: number;
|
|
360
|
+
yolo: boolean;
|
|
361
|
+
}): Promise<Sandbox>;
|
|
362
|
+
/**
|
|
363
|
+
* Copy a task template into a fresh workdir. The directory name embeds the
|
|
364
|
+
* cell label and task id so it is both unique (parallel-safe) and debuggable.
|
|
365
|
+
*/
|
|
366
|
+
declare function prepareWorkdir(sandbox: Sandbox, templateDir: string, taskId: string, cellLabel: string, exclude?: string[] | undefined): Promise<string>;
|
|
367
|
+
/** Remove the whole sandbox tree. Best-effort. */
|
|
368
|
+
declare function cleanupSandbox(sandbox: Sandbox): Promise<void>;
|
|
369
|
+
|
|
370
|
+
interface RunBenchmarkOptions {
|
|
371
|
+
suite: BenchSuite;
|
|
372
|
+
/** Suite-specific deterministic grader. */
|
|
373
|
+
grade: (args: {
|
|
374
|
+
workdir: string;
|
|
375
|
+
task: BenchTask;
|
|
376
|
+
cell: ModelCell;
|
|
377
|
+
timeoutMs: number;
|
|
378
|
+
}) => Promise<GradeResult>;
|
|
379
|
+
config: BenchConfig;
|
|
380
|
+
cliVersion: string;
|
|
381
|
+
/** Tool names available to the agent — folded into the fingerprint. */
|
|
382
|
+
toolNames: string[];
|
|
383
|
+
/** Node executable. */
|
|
384
|
+
nodeBin: string;
|
|
385
|
+
/** Path to the wstack CLI entry. */
|
|
386
|
+
wstackEntry: string;
|
|
387
|
+
/** Cap the number of tasks (cheap smoke runs). */
|
|
388
|
+
limit?: number | undefined;
|
|
389
|
+
/** Where the sandbox is created (default OS temp). */
|
|
390
|
+
sandboxBaseDir?: string | undefined;
|
|
391
|
+
/** Extra env for the subprocess (provider keys are inherited from process.env). */
|
|
392
|
+
env?: NodeJS.ProcessEnv | undefined;
|
|
393
|
+
/** Keep the sandbox on disk after the run (debugging). */
|
|
394
|
+
keepSandbox?: boolean | undefined;
|
|
395
|
+
/** Progress callback (one line per event). */
|
|
396
|
+
onProgress?: ((msg: string) => void) | undefined;
|
|
397
|
+
/** Injected clock for the report timestamp (tests pass a fixed value). */
|
|
398
|
+
now?: (() => string) | undefined;
|
|
399
|
+
}
|
|
400
|
+
/**
|
|
401
|
+
* Run the full benchmark: load the task subset, fan every (task × cell) cell
|
|
402
|
+
* out through isolated subprocesses, grade deterministically, and fold into a
|
|
403
|
+
* fingerprint-stamped report.
|
|
404
|
+
*/
|
|
405
|
+
declare function runBenchmark(opts: RunBenchmarkOptions): Promise<BenchReport>;
|
|
406
|
+
|
|
407
|
+
/**
|
|
408
|
+
* Write the machine-readable report artifacts:
|
|
409
|
+
* - results.jsonl → one line per (task × cell), for reproducibility
|
|
410
|
+
* - summary.json → fingerprint + folded cell results
|
|
411
|
+
*
|
|
412
|
+
* The markdown report is derived from summary.json (see report/markdown.ts), so
|
|
413
|
+
* `wstack bench report` can re-render without re-running anything.
|
|
414
|
+
*/
|
|
415
|
+
declare function writeJsonArtifacts(outDir: string, report: BenchReport): Promise<void>;
|
|
416
|
+
/** Read back a summary.json into the partial report shape markdown needs. */
|
|
417
|
+
declare function readSummary(outDir: string): Promise<Pick<BenchReport, 'suite' | 'finishedAt' | 'fingerprint' | 'cells'>>;
|
|
418
|
+
|
|
419
|
+
/**
|
|
420
|
+
* Render the human-facing leaderboard. The header carries the harness
|
|
421
|
+
* fingerprint: rows are only comparable across reports that share it. The body
|
|
422
|
+
* sorts cells by pass rate (the headline correctness metric), highest first.
|
|
423
|
+
*/
|
|
424
|
+
declare function renderMarkdownReport(report: Pick<BenchReport, 'suite' | 'finishedAt' | 'fingerprint' | 'cells'>): string;
|
|
425
|
+
/** Build the full fingerprint-stamped header line for terminal echo. */
|
|
426
|
+
declare function reportHeaderLine(fp: HarnessFingerprint): string;
|
|
427
|
+
|
|
428
|
+
/**
|
|
429
|
+
* One SWE-bench prediction row, in the exact shape the official harness
|
|
430
|
+
* (`princeton-nlp/SWE-bench`) consumes via `--predictions_path`.
|
|
431
|
+
*/
|
|
432
|
+
interface SwebenchPrediction {
|
|
433
|
+
instance_id: string;
|
|
434
|
+
/** The model/system label — becomes a column in the official report. */
|
|
435
|
+
model_name_or_path: string;
|
|
436
|
+
/** The unified diff the agent produced. */
|
|
437
|
+
model_patch: string;
|
|
438
|
+
}
|
|
439
|
+
/**
|
|
440
|
+
* Write a `predictions.jsonl` for one model cell. SWE-bench grading is delegated
|
|
441
|
+
* to the canonical, version-sensitive harness rather than re-implemented here:
|
|
442
|
+
* we own running the agent and producing a conformant patch; the official tool
|
|
443
|
+
* owns the Docker execution and pass/fail verdict.
|
|
444
|
+
*
|
|
445
|
+
* Returns the file path written.
|
|
446
|
+
*/
|
|
447
|
+
declare function writePredictionsJsonl(outDir: string, cellLabel: string, predictions: SwebenchPrediction[]): Promise<string>;
|
|
448
|
+
/**
|
|
449
|
+
* Write one instance's prediction to its own file under
|
|
450
|
+
* `<predictionsDir>/<cell>/<instance>.json`. Distinct files per instance are
|
|
451
|
+
* concurrency-safe — the SWE-bench grader runs inside the orchestrator's
|
|
452
|
+
* parallel fan-out, so appending to a shared jsonl would race. Call
|
|
453
|
+
* {@link collectCellPredictions} after the run to merge them.
|
|
454
|
+
*/
|
|
455
|
+
declare function writeInstancePrediction(predictionsDir: string, cellLabel: string, prediction: SwebenchPrediction): Promise<void>;
|
|
456
|
+
/** Read back every per-instance prediction written for one cell. */
|
|
457
|
+
declare function collectCellPredictions(predictionsDir: string, cellLabel: string): Promise<SwebenchPrediction[]>;
|
|
458
|
+
/**
|
|
459
|
+
* Parse an official SWE-bench evaluation report JSON for the set of resolved
|
|
460
|
+
* instance ids. The harness writes `resolved_ids` (newer) or a per-instance
|
|
461
|
+
* `{ resolved: bool }` map; both shapes are handled so this keeps working across
|
|
462
|
+
* harness versions.
|
|
463
|
+
*/
|
|
464
|
+
declare function parseResolvedIds(reportJson: unknown): Set<string>;
|
|
465
|
+
|
|
466
|
+
/** Everything needed to run one (task × cell) subprocess. */
|
|
467
|
+
interface RunWstackOptions {
|
|
468
|
+
/** Node executable (process.execPath). */
|
|
469
|
+
nodeBin: string;
|
|
470
|
+
/** Path to the wstack CLI entry (dist/index.js) — or a fake in tests. */
|
|
471
|
+
wstackEntry: string;
|
|
472
|
+
/** Isolated WRONGSTACK_HOME for this run. */
|
|
473
|
+
homeDir: string;
|
|
474
|
+
/** Task workdir (becomes the subprocess cwd / projectRoot). */
|
|
475
|
+
workdir: string;
|
|
476
|
+
cell: ModelCell;
|
|
477
|
+
prompt: string;
|
|
478
|
+
timeoutMs: number;
|
|
479
|
+
/** Extra env merged over process.env (e.g. provider keys already present). */
|
|
480
|
+
env?: NodeJS.ProcessEnv | undefined;
|
|
481
|
+
/** Extra CLI args appended after the standard set (Phase 2 hooks). */
|
|
482
|
+
extraArgs?: string[] | undefined;
|
|
483
|
+
}
|
|
484
|
+
/**
|
|
485
|
+
* Run the real `wstack` binary once, in single-shot `--output-json` mode, and
|
|
486
|
+
* parse its machine-readable result. This is the heart of the model-independent
|
|
487
|
+
* design: the subprocess is the *whole* harness (real wiring, real tools), and
|
|
488
|
+
* the only thing that varies between calls is `--provider`/`--model`.
|
|
489
|
+
*
|
|
490
|
+
* Never rejects — a crash, non-JSON output, or timeout becomes a RawRun with an
|
|
491
|
+
* explanatory status so the grader still produces a row.
|
|
492
|
+
*/
|
|
493
|
+
declare function runWstack(opts: RunWstackOptions): Promise<RawRun>;
|
|
494
|
+
/**
|
|
495
|
+
* Map `items` through `fn` with at most `concurrency` in flight at once.
|
|
496
|
+
* Results preserve input order. A tiny dependency-free p-limit.
|
|
497
|
+
*/
|
|
498
|
+
declare function mapWithConcurrency<T, R>(items: T[], concurrency: number, fn: (item: T, index: number) => Promise<R>): Promise<R[]>;
|
|
499
|
+
|
|
500
|
+
/**
|
|
501
|
+
* Derive model-free tool metrics from the isolated session JSONL the
|
|
502
|
+
* subprocess wrote. Everything here comes from `tool_call_end` events
|
|
503
|
+
* (`{ name, ok }`) and provider retry/error events — no LLM, no heuristics.
|
|
504
|
+
*
|
|
505
|
+
* Returns zeroed metrics (never throws) when the session log is missing or
|
|
506
|
+
* unreadable: a crashed run still produces a valid, gradeable TaskResult.
|
|
507
|
+
*/
|
|
508
|
+
declare function readToolMetrics(opts: {
|
|
509
|
+
homeDir: string;
|
|
510
|
+
/** The task workdir — the subprocess used this as its projectRoot. */
|
|
511
|
+
workdir: string;
|
|
512
|
+
}): Promise<ToolMetrics>;
|
|
513
|
+
|
|
514
|
+
/**
|
|
515
|
+
* Aider polyglot benchmark loader.
|
|
516
|
+
*
|
|
517
|
+
* The polyglot-benchmark repo (https://github.com/Aider-AI/polyglot-benchmark)
|
|
518
|
+
* lays exercises out in Exercism form:
|
|
519
|
+
*
|
|
520
|
+
* <root>/<language>/exercises/practice/<slug>/
|
|
521
|
+
* .docs/instructions.md ← problem statement
|
|
522
|
+
* .meta/config.json ← Exercism file manifest (solution/test/example)
|
|
523
|
+
* .meta/example.<ext> ← reference solution (EXCLUDED from the agent)
|
|
524
|
+
* <solution files> ← stubs the agent edits
|
|
525
|
+
* <test files> ← the hidden tests the grader runs
|
|
526
|
+
*
|
|
527
|
+
* We do NOT vendor the exercises (225 across 6 languages); the caller points
|
|
528
|
+
* `--polyglot-dir` at a local checkout.
|
|
529
|
+
*/
|
|
530
|
+
/** Per-language test command + optional dependency-install step. */
|
|
531
|
+
interface LanguageRunner {
|
|
532
|
+
/** Directory name under the polyglot root. */
|
|
533
|
+
dir: string;
|
|
534
|
+
/** argv for the test command, run in the workdir. */
|
|
535
|
+
test: (testFiles: string[]) => {
|
|
536
|
+
command: string;
|
|
537
|
+
args: string[];
|
|
538
|
+
};
|
|
539
|
+
/** argv for an optional setup/install step run before tests. */
|
|
540
|
+
setup?: {
|
|
541
|
+
command: string;
|
|
542
|
+
args: string[];
|
|
543
|
+
} | undefined;
|
|
544
|
+
}
|
|
545
|
+
declare const LANGUAGE_RUNNERS: Record<string, LanguageRunner>;
|
|
546
|
+
/** Metadata the polyglot grader reads off each task. */
|
|
547
|
+
interface PolyglotMeta {
|
|
548
|
+
language: string;
|
|
549
|
+
solutionFiles: string[];
|
|
550
|
+
testFiles: string[];
|
|
551
|
+
testCommand: {
|
|
552
|
+
command: string;
|
|
553
|
+
args: string[];
|
|
554
|
+
};
|
|
555
|
+
setupCommand?: {
|
|
556
|
+
command: string;
|
|
557
|
+
args: string[];
|
|
558
|
+
} | undefined;
|
|
559
|
+
}
|
|
560
|
+
declare function createPolyglotSuite(opts: {
|
|
561
|
+
/** Local checkout of the polyglot-benchmark repo. */
|
|
562
|
+
polyglotDir: string;
|
|
563
|
+
/** Restrict to these languages (default: all present). */
|
|
564
|
+
languages?: string[] | undefined;
|
|
565
|
+
}): BenchSuite;
|
|
566
|
+
|
|
567
|
+
/**
|
|
568
|
+
* SWE-bench Verified adapter (Phase 2).
|
|
569
|
+
*
|
|
570
|
+
* Unlike polyglot (a self-contained directory of exercises), SWE-bench requires
|
|
571
|
+
* a materialized repo at a specific base commit AND the task's pinned execution
|
|
572
|
+
* environment — which the official harness ships as per-instance Docker images.
|
|
573
|
+
* Running it without Docker is not meaningful, so this adapter is wired and
|
|
574
|
+
* fingerprint-aware but gated: `loadTasks` throws an actionable error unless a
|
|
575
|
+
* prepared dataset directory is supplied and `docker` is enabled.
|
|
576
|
+
*
|
|
577
|
+
* The fixed instance subset lives in `subsets/swe-bench-verified-50.json` so a
|
|
578
|
+
* model comparison always grades the exact same issues (reproducibility).
|
|
579
|
+
*/
|
|
580
|
+
interface SwebenchOptions {
|
|
581
|
+
/**
|
|
582
|
+
* Directory of prepared instances. Expected layout (produced by your
|
|
583
|
+
* SWE-bench setup step — see packages/bench/README.md):
|
|
584
|
+
* <datasetDir>/<instance_id>/
|
|
585
|
+
* repo/ ← git checkout at base_commit
|
|
586
|
+
* instance.json ← { problem_statement, test_patch, FAIL_TO_PASS, PASS_TO_PASS, image }
|
|
587
|
+
*/
|
|
588
|
+
datasetDir?: string | undefined;
|
|
589
|
+
/** Must be true to actually build tasks — the env is Docker-backed. */
|
|
590
|
+
docker?: boolean | undefined;
|
|
591
|
+
/** Override the committed subset file. */
|
|
592
|
+
subsetFile?: string | undefined;
|
|
593
|
+
}
|
|
594
|
+
interface SwebenchMeta {
|
|
595
|
+
instanceId: string;
|
|
596
|
+
instanceDir: string;
|
|
597
|
+
image?: string | undefined;
|
|
598
|
+
failToPass: string[];
|
|
599
|
+
passToPass: string[];
|
|
600
|
+
testPatch?: string | undefined;
|
|
601
|
+
}
|
|
602
|
+
/** Read the pinned instance-id subset (the canonical task set). */
|
|
603
|
+
declare function loadSubset(subsetFile?: string): Promise<string[]>;
|
|
604
|
+
declare function createSwebenchSuite(opts?: SwebenchOptions): BenchSuite;
|
|
605
|
+
|
|
606
|
+
export { type BenchConfig, type BenchReport, type BenchSuite, type BenchTask, type CellResult, type Exec, type ExecResult, type GradeResult, type HarnessFingerprint, LANGUAGE_RUNNERS, type ModelCell, type PolyglotMeta, type RawRun, type RunBenchmarkOptions, type RunWstackOptions, type Sandbox, type SuiteId, type SwebenchExternalGrade, type SwebenchMeta, type SwebenchOptions, type SwebenchPrediction, type TaskResult, type ToolMetrics, aggregateAll, aggregateCell, cleanupSandbox, collectCellPredictions, computeHarnessFingerprint, createPolyglotSuite, createSandbox, createSwebenchSuite, execCommand, extractModelPatch, extractPatchPaths, filterPatchExcludingPaths, filterPatchSections, fingerprintLabel, gradePolyglot, gradeSwebench, loadBenchConfig, loadSubset, mapWithConcurrency, median, parseBenchConfig, parseResolvedIds, prepareWorkdir, readSummary, readToolMetrics, renderMarkdownReport, reportHeaderLine, runBenchmark, runWstack, writeInstancePrediction, writeJsonArtifacts, writePredictionsJsonl };
|