@wrongstack/bench 0.260.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,606 @@
1
+ /**
2
+ * Core contracts for the model-independent benchmark harness.
3
+ *
4
+ * The guiding principle: WrongStack is the *harness* (system prompt + tool set
5
+ * + agent loop + scaffolding). The model is the swappable variable. Grading is
6
+ * deterministic (the suite's own tests decide pass/fail — never an LLM), and
7
+ * every report is stamped with a {@link HarnessFingerprint} so rows are only
8
+ * comparable when the harness is identical.
9
+ */
10
+ /** A single model under test — one column in the leaderboard. */
11
+ interface ModelCell {
12
+ /** Short human label shown in the report (e.g. "opus-4.8"). Must be unique. */
13
+ label: string;
14
+ /** Provider id passed to `wstack --provider` (e.g. "anthropic"). */
15
+ provider: string;
16
+ /** Model id passed to `wstack --model` (e.g. "claude-opus-4-8"). */
17
+ model: string;
18
+ }
19
+ /** Loaded `bench.config.json`. */
20
+ interface BenchConfig {
21
+ /** Per-task iteration cap (seeded into the isolated config). Default 40. */
22
+ maxIterations: number;
23
+ /** How many cells/tasks run concurrently. Default 4. */
24
+ concurrency: number;
25
+ /** Per-task wall-clock timeout in milliseconds. Default 600_000 (10m). */
26
+ timeoutMs: number;
27
+ /** The models to benchmark. At least one. */
28
+ cells: ModelCell[];
29
+ }
30
+ /** One unit of work: a single benchmark exercise/issue. */
31
+ interface BenchTask {
32
+ /** Stable id, unique within the suite (e.g. "polyglot/python/bowling"). */
33
+ id: string;
34
+ /** Suite this task belongs to. */
35
+ suite: SuiteId;
36
+ /** The instruction text handed to the agent via `--prompt`. */
37
+ prompt: string;
38
+ /**
39
+ * Absolute path to a template directory. The runner copies it into an
40
+ * isolated workdir before each cell so parallel runs never collide.
41
+ */
42
+ templateDir: string;
43
+ /**
44
+ * Top-level entry names to omit when copying the template (e.g. `.meta` so
45
+ * the agent never sees the reference solution). Matched against each path's
46
+ * segments. Defaults to none.
47
+ */
48
+ templateExclude?: string[] | undefined;
49
+ /** Opaque per-suite data the grader needs (test command, language, etc.). */
50
+ meta: Record<string, unknown>;
51
+ }
52
+ type SuiteId = 'polyglot' | 'swebench';
53
+ /** A suite knows how to enumerate its tasks and grade a finished workdir. */
54
+ interface BenchSuite {
55
+ id: SuiteId;
56
+ /** Discover tasks. `limit` caps the count (for cheap smoke runs). */
57
+ loadTasks(opts: {
58
+ limit?: number | undefined;
59
+ }): Promise<BenchTask[]>;
60
+ /** A stable id for the exact task subset, folded into the fingerprint. */
61
+ subsetId(tasks: BenchTask[]): string;
62
+ }
63
+ /** Deterministic grader verdict for one finished workdir. */
64
+ interface GradeResult {
65
+ /** Did the suite's own tests pass? This is the headline correctness signal. */
66
+ passed: boolean;
67
+ /**
68
+ * Whether a verdict was actually produced. Defaults to true. SWE-bench sets
69
+ * this false when it only exported a prediction for offline grading by the
70
+ * official harness — such rows are excluded from the pass rate so they don't
71
+ * masquerade as failures.
72
+ */
73
+ graded?: boolean | undefined;
74
+ /** Optional detail (failing test names, compiler error, etc.). */
75
+ detail?: string | undefined;
76
+ }
77
+ /** Raw telemetry parsed from a single `wstack` subprocess run. */
78
+ interface RawRun {
79
+ /** RunResult.status from `--output-json`, or a harness-level status. */
80
+ status: 'completed' | 'failed' | 'aborted' | 'max_iterations' | 'timeout' | 'crashed';
81
+ finalText: string | null;
82
+ iterations: number;
83
+ tokensIn: number;
84
+ tokensOut: number;
85
+ costUsd: number;
86
+ elapsedMs: number;
87
+ /** Process exit code (null when killed by timeout). */
88
+ exitCode: number | null;
89
+ }
90
+ /** Per-(task × cell) result: telemetry + deterministic grade + tool metrics. */
91
+ interface TaskResult {
92
+ taskId: string;
93
+ cell: ModelCell;
94
+ run: RawRun;
95
+ grade: GradeResult;
96
+ /** Tool-call metrics parsed from the isolated session JSONL. */
97
+ tools: ToolMetrics;
98
+ }
99
+ /** Tool-level metrics derived from the session log (model-free). */
100
+ interface ToolMetrics {
101
+ totalCalls: number;
102
+ /** edit/write tool invocations. */
103
+ editCalls: number;
104
+ /** edit/write invocations that returned an error (failed to apply). */
105
+ editErrors: number;
106
+ /** provider 429 / retry events. */
107
+ rateLimitRetries: number;
108
+ }
109
+ /** Folded results for one model cell across all its tasks. */
110
+ interface CellResult {
111
+ cell: ModelCell;
112
+ taskCount: number;
113
+ /** How many tasks produced an actual graded verdict (graded !== false). */
114
+ gradedCount: number;
115
+ /** Fraction in [0,1] of GRADED tasks whose grader passed (pass@1). */
116
+ passRate: number;
117
+ /** Fraction in [0,1] of edit/write calls that applied cleanly. */
118
+ editApplyRate: number;
119
+ avgCostUsd: number;
120
+ avgTokensIn: number;
121
+ avgTokensOut: number;
122
+ /** Median iterations across tasks. */
123
+ p50Iterations: number;
124
+ /** Median wall-clock per task, ms. */
125
+ p50ElapsedMs: number;
126
+ /** Fraction in [0,1] of tasks that hit the timeout. */
127
+ timeoutRate: number;
128
+ totalRateLimitRetries: number;
129
+ }
130
+ /**
131
+ * Identifies the harness configuration. Two reports are only comparable when
132
+ * their fingerprints match; a prompt/tool/version change flips the hash and
133
+ * marks older rows stale.
134
+ */
135
+ interface HarnessFingerprint {
136
+ cliVersion: string;
137
+ /** Sorted, comma-joined tool names available to the agent. */
138
+ toolNames: string[];
139
+ maxIterations: number;
140
+ yolo: boolean;
141
+ /** Suite subset id (the exact task set). */
142
+ subsetId: string;
143
+ /** sha256 hex (first 12 chars) of the above. */
144
+ hash: string;
145
+ }
146
+ /** The full report artifact written to disk. */
147
+ interface BenchReport {
148
+ suite: SuiteId;
149
+ /** ISO timestamp the run finished (stamped by the caller, not the harness). */
150
+ finishedAt: string;
151
+ fingerprint: HarnessFingerprint;
152
+ cells: CellResult[];
153
+ /** Every per-(task × cell) row, for reproducibility. */
154
+ results: TaskResult[];
155
+ }
156
+
157
+ /**
158
+ * Fold every per-(task × cell) result for ONE cell into its leaderboard row.
159
+ * All metrics are derived from deterministic signals (grader pass/fail, the
160
+ * `--output-json` usage block, and session-log tool counts) — nothing here
161
+ * consults a model.
162
+ */
163
+ declare function aggregateCell(cell: ModelCell, results: TaskResult[]): CellResult;
164
+ /** Group all results by cell label and aggregate each group. */
165
+ declare function aggregateAll(cells: ModelCell[], results: TaskResult[]): CellResult[];
166
+ /** Median of a numeric array (0 for empty). Exported for tests. */
167
+ declare function median(values: number[]): number;
168
+
169
+ /**
170
+ * Parse and validate a raw `bench.config.json` object. Throws a descriptive
171
+ * Error on any structural problem so the CLI can surface it cleanly instead of
172
+ * failing deep inside the runner.
173
+ */
174
+ declare function parseBenchConfig(raw: unknown): BenchConfig;
175
+ /** Load and validate a `bench.config.json` from disk. */
176
+ declare function loadBenchConfig(path: string): Promise<BenchConfig>;
177
+
178
+ interface ExecResult {
179
+ exitCode: number | null;
180
+ stdout: string;
181
+ stderr: string;
182
+ timedOut: boolean;
183
+ }
184
+ /**
185
+ * Run a command (argv form) in a directory and capture its result. Used by the
186
+ * deterministic graders to run a suite's own test command — the run's exit code
187
+ * is the pass/fail signal, no LLM involved.
188
+ *
189
+ * Never rejects; a spawn failure surfaces as exitCode null with the error on
190
+ * stderr.
191
+ */
192
+ declare function execCommand(opts: {
193
+ command: string;
194
+ args: string[];
195
+ cwd: string;
196
+ timeoutMs: number;
197
+ env?: NodeJS.ProcessEnv | undefined;
198
+ /**
199
+ * Run through a shell. Needed for launchers that resolve platform wrappers
200
+ * (`npm` → npm.cmd on Windows, `./gradlew`). Defaults to true. Pass false for
201
+ * real executables (git, python, node, cargo, go) to avoid the shell entirely
202
+ * — no metacharacter interpretation, no DEP0190, no injection surface.
203
+ */
204
+ shell?: boolean | undefined;
205
+ }): Promise<ExecResult>;
206
+
207
+ /**
208
+ * Compute the harness fingerprint. This is what makes a report
209
+ * "model-independent": every cell in a run shares one fingerprint, so the only
210
+ * thing that varies across leaderboard rows is the model. Change the CLI
211
+ * version, the tool roster, the iteration cap, the yolo flag, or the task
212
+ * subset and the hash changes — which is exactly when old numbers stop being
213
+ * comparable.
214
+ *
215
+ * The hash is intentionally cheap and reproducible: no timestamps, no random
216
+ * salt. Same inputs → same hash, on any machine.
217
+ */
218
+ declare function computeHarnessFingerprint(input: {
219
+ cliVersion: string;
220
+ toolNames: string[];
221
+ maxIterations: number;
222
+ yolo: boolean;
223
+ subsetId: string;
224
+ }): HarnessFingerprint;
225
+ /** One-line human label for report headers: `wrongstack@0.255 · fp:a3f9c7 · maxIter=40 · yolo`. */
226
+ declare function fingerprintLabel(fp: HarnessFingerprint): string;
227
+
228
+ /**
229
+ * Deterministic polyglot grader: run the exercise's own test command in the
230
+ * finished workdir. Exit code 0 → passed. No LLM, no judgement — this is the
231
+ * invariant that keeps the report model-independent.
232
+ *
233
+ * For languages with a dependency-install step (JS `npm install`, etc.) the
234
+ * setup command runs first; if setup fails the task is graded as not-passed
235
+ * with the setup error as detail (it cannot be the model's fault, but the run
236
+ * is genuinely ungradeable, so it counts as a fail rather than crashing).
237
+ */
238
+ declare function gradePolyglot(opts: {
239
+ workdir: string;
240
+ task: BenchTask;
241
+ /** Per-step timeout for setup/test commands. */
242
+ timeoutMs: number;
243
+ }): Promise<GradeResult>;
244
+
245
+ /** Injectable command runner (defaults to execCommand) for testability. */
246
+ type Exec = (opts: {
247
+ command: string;
248
+ args: string[];
249
+ cwd: string;
250
+ timeoutMs: number;
251
+ shell?: boolean | undefined;
252
+ }) => Promise<ExecResult>;
253
+ /**
254
+ * Extract the model's patch from a finished SWE-bench workdir.
255
+ *
256
+ * The workdir is a git checkout at the instance's base commit; the agent's
257
+ * edits are uncommitted. `git add -A` stages new/modified/deleted files, then
258
+ * `git diff --cached` produces a unified diff in the exact form the official
259
+ * SWE-bench harness expects as `model_patch`.
260
+ *
261
+ * Changes to files touched by the held-out `test_patch` are stripped: the
262
+ * harness applies the model patch and then the test patch, and the agent is
263
+ * told not to edit tests — dropping those sections keeps the model patch from
264
+ * conflicting with (or sneaking changes into) the graded tests.
265
+ */
266
+ declare function extractModelPatch(opts: {
267
+ workdir: string;
268
+ /** The instance's held-out test patch, used to exclude test-file edits. */
269
+ testPatch?: string | undefined;
270
+ timeoutMs: number;
271
+ exec?: Exec | undefined;
272
+ }): Promise<string>;
273
+ /**
274
+ * Collect the file paths a unified diff touches. Reads both the
275
+ * `diff --git a/<p> b/<p>` header and the `+++ b/<p>` / `--- a/<p>` lines so it
276
+ * works on patches produced by git or by `diff -u`.
277
+ */
278
+ declare function extractPatchPaths(patch: string): Set<string>;
279
+ /**
280
+ * Drop every per-file section of `patch` whose target path is in `exclude`.
281
+ * Sections are delimited by `diff --git` headers (git's format).
282
+ */
283
+ declare function filterPatchExcludingPaths(patch: string, exclude: Set<string>): string;
284
+ /**
285
+ * Drop each `diff --git` section for which `shouldDrop(aPath, bPath)` is true.
286
+ */
287
+ declare function filterPatchSections(patch: string, shouldDrop: (aPath: string, bPath: string) => boolean): string;
288
+
289
+ /**
290
+ * Inline (Docker) grader hook. Given an instance's model patch and held-out
291
+ * test data, return whether the issue is resolved — or `undefined` if it could
292
+ * not produce a verdict. Left injectable so the heavy, version-sensitive Docker
293
+ * execution is plugged in by the host (the official SWE-bench harness) rather
294
+ * than re-implemented and guessed-at here.
295
+ */
296
+ type SwebenchExternalGrade = (args: {
297
+ instanceId: string;
298
+ patch: string;
299
+ image?: string | undefined;
300
+ failToPass: string[];
301
+ passToPass: string[];
302
+ testPatch?: string | undefined;
303
+ workdir: string;
304
+ timeoutMs: number;
305
+ }) => Promise<boolean | undefined>;
306
+ /**
307
+ * SWE-bench grader.
308
+ *
309
+ * Resolution is decided deterministically by the instance's own tests
310
+ * (`FAIL_TO_PASS` must pass, `PASS_TO_PASS` must still pass) inside its pinned
311
+ * Docker image — never an LLM. We do NOT re-implement that Docker evaluation
312
+ * (the official `princeton-nlp/SWE-bench` harness owns it and it is version
313
+ * sensitive). Instead this grader:
314
+ *
315
+ * 1. Extracts the model patch from the finished workdir (`git diff`),
316
+ * excluding any edits to the held-out test files.
317
+ * 2. Writes a conformant per-instance prediction so the run can be graded by
318
+ * the official harness (`--predictions_path`).
319
+ * 3. If an inline grader is supplied (Docker available), runs it and returns a
320
+ * real pass/fail verdict; otherwise marks the row "exported, ungraded"
321
+ * (graded:false) so it is excluded from pass@1 rather than counted as a
322
+ * failure.
323
+ */
324
+ declare function gradeSwebench(opts: {
325
+ workdir: string;
326
+ task: BenchTask;
327
+ cell: ModelCell;
328
+ timeoutMs: number;
329
+ /** Where per-instance prediction files are written. */
330
+ predictionsDir: string;
331
+ exec?: Exec | undefined;
332
+ externalGrade?: SwebenchExternalGrade | undefined;
333
+ }): Promise<GradeResult>;
334
+
335
+ /**
336
+ * Per-run isolation. Each benchmark run gets one sandbox directory tree:
337
+ *
338
+ * <sandbox>/
339
+ * home/ → isolated WRONGSTACK_HOME (config seed + all session JSONL)
340
+ * work/<id>/ → one copy of a task template per (task × cell)
341
+ *
342
+ * The isolated home keeps the bench off the developer's real ~/.wrongstack
343
+ * (config, sessions, models cache). Each task workdir hashes to its own
344
+ * project slug under home/projects/, so concurrent runs never share a session
345
+ * file even though they share one home.
346
+ */
347
+ interface Sandbox {
348
+ /** Root sandbox dir. */
349
+ root: string;
350
+ /** Isolated WRONGSTACK_HOME. */
351
+ homeDir: string;
352
+ /** Directory that holds per-task workdirs. */
353
+ workRoot: string;
354
+ }
355
+ /** Create the sandbox tree and seed the isolated home's config.json. */
356
+ declare function createSandbox(opts: {
357
+ /** Where to create the sandbox. Defaults to an OS temp dir. */
358
+ baseDir?: string | undefined;
359
+ maxIterations: number;
360
+ yolo: boolean;
361
+ }): Promise<Sandbox>;
362
+ /**
363
+ * Copy a task template into a fresh workdir. The directory name embeds the
364
+ * cell label and task id so it is both unique (parallel-safe) and debuggable.
365
+ */
366
+ declare function prepareWorkdir(sandbox: Sandbox, templateDir: string, taskId: string, cellLabel: string, exclude?: string[] | undefined): Promise<string>;
367
+ /** Remove the whole sandbox tree. Best-effort. */
368
+ declare function cleanupSandbox(sandbox: Sandbox): Promise<void>;
369
+
370
+ interface RunBenchmarkOptions {
371
+ suite: BenchSuite;
372
+ /** Suite-specific deterministic grader. */
373
+ grade: (args: {
374
+ workdir: string;
375
+ task: BenchTask;
376
+ cell: ModelCell;
377
+ timeoutMs: number;
378
+ }) => Promise<GradeResult>;
379
+ config: BenchConfig;
380
+ cliVersion: string;
381
+ /** Tool names available to the agent — folded into the fingerprint. */
382
+ toolNames: string[];
383
+ /** Node executable. */
384
+ nodeBin: string;
385
+ /** Path to the wstack CLI entry. */
386
+ wstackEntry: string;
387
+ /** Cap the number of tasks (cheap smoke runs). */
388
+ limit?: number | undefined;
389
+ /** Where the sandbox is created (default OS temp). */
390
+ sandboxBaseDir?: string | undefined;
391
+ /** Extra env for the subprocess (provider keys are inherited from process.env). */
392
+ env?: NodeJS.ProcessEnv | undefined;
393
+ /** Keep the sandbox on disk after the run (debugging). */
394
+ keepSandbox?: boolean | undefined;
395
+ /** Progress callback (one line per event). */
396
+ onProgress?: ((msg: string) => void) | undefined;
397
+ /** Injected clock for the report timestamp (tests pass a fixed value). */
398
+ now?: (() => string) | undefined;
399
+ }
400
+ /**
401
+ * Run the full benchmark: load the task subset, fan every (task × cell) cell
402
+ * out through isolated subprocesses, grade deterministically, and fold into a
403
+ * fingerprint-stamped report.
404
+ */
405
+ declare function runBenchmark(opts: RunBenchmarkOptions): Promise<BenchReport>;
406
+
407
+ /**
408
+ * Write the machine-readable report artifacts:
409
+ * - results.jsonl → one line per (task × cell), for reproducibility
410
+ * - summary.json → fingerprint + folded cell results
411
+ *
412
+ * The markdown report is derived from summary.json (see report/markdown.ts), so
413
+ * `wstack bench report` can re-render without re-running anything.
414
+ */
415
+ declare function writeJsonArtifacts(outDir: string, report: BenchReport): Promise<void>;
416
+ /** Read back a summary.json into the partial report shape markdown needs. */
417
+ declare function readSummary(outDir: string): Promise<Pick<BenchReport, 'suite' | 'finishedAt' | 'fingerprint' | 'cells'>>;
418
+
419
+ /**
420
+ * Render the human-facing leaderboard. The header carries the harness
421
+ * fingerprint: rows are only comparable across reports that share it. The body
422
+ * sorts cells by pass rate (the headline correctness metric), highest first.
423
+ */
424
+ declare function renderMarkdownReport(report: Pick<BenchReport, 'suite' | 'finishedAt' | 'fingerprint' | 'cells'>): string;
425
+ /** Build the full fingerprint-stamped header line for terminal echo. */
426
+ declare function reportHeaderLine(fp: HarnessFingerprint): string;
427
+
428
+ /**
429
+ * One SWE-bench prediction row, in the exact shape the official harness
430
+ * (`princeton-nlp/SWE-bench`) consumes via `--predictions_path`.
431
+ */
432
+ interface SwebenchPrediction {
433
+ instance_id: string;
434
+ /** The model/system label — becomes a column in the official report. */
435
+ model_name_or_path: string;
436
+ /** The unified diff the agent produced. */
437
+ model_patch: string;
438
+ }
439
+ /**
440
+ * Write a `predictions.jsonl` for one model cell. SWE-bench grading is delegated
441
+ * to the canonical, version-sensitive harness rather than re-implemented here:
442
+ * we own running the agent and producing a conformant patch; the official tool
443
+ * owns the Docker execution and pass/fail verdict.
444
+ *
445
+ * Returns the file path written.
446
+ */
447
+ declare function writePredictionsJsonl(outDir: string, cellLabel: string, predictions: SwebenchPrediction[]): Promise<string>;
448
+ /**
449
+ * Write one instance's prediction to its own file under
450
+ * `<predictionsDir>/<cell>/<instance>.json`. Distinct files per instance are
451
+ * concurrency-safe — the SWE-bench grader runs inside the orchestrator's
452
+ * parallel fan-out, so appending to a shared jsonl would race. Call
453
+ * {@link collectCellPredictions} after the run to merge them.
454
+ */
455
+ declare function writeInstancePrediction(predictionsDir: string, cellLabel: string, prediction: SwebenchPrediction): Promise<void>;
456
+ /** Read back every per-instance prediction written for one cell. */
457
+ declare function collectCellPredictions(predictionsDir: string, cellLabel: string): Promise<SwebenchPrediction[]>;
458
+ /**
459
+ * Parse an official SWE-bench evaluation report JSON for the set of resolved
460
+ * instance ids. The harness writes `resolved_ids` (newer) or a per-instance
461
+ * `{ resolved: bool }` map; both shapes are handled so this keeps working across
462
+ * harness versions.
463
+ */
464
+ declare function parseResolvedIds(reportJson: unknown): Set<string>;
465
+
466
+ /** Everything needed to run one (task × cell) subprocess. */
467
+ interface RunWstackOptions {
468
+ /** Node executable (process.execPath). */
469
+ nodeBin: string;
470
+ /** Path to the wstack CLI entry (dist/index.js) — or a fake in tests. */
471
+ wstackEntry: string;
472
+ /** Isolated WRONGSTACK_HOME for this run. */
473
+ homeDir: string;
474
+ /** Task workdir (becomes the subprocess cwd / projectRoot). */
475
+ workdir: string;
476
+ cell: ModelCell;
477
+ prompt: string;
478
+ timeoutMs: number;
479
+ /** Extra env merged over process.env (e.g. provider keys already present). */
480
+ env?: NodeJS.ProcessEnv | undefined;
481
+ /** Extra CLI args appended after the standard set (Phase 2 hooks). */
482
+ extraArgs?: string[] | undefined;
483
+ }
484
+ /**
485
+ * Run the real `wstack` binary once, in single-shot `--output-json` mode, and
486
+ * parse its machine-readable result. This is the heart of the model-independent
487
+ * design: the subprocess is the *whole* harness (real wiring, real tools), and
488
+ * the only thing that varies between calls is `--provider`/`--model`.
489
+ *
490
+ * Never rejects — a crash, non-JSON output, or timeout becomes a RawRun with an
491
+ * explanatory status so the grader still produces a row.
492
+ */
493
+ declare function runWstack(opts: RunWstackOptions): Promise<RawRun>;
494
+ /**
495
+ * Map `items` through `fn` with at most `concurrency` in flight at once.
496
+ * Results preserve input order. A tiny dependency-free p-limit.
497
+ */
498
+ declare function mapWithConcurrency<T, R>(items: T[], concurrency: number, fn: (item: T, index: number) => Promise<R>): Promise<R[]>;
499
+
500
+ /**
501
+ * Derive model-free tool metrics from the isolated session JSONL the
502
+ * subprocess wrote. Everything here comes from `tool_call_end` events
503
+ * (`{ name, ok }`) and provider retry/error events — no LLM, no heuristics.
504
+ *
505
+ * Returns zeroed metrics (never throws) when the session log is missing or
506
+ * unreadable: a crashed run still produces a valid, gradeable TaskResult.
507
+ */
508
+ declare function readToolMetrics(opts: {
509
+ homeDir: string;
510
+ /** The task workdir — the subprocess used this as its projectRoot. */
511
+ workdir: string;
512
+ }): Promise<ToolMetrics>;
513
+
514
+ /**
515
+ * Aider polyglot benchmark loader.
516
+ *
517
+ * The polyglot-benchmark repo (https://github.com/Aider-AI/polyglot-benchmark)
518
+ * lays exercises out in Exercism form:
519
+ *
520
+ * <root>/<language>/exercises/practice/<slug>/
521
+ * .docs/instructions.md ← problem statement
522
+ * .meta/config.json ← Exercism file manifest (solution/test/example)
523
+ * .meta/example.<ext> ← reference solution (EXCLUDED from the agent)
524
+ * <solution files> ← stubs the agent edits
525
+ * <test files> ← the hidden tests the grader runs
526
+ *
527
+ * We do NOT vendor the exercises (225 across 6 languages); the caller points
528
+ * `--polyglot-dir` at a local checkout.
529
+ */
530
+ /** Per-language test command + optional dependency-install step. */
531
+ interface LanguageRunner {
532
+ /** Directory name under the polyglot root. */
533
+ dir: string;
534
+ /** argv for the test command, run in the workdir. */
535
+ test: (testFiles: string[]) => {
536
+ command: string;
537
+ args: string[];
538
+ };
539
+ /** argv for an optional setup/install step run before tests. */
540
+ setup?: {
541
+ command: string;
542
+ args: string[];
543
+ } | undefined;
544
+ }
545
+ declare const LANGUAGE_RUNNERS: Record<string, LanguageRunner>;
546
+ /** Metadata the polyglot grader reads off each task. */
547
+ interface PolyglotMeta {
548
+ language: string;
549
+ solutionFiles: string[];
550
+ testFiles: string[];
551
+ testCommand: {
552
+ command: string;
553
+ args: string[];
554
+ };
555
+ setupCommand?: {
556
+ command: string;
557
+ args: string[];
558
+ } | undefined;
559
+ }
560
+ declare function createPolyglotSuite(opts: {
561
+ /** Local checkout of the polyglot-benchmark repo. */
562
+ polyglotDir: string;
563
+ /** Restrict to these languages (default: all present). */
564
+ languages?: string[] | undefined;
565
+ }): BenchSuite;
566
+
567
+ /**
568
+ * SWE-bench Verified adapter (Phase 2).
569
+ *
570
+ * Unlike polyglot (a self-contained directory of exercises), SWE-bench requires
571
+ * a materialized repo at a specific base commit AND the task's pinned execution
572
+ * environment — which the official harness ships as per-instance Docker images.
573
+ * Running it without Docker is not meaningful, so this adapter is wired and
574
+ * fingerprint-aware but gated: `loadTasks` throws an actionable error unless a
575
+ * prepared dataset directory is supplied and `docker` is enabled.
576
+ *
577
+ * The fixed instance subset lives in `subsets/swe-bench-verified-50.json` so a
578
+ * model comparison always grades the exact same issues (reproducibility).
579
+ */
580
+ interface SwebenchOptions {
581
+ /**
582
+ * Directory of prepared instances. Expected layout (produced by your
583
+ * SWE-bench setup step — see packages/bench/README.md):
584
+ * <datasetDir>/<instance_id>/
585
+ * repo/ ← git checkout at base_commit
586
+ * instance.json ← { problem_statement, test_patch, FAIL_TO_PASS, PASS_TO_PASS, image }
587
+ */
588
+ datasetDir?: string | undefined;
589
+ /** Must be true to actually build tasks — the env is Docker-backed. */
590
+ docker?: boolean | undefined;
591
+ /** Override the committed subset file. */
592
+ subsetFile?: string | undefined;
593
+ }
594
+ interface SwebenchMeta {
595
+ instanceId: string;
596
+ instanceDir: string;
597
+ image?: string | undefined;
598
+ failToPass: string[];
599
+ passToPass: string[];
600
+ testPatch?: string | undefined;
601
+ }
602
+ /** Read the pinned instance-id subset (the canonical task set). */
603
+ declare function loadSubset(subsetFile?: string): Promise<string[]>;
604
+ declare function createSwebenchSuite(opts?: SwebenchOptions): BenchSuite;
605
+
606
+ export { type BenchConfig, type BenchReport, type BenchSuite, type BenchTask, type CellResult, type Exec, type ExecResult, type GradeResult, type HarnessFingerprint, LANGUAGE_RUNNERS, type ModelCell, type PolyglotMeta, type RawRun, type RunBenchmarkOptions, type RunWstackOptions, type Sandbox, type SuiteId, type SwebenchExternalGrade, type SwebenchMeta, type SwebenchOptions, type SwebenchPrediction, type TaskResult, type ToolMetrics, aggregateAll, aggregateCell, cleanupSandbox, collectCellPredictions, computeHarnessFingerprint, createPolyglotSuite, createSandbox, createSwebenchSuite, execCommand, extractModelPatch, extractPatchPaths, filterPatchExcludingPaths, filterPatchSections, fingerprintLabel, gradePolyglot, gradeSwebench, loadBenchConfig, loadSubset, mapWithConcurrency, median, parseBenchConfig, parseResolvedIds, prepareWorkdir, readSummary, readToolMetrics, renderMarkdownReport, reportHeaderLine, runBenchmark, runWstack, writeInstancePrediction, writeJsonArtifacts, writePredictionsJsonl };