@wrongstack/bench 0.260.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,1050 @@
1
+ import * as fs3 from 'fs/promises';
2
+ import { spawn, spawnSync } from 'child_process';
3
+ import { createHash } from 'crypto';
4
+ import * as path from 'path';
5
+ import * as os from 'os';
6
+ import { resolveWstackPaths } from '@wrongstack/core';
7
+ import { fileURLToPath } from 'url';
8
+
9
+ // src/aggregate.ts
10
+ function aggregateCell(cell, results) {
11
+ const taskCount = results.length;
12
+ if (taskCount === 0) {
13
+ return {
14
+ cell,
15
+ taskCount: 0,
16
+ gradedCount: 0,
17
+ passRate: 0,
18
+ editApplyRate: 1,
19
+ avgCostUsd: 0,
20
+ avgTokensIn: 0,
21
+ avgTokensOut: 0,
22
+ p50Iterations: 0,
23
+ p50ElapsedMs: 0,
24
+ timeoutRate: 0,
25
+ totalRateLimitRetries: 0
26
+ };
27
+ }
28
+ const graded = results.filter((r) => r.grade.graded !== false);
29
+ const passed = graded.filter((r) => r.grade.passed).length;
30
+ const timeouts = results.filter((r) => r.run.status === "timeout").length;
31
+ const editCalls = sum(results, (r) => r.tools.editCalls);
32
+ const editErrors = sum(results, (r) => r.tools.editErrors);
33
+ const editApplyRate = editCalls === 0 ? 1 : (editCalls - editErrors) / editCalls;
34
+ return {
35
+ cell,
36
+ taskCount,
37
+ gradedCount: graded.length,
38
+ passRate: graded.length === 0 ? 0 : passed / graded.length,
39
+ editApplyRate,
40
+ avgCostUsd: sum(results, (r) => r.run.costUsd) / taskCount,
41
+ avgTokensIn: sum(results, (r) => r.run.tokensIn) / taskCount,
42
+ avgTokensOut: sum(results, (r) => r.run.tokensOut) / taskCount,
43
+ p50Iterations: median(results.map((r) => r.run.iterations)),
44
+ p50ElapsedMs: median(results.map((r) => r.run.elapsedMs)),
45
+ timeoutRate: timeouts / taskCount,
46
+ totalRateLimitRetries: sum(results, (r) => r.tools.rateLimitRetries)
47
+ };
48
+ }
49
+ function aggregateAll(cells, results) {
50
+ return cells.map(
51
+ (cell) => aggregateCell(
52
+ cell,
53
+ results.filter((r) => r.cell.label === cell.label)
54
+ )
55
+ );
56
+ }
57
+ function median(values) {
58
+ if (values.length === 0) return 0;
59
+ const sorted = [...values].sort((a, b) => a - b);
60
+ const mid = Math.floor(sorted.length / 2);
61
+ return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
62
+ }
63
+ function sum(items, pick) {
64
+ let total = 0;
65
+ for (const item of items) total += pick(item);
66
+ return total;
67
+ }
68
+ var DEFAULTS = {
69
+ maxIterations: 40,
70
+ concurrency: 4,
71
+ timeoutMs: 6e5
72
+ };
73
+ function parseBenchConfig(raw) {
74
+ if (typeof raw !== "object" || raw === null) {
75
+ throw new Error("bench config must be a JSON object");
76
+ }
77
+ const obj = raw;
78
+ const cellsRaw = obj["cells"];
79
+ if (!Array.isArray(cellsRaw) || cellsRaw.length === 0) {
80
+ throw new Error('bench config "cells" must be a non-empty array');
81
+ }
82
+ const seen = /* @__PURE__ */ new Set();
83
+ const cells = cellsRaw.map((c, i) => {
84
+ if (typeof c !== "object" || c === null) {
85
+ throw new Error(`cells[${i}] must be an object`);
86
+ }
87
+ const cell = c;
88
+ const provider = cell["provider"];
89
+ const model = cell["model"];
90
+ if (typeof provider !== "string" || provider.length === 0) {
91
+ throw new Error(`cells[${i}].provider must be a non-empty string`);
92
+ }
93
+ if (typeof model !== "string" || model.length === 0) {
94
+ throw new Error(`cells[${i}].model must be a non-empty string`);
95
+ }
96
+ const label = typeof cell["label"] === "string" && cell["label"].length > 0 ? cell["label"] : `${provider}/${model}`;
97
+ if (seen.has(label)) {
98
+ throw new Error(`duplicate cell label "${label}" \u2014 labels must be unique`);
99
+ }
100
+ seen.add(label);
101
+ return { label, provider, model };
102
+ });
103
+ const maxIterations = positiveInt(obj["maxIterations"], DEFAULTS.maxIterations, "maxIterations");
104
+ const concurrency = positiveInt(obj["concurrency"], DEFAULTS.concurrency, "concurrency");
105
+ const timeoutMs = positiveInt(obj["timeoutMs"], DEFAULTS.timeoutMs, "timeoutMs");
106
+ return { maxIterations, concurrency, timeoutMs, cells };
107
+ }
108
+ async function loadBenchConfig(path7) {
109
+ let raw;
110
+ try {
111
+ raw = await fs3.readFile(path7, "utf8");
112
+ } catch (err) {
113
+ throw new Error(
114
+ `cannot read bench config at ${path7}: ${err instanceof Error ? err.message : String(err)}`
115
+ );
116
+ }
117
+ let parsed;
118
+ try {
119
+ parsed = JSON.parse(raw);
120
+ } catch (err) {
121
+ throw new Error(
122
+ `bench config at ${path7} is not valid JSON: ${err instanceof Error ? err.message : String(err)}`
123
+ );
124
+ }
125
+ return parseBenchConfig(parsed);
126
+ }
127
+ function positiveInt(value, fallback, name) {
128
+ if (value === void 0) return fallback;
129
+ if (typeof value !== "number" || !Number.isFinite(value) || value <= 0) {
130
+ throw new Error(`${name} must be a positive number`);
131
+ }
132
+ return Math.floor(value);
133
+ }
134
+ function execCommand(opts) {
135
+ const useShell = opts.shell ?? true;
136
+ return new Promise((resolve) => {
137
+ let child;
138
+ try {
139
+ if (useShell) {
140
+ const line = [opts.command, ...opts.args.map(shellQuote)].join(" ");
141
+ child = spawn(line, {
142
+ cwd: opts.cwd,
143
+ env: { ...process.env, ...opts.env },
144
+ windowsHide: true,
145
+ shell: true
146
+ });
147
+ } else {
148
+ child = spawn(opts.command, opts.args, {
149
+ cwd: opts.cwd,
150
+ env: { ...process.env, ...opts.env },
151
+ windowsHide: true
152
+ });
153
+ }
154
+ } catch (err) {
155
+ resolve({
156
+ exitCode: null,
157
+ stdout: "",
158
+ stderr: err instanceof Error ? err.message : String(err),
159
+ timedOut: false
160
+ });
161
+ return;
162
+ }
163
+ let stdout = "";
164
+ let stderr = "";
165
+ let timedOut = false;
166
+ let settled = false;
167
+ const timer = setTimeout(() => {
168
+ timedOut = true;
169
+ try {
170
+ child.kill("SIGKILL");
171
+ } catch {
172
+ }
173
+ }, opts.timeoutMs);
174
+ child.stdout?.on("data", (d) => {
175
+ stdout += d.toString("utf8");
176
+ });
177
+ child.stderr?.on("data", (d) => {
178
+ stderr += d.toString("utf8");
179
+ });
180
+ const done = (exitCode) => {
181
+ if (settled) return;
182
+ settled = true;
183
+ clearTimeout(timer);
184
+ resolve({ exitCode, stdout, stderr, timedOut });
185
+ };
186
+ child.on("error", (err) => {
187
+ stderr += `
188
+ ${err.message}`;
189
+ done(null);
190
+ });
191
+ child.on("close", (code) => done(code));
192
+ });
193
+ }
194
+ function shellQuote(arg) {
195
+ if (arg.length > 0 && !/\s/.test(arg)) return arg;
196
+ return `"${arg.replace(/"/g, '\\"')}"`;
197
+ }
198
+ function computeHarnessFingerprint(input) {
199
+ const toolNames = [...input.toolNames].sort((a, b) => a.localeCompare(b));
200
+ const canonical = JSON.stringify({
201
+ cliVersion: input.cliVersion,
202
+ toolNames,
203
+ maxIterations: input.maxIterations,
204
+ yolo: input.yolo,
205
+ subsetId: input.subsetId
206
+ });
207
+ const hash = createHash("sha256").update(canonical).digest("hex").slice(0, 12);
208
+ return {
209
+ cliVersion: input.cliVersion,
210
+ toolNames,
211
+ maxIterations: input.maxIterations,
212
+ yolo: input.yolo,
213
+ subsetId: input.subsetId,
214
+ hash
215
+ };
216
+ }
217
+ function fingerprintLabel(fp) {
218
+ const parts = [`wrongstack@${fp.cliVersion}`, `fp:${fp.hash}`, `maxIter=${fp.maxIterations}`];
219
+ if (fp.yolo) parts.push("yolo");
220
+ return parts.join(" \xB7 ");
221
+ }
222
+
223
+ // src/graders/polyglot-grader.ts
224
+ async function gradePolyglot(opts) {
225
+ const meta = opts.task.meta;
226
+ if (meta.setupCommand) {
227
+ const setup = await execCommand({
228
+ command: meta.setupCommand.command,
229
+ args: meta.setupCommand.args,
230
+ cwd: opts.workdir,
231
+ timeoutMs: opts.timeoutMs
232
+ });
233
+ if (setup.exitCode !== 0) {
234
+ return {
235
+ passed: false,
236
+ detail: `setup failed (${meta.setupCommand.command}): ${tail(setup.stderr || setup.stdout)}`
237
+ };
238
+ }
239
+ }
240
+ const test = await execCommand({
241
+ command: meta.testCommand.command,
242
+ args: meta.testCommand.args,
243
+ cwd: opts.workdir,
244
+ timeoutMs: opts.timeoutMs
245
+ });
246
+ if (test.timedOut) {
247
+ return { passed: false, detail: "test command timed out" };
248
+ }
249
+ if (test.exitCode === 0) {
250
+ return { passed: true };
251
+ }
252
+ return { passed: false, detail: tail(test.stdout + "\n" + test.stderr) };
253
+ }
254
+ function tail(s) {
255
+ const clean = s.trim();
256
+ return clean.length > 500 ? `\u2026${clean.slice(-500)}` : clean;
257
+ }
258
+ async function writePredictionsJsonl(outDir, cellLabel, predictions) {
259
+ await fs3.mkdir(outDir, { recursive: true });
260
+ const file = path.join(outDir, `predictions-${slug(cellLabel)}.jsonl`);
261
+ const body = predictions.map((p) => JSON.stringify(p)).join("\n");
262
+ await fs3.writeFile(file, body + (body ? "\n" : ""), "utf8");
263
+ return file;
264
+ }
265
+ async function writeInstancePrediction(predictionsDir, cellLabel, prediction) {
266
+ const dir = path.join(predictionsDir, slug(cellLabel));
267
+ await fs3.mkdir(dir, { recursive: true });
268
+ await fs3.writeFile(
269
+ path.join(dir, `${slug(prediction.instance_id)}.json`),
270
+ JSON.stringify(prediction),
271
+ "utf8"
272
+ );
273
+ }
274
+ async function collectCellPredictions(predictionsDir, cellLabel) {
275
+ const dir = path.join(predictionsDir, slug(cellLabel));
276
+ let entries;
277
+ try {
278
+ entries = await fs3.readdir(dir);
279
+ } catch {
280
+ return [];
281
+ }
282
+ const out = [];
283
+ for (const name of entries.filter((e) => e.endsWith(".json")).sort()) {
284
+ try {
285
+ out.push(JSON.parse(await fs3.readFile(path.join(dir, name), "utf8")));
286
+ } catch {
287
+ }
288
+ }
289
+ return out;
290
+ }
291
+ function parseResolvedIds(reportJson) {
292
+ const resolved = /* @__PURE__ */ new Set();
293
+ if (typeof reportJson !== "object" || reportJson === null) return resolved;
294
+ const obj = reportJson;
295
+ if (Array.isArray(obj["resolved_ids"])) {
296
+ for (const id of obj["resolved_ids"]) if (typeof id === "string") resolved.add(id);
297
+ return resolved;
298
+ }
299
+ for (const [id, v] of Object.entries(obj)) {
300
+ if (v && typeof v === "object" && v["resolved"] === true) {
301
+ resolved.add(id);
302
+ }
303
+ }
304
+ return resolved;
305
+ }
306
+ function slug(s) {
307
+ return s.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 60) || "cell";
308
+ }
309
+
310
+ // src/suites/swebench-patch.ts
311
+ async function extractModelPatch(opts) {
312
+ const exec = opts.exec ?? execCommand;
313
+ await exec({
314
+ command: "git",
315
+ args: ["add", "-A"],
316
+ cwd: opts.workdir,
317
+ timeoutMs: opts.timeoutMs,
318
+ shell: false
319
+ });
320
+ const diff = await exec({
321
+ command: "git",
322
+ args: ["diff", "--cached", "--no-color"],
323
+ cwd: opts.workdir,
324
+ timeoutMs: opts.timeoutMs,
325
+ shell: false
326
+ });
327
+ const raw = diff.stdout;
328
+ const testPaths = opts.testPatch ? extractPatchPaths(opts.testPatch) : /* @__PURE__ */ new Set();
329
+ return filterPatchSections(
330
+ raw,
331
+ (a, b) => testPaths.has(a) || testPaths.has(b) || isHarnessArtifact(a) || isHarnessArtifact(b)
332
+ );
333
+ }
334
+ function isHarnessArtifact(p) {
335
+ return p === ".gitignore" || p.split("/")[0] === ".wrongstack";
336
+ }
337
+ function extractPatchPaths(patch) {
338
+ const paths = /* @__PURE__ */ new Set();
339
+ for (const line of patch.split("\n")) {
340
+ const git = /^diff --git a\/(.+?) b\/(.+)$/.exec(line);
341
+ if (git) {
342
+ paths.add(git[1]);
343
+ paths.add(git[2]);
344
+ continue;
345
+ }
346
+ const minus = /^--- (?:a\/)?(.+)$/.exec(line);
347
+ if (minus && minus[1] !== "/dev/null") paths.add(stripTimestamp(minus[1]));
348
+ const plus = /^\+\+\+ (?:b\/)?(.+)$/.exec(line);
349
+ if (plus && plus[1] !== "/dev/null") paths.add(stripTimestamp(plus[1]));
350
+ }
351
+ return paths;
352
+ }
353
+ function filterPatchExcludingPaths(patch, exclude) {
354
+ if (exclude.size === 0) return patch;
355
+ return filterPatchSections(patch, (a, b) => exclude.has(a) || exclude.has(b));
356
+ }
357
+ function filterPatchSections(patch, shouldDrop) {
358
+ const lines = patch.split("\n");
359
+ const out = [];
360
+ let skipping = false;
361
+ for (const line of lines) {
362
+ const header = /^diff --git a\/(.+?) b\/(.+)$/.exec(line);
363
+ if (header) {
364
+ skipping = shouldDrop(header[1], header[2]);
365
+ }
366
+ if (!skipping) out.push(line);
367
+ }
368
+ return out.join("\n");
369
+ }
370
+ function stripTimestamp(p) {
371
+ const tab = p.indexOf(" ");
372
+ return tab === -1 ? p : p.slice(0, tab);
373
+ }
374
+
375
+ // src/graders/swebench-grader.ts
376
+ async function gradeSwebench(opts) {
377
+ const meta = opts.task.meta;
378
+ const patch = await extractModelPatch({
379
+ workdir: opts.workdir,
380
+ testPatch: meta.testPatch,
381
+ timeoutMs: opts.timeoutMs,
382
+ exec: opts.exec
383
+ });
384
+ await writeInstancePrediction(opts.predictionsDir, opts.cell.label, {
385
+ instance_id: meta.instanceId,
386
+ model_name_or_path: opts.cell.label,
387
+ model_patch: patch
388
+ });
389
+ if (patch.trim().length === 0) {
390
+ return { passed: false, graded: true, detail: "empty patch (agent made no edits)" };
391
+ }
392
+ if (opts.externalGrade) {
393
+ const verdict = await opts.externalGrade({
394
+ instanceId: meta.instanceId,
395
+ patch,
396
+ image: meta.image,
397
+ failToPass: meta.failToPass,
398
+ passToPass: meta.passToPass,
399
+ testPatch: meta.testPatch,
400
+ workdir: opts.workdir,
401
+ timeoutMs: opts.timeoutMs
402
+ });
403
+ if (verdict !== void 0) {
404
+ return { passed: verdict, graded: true };
405
+ }
406
+ }
407
+ return {
408
+ passed: false,
409
+ graded: false,
410
+ detail: `patch exported (${patch.length} bytes) \u2014 grade with the official SWE-bench harness`
411
+ };
412
+ }
413
+ async function createSandbox(opts) {
414
+ const base = opts.baseDir ?? await fs3.mkdtemp(path.join(os.tmpdir(), "wstack-bench-"));
415
+ await fs3.mkdir(base, { recursive: true });
416
+ const homeDir = path.join(base, "home");
417
+ const workRoot = path.join(base, "work");
418
+ await fs3.mkdir(homeDir, { recursive: true });
419
+ await fs3.mkdir(workRoot, { recursive: true });
420
+ const config = {
421
+ yolo: opts.yolo,
422
+ tools: { maxIterations: opts.maxIterations },
423
+ session: { auditLevel: "standard" }
424
+ };
425
+ await fs3.writeFile(path.join(homeDir, "config.json"), JSON.stringify(config, null, 2), "utf8");
426
+ return { root: base, homeDir, workRoot };
427
+ }
428
+ async function prepareWorkdir(sandbox, templateDir, taskId, cellLabel, exclude) {
429
+ const safe = `${slug2(cellLabel)}__${slug2(taskId)}`;
430
+ const dest = path.join(sandbox.workRoot, safe);
431
+ await fs3.rm(dest, { recursive: true, force: true });
432
+ const excludeSet = new Set(exclude ?? []);
433
+ await fs3.cp(templateDir, dest, {
434
+ recursive: true,
435
+ // Drop any path whose segments include an excluded name (e.g. `.meta`),
436
+ // so the reference solution never reaches the agent's workdir.
437
+ filter: excludeSet.size === 0 ? void 0 : (src) => !src.split(/[\\/]/).some((seg) => excludeSet.has(seg))
438
+ });
439
+ return dest;
440
+ }
441
+ async function cleanupSandbox(sandbox) {
442
+ await fs3.rm(sandbox.root, { recursive: true, force: true }).catch(() => void 0);
443
+ }
444
+ function slug2(s) {
445
+ return s.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 60) || "x";
446
+ }
447
+ async function runWstack(opts) {
448
+ const args = [
449
+ opts.wstackEntry,
450
+ "--prompt",
451
+ opts.prompt,
452
+ "--provider",
453
+ opts.cell.provider,
454
+ "--model",
455
+ opts.cell.model,
456
+ "--output-json",
457
+ "--no-tui",
458
+ "--no-interactive",
459
+ "--no-banner",
460
+ "--yolo",
461
+ "--no-models-refresh",
462
+ "--skip-index",
463
+ ...opts.extraArgs ?? []
464
+ ];
465
+ const startedAt = Date.now();
466
+ return new Promise((resolve) => {
467
+ let child;
468
+ try {
469
+ child = spawn(opts.nodeBin, args, {
470
+ cwd: opts.workdir,
471
+ env: { ...process.env, ...opts.env, WRONGSTACK_HOME: opts.homeDir },
472
+ // windowsHide + no detached: per the repo spawn convention, detached on
473
+ // win32 voids CREATE_NO_WINDOW and pops visible consoles for grandchildren.
474
+ windowsHide: true
475
+ });
476
+ } catch (err) {
477
+ resolve(
478
+ crashed(startedAt, `spawn failed: ${err instanceof Error ? err.message : String(err)}`)
479
+ );
480
+ return;
481
+ }
482
+ let stdout = "";
483
+ let stderr = "";
484
+ let timedOut = false;
485
+ let settled = false;
486
+ const timer = setTimeout(() => {
487
+ timedOut = true;
488
+ treeKill(child);
489
+ }, opts.timeoutMs);
490
+ child.stdout?.on("data", (d) => {
491
+ stdout += d.toString("utf8");
492
+ });
493
+ child.stderr?.on("data", (d) => {
494
+ stderr += d.toString("utf8");
495
+ });
496
+ const finish = (run) => {
497
+ if (settled) return;
498
+ settled = true;
499
+ clearTimeout(timer);
500
+ resolve(run);
501
+ };
502
+ child.on("error", (err) => {
503
+ finish(crashed(startedAt, `process error: ${err.message}`));
504
+ });
505
+ child.on("close", (code) => {
506
+ const elapsedMs = Date.now() - startedAt;
507
+ if (timedOut) {
508
+ finish({
509
+ status: "timeout",
510
+ finalText: null,
511
+ iterations: 0,
512
+ tokensIn: 0,
513
+ tokensOut: 0,
514
+ costUsd: 0,
515
+ elapsedMs,
516
+ exitCode: code
517
+ });
518
+ return;
519
+ }
520
+ const parsed = parseOutputJson(stdout);
521
+ if (!parsed) {
522
+ finish({
523
+ status: "crashed",
524
+ finalText: null,
525
+ iterations: 0,
526
+ tokensIn: 0,
527
+ tokensOut: 0,
528
+ costUsd: 0,
529
+ elapsedMs,
530
+ exitCode: code
531
+ // stderr is intentionally not surfaced in RawRun; the caller can read
532
+ // the session log. Keep the shape minimal.
533
+ });
534
+ return;
535
+ }
536
+ finish({ ...parsed, elapsedMs, exitCode: code });
537
+ });
538
+ });
539
+ }
540
+ function parseOutputJson(stdout) {
541
+ const lines = stdout.split("\n");
542
+ for (let i = lines.length - 1; i >= 0; i--) {
543
+ const line = lines[i]?.trim();
544
+ if (!line?.startsWith("{")) continue;
545
+ let obj;
546
+ try {
547
+ obj = JSON.parse(line);
548
+ } catch {
549
+ continue;
550
+ }
551
+ if (typeof obj["status"] !== "string") continue;
552
+ const usage = obj["usage"] ?? {};
553
+ return {
554
+ status: normalizeStatus(obj["status"]),
555
+ finalText: typeof obj["finalText"] === "string" ? obj["finalText"] : null,
556
+ iterations: num(usage["iterations"]),
557
+ tokensIn: num(usage["input"]),
558
+ tokensOut: num(usage["output"]),
559
+ costUsd: num(usage["cost"])
560
+ };
561
+ }
562
+ return void 0;
563
+ }
564
+ function normalizeStatus(s) {
565
+ switch (s) {
566
+ case "completed":
567
+ case "failed":
568
+ case "aborted":
569
+ case "max_iterations":
570
+ return s;
571
+ default:
572
+ return "failed";
573
+ }
574
+ }
575
+ function num(v) {
576
+ return typeof v === "number" && Number.isFinite(v) ? v : 0;
577
+ }
578
+ function crashed(startedAt, _reason) {
579
+ return {
580
+ status: "crashed",
581
+ finalText: null,
582
+ iterations: 0,
583
+ tokensIn: 0,
584
+ tokensOut: 0,
585
+ costUsd: 0,
586
+ elapsedMs: Date.now() - startedAt,
587
+ exitCode: null
588
+ };
589
+ }
590
+ function treeKill(child) {
591
+ if (child.pid === void 0) return;
592
+ if (process.platform === "win32") {
593
+ spawnSync("taskkill", ["/pid", String(child.pid), "/T", "/F"], { windowsHide: true });
594
+ return;
595
+ }
596
+ try {
597
+ child.kill("SIGTERM");
598
+ } catch {
599
+ }
600
+ setTimeout(() => {
601
+ try {
602
+ child.kill("SIGKILL");
603
+ } catch {
604
+ }
605
+ }, 2e3).unref();
606
+ }
607
+ async function mapWithConcurrency(items, concurrency, fn) {
608
+ const results = new Array(items.length);
609
+ let next = 0;
610
+ const limit = Math.max(1, Math.min(concurrency, items.length || 1));
611
+ const workers = Array.from({ length: limit }, async () => {
612
+ while (true) {
613
+ const i = next++;
614
+ if (i >= items.length) return;
615
+ results[i] = await fn(items[i], i);
616
+ }
617
+ });
618
+ await Promise.all(workers);
619
+ return results;
620
+ }
621
+ var EDIT_TOOLS = /* @__PURE__ */ new Set([
622
+ "edit",
623
+ "write",
624
+ "multiedit",
625
+ "multi_edit",
626
+ "str_replace",
627
+ "apply_patch"
628
+ ]);
629
+ async function readToolMetrics(opts) {
630
+ const empty = {
631
+ totalCalls: 0,
632
+ editCalls: 0,
633
+ editErrors: 0,
634
+ rateLimitRetries: 0
635
+ };
636
+ let jsonlPath;
637
+ try {
638
+ const sessionsDir = resolveWstackPaths({
639
+ projectRoot: opts.workdir,
640
+ globalRoot: opts.homeDir
641
+ }).projectSessions;
642
+ jsonlPath = await newestJsonl(sessionsDir);
643
+ } catch {
644
+ return empty;
645
+ }
646
+ if (!jsonlPath) return empty;
647
+ let raw;
648
+ try {
649
+ raw = await fs3.readFile(jsonlPath, "utf8");
650
+ } catch {
651
+ return empty;
652
+ }
653
+ const metrics = { ...empty };
654
+ for (const line of raw.split("\n")) {
655
+ const trimmed = line.trim();
656
+ if (trimmed.length === 0) continue;
657
+ let event;
658
+ try {
659
+ event = JSON.parse(trimmed);
660
+ } catch {
661
+ continue;
662
+ }
663
+ const type = event["type"];
664
+ if (type === "tool_call_end") {
665
+ metrics.totalCalls++;
666
+ const name = typeof event["name"] === "string" ? event["name"].toLowerCase() : "";
667
+ if (EDIT_TOOLS.has(name)) {
668
+ metrics.editCalls++;
669
+ if (event["ok"] === false) metrics.editErrors++;
670
+ }
671
+ } else if (type === "provider_retry" || type === "provider_error") {
672
+ metrics.rateLimitRetries++;
673
+ }
674
+ }
675
+ return metrics;
676
+ }
677
+ async function newestJsonl(dir) {
678
+ let entries;
679
+ try {
680
+ entries = await fs3.readdir(dir);
681
+ } catch {
682
+ return void 0;
683
+ }
684
+ const jsonls = entries.filter((e) => e.endsWith(".jsonl"));
685
+ if (jsonls.length === 0) return void 0;
686
+ let newest;
687
+ for (const name of jsonls) {
688
+ const full = path.join(dir, name);
689
+ try {
690
+ const stat2 = await fs3.stat(full);
691
+ if (!newest || stat2.mtimeMs > newest.mtime) {
692
+ newest = { path: full, mtime: stat2.mtimeMs };
693
+ }
694
+ } catch {
695
+ }
696
+ }
697
+ return newest?.path;
698
+ }
699
+
700
+ // src/orchestrate.ts
701
+ async function runBenchmark(opts) {
702
+ const progress = opts.onProgress ?? (() => {
703
+ });
704
+ const nowFn = opts.now ?? (() => (/* @__PURE__ */ new Date()).toISOString());
705
+ const tasks = await opts.suite.loadTasks({ limit: opts.limit });
706
+ if (tasks.length === 0) {
707
+ throw new Error(`suite "${opts.suite.id}" produced no tasks (check the data directory)`);
708
+ }
709
+ const subsetId = opts.suite.subsetId(tasks);
710
+ const fingerprint = computeHarnessFingerprint({
711
+ cliVersion: opts.cliVersion,
712
+ toolNames: opts.toolNames,
713
+ maxIterations: opts.config.maxIterations,
714
+ yolo: true,
715
+ subsetId
716
+ });
717
+ progress(
718
+ `suite=${opts.suite.id} tasks=${tasks.length} cells=${opts.config.cells.length} fp=${fingerprint.hash}`
719
+ );
720
+ const sandbox = await createSandbox({
721
+ baseDir: opts.sandboxBaseDir,
722
+ maxIterations: opts.config.maxIterations,
723
+ yolo: true
724
+ });
725
+ const units = [];
726
+ for (const task of tasks) {
727
+ for (const cell of opts.config.cells) {
728
+ units.push({ task, cell });
729
+ }
730
+ }
731
+ try {
732
+ const results = await mapWithConcurrency(units, opts.config.concurrency, async (unit) => {
733
+ const { task, cell } = unit;
734
+ const workdir = await prepareWorkdir(
735
+ sandbox,
736
+ task.templateDir,
737
+ task.id,
738
+ cell.label,
739
+ task.templateExclude
740
+ );
741
+ const run = await runWstack({
742
+ nodeBin: opts.nodeBin,
743
+ wstackEntry: opts.wstackEntry,
744
+ homeDir: sandbox.homeDir,
745
+ workdir,
746
+ cell,
747
+ prompt: task.prompt,
748
+ timeoutMs: opts.config.timeoutMs,
749
+ env: opts.env
750
+ });
751
+ const tools = await readToolMetrics({ homeDir: sandbox.homeDir, workdir });
752
+ let grade;
753
+ try {
754
+ grade = await opts.grade({ workdir, task, cell, timeoutMs: opts.config.timeoutMs });
755
+ } catch (err) {
756
+ grade = {
757
+ passed: false,
758
+ detail: `grader error: ${err instanceof Error ? err.message : String(err)}`
759
+ };
760
+ }
761
+ progress(
762
+ ` ${cell.label} \xB7 ${task.id} \u2192 ${grade.passed ? "PASS" : "fail"} (${run.status}, ${run.iterations} it, $${run.costUsd.toFixed(3)})`
763
+ );
764
+ const result = { taskId: task.id, cell, run, grade, tools };
765
+ return result;
766
+ });
767
+ const cells = aggregateAll(opts.config.cells, results);
768
+ return {
769
+ suite: opts.suite.id,
770
+ finishedAt: nowFn(),
771
+ fingerprint,
772
+ cells,
773
+ results
774
+ };
775
+ } finally {
776
+ if (!opts.keepSandbox) await cleanupSandbox(sandbox);
777
+ }
778
+ }
779
+ async function writeJsonArtifacts(outDir, report) {
780
+ await fs3.mkdir(outDir, { recursive: true });
781
+ const jsonl = report.results.map((r) => JSON.stringify(r)).join("\n");
782
+ await fs3.writeFile(path.join(outDir, "results.jsonl"), jsonl + (jsonl ? "\n" : ""), "utf8");
783
+ const summary = {
784
+ suite: report.suite,
785
+ finishedAt: report.finishedAt,
786
+ fingerprint: report.fingerprint,
787
+ cells: report.cells
788
+ };
789
+ await fs3.writeFile(path.join(outDir, "summary.json"), JSON.stringify(summary, null, 2), "utf8");
790
+ }
791
+ async function readSummary(outDir) {
792
+ const raw = await fs3.readFile(path.join(outDir, "summary.json"), "utf8");
793
+ return JSON.parse(raw);
794
+ }
795
+
796
+ // src/report/markdown.ts
797
+ function renderMarkdownReport(report) {
798
+ const { suite, finishedAt, fingerprint, cells } = report;
799
+ const lines = [];
800
+ lines.push(`# WrongStack benchmark \u2014 ${suite}`);
801
+ lines.push("");
802
+ lines.push(`**Harness:** ${fingerprintLabel(fingerprint)}`);
803
+ lines.push(`**Finished:** ${finishedAt}`);
804
+ lines.push(`**Tasks/cell:** ${cells[0]?.taskCount ?? 0}`);
805
+ lines.push("");
806
+ lines.push(
807
+ "Grading is deterministic (the suite's own tests decide pass/fail \u2014 no LLM judge). The only variable across rows is the model; everything else is fixed by the harness fingerprint."
808
+ );
809
+ lines.push("");
810
+ const sorted = [...cells].sort((a, b) => b.passRate - a.passRate);
811
+ lines.push(
812
+ "| Model | Pass@1 | Edit-apply | $/task | tok in/out | iters (p50) | wall (p50) | timeout | 429s |"
813
+ );
814
+ lines.push("|---|---:|---:|---:|---:|---:|---:|---:|---:|");
815
+ for (const c of sorted) {
816
+ lines.push(renderRow(c));
817
+ }
818
+ lines.push("");
819
+ lines.push(
820
+ `_Fingerprint hash: \`${fingerprint.hash}\` \xB7 tools: ${fingerprint.toolNames.length} \xB7 subset: \`${fingerprint.subsetId}\`_`
821
+ );
822
+ lines.push("");
823
+ return lines.join("\n");
824
+ }
825
+ function renderRow(c) {
826
+ const passCell = c.gradedCount === 0 ? "\u2014" : c.gradedCount < c.taskCount ? `${pct(c.passRate)} (${c.gradedCount}/${c.taskCount})` : pct(c.passRate);
827
+ return [
828
+ "",
829
+ c.cell.label,
830
+ passCell,
831
+ pct(c.editApplyRate),
832
+ `$${c.avgCostUsd.toFixed(3)}`,
833
+ `${fmtK(c.avgTokensIn)}/${fmtK(c.avgTokensOut)}`,
834
+ String(Math.round(c.p50Iterations)),
835
+ fmtMs(c.p50ElapsedMs),
836
+ pct(c.timeoutRate),
837
+ String(c.totalRateLimitRetries),
838
+ ""
839
+ ].join(" | ").trim();
840
+ }
841
+ function pct(x) {
842
+ return `${(x * 100).toFixed(1)}%`;
843
+ }
844
+ function fmtK(n) {
845
+ if (n >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
846
+ return String(Math.round(n));
847
+ }
848
+ function fmtMs(ms) {
849
+ if (ms >= 1e3) return `${(ms / 1e3).toFixed(1)}s`;
850
+ return `${Math.round(ms)}ms`;
851
+ }
852
+ function reportHeaderLine(fp) {
853
+ return fingerprintLabel(fp);
854
+ }
855
+ var LANGUAGE_RUNNERS = {
856
+ python: {
857
+ dir: "python",
858
+ test: (tests) => ({ command: "python", args: ["-m", "pytest", "-q", ...tests] })
859
+ },
860
+ javascript: {
861
+ dir: "javascript",
862
+ setup: { command: "npm", args: ["install", "--no-audit", "--no-fund"] },
863
+ test: () => ({ command: "npm", args: ["test"] })
864
+ },
865
+ go: {
866
+ dir: "go",
867
+ test: () => ({ command: "go", args: ["test", "./..."] })
868
+ },
869
+ rust: {
870
+ dir: "rust",
871
+ test: () => ({ command: "cargo", args: ["test", "--", "--include-ignored"] })
872
+ },
873
+ cpp: {
874
+ dir: "cpp",
875
+ test: () => ({ command: "cmake", args: ["--build", "build", "--target", "test"] })
876
+ },
877
+ java: {
878
+ dir: "java",
879
+ test: () => ({ command: "./gradlew", args: ["test"] })
880
+ }
881
+ };
882
+ function createPolyglotSuite(opts) {
883
+ return {
884
+ id: "polyglot",
885
+ async loadTasks({ limit }) {
886
+ const tasks = [];
887
+ const langs = opts.languages ?? Object.keys(LANGUAGE_RUNNERS);
888
+ for (const lang of langs) {
889
+ const runner = LANGUAGE_RUNNERS[lang];
890
+ if (!runner) continue;
891
+ const practiceDir = path.join(opts.polyglotDir, runner.dir, "exercises", "practice");
892
+ let slugs;
893
+ try {
894
+ slugs = (await fs3.readdir(practiceDir, { withFileTypes: true })).filter((d) => d.isDirectory()).map((d) => d.name).sort((a, b) => a.localeCompare(b));
895
+ } catch {
896
+ continue;
897
+ }
898
+ for (const slug3 of slugs) {
899
+ const exerciseDir = path.join(practiceDir, slug3);
900
+ const task = await loadExercise(exerciseDir, lang, runner, slug3);
901
+ if (task) tasks.push(task);
902
+ if (limit !== void 0 && tasks.length >= limit) return tasks;
903
+ }
904
+ }
905
+ return tasks;
906
+ },
907
+ subsetId(tasks) {
908
+ const ids = tasks.map((t) => t.id).sort((a, b) => a.localeCompare(b));
909
+ return `polyglot:${createHash("sha256").update(ids.join("\n")).digest("hex").slice(0, 12)}`;
910
+ }
911
+ };
912
+ }
913
+ async function loadExercise(exerciseDir, language, runner, slug3) {
914
+ let manifest;
915
+ try {
916
+ const raw = await fs3.readFile(path.join(exerciseDir, ".meta", "config.json"), "utf8");
917
+ manifest = JSON.parse(raw);
918
+ } catch {
919
+ return void 0;
920
+ }
921
+ const solutionFiles = manifest.files?.solution ?? [];
922
+ const testFiles = manifest.files?.test ?? [];
923
+ if (solutionFiles.length === 0) return void 0;
924
+ const instructions = await readInstructions(exerciseDir);
925
+ const meta = {
926
+ language,
927
+ solutionFiles,
928
+ testFiles,
929
+ testCommand: runner.test(testFiles),
930
+ setupCommand: runner.setup
931
+ };
932
+ return {
933
+ id: `polyglot/${language}/${slug3}`,
934
+ suite: "polyglot",
935
+ prompt: buildPrompt(instructions, solutionFiles, testFiles),
936
+ templateDir: exerciseDir,
937
+ // Never copy the reference solution into the agent's workdir.
938
+ templateExclude: [".meta"],
939
+ meta
940
+ };
941
+ }
942
+ async function readInstructions(exerciseDir) {
943
+ const docs = path.join(exerciseDir, ".docs");
944
+ const parts = [];
945
+ for (const name of ["introduction.md", "instructions.md", "instructions.append.md"]) {
946
+ try {
947
+ parts.push((await fs3.readFile(path.join(docs, name), "utf8")).trim());
948
+ } catch {
949
+ }
950
+ }
951
+ return parts.filter(Boolean).join("\n\n");
952
+ }
953
+ function buildPrompt(instructions, solutionFiles, testFiles) {
954
+ return [
955
+ instructions,
956
+ "",
957
+ "---",
958
+ "",
959
+ `Implement the solution by editing **only** these file(s): ${solutionFiles.join(", ")}.`,
960
+ testFiles.length > 0 ? `The test suite (${testFiles.join(", ")}) is already present and will be run to grade your work \u2014 do not modify the tests.` : "A hidden test suite will be run to grade your work.",
961
+ "Make the tests pass. Use the available file tools to read and edit the files. When the implementation is complete, stop."
962
+ ].join("\n");
963
+ }
964
+ var SUBSET_FILE = "swe-bench-verified-50.json";
965
+ async function resolveDefaultSubset() {
966
+ const candidates = [
967
+ fileURLToPath(new URL(`../subsets/${SUBSET_FILE}`, import.meta.url)),
968
+ fileURLToPath(new URL(`../../subsets/${SUBSET_FILE}`, import.meta.url))
969
+ ];
970
+ for (const c of candidates) {
971
+ try {
972
+ await fs3.access(c);
973
+ return c;
974
+ } catch {
975
+ }
976
+ }
977
+ return candidates[0];
978
+ }
979
+ async function loadSubset(subsetFile) {
980
+ const file = subsetFile ?? await resolveDefaultSubset();
981
+ const raw = await fs3.readFile(file, "utf8");
982
+ const parsed = JSON.parse(raw);
983
+ if (!Array.isArray(parsed.instances)) {
984
+ throw new Error(`subset file ${file} is missing an "instances" array`);
985
+ }
986
+ return parsed.instances.filter((x) => typeof x === "string");
987
+ }
988
+ function createSwebenchSuite(opts = {}) {
989
+ return {
990
+ id: "swebench",
991
+ async loadTasks({ limit }) {
992
+ const instanceIds = await loadSubset(opts.subsetFile);
993
+ if (!opts.datasetDir) {
994
+ throw new Error(
995
+ `SWE-bench requires a prepared dataset directory (materialized repos).
996
+ Run with \`--dataset-dir <path>\`; add \`--docker\` to grade inline, or omit it to export predictions.jsonl for the official harness.
997
+ The pinned subset (${instanceIds.length} instances) is committed in packages/bench/subsets/swe-bench-verified-50.json. See packages/bench/README.md for the dataset-preparation steps.`
998
+ );
999
+ }
1000
+ const tasks = [];
1001
+ for (const id of instanceIds) {
1002
+ const instanceDir = path.join(opts.datasetDir, id);
1003
+ let meta;
1004
+ try {
1005
+ meta = JSON.parse(await fs3.readFile(path.join(instanceDir, "instance.json"), "utf8"));
1006
+ } catch {
1007
+ continue;
1008
+ }
1009
+ const swMeta = {
1010
+ instanceId: id,
1011
+ instanceDir,
1012
+ image: meta["image"],
1013
+ failToPass: meta["FAIL_TO_PASS"] ?? [],
1014
+ passToPass: meta["PASS_TO_PASS"] ?? [],
1015
+ testPatch: meta["test_patch"]
1016
+ };
1017
+ tasks.push({
1018
+ id: `swebench/${id}`,
1019
+ suite: "swebench",
1020
+ prompt: buildPrompt2(
1021
+ typeof meta.problem_statement === "string" ? meta.problem_statement : ""
1022
+ ),
1023
+ templateDir: path.join(instanceDir, "repo"),
1024
+ meta: swMeta
1025
+ });
1026
+ if (limit !== void 0 && tasks.length >= limit) break;
1027
+ }
1028
+ return tasks;
1029
+ },
1030
+ subsetId(tasks) {
1031
+ const ids = tasks.map((t) => t.id).sort((a, b) => a.localeCompare(b));
1032
+ return `swebench:${createHash("sha256").update(ids.join("\n")).digest("hex").slice(0, 12)}`;
1033
+ }
1034
+ };
1035
+ }
1036
+ function buildPrompt2(problemStatement) {
1037
+ return [
1038
+ problemStatement,
1039
+ "",
1040
+ "---",
1041
+ "",
1042
+ "You are working in a checkout of the repository. Resolve the issue above by editing the",
1043
+ "source. Do not edit test files. Use the available tools to explore and modify the code.",
1044
+ "When the fix is complete, stop."
1045
+ ].join("\n");
1046
+ }
1047
+
1048
+ export { LANGUAGE_RUNNERS, aggregateAll, aggregateCell, cleanupSandbox, collectCellPredictions, computeHarnessFingerprint, createPolyglotSuite, createSandbox, createSwebenchSuite, execCommand, extractModelPatch, extractPatchPaths, filterPatchExcludingPaths, filterPatchSections, fingerprintLabel, gradePolyglot, gradeSwebench, loadBenchConfig, loadSubset, mapWithConcurrency, median, parseBenchConfig, parseResolvedIds, prepareWorkdir, readSummary, readToolMetrics, renderMarkdownReport, reportHeaderLine, runBenchmark, runWstack, writeInstancePrediction, writeJsonArtifacts, writePredictionsJsonl };
1049
+ //# sourceMappingURL=index.js.map
1050
+ //# sourceMappingURL=index.js.map