ada-agent 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +262 -263
- package/bench/README.md +88 -88
- package/bench/swebench.mjs +242 -242
- package/docs/architecture.md +163 -163
- package/docs/architecture.svg +73 -73
- package/docs/cloudflare.md +81 -81
- package/docs/connectors.md +49 -49
- package/docs/integrations.md +62 -62
- package/package.json +67 -65
- package/skills/aesthetic-direction/SKILL.md +24 -24
- package/skills/color-palette/SKILL.md +24 -24
- package/skills/component-library/SKILL.md +23 -23
- package/skills/dark-mode/SKILL.md +24 -24
- package/skills/dashboard-ui/SKILL.md +23 -23
- package/skills/design-system/SKILL.md +24 -24
- package/skills/design-tokens/SKILL.md +24 -24
- package/skills/empty-states/SKILL.md +23 -23
- package/skills/hero-section/SKILL.md +23 -23
- package/skills/micro-interactions/SKILL.md +23 -23
- package/skills/motion-design/SKILL.md +23 -23
- package/skills/page-transitions/SKILL.md +23 -23
- package/skills/pricing-page/SKILL.md +23 -23
- package/skills/scroll-animation/SKILL.md +23 -23
- package/skills/skeleton-loader/SKILL.md +23 -23
- package/skills/tailwind-theme/SKILL.md +24 -24
- package/skills/typography/SKILL.md +24 -24
- package/skills/ui-polish/SKILL.md +24 -24
- package/skills/ui-review/SKILL.md +24 -24
- package/skills/web-fonts/SKILL.md +24 -24
- package/src/client/autostart.ts +93 -0
- package/src/client/catalog.json +1 -1
- package/src/client/cli.ts +1275 -1262
- package/src/client/models-dev.ts +106 -106
- package/src/selfcheck.ts +404 -390
- package/src/server/config.ts +65 -65
- package/src/server/providers/openai-compat.ts +78 -78
- package/src/server/providers/registry.ts +32 -32
- package/src/server/router.ts +33 -33
- package/src/shared/types.ts +21 -21
package/bench/swebench.mjs
CHANGED
|
@@ -1,242 +1,242 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
// SWE-bench (Verified) prediction generator, driven by ada.
|
|
3
|
-
//
|
|
4
|
-
// This produces an **official-format** predictions.jsonl. It does NOT score — scoring is done by the
|
|
5
|
-
// official `swebench` harness in Docker (the only way to get correct, comparable numbers). See
|
|
6
|
-
// bench/README.md for the full flow (dataset, prereqs, the scoring command).
|
|
7
|
-
//
|
|
8
|
-
// For each instance: clone the task repo at its base commit into an isolated dir, hand ada the issue
|
|
9
|
-
// text (headless `ada -p --json`, auto-approve), then capture `git diff` as the model patch.
|
|
10
|
-
//
|
|
11
|
-
// node bench/swebench.mjs --dataset swe-bench-verified.jsonl --model claude-opus-4-8 \
|
|
12
|
-
// --out runs/opus [--limit 5] [--instances id1,id2] [--concurrency 2] [--timeout 1200]
|
|
13
|
-
// node bench/swebench.mjs --selftest # offline checks of the pure helpers
|
|
14
|
-
//
|
|
15
|
-
// Prereqs: a running `ada-server` with provider keys, `git`, network (clones the task repos).
|
|
16
|
-
|
|
17
|
-
import { spawn, spawnSync } from "node:child_process";
|
|
18
|
-
import { appendFileSync, existsSync, mkdirSync, readFileSync, rmSync } from "node:fs";
|
|
19
|
-
import { homedir } from "node:os";
|
|
20
|
-
import { dirname, join, resolve } from "node:path";
|
|
21
|
-
import { fileURLToPath } from "node:url";
|
|
22
|
-
import assert from "node:assert/strict";
|
|
23
|
-
|
|
24
|
-
const HERE = dirname(fileURLToPath(import.meta.url));
|
|
25
|
-
const ADA_BIN = resolve(HERE, "..", "bin", "ada.mjs");
|
|
26
|
-
const CACHE = process.env.ADA_SWEBENCH_CACHE || join(homedir(), ".cache", "ada-swebench");
|
|
27
|
-
|
|
28
|
-
// ---------- pure helpers (covered by --selftest) ----------
|
|
29
|
-
|
|
30
|
-
export function parseArgs(argv) {
|
|
31
|
-
const f = { concurrency: 2, timeout: 1200, out: "runs/ada" };
|
|
32
|
-
for (let i = 0; i < argv.length; i++) {
|
|
33
|
-
const a = argv[i];
|
|
34
|
-
if (a === "--selftest") f.selftest = true;
|
|
35
|
-
else if (a === "--dataset") f.dataset = argv[++i];
|
|
36
|
-
else if (a === "--model") f.model = argv[++i];
|
|
37
|
-
else if (a === "--out") f.out = argv[++i];
|
|
38
|
-
else if (a === "--limit") f.limit = Number(argv[++i]);
|
|
39
|
-
else if (a === "--instances") f.instances = String(argv[++i]).split(",").map((s) => s.trim()).filter(Boolean);
|
|
40
|
-
else if (a === "--concurrency") f.concurrency = Math.max(1, Number(argv[++i]) || 1);
|
|
41
|
-
else if (a === "--timeout") f.timeout = Number(argv[++i]);
|
|
42
|
-
else if (a === "--ada") f.ada = argv[++i];
|
|
43
|
-
}
|
|
44
|
-
return f;
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
export function buildPrompt(repo, problemStatement) {
|
|
48
|
-
return `The repository \`${repo}\` is checked out in the current directory at the commit where this issue was filed. Resolve the issue by editing the source code.
|
|
49
|
-
|
|
50
|
-
ISSUE:
|
|
51
|
-
${problemStatement}
|
|
52
|
-
|
|
53
|
-
Guidelines:
|
|
54
|
-
- Make the smallest change that fixes the issue.
|
|
55
|
-
- Edit only library/source files. Do NOT add or modify tests — the grader supplies its own.
|
|
56
|
-
- When the fix is complete and self-consistent, stop.`;
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
export function predictionLine(instanceId, model, patch) {
|
|
60
|
-
return JSON.stringify({ instance_id: instanceId, model_name_or_path: model, model_patch: patch });
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
function loadJsonl(path) {
|
|
64
|
-
return readFileSync(path, "utf8")
|
|
65
|
-
.split("\n")
|
|
66
|
-
.map((l) => l.trim())
|
|
67
|
-
.filter(Boolean)
|
|
68
|
-
.map((l) => JSON.parse(l));
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
export function doneIds(predPath) {
|
|
72
|
-
if (!existsSync(predPath)) return new Set();
|
|
73
|
-
const ids = new Set();
|
|
74
|
-
for (const row of loadJsonl(predPath)) if (row.instance_id) ids.add(row.instance_id);
|
|
75
|
-
return ids;
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
export function selectInstances(all, { instances, limit }) {
|
|
79
|
-
let xs = all;
|
|
80
|
-
if (instances?.length) {
|
|
81
|
-
const want = new Set(instances);
|
|
82
|
-
xs = xs.filter((x) => want.has(x.instance_id));
|
|
83
|
-
}
|
|
84
|
-
if (limit && limit > 0) xs = xs.slice(0, limit);
|
|
85
|
-
return xs;
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
// ---------- git + ada (impure) ----------
|
|
89
|
-
|
|
90
|
-
const cloneLocks = new Map(); // repo → in-flight clone promise (don't clone the same repo twice)
|
|
91
|
-
function git(args, opts = {}) {
|
|
92
|
-
return spawnSync("git", args, { encoding: "utf8", ...opts });
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
async function ensureCache(repo) {
|
|
96
|
-
const bare = join(CACHE, `${repo.replace("/", "__")}.git`);
|
|
97
|
-
if (existsSync(bare)) return bare;
|
|
98
|
-
if (!cloneLocks.has(repo)) {
|
|
99
|
-
mkdirSync(CACHE, { recursive: true });
|
|
100
|
-
cloneLocks.set(
|
|
101
|
-
repo,
|
|
102
|
-
new Promise((res, rej) => {
|
|
103
|
-
const p = spawn("git", ["clone", "--bare", `https://github.com/${repo}.git`, bare], { stdio: "inherit" });
|
|
104
|
-
p.on("exit", (code) => (code === 0 ? res(bare) : rej(new Error(`clone ${repo} failed (${code})`))));
|
|
105
|
-
p.on("error", rej);
|
|
106
|
-
}),
|
|
107
|
-
);
|
|
108
|
-
}
|
|
109
|
-
return cloneLocks.get(repo);
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
async function prepInstance(repo, baseCommit, dir) {
|
|
113
|
-
const bare = await ensureCache(repo);
|
|
114
|
-
rmSync(dir, { recursive: true, force: true });
|
|
115
|
-
// --shared: instance dirs reuse the cache's objects (cheap, isolated working trees). Safe because
|
|
116
|
-
// we delete each dir before the cache is ever pruned.
|
|
117
|
-
let r = git(["clone", "--shared", "--no-checkout", bare, dir]);
|
|
118
|
-
if (r.status !== 0) throw new Error(`clone --shared failed: ${r.stderr}`);
|
|
119
|
-
r = git(["-C", dir, "checkout", "--detach", baseCommit]);
|
|
120
|
-
if (r.status !== 0) throw new Error(`checkout ${baseCommit.slice(0, 8)} failed: ${r.stderr}`);
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
function diffPatch(dir) {
|
|
124
|
-
git(["-C", dir, "add", "-A"]);
|
|
125
|
-
const r = git(["-C", dir, "diff", "--cached", "--no-color"]);
|
|
126
|
-
return r.stdout ?? "";
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
function runAda(adaBin, prompt, cwd, model, timeoutMs) {
|
|
130
|
-
return new Promise((res) => {
|
|
131
|
-
const child = spawn(process.execPath, [adaBin, "-p", prompt, "--model", model, "--json"], { cwd, env: process.env });
|
|
132
|
-
let out = "";
|
|
133
|
-
let err = "";
|
|
134
|
-
let timedOut = false;
|
|
135
|
-
const timer = setTimeout(() => {
|
|
136
|
-
timedOut = true;
|
|
137
|
-
child.kill("SIGKILL");
|
|
138
|
-
}, timeoutMs);
|
|
139
|
-
child.stdout.on("data", (d) => (out += d));
|
|
140
|
-
child.stderr.on("data", (d) => (err += d));
|
|
141
|
-
child.on("exit", (code) => {
|
|
142
|
-
clearTimeout(timer);
|
|
143
|
-
let usage = "";
|
|
144
|
-
const line = out.split("\n").reverse().find((l) => l.trim().startsWith("{"));
|
|
145
|
-
try {
|
|
146
|
-
usage = line ? JSON.parse(line).usage ?? "" : "";
|
|
147
|
-
} catch {
|
|
148
|
-
/* ignore */
|
|
149
|
-
}
|
|
150
|
-
res({ code, timedOut, usage, err: err.slice(-500) });
|
|
151
|
-
});
|
|
152
|
-
child.on("error", (e) => {
|
|
153
|
-
clearTimeout(timer);
|
|
154
|
-
res({ code: -1, timedOut, usage: "", err: String(e) });
|
|
155
|
-
});
|
|
156
|
-
});
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
// ---------- run ----------
|
|
160
|
-
|
|
161
|
-
async function pool(items, n, worker) {
|
|
162
|
-
const q = [...items.entries()];
|
|
163
|
-
const runners = Array.from({ length: Math.min(n, q.length) }, async () => {
|
|
164
|
-
for (;;) {
|
|
165
|
-
const next = q.shift();
|
|
166
|
-
if (!next) return;
|
|
167
|
-
await worker(next[1], next[0]);
|
|
168
|
-
}
|
|
169
|
-
});
|
|
170
|
-
await Promise.all(runners);
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
async function main(f) {
|
|
174
|
-
if (!f.dataset || !f.model) {
|
|
175
|
-
console.error("usage: node bench/swebench.mjs --dataset <verified.jsonl> --model <id> [--out dir] [--limit N] [--instances a,b] [--concurrency 2] [--timeout 1200]");
|
|
176
|
-
process.exit(2);
|
|
177
|
-
}
|
|
178
|
-
const adaBin = f.ada || ADA_BIN;
|
|
179
|
-
const outDir = resolve(f.out);
|
|
180
|
-
mkdirSync(outDir, { recursive: true });
|
|
181
|
-
const predPath = join(outDir, "predictions.jsonl");
|
|
182
|
-
const metaPath = join(outDir, "meta.jsonl");
|
|
183
|
-
|
|
184
|
-
const already = doneIds(predPath);
|
|
185
|
-
const todo = selectInstances(loadJsonl(f.dataset), f).filter((x) => !already.has(x.instance_id));
|
|
186
|
-
console.error(`ada SWE-bench · model=${f.model} · ${todo.length} instances (${already.size} already done) · concurrency=${f.concurrency} → ${outDir}`);
|
|
187
|
-
|
|
188
|
-
let done = 0;
|
|
189
|
-
let nonEmpty = 0;
|
|
190
|
-
await pool(todo, f.concurrency, async (inst) => {
|
|
191
|
-
const dir = join(CACHE, "wt", inst.instance_id);
|
|
192
|
-
const t0 = Date.now();
|
|
193
|
-
let patch = "";
|
|
194
|
-
let note = "";
|
|
195
|
-
try {
|
|
196
|
-
await prepInstance(inst.repo, inst.base_commit, dir);
|
|
197
|
-
const r = await runAda(adaBin, buildPrompt(inst.repo, inst.problem_statement), dir, f.model, f.timeout * 1000);
|
|
198
|
-
patch = diffPatch(dir);
|
|
199
|
-
note = r.timedOut ? "timeout" : r.code === 0 ? `usage:${r.usage}` : `exit ${r.code}: ${r.err}`;
|
|
200
|
-
} catch (e) {
|
|
201
|
-
note = `error: ${e instanceof Error ? e.message : e}`;
|
|
202
|
-
} finally {
|
|
203
|
-
rmSync(dir, { recursive: true, force: true });
|
|
204
|
-
}
|
|
205
|
-
appendFileSync(predPath, `${predictionLine(inst.instance_id, f.model, patch)}\n`);
|
|
206
|
-
appendFileSync(metaPath, `${JSON.stringify({ instance_id: inst.instance_id, seconds: Math.round((Date.now() - t0) / 1000), patch_bytes: patch.length, note })}\n`);
|
|
207
|
-
done++;
|
|
208
|
-
if (patch.trim()) nonEmpty++;
|
|
209
|
-
console.error(` [${done}/${todo.length}] ${inst.instance_id} · ${patch.length}B patch · ${note.slice(0, 60)}`);
|
|
210
|
-
});
|
|
211
|
-
|
|
212
|
-
console.error(`\nwrote ${predPath}\n${done} run, ${nonEmpty} produced a non-empty patch. Score with the official harness — see bench/README.md.`);
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
// ---------- selftest ----------
|
|
216
|
-
|
|
217
|
-
function runSelftest() {
|
|
218
|
-
const a = parseArgs(["--dataset", "d.jsonl", "--model", "m", "--limit", "3", "--instances", "x,y", "--concurrency", "4"]);
|
|
219
|
-
assert.equal(a.dataset, "d.jsonl");
|
|
220
|
-
assert.equal(a.model, "m");
|
|
221
|
-
assert.equal(a.limit, 3);
|
|
222
|
-
assert.deepEqual(a.instances, ["x", "y"]);
|
|
223
|
-
assert.equal(a.concurrency, 4);
|
|
224
|
-
|
|
225
|
-
const p = buildPrompt("django/django", "Boom on empty queryset.");
|
|
226
|
-
assert.ok(p.includes("django/django") && p.includes("Boom on empty queryset.") && /do not add or modify tests/i.test(p), "prompt includes repo, issue, no-tests rule");
|
|
227
|
-
|
|
228
|
-
const line = predictionLine("django__django-123", "claude-opus-4-8", "diff --git a b");
|
|
229
|
-
const obj = JSON.parse(line);
|
|
230
|
-
assert.deepEqual(Object.keys(obj).sort(), ["instance_id", "model_name_or_path", "model_patch"]);
|
|
231
|
-
assert.equal(obj.instance_id, "django__django-123");
|
|
232
|
-
|
|
233
|
-
const all = [{ instance_id: "a" }, { instance_id: "b" }, { instance_id: "c" }];
|
|
234
|
-
assert.deepEqual(selectInstances(all, { instances: ["b", "c"], limit: 1 }).map((x) => x.instance_id), ["b"]);
|
|
235
|
-
assert.deepEqual(selectInstances(all, { limit: 2 }).map((x) => x.instance_id), ["a", "b"]);
|
|
236
|
-
|
|
237
|
-
console.log("swebench selftest OK");
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
const flags = parseArgs(process.argv.slice(2));
|
|
241
|
-
if (flags.selftest) runSelftest();
|
|
242
|
-
else await main(flags);
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// SWE-bench (Verified) prediction generator, driven by ada.
|
|
3
|
+
//
|
|
4
|
+
// This produces an **official-format** predictions.jsonl. It does NOT score — scoring is done by the
|
|
5
|
+
// official `swebench` harness in Docker (the only way to get correct, comparable numbers). See
|
|
6
|
+
// bench/README.md for the full flow (dataset, prereqs, the scoring command).
|
|
7
|
+
//
|
|
8
|
+
// For each instance: clone the task repo at its base commit into an isolated dir, hand ada the issue
|
|
9
|
+
// text (headless `ada -p --json`, auto-approve), then capture `git diff` as the model patch.
|
|
10
|
+
//
|
|
11
|
+
// node bench/swebench.mjs --dataset swe-bench-verified.jsonl --model claude-opus-4-8 \
|
|
12
|
+
// --out runs/opus [--limit 5] [--instances id1,id2] [--concurrency 2] [--timeout 1200]
|
|
13
|
+
// node bench/swebench.mjs --selftest # offline checks of the pure helpers
|
|
14
|
+
//
|
|
15
|
+
// Prereqs: a running `ada-server` with provider keys, `git`, network (clones the task repos).
|
|
16
|
+
|
|
17
|
+
import { spawn, spawnSync } from "node:child_process";
|
|
18
|
+
import { appendFileSync, existsSync, mkdirSync, readFileSync, rmSync } from "node:fs";
|
|
19
|
+
import { homedir } from "node:os";
|
|
20
|
+
import { dirname, join, resolve } from "node:path";
|
|
21
|
+
import { fileURLToPath } from "node:url";
|
|
22
|
+
import assert from "node:assert/strict";
|
|
23
|
+
|
|
24
|
+
const HERE = dirname(fileURLToPath(import.meta.url));
|
|
25
|
+
const ADA_BIN = resolve(HERE, "..", "bin", "ada.mjs");
|
|
26
|
+
const CACHE = process.env.ADA_SWEBENCH_CACHE || join(homedir(), ".cache", "ada-swebench");
|
|
27
|
+
|
|
28
|
+
// ---------- pure helpers (covered by --selftest) ----------
|
|
29
|
+
|
|
30
|
+
export function parseArgs(argv) {
|
|
31
|
+
const f = { concurrency: 2, timeout: 1200, out: "runs/ada" };
|
|
32
|
+
for (let i = 0; i < argv.length; i++) {
|
|
33
|
+
const a = argv[i];
|
|
34
|
+
if (a === "--selftest") f.selftest = true;
|
|
35
|
+
else if (a === "--dataset") f.dataset = argv[++i];
|
|
36
|
+
else if (a === "--model") f.model = argv[++i];
|
|
37
|
+
else if (a === "--out") f.out = argv[++i];
|
|
38
|
+
else if (a === "--limit") f.limit = Number(argv[++i]);
|
|
39
|
+
else if (a === "--instances") f.instances = String(argv[++i]).split(",").map((s) => s.trim()).filter(Boolean);
|
|
40
|
+
else if (a === "--concurrency") f.concurrency = Math.max(1, Number(argv[++i]) || 1);
|
|
41
|
+
else if (a === "--timeout") f.timeout = Number(argv[++i]);
|
|
42
|
+
else if (a === "--ada") f.ada = argv[++i];
|
|
43
|
+
}
|
|
44
|
+
return f;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
export function buildPrompt(repo, problemStatement) {
|
|
48
|
+
return `The repository \`${repo}\` is checked out in the current directory at the commit where this issue was filed. Resolve the issue by editing the source code.
|
|
49
|
+
|
|
50
|
+
ISSUE:
|
|
51
|
+
${problemStatement}
|
|
52
|
+
|
|
53
|
+
Guidelines:
|
|
54
|
+
- Make the smallest change that fixes the issue.
|
|
55
|
+
- Edit only library/source files. Do NOT add or modify tests — the grader supplies its own.
|
|
56
|
+
- When the fix is complete and self-consistent, stop.`;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
export function predictionLine(instanceId, model, patch) {
|
|
60
|
+
return JSON.stringify({ instance_id: instanceId, model_name_or_path: model, model_patch: patch });
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function loadJsonl(path) {
|
|
64
|
+
return readFileSync(path, "utf8")
|
|
65
|
+
.split("\n")
|
|
66
|
+
.map((l) => l.trim())
|
|
67
|
+
.filter(Boolean)
|
|
68
|
+
.map((l) => JSON.parse(l));
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
export function doneIds(predPath) {
|
|
72
|
+
if (!existsSync(predPath)) return new Set();
|
|
73
|
+
const ids = new Set();
|
|
74
|
+
for (const row of loadJsonl(predPath)) if (row.instance_id) ids.add(row.instance_id);
|
|
75
|
+
return ids;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export function selectInstances(all, { instances, limit }) {
|
|
79
|
+
let xs = all;
|
|
80
|
+
if (instances?.length) {
|
|
81
|
+
const want = new Set(instances);
|
|
82
|
+
xs = xs.filter((x) => want.has(x.instance_id));
|
|
83
|
+
}
|
|
84
|
+
if (limit && limit > 0) xs = xs.slice(0, limit);
|
|
85
|
+
return xs;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// ---------- git + ada (impure) ----------
|
|
89
|
+
|
|
90
|
+
const cloneLocks = new Map(); // repo → in-flight clone promise (don't clone the same repo twice)
|
|
91
|
+
function git(args, opts = {}) {
|
|
92
|
+
return spawnSync("git", args, { encoding: "utf8", ...opts });
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
async function ensureCache(repo) {
|
|
96
|
+
const bare = join(CACHE, `${repo.replace("/", "__")}.git`);
|
|
97
|
+
if (existsSync(bare)) return bare;
|
|
98
|
+
if (!cloneLocks.has(repo)) {
|
|
99
|
+
mkdirSync(CACHE, { recursive: true });
|
|
100
|
+
cloneLocks.set(
|
|
101
|
+
repo,
|
|
102
|
+
new Promise((res, rej) => {
|
|
103
|
+
const p = spawn("git", ["clone", "--bare", `https://github.com/${repo}.git`, bare], { stdio: "inherit" });
|
|
104
|
+
p.on("exit", (code) => (code === 0 ? res(bare) : rej(new Error(`clone ${repo} failed (${code})`))));
|
|
105
|
+
p.on("error", rej);
|
|
106
|
+
}),
|
|
107
|
+
);
|
|
108
|
+
}
|
|
109
|
+
return cloneLocks.get(repo);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
async function prepInstance(repo, baseCommit, dir) {
|
|
113
|
+
const bare = await ensureCache(repo);
|
|
114
|
+
rmSync(dir, { recursive: true, force: true });
|
|
115
|
+
// --shared: instance dirs reuse the cache's objects (cheap, isolated working trees). Safe because
|
|
116
|
+
// we delete each dir before the cache is ever pruned.
|
|
117
|
+
let r = git(["clone", "--shared", "--no-checkout", bare, dir]);
|
|
118
|
+
if (r.status !== 0) throw new Error(`clone --shared failed: ${r.stderr}`);
|
|
119
|
+
r = git(["-C", dir, "checkout", "--detach", baseCommit]);
|
|
120
|
+
if (r.status !== 0) throw new Error(`checkout ${baseCommit.slice(0, 8)} failed: ${r.stderr}`);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
function diffPatch(dir) {
|
|
124
|
+
git(["-C", dir, "add", "-A"]);
|
|
125
|
+
const r = git(["-C", dir, "diff", "--cached", "--no-color"]);
|
|
126
|
+
return r.stdout ?? "";
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
function runAda(adaBin, prompt, cwd, model, timeoutMs) {
|
|
130
|
+
return new Promise((res) => {
|
|
131
|
+
const child = spawn(process.execPath, [adaBin, "-p", prompt, "--model", model, "--json"], { cwd, env: process.env });
|
|
132
|
+
let out = "";
|
|
133
|
+
let err = "";
|
|
134
|
+
let timedOut = false;
|
|
135
|
+
const timer = setTimeout(() => {
|
|
136
|
+
timedOut = true;
|
|
137
|
+
child.kill("SIGKILL");
|
|
138
|
+
}, timeoutMs);
|
|
139
|
+
child.stdout.on("data", (d) => (out += d));
|
|
140
|
+
child.stderr.on("data", (d) => (err += d));
|
|
141
|
+
child.on("exit", (code) => {
|
|
142
|
+
clearTimeout(timer);
|
|
143
|
+
let usage = "";
|
|
144
|
+
const line = out.split("\n").reverse().find((l) => l.trim().startsWith("{"));
|
|
145
|
+
try {
|
|
146
|
+
usage = line ? JSON.parse(line).usage ?? "" : "";
|
|
147
|
+
} catch {
|
|
148
|
+
/* ignore */
|
|
149
|
+
}
|
|
150
|
+
res({ code, timedOut, usage, err: err.slice(-500) });
|
|
151
|
+
});
|
|
152
|
+
child.on("error", (e) => {
|
|
153
|
+
clearTimeout(timer);
|
|
154
|
+
res({ code: -1, timedOut, usage: "", err: String(e) });
|
|
155
|
+
});
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// ---------- run ----------
|
|
160
|
+
|
|
161
|
+
async function pool(items, n, worker) {
|
|
162
|
+
const q = [...items.entries()];
|
|
163
|
+
const runners = Array.from({ length: Math.min(n, q.length) }, async () => {
|
|
164
|
+
for (;;) {
|
|
165
|
+
const next = q.shift();
|
|
166
|
+
if (!next) return;
|
|
167
|
+
await worker(next[1], next[0]);
|
|
168
|
+
}
|
|
169
|
+
});
|
|
170
|
+
await Promise.all(runners);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
async function main(f) {
|
|
174
|
+
if (!f.dataset || !f.model) {
|
|
175
|
+
console.error("usage: node bench/swebench.mjs --dataset <verified.jsonl> --model <id> [--out dir] [--limit N] [--instances a,b] [--concurrency 2] [--timeout 1200]");
|
|
176
|
+
process.exit(2);
|
|
177
|
+
}
|
|
178
|
+
const adaBin = f.ada || ADA_BIN;
|
|
179
|
+
const outDir = resolve(f.out);
|
|
180
|
+
mkdirSync(outDir, { recursive: true });
|
|
181
|
+
const predPath = join(outDir, "predictions.jsonl");
|
|
182
|
+
const metaPath = join(outDir, "meta.jsonl");
|
|
183
|
+
|
|
184
|
+
const already = doneIds(predPath);
|
|
185
|
+
const todo = selectInstances(loadJsonl(f.dataset), f).filter((x) => !already.has(x.instance_id));
|
|
186
|
+
console.error(`ada SWE-bench · model=${f.model} · ${todo.length} instances (${already.size} already done) · concurrency=${f.concurrency} → ${outDir}`);
|
|
187
|
+
|
|
188
|
+
let done = 0;
|
|
189
|
+
let nonEmpty = 0;
|
|
190
|
+
await pool(todo, f.concurrency, async (inst) => {
|
|
191
|
+
const dir = join(CACHE, "wt", inst.instance_id);
|
|
192
|
+
const t0 = Date.now();
|
|
193
|
+
let patch = "";
|
|
194
|
+
let note = "";
|
|
195
|
+
try {
|
|
196
|
+
await prepInstance(inst.repo, inst.base_commit, dir);
|
|
197
|
+
const r = await runAda(adaBin, buildPrompt(inst.repo, inst.problem_statement), dir, f.model, f.timeout * 1000);
|
|
198
|
+
patch = diffPatch(dir);
|
|
199
|
+
note = r.timedOut ? "timeout" : r.code === 0 ? `usage:${r.usage}` : `exit ${r.code}: ${r.err}`;
|
|
200
|
+
} catch (e) {
|
|
201
|
+
note = `error: ${e instanceof Error ? e.message : e}`;
|
|
202
|
+
} finally {
|
|
203
|
+
rmSync(dir, { recursive: true, force: true });
|
|
204
|
+
}
|
|
205
|
+
appendFileSync(predPath, `${predictionLine(inst.instance_id, f.model, patch)}\n`);
|
|
206
|
+
appendFileSync(metaPath, `${JSON.stringify({ instance_id: inst.instance_id, seconds: Math.round((Date.now() - t0) / 1000), patch_bytes: patch.length, note })}\n`);
|
|
207
|
+
done++;
|
|
208
|
+
if (patch.trim()) nonEmpty++;
|
|
209
|
+
console.error(` [${done}/${todo.length}] ${inst.instance_id} · ${patch.length}B patch · ${note.slice(0, 60)}`);
|
|
210
|
+
});
|
|
211
|
+
|
|
212
|
+
console.error(`\nwrote ${predPath}\n${done} run, ${nonEmpty} produced a non-empty patch. Score with the official harness — see bench/README.md.`);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// ---------- selftest ----------
|
|
216
|
+
|
|
217
|
+
function runSelftest() {
|
|
218
|
+
const a = parseArgs(["--dataset", "d.jsonl", "--model", "m", "--limit", "3", "--instances", "x,y", "--concurrency", "4"]);
|
|
219
|
+
assert.equal(a.dataset, "d.jsonl");
|
|
220
|
+
assert.equal(a.model, "m");
|
|
221
|
+
assert.equal(a.limit, 3);
|
|
222
|
+
assert.deepEqual(a.instances, ["x", "y"]);
|
|
223
|
+
assert.equal(a.concurrency, 4);
|
|
224
|
+
|
|
225
|
+
const p = buildPrompt("django/django", "Boom on empty queryset.");
|
|
226
|
+
assert.ok(p.includes("django/django") && p.includes("Boom on empty queryset.") && /do not add or modify tests/i.test(p), "prompt includes repo, issue, no-tests rule");
|
|
227
|
+
|
|
228
|
+
const line = predictionLine("django__django-123", "claude-opus-4-8", "diff --git a b");
|
|
229
|
+
const obj = JSON.parse(line);
|
|
230
|
+
assert.deepEqual(Object.keys(obj).sort(), ["instance_id", "model_name_or_path", "model_patch"]);
|
|
231
|
+
assert.equal(obj.instance_id, "django__django-123");
|
|
232
|
+
|
|
233
|
+
const all = [{ instance_id: "a" }, { instance_id: "b" }, { instance_id: "c" }];
|
|
234
|
+
assert.deepEqual(selectInstances(all, { instances: ["b", "c"], limit: 1 }).map((x) => x.instance_id), ["b"]);
|
|
235
|
+
assert.deepEqual(selectInstances(all, { limit: 2 }).map((x) => x.instance_id), ["a", "b"]);
|
|
236
|
+
|
|
237
|
+
console.log("swebench selftest OK");
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
const flags = parseArgs(process.argv.slice(2));
|
|
241
|
+
if (flags.selftest) runSelftest();
|
|
242
|
+
else await main(flags);
|