@chllming/wave-orchestration 0.6.3 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +82 -1
- package/README.md +40 -7
- package/docs/agents/wave-orchestrator-role.md +50 -0
- package/docs/agents/wave-planner-role.md +39 -0
- package/docs/context7/bundles.json +9 -0
- package/docs/context7/planner-agent/README.md +25 -0
- package/docs/context7/planner-agent/manifest.json +83 -0
- package/docs/context7/planner-agent/papers/cooperbench-why-coding-agents-cannot-be-your-teammates-yet.md +3283 -0
- package/docs/context7/planner-agent/papers/dova-deliberation-first-multi-agent-orchestration-for-autonomous-research-automation.md +1699 -0
- package/docs/context7/planner-agent/papers/dpbench-large-language-models-struggle-with-simultaneous-coordination.md +2251 -0
- package/docs/context7/planner-agent/papers/incremental-planning-to-control-a-blackboard-based-problem-solver.md +1729 -0
- package/docs/context7/planner-agent/papers/silo-bench-a-scalable-environment-for-evaluating-distributed-coordination-in-multi-agent-llm-systems.md +3747 -0
- package/docs/context7/planner-agent/papers/todoevolve-learning-to-architect-agent-planning-systems.md +1675 -0
- package/docs/context7/planner-agent/papers/verified-multi-agent-orchestration-a-plan-execute-verify-replan-framework-for-complex-query-resolution.md +1173 -0
- package/docs/context7/planner-agent/papers/why-do-multi-agent-llm-systems-fail.md +5211 -0
- package/docs/context7/planner-agent/topics/planning-and-orchestration.md +24 -0
- package/docs/evals/README.md +96 -1
- package/docs/evals/arm-templates/README.md +13 -0
- package/docs/evals/arm-templates/full-wave.json +15 -0
- package/docs/evals/arm-templates/single-agent.json +15 -0
- package/docs/evals/benchmark-catalog.json +7 -0
- package/docs/evals/cases/README.md +47 -0
- package/docs/evals/cases/wave-blackboard-inbox-targeting.json +73 -0
- package/docs/evals/cases/wave-contradiction-conflict.json +104 -0
- package/docs/evals/cases/wave-expert-routing-preservation.json +69 -0
- package/docs/evals/cases/wave-hidden-profile-private-evidence.json +81 -0
- package/docs/evals/cases/wave-premature-closure-guard.json +71 -0
- package/docs/evals/cases/wave-silo-cross-agent-state.json +77 -0
- package/docs/evals/cases/wave-simultaneous-lockstep.json +92 -0
- package/docs/evals/cooperbench/real-world-mitigation.md +341 -0
- package/docs/evals/external-benchmarks.json +85 -0
- package/docs/evals/external-command-config.sample.json +9 -0
- package/docs/evals/external-command-config.swe-bench-pro.json +8 -0
- package/docs/evals/pilots/README.md +47 -0
- package/docs/evals/pilots/swe-bench-pro-public-full-wave-review-10.json +64 -0
- package/docs/evals/pilots/swe-bench-pro-public-pilot.json +111 -0
- package/docs/evals/wave-benchmark-program.md +302 -0
- package/docs/guides/planner.md +67 -11
- package/docs/guides/terminal-surfaces.md +12 -0
- package/docs/plans/context7-wave-orchestrator.md +20 -0
- package/docs/plans/current-state.md +8 -1
- package/docs/plans/examples/wave-benchmark-improvement.md +108 -0
- package/docs/plans/examples/wave-example-live-proof.md +1 -1
- package/docs/plans/examples/wave-example-rollout-fidelity.md +340 -0
- package/docs/plans/migration.md +26 -0
- package/docs/plans/wave-orchestrator.md +60 -12
- package/docs/plans/waves/reviews/wave-1-benchmark-operator.md +118 -0
- package/docs/reference/cli-reference.md +547 -0
- package/docs/reference/coordination-and-closure.md +436 -0
- package/docs/reference/live-proof-waves.md +25 -3
- package/docs/reference/npmjs-trusted-publishing.md +3 -3
- package/docs/reference/proof-metrics.md +90 -0
- package/docs/reference/runtime-config/README.md +63 -2
- package/docs/reference/runtime-config/codex.md +2 -1
- package/docs/reference/sample-waves.md +29 -18
- package/docs/reference/wave-control.md +164 -0
- package/docs/reference/wave-planning-lessons.md +131 -0
- package/package.json +5 -4
- package/releases/manifest.json +40 -0
- package/scripts/research/agent-context-archive.mjs +18 -0
- package/scripts/research/manifests/agent-context-expanded-2026-03-22.mjs +17 -0
- package/scripts/research/sync-planner-context7-bundle.mjs +133 -0
- package/scripts/wave-orchestrator/agent-state.mjs +11 -2
- package/scripts/wave-orchestrator/artifact-schemas.mjs +232 -0
- package/scripts/wave-orchestrator/autonomous.mjs +7 -0
- package/scripts/wave-orchestrator/benchmark-cases.mjs +374 -0
- package/scripts/wave-orchestrator/benchmark-external.mjs +1384 -0
- package/scripts/wave-orchestrator/benchmark.mjs +972 -0
- package/scripts/wave-orchestrator/clarification-triage.mjs +78 -12
- package/scripts/wave-orchestrator/config.mjs +175 -0
- package/scripts/wave-orchestrator/control-cli.mjs +1216 -0
- package/scripts/wave-orchestrator/control-plane.mjs +697 -0
- package/scripts/wave-orchestrator/coord-cli.mjs +360 -2
- package/scripts/wave-orchestrator/coordination-store.mjs +211 -9
- package/scripts/wave-orchestrator/coordination.mjs +84 -0
- package/scripts/wave-orchestrator/dashboard-renderer.mjs +120 -5
- package/scripts/wave-orchestrator/dashboard-state.mjs +22 -0
- package/scripts/wave-orchestrator/evals.mjs +23 -0
- package/scripts/wave-orchestrator/executors.mjs +3 -2
- package/scripts/wave-orchestrator/feedback.mjs +55 -0
- package/scripts/wave-orchestrator/install.mjs +151 -2
- package/scripts/wave-orchestrator/launcher-closure.mjs +4 -1
- package/scripts/wave-orchestrator/launcher-runtime.mjs +33 -30
- package/scripts/wave-orchestrator/launcher.mjs +884 -36
- package/scripts/wave-orchestrator/planner-context.mjs +75 -0
- package/scripts/wave-orchestrator/planner.mjs +2270 -136
- package/scripts/wave-orchestrator/proof-cli.mjs +195 -0
- package/scripts/wave-orchestrator/proof-registry.mjs +317 -0
- package/scripts/wave-orchestrator/replay.mjs +10 -4
- package/scripts/wave-orchestrator/retry-cli.mjs +184 -0
- package/scripts/wave-orchestrator/retry-control.mjs +225 -0
- package/scripts/wave-orchestrator/shared.mjs +26 -0
- package/scripts/wave-orchestrator/swe-bench-pro-task.mjs +1004 -0
- package/scripts/wave-orchestrator/terminals.mjs +1 -1
- package/scripts/wave-orchestrator/traces.mjs +157 -2
- package/scripts/wave-orchestrator/wave-control-client.mjs +532 -0
- package/scripts/wave-orchestrator/wave-control-schema.mjs +309 -0
- package/scripts/wave-orchestrator/wave-files.mjs +144 -23
- package/scripts/wave.mjs +27 -0
- package/skills/repo-coding-rules/SKILL.md +1 -0
- package/skills/role-cont-eval/SKILL.md +1 -0
- package/skills/role-cont-qa/SKILL.md +13 -6
- package/skills/role-deploy/SKILL.md +1 -0
- package/skills/role-documentation/SKILL.md +4 -0
- package/skills/role-implementation/SKILL.md +4 -0
- package/skills/role-infra/SKILL.md +2 -1
- package/skills/role-integration/SKILL.md +15 -8
- package/skills/role-planner/SKILL.md +39 -0
- package/skills/role-planner/skill.json +21 -0
- package/skills/role-research/SKILL.md +1 -0
- package/skills/role-security/SKILL.md +2 -2
- package/skills/runtime-claude/SKILL.md +2 -1
- package/skills/runtime-codex/SKILL.md +1 -0
- package/skills/runtime-local/SKILL.md +2 -0
- package/skills/runtime-opencode/SKILL.md +1 -0
- package/skills/wave-core/SKILL.md +25 -6
- package/skills/wave-core/references/marker-syntax.md +16 -8
- package/wave.config.json +45 -0
|
@@ -0,0 +1,1004 @@
|
|
|
1
|
+
import crypto from "node:crypto";
|
|
2
|
+
import fs from "node:fs";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import { spawnSync } from "node:child_process";
|
|
5
|
+
import { buildCodexExecInvocation } from "./executors.mjs";
|
|
6
|
+
import { REPO_ROOT, ensureDirectory, shellQuote, toIsoTimestamp, writeJsonAtomic, writeTextAtomic } from "./shared.mjs";
|
|
7
|
+
|
|
8
|
+
const DEFAULT_PYTHON_BIN = path.join(REPO_ROOT, ".tmp", "bench-tools", "swe-bench-pro-venv", "bin", "python");
|
|
9
|
+
const DEFAULT_SWE_BENCH_ROOT = path.join(REPO_ROOT, ".tmp", "bench-tools", "SWE-bench_Pro-os");
|
|
10
|
+
const DEFAULT_OUTPUT_ROOT = path.join(REPO_ROOT, ".tmp", "wave-benchmarks", "swe-bench-pro-live");
|
|
11
|
+
const WAVE_ENTRY = path.join(REPO_ROOT, "scripts", "wave.mjs");
|
|
12
|
+
|
|
13
|
+
function cleanText(value) {
|
|
14
|
+
return String(value ?? "").trim();
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
function matchesFailurePattern(detail, patterns) {
|
|
18
|
+
return patterns.some((pattern) => detail.includes(pattern));
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
function isVerifierImageFailureDetail(detail) {
|
|
22
|
+
return matchesFailurePattern(detail, [
|
|
23
|
+
"failed to pull",
|
|
24
|
+
"manifest unknown",
|
|
25
|
+
"no matching manifest",
|
|
26
|
+
"pull access denied",
|
|
27
|
+
"jefzda/sweap-images",
|
|
28
|
+
"docker image",
|
|
29
|
+
"dockerhub_username",
|
|
30
|
+
]);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function isSetupHarnessFailureDetail(detail) {
|
|
34
|
+
return matchesFailurePattern(detail, [
|
|
35
|
+
"wave init failed",
|
|
36
|
+
"wave doctor failed",
|
|
37
|
+
"wave launch failed",
|
|
38
|
+
"git diff failed",
|
|
39
|
+
"git add -n failed",
|
|
40
|
+
"patch extraction failed",
|
|
41
|
+
"repository preparation failed",
|
|
42
|
+
"repo already contained wave bootstrap files",
|
|
43
|
+
"already contained wave bootstrap files",
|
|
44
|
+
"could not parse object",
|
|
45
|
+
"fatal: could not parse object",
|
|
46
|
+
"bootstrap",
|
|
47
|
+
"harness",
|
|
48
|
+
"workspace",
|
|
49
|
+
"task workspace",
|
|
50
|
+
"setup failed",
|
|
51
|
+
]);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function normalizeArm(value) {
|
|
55
|
+
const arm = cleanText(value);
|
|
56
|
+
if (!["single-agent", "full-wave"].includes(arm)) {
|
|
57
|
+
throw new Error(`Unsupported SWE-bench Pro arm: ${value}`);
|
|
58
|
+
}
|
|
59
|
+
return arm;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
function parseArgs(argv) {
|
|
63
|
+
const options = {
|
|
64
|
+
command: "",
|
|
65
|
+
instanceId: "",
|
|
66
|
+
arm: "",
|
|
67
|
+
modelId: "",
|
|
68
|
+
reasoningEffort: "high",
|
|
69
|
+
maxWallClockMinutes: 45,
|
|
70
|
+
maxTurns: 250,
|
|
71
|
+
pythonBin: DEFAULT_PYTHON_BIN,
|
|
72
|
+
sweBenchRoot: DEFAULT_SWE_BENCH_ROOT,
|
|
73
|
+
outputRoot: DEFAULT_OUTPUT_ROOT,
|
|
74
|
+
};
|
|
75
|
+
for (let index = 0; index < argv.length; index += 1) {
|
|
76
|
+
const arg = argv[index];
|
|
77
|
+
if (!options.command) {
|
|
78
|
+
options.command = cleanText(arg);
|
|
79
|
+
continue;
|
|
80
|
+
}
|
|
81
|
+
if (arg === "--instance") {
|
|
82
|
+
options.instanceId = cleanText(argv[++index]);
|
|
83
|
+
} else if (arg === "--arm") {
|
|
84
|
+
options.arm = cleanText(argv[++index]);
|
|
85
|
+
} else if (arg === "--model") {
|
|
86
|
+
options.modelId = cleanText(argv[++index]);
|
|
87
|
+
} else if (arg === "--reasoning-effort") {
|
|
88
|
+
options.reasoningEffort = cleanText(argv[++index]) || "high";
|
|
89
|
+
} else if (arg === "--max-wall-clock-minutes") {
|
|
90
|
+
options.maxWallClockMinutes = Number.parseInt(String(argv[++index] || "45"), 10) || 45;
|
|
91
|
+
} else if (arg === "--max-turns") {
|
|
92
|
+
options.maxTurns = Number.parseInt(String(argv[++index] || "250"), 10) || 250;
|
|
93
|
+
} else if (arg === "--python-bin") {
|
|
94
|
+
options.pythonBin = cleanText(argv[++index]) || DEFAULT_PYTHON_BIN;
|
|
95
|
+
} else if (arg === "--swe-bench-root") {
|
|
96
|
+
options.sweBenchRoot = cleanText(argv[++index]) || DEFAULT_SWE_BENCH_ROOT;
|
|
97
|
+
} else if (arg === "--output-root") {
|
|
98
|
+
options.outputRoot = cleanText(argv[++index]) || DEFAULT_OUTPUT_ROOT;
|
|
99
|
+
} else {
|
|
100
|
+
throw new Error(`Unknown argument: ${arg}`);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
if (!options.command) {
|
|
104
|
+
throw new Error("Usage: node scripts/wave-orchestrator/swe-bench-pro-task.mjs run --instance <id> --arm <single-agent|full-wave> --model <id>");
|
|
105
|
+
}
|
|
106
|
+
return options;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
function runShellCommand(command, { cwd, timeoutMs = 0, env = {} } = {}) {
|
|
110
|
+
const startedAt = Date.now();
|
|
111
|
+
const result = spawnSync("bash", ["-lc", `set -o pipefail; ${command}`], {
|
|
112
|
+
cwd,
|
|
113
|
+
encoding: "utf8",
|
|
114
|
+
env: { ...process.env, ...env },
|
|
115
|
+
timeout: timeoutMs > 0 ? timeoutMs : undefined,
|
|
116
|
+
});
|
|
117
|
+
return {
|
|
118
|
+
command,
|
|
119
|
+
cwd,
|
|
120
|
+
exitCode: Number.isInteger(result.status) ? result.status : 1,
|
|
121
|
+
signal: result.signal || null,
|
|
122
|
+
stdout: result.stdout || "",
|
|
123
|
+
stderr: result.stderr || "",
|
|
124
|
+
error: result.error || null,
|
|
125
|
+
wallClockMs: Date.now() - startedAt,
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
function assertSuccess(result, label) {
|
|
130
|
+
if (result.error?.code === "ETIMEDOUT") {
|
|
131
|
+
throw new Error(`${label} timed out after ${result.wallClockMs}ms`);
|
|
132
|
+
}
|
|
133
|
+
if (result.exitCode !== 0) {
|
|
134
|
+
const detail = cleanText(result.stderr || result.stdout) || "no output";
|
|
135
|
+
throw new Error(`${label} failed (${result.exitCode}): ${detail}`);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
function loadDatasetRow(instanceId, pythonBin) {
|
|
140
|
+
const pythonScript = `
|
|
141
|
+
import json
|
|
142
|
+
import sys
|
|
143
|
+
from datasets import load_dataset
|
|
144
|
+
|
|
145
|
+
instance_id = sys.argv[1]
|
|
146
|
+
dataset = load_dataset("ScaleAI/SWE-bench_Pro", split="test")
|
|
147
|
+
row = next((entry for entry in dataset if entry["instance_id"] == instance_id), None)
|
|
148
|
+
if row is None:
|
|
149
|
+
raise SystemExit(f"unknown instance: {instance_id}")
|
|
150
|
+
print(json.dumps(row))
|
|
151
|
+
`;
|
|
152
|
+
const result = spawnSync(pythonBin, ["-c", pythonScript, instanceId], {
|
|
153
|
+
cwd: REPO_ROOT,
|
|
154
|
+
encoding: "utf8",
|
|
155
|
+
env: process.env,
|
|
156
|
+
});
|
|
157
|
+
if (result.status !== 0) {
|
|
158
|
+
throw new Error(`Failed to load SWE-bench Pro row for ${instanceId}: ${cleanText(result.stderr || result.stdout)}`);
|
|
159
|
+
}
|
|
160
|
+
return JSON.parse(result.stdout);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
function normalizeSerializedList(value) {
|
|
164
|
+
if (value == null || value === "" || value === "None") {
|
|
165
|
+
return "[]";
|
|
166
|
+
}
|
|
167
|
+
if (Array.isArray(value)) {
|
|
168
|
+
return JSON.stringify(value);
|
|
169
|
+
}
|
|
170
|
+
return String(value);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
function normalizeRawSampleRow(row) {
|
|
174
|
+
return {
|
|
175
|
+
instance_id: cleanText(row.instance_id),
|
|
176
|
+
repo: cleanText(row.repo),
|
|
177
|
+
problem_statement: String(row.problem_statement || ""),
|
|
178
|
+
base_commit: cleanText(row.base_commit),
|
|
179
|
+
before_repo_set_cmd: String(row.before_repo_set_cmd || ""),
|
|
180
|
+
selected_test_files_to_run: normalizeSerializedList(
|
|
181
|
+
row.selected_test_files_to_run ?? row.SELECTED_TEST_FILES_TO_RUN,
|
|
182
|
+
),
|
|
183
|
+
fail_to_pass: normalizeSerializedList(row.fail_to_pass ?? row.FAIL_TO_PASS),
|
|
184
|
+
pass_to_pass: normalizeSerializedList(row.pass_to_pass ?? row.PASS_TO_PASS),
|
|
185
|
+
base_dockerfile: String(row.base_dockerfile || ""),
|
|
186
|
+
instance_dockerfile: String(row.instance_dockerfile || ""),
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
function ensureFreshDir(dirPath) {
|
|
191
|
+
fs.rmSync(dirPath, { recursive: true, force: true });
|
|
192
|
+
fs.mkdirSync(dirPath, { recursive: true });
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
function prepareTaskWorkspace(row, arm, outputRoot) {
|
|
196
|
+
const runId = `${row.instance_id}-${arm}-${crypto.randomBytes(4).toString("hex")}`;
|
|
197
|
+
const taskRoot = path.resolve(REPO_ROOT, outputRoot, runId);
|
|
198
|
+
const repoDir = path.join(taskRoot, "repo");
|
|
199
|
+
ensureFreshDir(taskRoot);
|
|
200
|
+
return {
|
|
201
|
+
runId,
|
|
202
|
+
taskRoot,
|
|
203
|
+
repoDir,
|
|
204
|
+
artifactsDir: path.join(taskRoot, "artifacts"),
|
|
205
|
+
logsDir: path.join(taskRoot, "logs"),
|
|
206
|
+
evalDir: path.join(taskRoot, "eval"),
|
|
207
|
+
};
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
function cloneRepo(row, repoDir) {
|
|
211
|
+
ensureDirectory(path.dirname(repoDir));
|
|
212
|
+
const clone = runShellCommand(
|
|
213
|
+
`git clone https://github.com/${row.repo}.git ${shellQuote(repoDir)}`,
|
|
214
|
+
{ cwd: REPO_ROOT, timeoutMs: 20 * 60 * 1000 },
|
|
215
|
+
);
|
|
216
|
+
assertSuccess(clone, `clone ${row.repo}`);
|
|
217
|
+
const prep = runShellCommand(String(row.before_repo_set_cmd || ""), {
|
|
218
|
+
cwd: repoDir,
|
|
219
|
+
timeoutMs: 10 * 60 * 1000,
|
|
220
|
+
env: { GIT_TERMINAL_PROMPT: "0" },
|
|
221
|
+
});
|
|
222
|
+
assertSuccess(prep, `prepare ${row.instance_id}`);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
function parseCodexUsageFromLog(logPath) {
|
|
226
|
+
if (!fs.existsSync(logPath)) {
|
|
227
|
+
return { input_tokens: 0, cached_input_tokens: 0, output_tokens: 0 };
|
|
228
|
+
}
|
|
229
|
+
const usage = {
|
|
230
|
+
input_tokens: 0,
|
|
231
|
+
cached_input_tokens: 0,
|
|
232
|
+
output_tokens: 0,
|
|
233
|
+
};
|
|
234
|
+
const lines = fs.readFileSync(logPath, "utf8").split(/\r?\n/);
|
|
235
|
+
for (const line of lines) {
|
|
236
|
+
const trimmed = line.trim();
|
|
237
|
+
if (!trimmed.startsWith("{")) {
|
|
238
|
+
continue;
|
|
239
|
+
}
|
|
240
|
+
try {
|
|
241
|
+
const payload = JSON.parse(trimmed);
|
|
242
|
+
if (payload.type === "turn.completed" && payload.usage && typeof payload.usage === "object") {
|
|
243
|
+
usage.input_tokens += Number(payload.usage.input_tokens || 0);
|
|
244
|
+
usage.cached_input_tokens += Number(payload.usage.cached_input_tokens || 0);
|
|
245
|
+
usage.output_tokens += Number(payload.usage.output_tokens || 0);
|
|
246
|
+
}
|
|
247
|
+
} catch {
|
|
248
|
+
// Ignore non-JSON or partial lines.
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
return usage;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
function mergeUsageTotals(list) {
|
|
255
|
+
return list.reduce(
|
|
256
|
+
(total, entry) => ({
|
|
257
|
+
input_tokens: total.input_tokens + Number(entry.input_tokens || 0),
|
|
258
|
+
cached_input_tokens: total.cached_input_tokens + Number(entry.cached_input_tokens || 0),
|
|
259
|
+
output_tokens: total.output_tokens + Number(entry.output_tokens || 0),
|
|
260
|
+
}),
|
|
261
|
+
{ input_tokens: 0, cached_input_tokens: 0, output_tokens: 0 },
|
|
262
|
+
);
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
function buildSingleAgentPrompt(row) {
|
|
266
|
+
return [
|
|
267
|
+
`You are solving one SWE-bench Pro task in the repository ${row.repo}.`,
|
|
268
|
+
"",
|
|
269
|
+
"Solve the issue using only the repository checkout and the issue text below.",
|
|
270
|
+
"Do not use gold patches, benchmark answers, evaluator outputs, or any external answer source.",
|
|
271
|
+
"Prefer the smallest correct patch that fixes the described bug.",
|
|
272
|
+
"You may inspect and edit files and run lightweight local checks if helpful, but do not spend most of your budget on heavyweight environment reconstruction.",
|
|
273
|
+
"Leave your code changes in the working tree and do not create commits.",
|
|
274
|
+
"",
|
|
275
|
+
"Issue statement:",
|
|
276
|
+
String(row.problem_statement || ""),
|
|
277
|
+
"",
|
|
278
|
+
`Official target tests: ${normalizeSerializedList(row.selected_test_files_to_run)}`,
|
|
279
|
+
"",
|
|
280
|
+
"Final response requirements:",
|
|
281
|
+
"- summarize the root cause and files changed",
|
|
282
|
+
"- mention any local checks you ran, or state that you relied on static reasoning only",
|
|
283
|
+
].join("\n");
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
function buildFullWaveMarkdown(row, modelId, reasoningEffort, maxWallClockMinutes, maxTurns) {
|
|
287
|
+
const testList = normalizeSerializedList(row.selected_test_files_to_run);
|
|
288
|
+
return `# Wave 1 - SWE-bench Pro Task Solve
|
|
289
|
+
|
|
290
|
+
**Commit message**: \`Feat: solve ${row.instance_id}\`
|
|
291
|
+
|
|
292
|
+
## Sequencing note
|
|
293
|
+
|
|
294
|
+
- This is a frozen benchmark solve attempt for \`${row.instance_id}\`. Use only the issue statement, repository checkout, and your own reasoning. Do not use gold patches, verifier outputs, or benchmark answer sources.
|
|
295
|
+
|
|
296
|
+
## Reference rule
|
|
297
|
+
|
|
298
|
+
- The benchmark contract is fixed before launch. Agents may solve the task, validate locally when practical, and close the wave, but they must not tune against hidden verifier feedback.
|
|
299
|
+
|
|
300
|
+
## Component promotions
|
|
301
|
+
|
|
302
|
+
- benchmark-program-and-evals: baseline-proved
|
|
303
|
+
|
|
304
|
+
## Context7 defaults
|
|
305
|
+
|
|
306
|
+
- bundle: none
|
|
307
|
+
|
|
308
|
+
## Eval targets
|
|
309
|
+
|
|
310
|
+
- id: issue-acceptance-review | selection: pinned | benchmarks: manual-session-review | objective: Re-check the landed diff against the issue statement and the official target tests before closure | threshold: The landed diff addresses the issue requirements without obvious unresolved gaps
|
|
311
|
+
|
|
312
|
+
## Agent A0: cont-QA
|
|
313
|
+
|
|
314
|
+
### Role prompts
|
|
315
|
+
|
|
316
|
+
- docs/agents/wave-cont-qa-role.md
|
|
317
|
+
|
|
318
|
+
### Executor
|
|
319
|
+
|
|
320
|
+
- id: codex
|
|
321
|
+
- model: ${modelId}
|
|
322
|
+
- budget.minutes: ${maxWallClockMinutes}
|
|
323
|
+
- budget.turns: ${maxTurns}
|
|
324
|
+
- codex.json: true
|
|
325
|
+
- codex.config: model_reasoning_effort=${reasoningEffort}
|
|
326
|
+
|
|
327
|
+
### Context7
|
|
328
|
+
|
|
329
|
+
- bundle: none
|
|
330
|
+
|
|
331
|
+
### Prompt
|
|
332
|
+
|
|
333
|
+
\`\`\`text
|
|
334
|
+
Primary goal:
|
|
335
|
+
- Close this benchmark solve attempt fail-closed.
|
|
336
|
+
|
|
337
|
+
Required context before coding:
|
|
338
|
+
- Read docs/reference/repository-guidance.md.
|
|
339
|
+
- Read docs/research/agent-context-sources.md.
|
|
340
|
+
- Read docs/plans/current-state.md, docs/plans/master-plan.md, and docs/plans/migration.md.
|
|
341
|
+
|
|
342
|
+
Specific expectations:
|
|
343
|
+
- do not treat effort or plausible narration as proof
|
|
344
|
+
- do not use any benchmark answer source outside this repository checkout and the issue statement
|
|
345
|
+
- BLOCKED is acceptable if the landed evidence is not strong enough
|
|
346
|
+
|
|
347
|
+
File ownership (only touch these paths):
|
|
348
|
+
- docs/plans/waves/reviews/wave-1-cont-qa.md
|
|
349
|
+
\`\`\`
|
|
350
|
+
|
|
351
|
+
## Agent E0: cont-EVAL
|
|
352
|
+
|
|
353
|
+
### Role prompts
|
|
354
|
+
|
|
355
|
+
- docs/agents/wave-cont-eval-role.md
|
|
356
|
+
|
|
357
|
+
### Executor
|
|
358
|
+
|
|
359
|
+
- id: codex
|
|
360
|
+
- model: ${modelId}
|
|
361
|
+
- budget.minutes: ${maxWallClockMinutes}
|
|
362
|
+
- budget.turns: ${maxTurns}
|
|
363
|
+
- codex.json: true
|
|
364
|
+
- codex.config: model_reasoning_effort=${reasoningEffort}
|
|
365
|
+
|
|
366
|
+
### Context7
|
|
367
|
+
|
|
368
|
+
- bundle: none
|
|
369
|
+
|
|
370
|
+
### Prompt
|
|
371
|
+
|
|
372
|
+
\`\`\`text
|
|
373
|
+
Primary goal:
|
|
374
|
+
- Review the landed implementation against the issue statement and the official target test scope without changing source files directly.
|
|
375
|
+
|
|
376
|
+
Required context before coding:
|
|
377
|
+
- Read docs/reference/repository-guidance.md.
|
|
378
|
+
- Read docs/research/agent-context-sources.md.
|
|
379
|
+
- Read docs/evals/README.md.
|
|
380
|
+
|
|
381
|
+
Specific expectations:
|
|
382
|
+
- stay report-only for this wave
|
|
383
|
+
- use the issue statement and target test scope below as the acceptance contract
|
|
384
|
+
- do not use verifier output or hidden benchmark answers as solve hints
|
|
385
|
+
|
|
386
|
+
Issue statement:
|
|
387
|
+
${String(row.problem_statement || "")}
|
|
388
|
+
|
|
389
|
+
Official target tests:
|
|
390
|
+
- ${testList}
|
|
391
|
+
|
|
392
|
+
File ownership (only touch these paths):
|
|
393
|
+
- docs/plans/waves/reviews/wave-1-cont-eval.md
|
|
394
|
+
\`\`\`
|
|
395
|
+
|
|
396
|
+
## Agent A8: Integration Steward
|
|
397
|
+
|
|
398
|
+
### Role prompts
|
|
399
|
+
|
|
400
|
+
- docs/agents/wave-integration-role.md
|
|
401
|
+
|
|
402
|
+
### Executor
|
|
403
|
+
|
|
404
|
+
- id: codex
|
|
405
|
+
- model: ${modelId}
|
|
406
|
+
- budget.minutes: ${maxWallClockMinutes}
|
|
407
|
+
- budget.turns: ${maxTurns}
|
|
408
|
+
- codex.json: true
|
|
409
|
+
- codex.config: model_reasoning_effort=${reasoningEffort}
|
|
410
|
+
|
|
411
|
+
### Context7
|
|
412
|
+
|
|
413
|
+
- bundle: none
|
|
414
|
+
|
|
415
|
+
### Prompt
|
|
416
|
+
|
|
417
|
+
\`\`\`text
|
|
418
|
+
Primary goal:
|
|
419
|
+
- Integrate the implementation and review evidence into one closure-ready judgment.
|
|
420
|
+
|
|
421
|
+
Required context before coding:
|
|
422
|
+
- Read docs/reference/repository-guidance.md.
|
|
423
|
+
- Read docs/research/agent-context-sources.md.
|
|
424
|
+
- Read docs/plans/current-state.md and docs/plans/master-plan.md.
|
|
425
|
+
|
|
426
|
+
Specific expectations:
|
|
427
|
+
- keep benchmark fairness explicit
|
|
428
|
+
- name blockers instead of smoothing them over
|
|
429
|
+
|
|
430
|
+
File ownership (only touch these paths):
|
|
431
|
+
- .tmp/main-wave-launcher/integration/wave-1.md
|
|
432
|
+
- .tmp/main-wave-launcher/integration/wave-1.json
|
|
433
|
+
\`\`\`
|
|
434
|
+
|
|
435
|
+
## Agent A9: Documentation Steward
|
|
436
|
+
|
|
437
|
+
### Role prompts
|
|
438
|
+
|
|
439
|
+
- docs/agents/wave-documentation-role.md
|
|
440
|
+
|
|
441
|
+
### Executor
|
|
442
|
+
|
|
443
|
+
- id: codex
|
|
444
|
+
- model: ${modelId}
|
|
445
|
+
- budget.minutes: ${maxWallClockMinutes}
|
|
446
|
+
- budget.turns: ${maxTurns}
|
|
447
|
+
- codex.json: true
|
|
448
|
+
- codex.config: model_reasoning_effort=${reasoningEffort}
|
|
449
|
+
|
|
450
|
+
### Context7
|
|
451
|
+
|
|
452
|
+
- bundle: none
|
|
453
|
+
|
|
454
|
+
### Prompt
|
|
455
|
+
|
|
456
|
+
\`\`\`text
|
|
457
|
+
Primary goal:
|
|
458
|
+
- Close the documentation surface without polluting the benchmark patch with Wave scaffolding changes.
|
|
459
|
+
|
|
460
|
+
Required context before coding:
|
|
461
|
+
- Read docs/reference/repository-guidance.md.
|
|
462
|
+
- Read docs/research/agent-context-sources.md.
|
|
463
|
+
- Read docs/plans/current-state.md, docs/plans/master-plan.md, and docs/plans/migration.md.
|
|
464
|
+
|
|
465
|
+
Specific expectations:
|
|
466
|
+
- prefer no-change when shared-plan docs are unrelated to the repository bug fix
|
|
467
|
+
- do not treat Wave scaffold changes as part of the benchmark patch
|
|
468
|
+
|
|
469
|
+
File ownership (only touch these paths):
|
|
470
|
+
- docs/plans/current-state.md
|
|
471
|
+
- docs/plans/master-plan.md
|
|
472
|
+
- docs/plans/migration.md
|
|
473
|
+
- docs/plans/component-cutover-matrix.md
|
|
474
|
+
- docs/plans/component-cutover-matrix.json
|
|
475
|
+
\`\`\`
|
|
476
|
+
|
|
477
|
+
## Agent A1: Root Cause And Patch
|
|
478
|
+
|
|
479
|
+
### Executor
|
|
480
|
+
|
|
481
|
+
- id: codex
|
|
482
|
+
- model: ${modelId}
|
|
483
|
+
- budget.minutes: ${maxWallClockMinutes}
|
|
484
|
+
- budget.turns: ${maxTurns}
|
|
485
|
+
- codex.json: true
|
|
486
|
+
- codex.config: model_reasoning_effort=${reasoningEffort}
|
|
487
|
+
|
|
488
|
+
### Context7
|
|
489
|
+
|
|
490
|
+
- bundle: none
|
|
491
|
+
|
|
492
|
+
### Components
|
|
493
|
+
|
|
494
|
+
- benchmark-program-and-evals
|
|
495
|
+
|
|
496
|
+
### Exit contract
|
|
497
|
+
|
|
498
|
+
- completion: integrated
|
|
499
|
+
- durability: none
|
|
500
|
+
- proof: integration
|
|
501
|
+
- doc-impact: owned
|
|
502
|
+
|
|
503
|
+
### Prompt
|
|
504
|
+
|
|
505
|
+
\`\`\`text
|
|
506
|
+
Primary goal:
|
|
507
|
+
- Diagnose the bug and land the smallest correct source patch.
|
|
508
|
+
|
|
509
|
+
Required context before coding:
|
|
510
|
+
- Read docs/reference/repository-guidance.md.
|
|
511
|
+
- Read docs/research/agent-context-sources.md.
|
|
512
|
+
- Read README.md if it helps orient the repository.
|
|
513
|
+
|
|
514
|
+
Specific expectations:
|
|
515
|
+
- use only this issue statement and the repository checkout
|
|
516
|
+
- do not use gold patches, evaluator outputs, or hidden benchmark answers
|
|
517
|
+
- coordinate with A2 when a regression test should change
|
|
518
|
+
- prefer a minimal diff that fixes the root cause
|
|
519
|
+
|
|
520
|
+
Issue statement:
|
|
521
|
+
${String(row.problem_statement || "")}
|
|
522
|
+
|
|
523
|
+
Official target tests:
|
|
524
|
+
- ${testList}
|
|
525
|
+
|
|
526
|
+
File ownership (only touch these paths):
|
|
527
|
+
- src/
|
|
528
|
+
- source/
|
|
529
|
+
- lib/
|
|
530
|
+
- server/
|
|
531
|
+
- client/
|
|
532
|
+
- public/
|
|
533
|
+
- package.json
|
|
534
|
+
- pnpm-lock.yaml
|
|
535
|
+
- package-lock.json
|
|
536
|
+
- yarn.lock
|
|
537
|
+
- README.md
|
|
538
|
+
\`\`\`
|
|
539
|
+
|
|
540
|
+
## Agent A2: Regression Tests And Acceptance
|
|
541
|
+
|
|
542
|
+
### Executor
|
|
543
|
+
|
|
544
|
+
- id: codex
|
|
545
|
+
- model: ${modelId}
|
|
546
|
+
- budget.minutes: ${maxWallClockMinutes}
|
|
547
|
+
- budget.turns: ${maxTurns}
|
|
548
|
+
- codex.json: true
|
|
549
|
+
- codex.config: model_reasoning_effort=${reasoningEffort}
|
|
550
|
+
|
|
551
|
+
### Context7
|
|
552
|
+
|
|
553
|
+
- bundle: none
|
|
554
|
+
|
|
555
|
+
### Components
|
|
556
|
+
|
|
557
|
+
- benchmark-program-and-evals
|
|
558
|
+
|
|
559
|
+
### Exit contract
|
|
560
|
+
|
|
561
|
+
- completion: integrated
|
|
562
|
+
- durability: none
|
|
563
|
+
- proof: integration
|
|
564
|
+
- doc-impact: owned
|
|
565
|
+
|
|
566
|
+
### Prompt
|
|
567
|
+
|
|
568
|
+
\`\`\`text
|
|
569
|
+
Primary goal:
|
|
570
|
+
- Add or adjust the narrowest regression coverage needed and independently check that the patch matches the issue requirements.
|
|
571
|
+
|
|
572
|
+
Required context before coding:
|
|
573
|
+
- Read docs/reference/repository-guidance.md.
|
|
574
|
+
- Read docs/research/agent-context-sources.md.
|
|
575
|
+
- Read the issue statement and the files A1 changes before editing.
|
|
576
|
+
|
|
577
|
+
Specific expectations:
|
|
578
|
+
- keep tests tightly scoped to the bug
|
|
579
|
+
- do not broaden the patch unless the issue requires it
|
|
580
|
+
- if a reliable local test run is not practical in this environment, say so explicitly rather than fabricating proof
|
|
581
|
+
|
|
582
|
+
Issue statement:
|
|
583
|
+
${String(row.problem_statement || "")}
|
|
584
|
+
|
|
585
|
+
Official target tests:
|
|
586
|
+
- ${testList}
|
|
587
|
+
|
|
588
|
+
File ownership (only touch these paths):
|
|
589
|
+
- test/
|
|
590
|
+
- tests/
|
|
591
|
+
- __tests__/
|
|
592
|
+
- spec/
|
|
593
|
+
\`\`\`
|
|
594
|
+
`;
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
function renderWaveRepoGuide() {
|
|
598
|
+
return `# Repository Guidance
|
|
599
|
+
|
|
600
|
+
- This repository is being used as a benchmark task workspace.
|
|
601
|
+
- Only edit source files needed for the task and the Wave-owned closure reports.
|
|
602
|
+
- Do not use benchmark gold patches, hidden answers, or verifier outputs as solve hints.
|
|
603
|
+
- Keep changes minimal and reviewable.
|
|
604
|
+
`;
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
function normalizeBenchmarkWaveConfig(repoDir) {
|
|
608
|
+
const configPath = path.join(repoDir, "wave.config.json");
|
|
609
|
+
if (!fs.existsSync(configPath)) {
|
|
610
|
+
return;
|
|
611
|
+
}
|
|
612
|
+
const config = JSON.parse(fs.readFileSync(configPath, "utf8"));
|
|
613
|
+
const runtimePolicy = {
|
|
614
|
+
...(config.lanes?.main?.runtimePolicy || {}),
|
|
615
|
+
runtimeMixTargets: {
|
|
616
|
+
codex: 10,
|
|
617
|
+
claude: 0,
|
|
618
|
+
opencode: 0,
|
|
619
|
+
},
|
|
620
|
+
defaultExecutorByRole: {
|
|
621
|
+
implementation: "codex",
|
|
622
|
+
integration: "codex",
|
|
623
|
+
documentation: "codex",
|
|
624
|
+
"cont-qa": "codex",
|
|
625
|
+
"cont-eval": "codex",
|
|
626
|
+
security: "codex",
|
|
627
|
+
research: "codex",
|
|
628
|
+
infra: "codex",
|
|
629
|
+
deploy: "codex",
|
|
630
|
+
},
|
|
631
|
+
fallbackExecutorOrder: ["codex", "claude", "opencode"],
|
|
632
|
+
};
|
|
633
|
+
config.executors = {
|
|
634
|
+
...(config.executors || {}),
|
|
635
|
+
default: "codex",
|
|
636
|
+
};
|
|
637
|
+
config.lanes = {
|
|
638
|
+
...(config.lanes || {}),
|
|
639
|
+
main: {
|
|
640
|
+
...(config.lanes?.main || {}),
|
|
641
|
+
runtimePolicy,
|
|
642
|
+
},
|
|
643
|
+
};
|
|
644
|
+
writeJsonAtomic(configPath, config);
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
function writeFullWaveScaffold(row, repoDir, modelId, reasoningEffort, maxWallClockMinutes, maxTurns) {
|
|
648
|
+
const docsRefDir = path.join(repoDir, "docs", "reference");
|
|
649
|
+
ensureDirectory(docsRefDir);
|
|
650
|
+
const repoGuidePath = path.join(docsRefDir, "repository-guidance.md");
|
|
651
|
+
if (!fs.existsSync(repoGuidePath)) {
|
|
652
|
+
writeTextAtomic(repoGuidePath, `${renderWaveRepoGuide()}\n`);
|
|
653
|
+
}
|
|
654
|
+
const wavePath = path.join(repoDir, "docs", "plans", "waves", "wave-1.md");
|
|
655
|
+
writeTextAtomic(
|
|
656
|
+
wavePath,
|
|
657
|
+
`${buildFullWaveMarkdown(row, modelId, reasoningEffort, maxWallClockMinutes, maxTurns)}\n`,
|
|
658
|
+
);
|
|
659
|
+
return wavePath;
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
function removeSeededStarterWave(repoDir) {
|
|
663
|
+
fs.rmSync(path.join(repoDir, "docs", "plans", "waves", "wave-0.md"), { force: true });
|
|
664
|
+
fs.rmSync(path.join(repoDir, "docs", "plans", "waves", "specs", "wave-0.json"), { force: true });
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
function parseGitStatusPorcelain(output) {
|
|
668
|
+
return String(output || "")
|
|
669
|
+
.split(/\r?\n/)
|
|
670
|
+
.map((line) => line.trimEnd())
|
|
671
|
+
.filter(Boolean)
|
|
672
|
+
.map((line) => ({
|
|
673
|
+
code: line.slice(0, 2),
|
|
674
|
+
path: line.slice(3).trim(),
|
|
675
|
+
}));
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
function shouldExcludeFromBenchmarkPatch(relPath, seededFiles) {
|
|
679
|
+
const normalized = relPath.replaceAll(path.sep, "/");
|
|
680
|
+
if (!normalized) {
|
|
681
|
+
return true;
|
|
682
|
+
}
|
|
683
|
+
if (normalized.startsWith(".wave/") || normalized.startsWith(".tmp/")) {
|
|
684
|
+
return true;
|
|
685
|
+
}
|
|
686
|
+
if (seededFiles.has(normalized)) {
|
|
687
|
+
return true;
|
|
688
|
+
}
|
|
689
|
+
return [
|
|
690
|
+
"docs/agents/",
|
|
691
|
+
"docs/context7/",
|
|
692
|
+
"docs/evals/",
|
|
693
|
+
"docs/guides/",
|
|
694
|
+
"docs/plans/",
|
|
695
|
+
"docs/reference/",
|
|
696
|
+
"docs/research/",
|
|
697
|
+
"skills/",
|
|
698
|
+
].some((prefix) => normalized.startsWith(prefix)) || normalized === "wave.config.json";
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
function buildDiffPathspecs(seededFiles) {
|
|
702
|
+
const exactFiles = Array.from(seededFiles).map((filePath) => `:(exclude)${filePath}`);
|
|
703
|
+
const globExcludes = [
|
|
704
|
+
".wave/**",
|
|
705
|
+
".tmp/**",
|
|
706
|
+
"docs/agents/**",
|
|
707
|
+
"docs/context7/**",
|
|
708
|
+
"docs/evals/**",
|
|
709
|
+
"docs/guides/**",
|
|
710
|
+
"docs/plans/**",
|
|
711
|
+
"docs/reference/**",
|
|
712
|
+
"docs/research/**",
|
|
713
|
+
"skills/**",
|
|
714
|
+
].map((pattern) => `:(glob,exclude)${pattern}`);
|
|
715
|
+
return Array.from(new Set([...exactFiles, ":(exclude)wave.config.json", ...globExcludes]));
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
function buildPatch(repoDir, seededFiles = new Set()) {
|
|
719
|
+
const status = runShellCommand("git status --porcelain=v1 -uall", { cwd: repoDir });
|
|
720
|
+
assertSuccess(status, "git status");
|
|
721
|
+
const untracked = parseGitStatusPorcelain(status.stdout)
|
|
722
|
+
.filter((entry) => entry.code === "??")
|
|
723
|
+
.map((entry) => entry.path)
|
|
724
|
+
.filter((entry) => !shouldExcludeFromBenchmarkPatch(entry, seededFiles));
|
|
725
|
+
if (untracked.length > 0) {
|
|
726
|
+
const addIntent = runShellCommand(
|
|
727
|
+
`git add -N -- ${untracked.map((filePath) => shellQuote(filePath)).join(" ")}`,
|
|
728
|
+
{ cwd: repoDir },
|
|
729
|
+
);
|
|
730
|
+
assertSuccess(addIntent, "git add -N");
|
|
731
|
+
}
|
|
732
|
+
const pathspecs = buildDiffPathspecs(seededFiles);
|
|
733
|
+
const diffCommand = `git diff --binary HEAD -- . ${pathspecs.map((entry) => shellQuote(entry)).join(" ")}`.trim();
|
|
734
|
+
const diff = runShellCommand(diffCommand, { cwd: repoDir });
|
|
735
|
+
assertSuccess(diff, "git diff");
|
|
736
|
+
return diff.stdout;
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
function parseWaveCodexUsage(repoDir) {
|
|
740
|
+
const logsDir = path.join(repoDir, ".tmp", "main-wave-launcher", "logs");
|
|
741
|
+
if (!fs.existsSync(logsDir)) {
|
|
742
|
+
return { input_tokens: 0, cached_input_tokens: 0, output_tokens: 0 };
|
|
743
|
+
}
|
|
744
|
+
const usages = fs
|
|
745
|
+
.readdirSync(logsDir)
|
|
746
|
+
.filter((name) => name.startsWith("wave-1-") && name.endsWith(".log"))
|
|
747
|
+
.map((name) => parseCodexUsageFromLog(path.join(logsDir, name)));
|
|
748
|
+
return mergeUsageTotals(usages);
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
function buildSingleAgentSolve(row, taskWorkspace, options) {
|
|
752
|
+
ensureDirectory(taskWorkspace.logsDir);
|
|
753
|
+
const promptPath = path.join(taskWorkspace.logsDir, "single-agent-prompt.txt");
|
|
754
|
+
const logPath = path.join(taskWorkspace.logsDir, "single-agent-codex.jsonl");
|
|
755
|
+
writeTextAtomic(promptPath, `${buildSingleAgentPrompt(row)}\n`);
|
|
756
|
+
const command = buildCodexExecInvocation(promptPath, logPath, "danger-full-access", "codex", {
|
|
757
|
+
model: options.modelId,
|
|
758
|
+
config: [`model_reasoning_effort=${options.reasoningEffort}`],
|
|
759
|
+
search: false,
|
|
760
|
+
json: true,
|
|
761
|
+
ephemeral: false,
|
|
762
|
+
});
|
|
763
|
+
const result = runShellCommand(command, {
|
|
764
|
+
cwd: taskWorkspace.repoDir,
|
|
765
|
+
timeoutMs: options.maxWallClockMinutes * 60 * 1000,
|
|
766
|
+
});
|
|
767
|
+
return {
|
|
768
|
+
execution: result,
|
|
769
|
+
tokenUsage: parseCodexUsageFromLog(logPath),
|
|
770
|
+
tracePath: null,
|
|
771
|
+
summaryPath: path.relative(REPO_ROOT, logPath).replaceAll(path.sep, "/"),
|
|
772
|
+
};
|
|
773
|
+
}
|
|
774
|
+
|
|
775
|
+
function buildFullWaveSolve(row, taskWorkspace, options) {
|
|
776
|
+
const init = runShellCommand(`node ${shellQuote(WAVE_ENTRY)} init --json`, {
|
|
777
|
+
cwd: taskWorkspace.repoDir,
|
|
778
|
+
timeoutMs: 2 * 60 * 1000,
|
|
779
|
+
});
|
|
780
|
+
assertSuccess(init, "wave init");
|
|
781
|
+
const initPayload = JSON.parse(init.stdout);
|
|
782
|
+
const seededFiles = new Set((initPayload.seededFiles || []).map((filePath) => String(filePath).replaceAll("\\", "/")));
|
|
783
|
+
normalizeBenchmarkWaveConfig(taskWorkspace.repoDir);
|
|
784
|
+
removeSeededStarterWave(taskWorkspace.repoDir);
|
|
785
|
+
writeFullWaveScaffold(
|
|
786
|
+
row,
|
|
787
|
+
taskWorkspace.repoDir,
|
|
788
|
+
options.modelId,
|
|
789
|
+
options.reasoningEffort,
|
|
790
|
+
options.maxWallClockMinutes,
|
|
791
|
+
options.maxTurns,
|
|
792
|
+
);
|
|
793
|
+
const doctor = runShellCommand(`node ${shellQuote(WAVE_ENTRY)} doctor --json`, {
|
|
794
|
+
cwd: taskWorkspace.repoDir,
|
|
795
|
+
timeoutMs: 2 * 60 * 1000,
|
|
796
|
+
});
|
|
797
|
+
assertSuccess(doctor, "wave doctor");
|
|
798
|
+
const launch = runShellCommand(
|
|
799
|
+
`node ${shellQuote(WAVE_ENTRY)} launch --lane main --start-wave 1 --end-wave 1 --no-dashboard --terminal-surface tmux`,
|
|
800
|
+
{
|
|
801
|
+
cwd: taskWorkspace.repoDir,
|
|
802
|
+
timeoutMs: options.maxWallClockMinutes * 60 * 1000,
|
|
803
|
+
},
|
|
804
|
+
);
|
|
805
|
+
const integrationSummaryPath = path.join(
|
|
806
|
+
taskWorkspace.repoDir,
|
|
807
|
+
".tmp",
|
|
808
|
+
"main-wave-launcher",
|
|
809
|
+
"integration",
|
|
810
|
+
"wave-1.md",
|
|
811
|
+
);
|
|
812
|
+
const tracePath = path.join(taskWorkspace.repoDir, "traces", "wave-1");
|
|
813
|
+
return {
|
|
814
|
+
execution: launch,
|
|
815
|
+
tokenUsage: parseWaveCodexUsage(taskWorkspace.repoDir),
|
|
816
|
+
seededFiles,
|
|
817
|
+
tracePath: fs.existsSync(tracePath) ? path.relative(REPO_ROOT, tracePath).replaceAll(path.sep, "/") : null,
|
|
818
|
+
summaryPath: fs.existsSync(integrationSummaryPath)
|
|
819
|
+
? path.relative(REPO_ROOT, integrationSummaryPath).replaceAll(path.sep, "/")
|
|
820
|
+
: null,
|
|
821
|
+
};
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
function evaluatePatch(row, patch, taskWorkspace, options, arm) {
|
|
825
|
+
ensureDirectory(taskWorkspace.evalDir);
|
|
826
|
+
const rawSamplePath = path.join(taskWorkspace.evalDir, "raw-sample.jsonl");
|
|
827
|
+
const patchPath = path.join(taskWorkspace.evalDir, "patches.json");
|
|
828
|
+
const outputDir = path.join(taskWorkspace.evalDir, "output");
|
|
829
|
+
const stdoutPath = path.join(taskWorkspace.evalDir, "official-eval.stdout.log");
|
|
830
|
+
const stderrPath = path.join(taskWorkspace.evalDir, "official-eval.stderr.log");
|
|
831
|
+
const commandPath = path.join(taskWorkspace.evalDir, "official-eval.command.txt");
|
|
832
|
+
ensureDirectory(outputDir);
|
|
833
|
+
const rawRow = normalizeRawSampleRow(row);
|
|
834
|
+
fs.writeFileSync(rawSamplePath, `${JSON.stringify(rawRow)}\n`, "utf8");
|
|
835
|
+
fs.writeFileSync(
|
|
836
|
+
patchPath,
|
|
837
|
+
`${JSON.stringify([{ instance_id: row.instance_id, patch, prefix: arm }], null, 2)}\n`,
|
|
838
|
+
"utf8",
|
|
839
|
+
);
|
|
840
|
+
const evalCommand = [
|
|
841
|
+
shellQuote(options.pythonBin),
|
|
842
|
+
shellQuote(path.join(options.sweBenchRoot, "swe_bench_pro_eval.py")),
|
|
843
|
+
`--raw_sample_path=${shellQuote(rawSamplePath)}`,
|
|
844
|
+
`--patch_path=${shellQuote(patchPath)}`,
|
|
845
|
+
`--output_dir=${shellQuote(outputDir)}`,
|
|
846
|
+
`--scripts_dir=${shellQuote(path.join(options.sweBenchRoot, "run_scripts"))}`,
|
|
847
|
+
"--num_workers=1",
|
|
848
|
+
"--dockerhub_username=jefzda",
|
|
849
|
+
"--use_local_docker",
|
|
850
|
+
].join(" ");
|
|
851
|
+
fs.writeFileSync(commandPath, `${evalCommand}\n`, "utf8");
|
|
852
|
+
const result = runShellCommand(evalCommand, {
|
|
853
|
+
cwd: options.sweBenchRoot,
|
|
854
|
+
timeoutMs: 60 * 60 * 1000,
|
|
855
|
+
});
|
|
856
|
+
fs.writeFileSync(stdoutPath, result.stdout || "", "utf8");
|
|
857
|
+
fs.writeFileSync(stderrPath, result.stderr || "", "utf8");
|
|
858
|
+
const evalResultsPath = path.join(outputDir, "eval_results.json");
|
|
859
|
+
if (result.error?.code === "ETIMEDOUT") {
|
|
860
|
+
return {
|
|
861
|
+
success: false,
|
|
862
|
+
artifactPath: null,
|
|
863
|
+
verificationStdoutPath: path.relative(REPO_ROOT, stdoutPath).replaceAll(path.sep, "/"),
|
|
864
|
+
verificationStderrPath: path.relative(REPO_ROOT, stderrPath).replaceAll(path.sep, "/"),
|
|
865
|
+
verificationOutputDir: path.relative(REPO_ROOT, outputDir).replaceAll(path.sep, "/"),
|
|
866
|
+
reviewCategory: "timeout",
|
|
867
|
+
detail: `official SWE-bench Pro evaluation timed out after ${result.wallClockMs}ms`,
|
|
868
|
+
};
|
|
869
|
+
}
|
|
870
|
+
if (result.exitCode !== 0) {
|
|
871
|
+
const detail = cleanText(result.stderr || result.stdout) || "no output";
|
|
872
|
+
return {
|
|
873
|
+
success: false,
|
|
874
|
+
artifactPath: fs.existsSync(evalResultsPath)
|
|
875
|
+
? path.relative(REPO_ROOT, evalResultsPath).replaceAll(path.sep, "/")
|
|
876
|
+
: null,
|
|
877
|
+
verificationStdoutPath: path.relative(REPO_ROOT, stdoutPath).replaceAll(path.sep, "/"),
|
|
878
|
+
verificationStderrPath: path.relative(REPO_ROOT, stderrPath).replaceAll(path.sep, "/"),
|
|
879
|
+
verificationOutputDir: path.relative(REPO_ROOT, outputDir).replaceAll(path.sep, "/"),
|
|
880
|
+
reviewCategory: isVerifierImageFailureDetail(detail.toLowerCase()) ? "verifier-image" : "setup-harness",
|
|
881
|
+
detail: `official SWE-bench Pro evaluation failed (${result.exitCode}): ${detail}`,
|
|
882
|
+
};
|
|
883
|
+
}
|
|
884
|
+
const evalResults = JSON.parse(fs.readFileSync(evalResultsPath, "utf8"));
|
|
885
|
+
const success = Boolean(evalResults[row.instance_id]);
|
|
886
|
+
return {
|
|
887
|
+
success,
|
|
888
|
+
artifactPath: path.relative(REPO_ROOT, evalResultsPath).replaceAll(path.sep, "/"),
|
|
889
|
+
verificationStdoutPath: path.relative(REPO_ROOT, stdoutPath).replaceAll(path.sep, "/"),
|
|
890
|
+
verificationStderrPath: path.relative(REPO_ROOT, stderrPath).replaceAll(path.sep, "/"),
|
|
891
|
+
verificationOutputDir: path.relative(REPO_ROOT, outputDir).replaceAll(path.sep, "/"),
|
|
892
|
+
detail: cleanText(result.stdout.split(/\r?\n/).filter(Boolean).slice(-1)[0]) || "evaluation completed",
|
|
893
|
+
};
|
|
894
|
+
}
|
|
895
|
+
|
|
896
|
+
function classifyReviewCategory({ solve, evaluation }) {
|
|
897
|
+
if (evaluation.reviewCategory) {
|
|
898
|
+
return evaluation.reviewCategory;
|
|
899
|
+
}
|
|
900
|
+
if (evaluation.success) {
|
|
901
|
+
return "solved";
|
|
902
|
+
}
|
|
903
|
+
const detail = cleanText(evaluation.detail).toLowerCase();
|
|
904
|
+
if (detail.includes("dry-run plan only") || detail.includes("planning only")) {
|
|
905
|
+
return "dry-run-plan";
|
|
906
|
+
}
|
|
907
|
+
if (solve.execution.error?.code === "ETIMEDOUT" || detail.includes("timed out") || detail.includes("timeout")) {
|
|
908
|
+
return "timeout";
|
|
909
|
+
}
|
|
910
|
+
if (isVerifierImageFailureDetail(detail)) {
|
|
911
|
+
return "verifier-image";
|
|
912
|
+
}
|
|
913
|
+
if (detail.includes("needs-more-work") || detail.includes("proof gap") || detail.includes("blocked")) {
|
|
914
|
+
return "blocked-proof";
|
|
915
|
+
}
|
|
916
|
+
if (isSetupHarnessFailureDetail(detail)) {
|
|
917
|
+
return "setup-harness";
|
|
918
|
+
}
|
|
919
|
+
if (solve.execution.exitCode !== 0) {
|
|
920
|
+
return "setup-harness";
|
|
921
|
+
}
|
|
922
|
+
return "incorrect-patch";
|
|
923
|
+
}
|
|
924
|
+
|
|
925
|
+
function buildResultPayload({
|
|
926
|
+
row,
|
|
927
|
+
arm,
|
|
928
|
+
solve,
|
|
929
|
+
evaluation,
|
|
930
|
+
patch,
|
|
931
|
+
taskWorkspace,
|
|
932
|
+
}) {
|
|
933
|
+
const patchPath = path.join(taskWorkspace.artifactsDir, `${arm}.patch.diff`);
|
|
934
|
+
const resultPath = path.join(taskWorkspace.artifactsDir, `${arm}.result.json`);
|
|
935
|
+
ensureDirectory(taskWorkspace.artifactsDir);
|
|
936
|
+
fs.writeFileSync(patchPath, patch, "utf8");
|
|
937
|
+
const payload = {
|
|
938
|
+
generatedAt: toIsoTimestamp(),
|
|
939
|
+
instanceId: row.instance_id,
|
|
940
|
+
repo: row.repo,
|
|
941
|
+
arm,
|
|
942
|
+
success: evaluation.success,
|
|
943
|
+
wallClockMs: solve.execution.wallClockMs,
|
|
944
|
+
totalCostUsd: null,
|
|
945
|
+
tokenUsage: solve.tokenUsage,
|
|
946
|
+
tracePath: solve.tracePath,
|
|
947
|
+
summaryPath: solve.summaryPath,
|
|
948
|
+
artifactPath: evaluation.artifactPath,
|
|
949
|
+
patchPath: path.relative(REPO_ROOT, patchPath).replaceAll(path.sep, "/"),
|
|
950
|
+
verificationStdoutPath: evaluation.verificationStdoutPath,
|
|
951
|
+
verificationStderrPath: evaluation.verificationStderrPath,
|
|
952
|
+
verificationOutputDir: evaluation.verificationOutputDir,
|
|
953
|
+
reviewCategory: classifyReviewCategory({ solve, evaluation }),
|
|
954
|
+
detail: evaluation.detail,
|
|
955
|
+
};
|
|
956
|
+
writeJsonAtomic(resultPath, payload);
|
|
957
|
+
return payload;
|
|
958
|
+
}
|
|
959
|
+
|
|
960
|
+
function main() {
|
|
961
|
+
const options = parseArgs(process.argv.slice(2));
|
|
962
|
+
if (options.command !== "run") {
|
|
963
|
+
throw new Error(`Unsupported command: ${options.command}`);
|
|
964
|
+
}
|
|
965
|
+
const arm = normalizeArm(options.arm);
|
|
966
|
+
if (!options.instanceId) {
|
|
967
|
+
throw new Error("--instance is required");
|
|
968
|
+
}
|
|
969
|
+
if (!options.modelId) {
|
|
970
|
+
throw new Error("--model is required");
|
|
971
|
+
}
|
|
972
|
+
if (!fs.existsSync(options.pythonBin)) {
|
|
973
|
+
throw new Error(`Python runtime not found: ${options.pythonBin}`);
|
|
974
|
+
}
|
|
975
|
+
if (!fs.existsSync(options.sweBenchRoot)) {
|
|
976
|
+
throw new Error(`SWE-bench Pro repo not found: ${options.sweBenchRoot}`);
|
|
977
|
+
}
|
|
978
|
+
const row = loadDatasetRow(options.instanceId, options.pythonBin);
|
|
979
|
+
const taskWorkspace = prepareTaskWorkspace(row, arm, options.outputRoot);
|
|
980
|
+
cloneRepo(row, taskWorkspace.repoDir);
|
|
981
|
+
const solve =
|
|
982
|
+
arm === "single-agent"
|
|
983
|
+
? buildSingleAgentSolve(row, taskWorkspace, options)
|
|
984
|
+
: buildFullWaveSolve(row, taskWorkspace, options);
|
|
985
|
+
const patch = buildPatch(taskWorkspace.repoDir, solve.seededFiles || new Set());
|
|
986
|
+
const evaluation = evaluatePatch(row, patch, taskWorkspace, options, arm);
|
|
987
|
+
const payload = buildResultPayload({
|
|
988
|
+
row,
|
|
989
|
+
arm,
|
|
990
|
+
solve,
|
|
991
|
+
evaluation,
|
|
992
|
+
patch,
|
|
993
|
+
taskWorkspace,
|
|
994
|
+
});
|
|
995
|
+
console.log(JSON.stringify(payload));
|
|
996
|
+
}
|
|
997
|
+
|
|
998
|
+
try {
|
|
999
|
+
main();
|
|
1000
|
+
} catch (error) {
|
|
1001
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1002
|
+
console.error(message);
|
|
1003
|
+
process.exit(1);
|
|
1004
|
+
}
|