@chllming/wave-orchestration 0.6.3 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/CHANGELOG.md +82 -1
  2. package/README.md +40 -7
  3. package/docs/agents/wave-orchestrator-role.md +50 -0
  4. package/docs/agents/wave-planner-role.md +39 -0
  5. package/docs/context7/bundles.json +9 -0
  6. package/docs/context7/planner-agent/README.md +25 -0
  7. package/docs/context7/planner-agent/manifest.json +83 -0
  8. package/docs/context7/planner-agent/papers/cooperbench-why-coding-agents-cannot-be-your-teammates-yet.md +3283 -0
  9. package/docs/context7/planner-agent/papers/dova-deliberation-first-multi-agent-orchestration-for-autonomous-research-automation.md +1699 -0
  10. package/docs/context7/planner-agent/papers/dpbench-large-language-models-struggle-with-simultaneous-coordination.md +2251 -0
  11. package/docs/context7/planner-agent/papers/incremental-planning-to-control-a-blackboard-based-problem-solver.md +1729 -0
  12. package/docs/context7/planner-agent/papers/silo-bench-a-scalable-environment-for-evaluating-distributed-coordination-in-multi-agent-llm-systems.md +3747 -0
  13. package/docs/context7/planner-agent/papers/todoevolve-learning-to-architect-agent-planning-systems.md +1675 -0
  14. package/docs/context7/planner-agent/papers/verified-multi-agent-orchestration-a-plan-execute-verify-replan-framework-for-complex-query-resolution.md +1173 -0
  15. package/docs/context7/planner-agent/papers/why-do-multi-agent-llm-systems-fail.md +5211 -0
  16. package/docs/context7/planner-agent/topics/planning-and-orchestration.md +24 -0
  17. package/docs/evals/README.md +96 -1
  18. package/docs/evals/arm-templates/README.md +13 -0
  19. package/docs/evals/arm-templates/full-wave.json +15 -0
  20. package/docs/evals/arm-templates/single-agent.json +15 -0
  21. package/docs/evals/benchmark-catalog.json +7 -0
  22. package/docs/evals/cases/README.md +47 -0
  23. package/docs/evals/cases/wave-blackboard-inbox-targeting.json +73 -0
  24. package/docs/evals/cases/wave-contradiction-conflict.json +104 -0
  25. package/docs/evals/cases/wave-expert-routing-preservation.json +69 -0
  26. package/docs/evals/cases/wave-hidden-profile-private-evidence.json +81 -0
  27. package/docs/evals/cases/wave-premature-closure-guard.json +71 -0
  28. package/docs/evals/cases/wave-silo-cross-agent-state.json +77 -0
  29. package/docs/evals/cases/wave-simultaneous-lockstep.json +92 -0
  30. package/docs/evals/cooperbench/real-world-mitigation.md +341 -0
  31. package/docs/evals/external-benchmarks.json +85 -0
  32. package/docs/evals/external-command-config.sample.json +9 -0
  33. package/docs/evals/external-command-config.swe-bench-pro.json +8 -0
  34. package/docs/evals/pilots/README.md +47 -0
  35. package/docs/evals/pilots/swe-bench-pro-public-full-wave-review-10.json +64 -0
  36. package/docs/evals/pilots/swe-bench-pro-public-pilot.json +111 -0
  37. package/docs/evals/wave-benchmark-program.md +302 -0
  38. package/docs/guides/planner.md +67 -11
  39. package/docs/guides/terminal-surfaces.md +12 -0
  40. package/docs/plans/context7-wave-orchestrator.md +20 -0
  41. package/docs/plans/current-state.md +8 -1
  42. package/docs/plans/examples/wave-benchmark-improvement.md +108 -0
  43. package/docs/plans/examples/wave-example-live-proof.md +1 -1
  44. package/docs/plans/examples/wave-example-rollout-fidelity.md +340 -0
  45. package/docs/plans/migration.md +26 -0
  46. package/docs/plans/wave-orchestrator.md +60 -12
  47. package/docs/plans/waves/reviews/wave-1-benchmark-operator.md +118 -0
  48. package/docs/reference/cli-reference.md +547 -0
  49. package/docs/reference/coordination-and-closure.md +436 -0
  50. package/docs/reference/live-proof-waves.md +25 -3
  51. package/docs/reference/npmjs-trusted-publishing.md +3 -3
  52. package/docs/reference/proof-metrics.md +90 -0
  53. package/docs/reference/runtime-config/README.md +63 -2
  54. package/docs/reference/runtime-config/codex.md +2 -1
  55. package/docs/reference/sample-waves.md +29 -18
  56. package/docs/reference/wave-control.md +164 -0
  57. package/docs/reference/wave-planning-lessons.md +131 -0
  58. package/package.json +5 -4
  59. package/releases/manifest.json +40 -0
  60. package/scripts/research/agent-context-archive.mjs +18 -0
  61. package/scripts/research/manifests/agent-context-expanded-2026-03-22.mjs +17 -0
  62. package/scripts/research/sync-planner-context7-bundle.mjs +133 -0
  63. package/scripts/wave-orchestrator/agent-state.mjs +11 -2
  64. package/scripts/wave-orchestrator/artifact-schemas.mjs +232 -0
  65. package/scripts/wave-orchestrator/autonomous.mjs +7 -0
  66. package/scripts/wave-orchestrator/benchmark-cases.mjs +374 -0
  67. package/scripts/wave-orchestrator/benchmark-external.mjs +1384 -0
  68. package/scripts/wave-orchestrator/benchmark.mjs +972 -0
  69. package/scripts/wave-orchestrator/clarification-triage.mjs +78 -12
  70. package/scripts/wave-orchestrator/config.mjs +175 -0
  71. package/scripts/wave-orchestrator/control-cli.mjs +1216 -0
  72. package/scripts/wave-orchestrator/control-plane.mjs +697 -0
  73. package/scripts/wave-orchestrator/coord-cli.mjs +360 -2
  74. package/scripts/wave-orchestrator/coordination-store.mjs +211 -9
  75. package/scripts/wave-orchestrator/coordination.mjs +84 -0
  76. package/scripts/wave-orchestrator/dashboard-renderer.mjs +120 -5
  77. package/scripts/wave-orchestrator/dashboard-state.mjs +22 -0
  78. package/scripts/wave-orchestrator/evals.mjs +23 -0
  79. package/scripts/wave-orchestrator/executors.mjs +3 -2
  80. package/scripts/wave-orchestrator/feedback.mjs +55 -0
  81. package/scripts/wave-orchestrator/install.mjs +151 -2
  82. package/scripts/wave-orchestrator/launcher-closure.mjs +4 -1
  83. package/scripts/wave-orchestrator/launcher-runtime.mjs +33 -30
  84. package/scripts/wave-orchestrator/launcher.mjs +884 -36
  85. package/scripts/wave-orchestrator/planner-context.mjs +75 -0
  86. package/scripts/wave-orchestrator/planner.mjs +2270 -136
  87. package/scripts/wave-orchestrator/proof-cli.mjs +195 -0
  88. package/scripts/wave-orchestrator/proof-registry.mjs +317 -0
  89. package/scripts/wave-orchestrator/replay.mjs +10 -4
  90. package/scripts/wave-orchestrator/retry-cli.mjs +184 -0
  91. package/scripts/wave-orchestrator/retry-control.mjs +225 -0
  92. package/scripts/wave-orchestrator/shared.mjs +26 -0
  93. package/scripts/wave-orchestrator/swe-bench-pro-task.mjs +1004 -0
  94. package/scripts/wave-orchestrator/terminals.mjs +1 -1
  95. package/scripts/wave-orchestrator/traces.mjs +157 -2
  96. package/scripts/wave-orchestrator/wave-control-client.mjs +532 -0
  97. package/scripts/wave-orchestrator/wave-control-schema.mjs +309 -0
  98. package/scripts/wave-orchestrator/wave-files.mjs +144 -23
  99. package/scripts/wave.mjs +27 -0
  100. package/skills/repo-coding-rules/SKILL.md +1 -0
  101. package/skills/role-cont-eval/SKILL.md +1 -0
  102. package/skills/role-cont-qa/SKILL.md +13 -6
  103. package/skills/role-deploy/SKILL.md +1 -0
  104. package/skills/role-documentation/SKILL.md +4 -0
  105. package/skills/role-implementation/SKILL.md +4 -0
  106. package/skills/role-infra/SKILL.md +2 -1
  107. package/skills/role-integration/SKILL.md +15 -8
  108. package/skills/role-planner/SKILL.md +39 -0
  109. package/skills/role-planner/skill.json +21 -0
  110. package/skills/role-research/SKILL.md +1 -0
  111. package/skills/role-security/SKILL.md +2 -2
  112. package/skills/runtime-claude/SKILL.md +2 -1
  113. package/skills/runtime-codex/SKILL.md +1 -0
  114. package/skills/runtime-local/SKILL.md +2 -0
  115. package/skills/runtime-opencode/SKILL.md +1 -0
  116. package/skills/wave-core/SKILL.md +25 -6
  117. package/skills/wave-core/references/marker-syntax.md +16 -8
  118. package/wave.config.json +45 -0
@@ -0,0 +1,1004 @@
1
+ import crypto from "node:crypto";
2
+ import fs from "node:fs";
3
+ import path from "node:path";
4
+ import { spawnSync } from "node:child_process";
5
+ import { buildCodexExecInvocation } from "./executors.mjs";
6
+ import { REPO_ROOT, ensureDirectory, shellQuote, toIsoTimestamp, writeJsonAtomic, writeTextAtomic } from "./shared.mjs";
7
+
8
+ const DEFAULT_PYTHON_BIN = path.join(REPO_ROOT, ".tmp", "bench-tools", "swe-bench-pro-venv", "bin", "python");
9
+ const DEFAULT_SWE_BENCH_ROOT = path.join(REPO_ROOT, ".tmp", "bench-tools", "SWE-bench_Pro-os");
10
+ const DEFAULT_OUTPUT_ROOT = path.join(REPO_ROOT, ".tmp", "wave-benchmarks", "swe-bench-pro-live");
11
+ const WAVE_ENTRY = path.join(REPO_ROOT, "scripts", "wave.mjs");
12
+
13
+ function cleanText(value) {
14
+ return String(value ?? "").trim();
15
+ }
16
+
17
+ function matchesFailurePattern(detail, patterns) {
18
+ return patterns.some((pattern) => detail.includes(pattern));
19
+ }
20
+
21
+ function isVerifierImageFailureDetail(detail) {
22
+ return matchesFailurePattern(detail, [
23
+ "failed to pull",
24
+ "manifest unknown",
25
+ "no matching manifest",
26
+ "pull access denied",
27
+ "jefzda/sweap-images",
28
+ "docker image",
29
+ "dockerhub_username",
30
+ ]);
31
+ }
32
+
33
+ function isSetupHarnessFailureDetail(detail) {
34
+ return matchesFailurePattern(detail, [
35
+ "wave init failed",
36
+ "wave doctor failed",
37
+ "wave launch failed",
38
+ "git diff failed",
39
+ "git add -n failed",
40
+ "patch extraction failed",
41
+ "repository preparation failed",
42
+ "repo already contained wave bootstrap files",
43
+ "already contained wave bootstrap files",
44
+ "could not parse object",
45
+ "fatal: could not parse object",
46
+ "bootstrap",
47
+ "harness",
48
+ "workspace",
49
+ "task workspace",
50
+ "setup failed",
51
+ ]);
52
+ }
53
+
54
+ function normalizeArm(value) {
55
+ const arm = cleanText(value);
56
+ if (!["single-agent", "full-wave"].includes(arm)) {
57
+ throw new Error(`Unsupported SWE-bench Pro arm: ${value}`);
58
+ }
59
+ return arm;
60
+ }
61
+
62
+ function parseArgs(argv) {
63
+ const options = {
64
+ command: "",
65
+ instanceId: "",
66
+ arm: "",
67
+ modelId: "",
68
+ reasoningEffort: "high",
69
+ maxWallClockMinutes: 45,
70
+ maxTurns: 250,
71
+ pythonBin: DEFAULT_PYTHON_BIN,
72
+ sweBenchRoot: DEFAULT_SWE_BENCH_ROOT,
73
+ outputRoot: DEFAULT_OUTPUT_ROOT,
74
+ };
75
+ for (let index = 0; index < argv.length; index += 1) {
76
+ const arg = argv[index];
77
+ if (!options.command) {
78
+ options.command = cleanText(arg);
79
+ continue;
80
+ }
81
+ if (arg === "--instance") {
82
+ options.instanceId = cleanText(argv[++index]);
83
+ } else if (arg === "--arm") {
84
+ options.arm = cleanText(argv[++index]);
85
+ } else if (arg === "--model") {
86
+ options.modelId = cleanText(argv[++index]);
87
+ } else if (arg === "--reasoning-effort") {
88
+ options.reasoningEffort = cleanText(argv[++index]) || "high";
89
+ } else if (arg === "--max-wall-clock-minutes") {
90
+ options.maxWallClockMinutes = Number.parseInt(String(argv[++index] || "45"), 10) || 45;
91
+ } else if (arg === "--max-turns") {
92
+ options.maxTurns = Number.parseInt(String(argv[++index] || "250"), 10) || 250;
93
+ } else if (arg === "--python-bin") {
94
+ options.pythonBin = cleanText(argv[++index]) || DEFAULT_PYTHON_BIN;
95
+ } else if (arg === "--swe-bench-root") {
96
+ options.sweBenchRoot = cleanText(argv[++index]) || DEFAULT_SWE_BENCH_ROOT;
97
+ } else if (arg === "--output-root") {
98
+ options.outputRoot = cleanText(argv[++index]) || DEFAULT_OUTPUT_ROOT;
99
+ } else {
100
+ throw new Error(`Unknown argument: ${arg}`);
101
+ }
102
+ }
103
+ if (!options.command) {
104
+ throw new Error("Usage: node scripts/wave-orchestrator/swe-bench-pro-task.mjs run --instance <id> --arm <single-agent|full-wave> --model <id>");
105
+ }
106
+ return options;
107
+ }
108
+
109
+ function runShellCommand(command, { cwd, timeoutMs = 0, env = {} } = {}) {
110
+ const startedAt = Date.now();
111
+ const result = spawnSync("bash", ["-lc", `set -o pipefail; ${command}`], {
112
+ cwd,
113
+ encoding: "utf8",
114
+ env: { ...process.env, ...env },
115
+ timeout: timeoutMs > 0 ? timeoutMs : undefined,
116
+ });
117
+ return {
118
+ command,
119
+ cwd,
120
+ exitCode: Number.isInteger(result.status) ? result.status : 1,
121
+ signal: result.signal || null,
122
+ stdout: result.stdout || "",
123
+ stderr: result.stderr || "",
124
+ error: result.error || null,
125
+ wallClockMs: Date.now() - startedAt,
126
+ };
127
+ }
128
+
129
+ function assertSuccess(result, label) {
130
+ if (result.error?.code === "ETIMEDOUT") {
131
+ throw new Error(`${label} timed out after ${result.wallClockMs}ms`);
132
+ }
133
+ if (result.exitCode !== 0) {
134
+ const detail = cleanText(result.stderr || result.stdout) || "no output";
135
+ throw new Error(`${label} failed (${result.exitCode}): ${detail}`);
136
+ }
137
+ }
138
+
139
+ function loadDatasetRow(instanceId, pythonBin) {
140
+ const pythonScript = `
141
+ import json
142
+ import sys
143
+ from datasets import load_dataset
144
+
145
+ instance_id = sys.argv[1]
146
+ dataset = load_dataset("ScaleAI/SWE-bench_Pro", split="test")
147
+ row = next((entry for entry in dataset if entry["instance_id"] == instance_id), None)
148
+ if row is None:
149
+ raise SystemExit(f"unknown instance: {instance_id}")
150
+ print(json.dumps(row))
151
+ `;
152
+ const result = spawnSync(pythonBin, ["-c", pythonScript, instanceId], {
153
+ cwd: REPO_ROOT,
154
+ encoding: "utf8",
155
+ env: process.env,
156
+ });
157
+ if (result.status !== 0) {
158
+ throw new Error(`Failed to load SWE-bench Pro row for ${instanceId}: ${cleanText(result.stderr || result.stdout)}`);
159
+ }
160
+ return JSON.parse(result.stdout);
161
+ }
162
+
163
+ function normalizeSerializedList(value) {
164
+ if (value == null || value === "" || value === "None") {
165
+ return "[]";
166
+ }
167
+ if (Array.isArray(value)) {
168
+ return JSON.stringify(value);
169
+ }
170
+ return String(value);
171
+ }
172
+
173
+ function normalizeRawSampleRow(row) {
174
+ return {
175
+ instance_id: cleanText(row.instance_id),
176
+ repo: cleanText(row.repo),
177
+ problem_statement: String(row.problem_statement || ""),
178
+ base_commit: cleanText(row.base_commit),
179
+ before_repo_set_cmd: String(row.before_repo_set_cmd || ""),
180
+ selected_test_files_to_run: normalizeSerializedList(
181
+ row.selected_test_files_to_run ?? row.SELECTED_TEST_FILES_TO_RUN,
182
+ ),
183
+ fail_to_pass: normalizeSerializedList(row.fail_to_pass ?? row.FAIL_TO_PASS),
184
+ pass_to_pass: normalizeSerializedList(row.pass_to_pass ?? row.PASS_TO_PASS),
185
+ base_dockerfile: String(row.base_dockerfile || ""),
186
+ instance_dockerfile: String(row.instance_dockerfile || ""),
187
+ };
188
+ }
189
+
190
+ function ensureFreshDir(dirPath) {
191
+ fs.rmSync(dirPath, { recursive: true, force: true });
192
+ fs.mkdirSync(dirPath, { recursive: true });
193
+ }
194
+
195
+ function prepareTaskWorkspace(row, arm, outputRoot) {
196
+ const runId = `${row.instance_id}-${arm}-${crypto.randomBytes(4).toString("hex")}`;
197
+ const taskRoot = path.resolve(REPO_ROOT, outputRoot, runId);
198
+ const repoDir = path.join(taskRoot, "repo");
199
+ ensureFreshDir(taskRoot);
200
+ return {
201
+ runId,
202
+ taskRoot,
203
+ repoDir,
204
+ artifactsDir: path.join(taskRoot, "artifacts"),
205
+ logsDir: path.join(taskRoot, "logs"),
206
+ evalDir: path.join(taskRoot, "eval"),
207
+ };
208
+ }
209
+
210
+ function cloneRepo(row, repoDir) {
211
+ ensureDirectory(path.dirname(repoDir));
212
+ const clone = runShellCommand(
213
+ `git clone https://github.com/${row.repo}.git ${shellQuote(repoDir)}`,
214
+ { cwd: REPO_ROOT, timeoutMs: 20 * 60 * 1000 },
215
+ );
216
+ assertSuccess(clone, `clone ${row.repo}`);
217
+ const prep = runShellCommand(String(row.before_repo_set_cmd || ""), {
218
+ cwd: repoDir,
219
+ timeoutMs: 10 * 60 * 1000,
220
+ env: { GIT_TERMINAL_PROMPT: "0" },
221
+ });
222
+ assertSuccess(prep, `prepare ${row.instance_id}`);
223
+ }
224
+
225
+ function parseCodexUsageFromLog(logPath) {
226
+ if (!fs.existsSync(logPath)) {
227
+ return { input_tokens: 0, cached_input_tokens: 0, output_tokens: 0 };
228
+ }
229
+ const usage = {
230
+ input_tokens: 0,
231
+ cached_input_tokens: 0,
232
+ output_tokens: 0,
233
+ };
234
+ const lines = fs.readFileSync(logPath, "utf8").split(/\r?\n/);
235
+ for (const line of lines) {
236
+ const trimmed = line.trim();
237
+ if (!trimmed.startsWith("{")) {
238
+ continue;
239
+ }
240
+ try {
241
+ const payload = JSON.parse(trimmed);
242
+ if (payload.type === "turn.completed" && payload.usage && typeof payload.usage === "object") {
243
+ usage.input_tokens += Number(payload.usage.input_tokens || 0);
244
+ usage.cached_input_tokens += Number(payload.usage.cached_input_tokens || 0);
245
+ usage.output_tokens += Number(payload.usage.output_tokens || 0);
246
+ }
247
+ } catch {
248
+ // Ignore non-JSON or partial lines.
249
+ }
250
+ }
251
+ return usage;
252
+ }
253
+
254
+ function mergeUsageTotals(list) {
255
+ return list.reduce(
256
+ (total, entry) => ({
257
+ input_tokens: total.input_tokens + Number(entry.input_tokens || 0),
258
+ cached_input_tokens: total.cached_input_tokens + Number(entry.cached_input_tokens || 0),
259
+ output_tokens: total.output_tokens + Number(entry.output_tokens || 0),
260
+ }),
261
+ { input_tokens: 0, cached_input_tokens: 0, output_tokens: 0 },
262
+ );
263
+ }
264
+
265
+ function buildSingleAgentPrompt(row) {
266
+ return [
267
+ `You are solving one SWE-bench Pro task in the repository ${row.repo}.`,
268
+ "",
269
+ "Solve the issue using only the repository checkout and the issue text below.",
270
+ "Do not use gold patches, benchmark answers, evaluator outputs, or any external answer source.",
271
+ "Prefer the smallest correct patch that fixes the described bug.",
272
+ "You may inspect and edit files and run lightweight local checks if helpful, but do not spend most of your budget on heavyweight environment reconstruction.",
273
+ "Leave your code changes in the working tree and do not create commits.",
274
+ "",
275
+ "Issue statement:",
276
+ String(row.problem_statement || ""),
277
+ "",
278
+ `Official target tests: ${normalizeSerializedList(row.selected_test_files_to_run)}`,
279
+ "",
280
+ "Final response requirements:",
281
+ "- summarize the root cause and files changed",
282
+ "- mention any local checks you ran, or state that you relied on static reasoning only",
283
+ ].join("\n");
284
+ }
285
+
286
+ function buildFullWaveMarkdown(row, modelId, reasoningEffort, maxWallClockMinutes, maxTurns) {
287
+ const testList = normalizeSerializedList(row.selected_test_files_to_run);
288
+ return `# Wave 1 - SWE-bench Pro Task Solve
289
+
290
+ **Commit message**: \`Feat: solve ${row.instance_id}\`
291
+
292
+ ## Sequencing note
293
+
294
+ - This is a frozen benchmark solve attempt for \`${row.instance_id}\`. Use only the issue statement, repository checkout, and your own reasoning. Do not use gold patches, verifier outputs, or benchmark answer sources.
295
+
296
+ ## Reference rule
297
+
298
+ - The benchmark contract is fixed before launch. Agents may solve the task, validate locally when practical, and close the wave, but they must not tune against hidden verifier feedback.
299
+
300
+ ## Component promotions
301
+
302
+ - benchmark-program-and-evals: baseline-proved
303
+
304
+ ## Context7 defaults
305
+
306
+ - bundle: none
307
+
308
+ ## Eval targets
309
+
310
+ - id: issue-acceptance-review | selection: pinned | benchmarks: manual-session-review | objective: Re-check the landed diff against the issue statement and the official target tests before closure | threshold: The landed diff addresses the issue requirements without obvious unresolved gaps
311
+
312
+ ## Agent A0: cont-QA
313
+
314
+ ### Role prompts
315
+
316
+ - docs/agents/wave-cont-qa-role.md
317
+
318
+ ### Executor
319
+
320
+ - id: codex
321
+ - model: ${modelId}
322
+ - budget.minutes: ${maxWallClockMinutes}
323
+ - budget.turns: ${maxTurns}
324
+ - codex.json: true
325
+ - codex.config: model_reasoning_effort=${reasoningEffort}
326
+
327
+ ### Context7
328
+
329
+ - bundle: none
330
+
331
+ ### Prompt
332
+
333
+ \`\`\`text
334
+ Primary goal:
335
+ - Close this benchmark solve attempt fail-closed.
336
+
337
+ Required context before coding:
338
+ - Read docs/reference/repository-guidance.md.
339
+ - Read docs/research/agent-context-sources.md.
340
+ - Read docs/plans/current-state.md, docs/plans/master-plan.md, and docs/plans/migration.md.
341
+
342
+ Specific expectations:
343
+ - do not treat effort or plausible narration as proof
344
+ - do not use any benchmark answer source outside this repository checkout and the issue statement
345
+ - BLOCKED is acceptable if the landed evidence is not strong enough
346
+
347
+ File ownership (only touch these paths):
348
+ - docs/plans/waves/reviews/wave-1-cont-qa.md
349
+ \`\`\`
350
+
351
+ ## Agent E0: cont-EVAL
352
+
353
+ ### Role prompts
354
+
355
+ - docs/agents/wave-cont-eval-role.md
356
+
357
+ ### Executor
358
+
359
+ - id: codex
360
+ - model: ${modelId}
361
+ - budget.minutes: ${maxWallClockMinutes}
362
+ - budget.turns: ${maxTurns}
363
+ - codex.json: true
364
+ - codex.config: model_reasoning_effort=${reasoningEffort}
365
+
366
+ ### Context7
367
+
368
+ - bundle: none
369
+
370
+ ### Prompt
371
+
372
+ \`\`\`text
373
+ Primary goal:
374
+ - Review the landed implementation against the issue statement and the official target test scope without changing source files directly.
375
+
376
+ Required context before coding:
377
+ - Read docs/reference/repository-guidance.md.
378
+ - Read docs/research/agent-context-sources.md.
379
+ - Read docs/evals/README.md.
380
+
381
+ Specific expectations:
382
+ - stay report-only for this wave
383
+ - use the issue statement and target test scope below as the acceptance contract
384
+ - do not use verifier output or hidden benchmark answers as solve hints
385
+
386
+ Issue statement:
387
+ ${String(row.problem_statement || "")}
388
+
389
+ Official target tests:
390
+ - ${testList}
391
+
392
+ File ownership (only touch these paths):
393
+ - docs/plans/waves/reviews/wave-1-cont-eval.md
394
+ \`\`\`
395
+
396
+ ## Agent A8: Integration Steward
397
+
398
+ ### Role prompts
399
+
400
+ - docs/agents/wave-integration-role.md
401
+
402
+ ### Executor
403
+
404
+ - id: codex
405
+ - model: ${modelId}
406
+ - budget.minutes: ${maxWallClockMinutes}
407
+ - budget.turns: ${maxTurns}
408
+ - codex.json: true
409
+ - codex.config: model_reasoning_effort=${reasoningEffort}
410
+
411
+ ### Context7
412
+
413
+ - bundle: none
414
+
415
+ ### Prompt
416
+
417
+ \`\`\`text
418
+ Primary goal:
419
+ - Integrate the implementation and review evidence into one closure-ready judgment.
420
+
421
+ Required context before coding:
422
+ - Read docs/reference/repository-guidance.md.
423
+ - Read docs/research/agent-context-sources.md.
424
+ - Read docs/plans/current-state.md and docs/plans/master-plan.md.
425
+
426
+ Specific expectations:
427
+ - keep benchmark fairness explicit
428
+ - name blockers instead of smoothing them over
429
+
430
+ File ownership (only touch these paths):
431
+ - .tmp/main-wave-launcher/integration/wave-1.md
432
+ - .tmp/main-wave-launcher/integration/wave-1.json
433
+ \`\`\`
434
+
435
+ ## Agent A9: Documentation Steward
436
+
437
+ ### Role prompts
438
+
439
+ - docs/agents/wave-documentation-role.md
440
+
441
+ ### Executor
442
+
443
+ - id: codex
444
+ - model: ${modelId}
445
+ - budget.minutes: ${maxWallClockMinutes}
446
+ - budget.turns: ${maxTurns}
447
+ - codex.json: true
448
+ - codex.config: model_reasoning_effort=${reasoningEffort}
449
+
450
+ ### Context7
451
+
452
+ - bundle: none
453
+
454
+ ### Prompt
455
+
456
+ \`\`\`text
457
+ Primary goal:
458
+ - Close the documentation surface without polluting the benchmark patch with Wave scaffolding changes.
459
+
460
+ Required context before coding:
461
+ - Read docs/reference/repository-guidance.md.
462
+ - Read docs/research/agent-context-sources.md.
463
+ - Read docs/plans/current-state.md, docs/plans/master-plan.md, and docs/plans/migration.md.
464
+
465
+ Specific expectations:
466
+ - prefer no-change when shared-plan docs are unrelated to the repository bug fix
467
+ - do not treat Wave scaffold changes as part of the benchmark patch
468
+
469
+ File ownership (only touch these paths):
470
+ - docs/plans/current-state.md
471
+ - docs/plans/master-plan.md
472
+ - docs/plans/migration.md
473
+ - docs/plans/component-cutover-matrix.md
474
+ - docs/plans/component-cutover-matrix.json
475
+ \`\`\`
476
+
477
+ ## Agent A1: Root Cause And Patch
478
+
479
+ ### Executor
480
+
481
+ - id: codex
482
+ - model: ${modelId}
483
+ - budget.minutes: ${maxWallClockMinutes}
484
+ - budget.turns: ${maxTurns}
485
+ - codex.json: true
486
+ - codex.config: model_reasoning_effort=${reasoningEffort}
487
+
488
+ ### Context7
489
+
490
+ - bundle: none
491
+
492
+ ### Components
493
+
494
+ - benchmark-program-and-evals
495
+
496
+ ### Exit contract
497
+
498
+ - completion: integrated
499
+ - durability: none
500
+ - proof: integration
501
+ - doc-impact: owned
502
+
503
+ ### Prompt
504
+
505
+ \`\`\`text
506
+ Primary goal:
507
+ - Diagnose the bug and land the smallest correct source patch.
508
+
509
+ Required context before coding:
510
+ - Read docs/reference/repository-guidance.md.
511
+ - Read docs/research/agent-context-sources.md.
512
+ - Read README.md if it helps orient the repository.
513
+
514
+ Specific expectations:
515
+ - use only this issue statement and the repository checkout
516
+ - do not use gold patches, evaluator outputs, or hidden benchmark answers
517
+ - coordinate with A2 when a regression test should change
518
+ - prefer a minimal diff that fixes the root cause
519
+
520
+ Issue statement:
521
+ ${String(row.problem_statement || "")}
522
+
523
+ Official target tests:
524
+ - ${testList}
525
+
526
+ File ownership (only touch these paths):
527
+ - src/
528
+ - source/
529
+ - lib/
530
+ - server/
531
+ - client/
532
+ - public/
533
+ - package.json
534
+ - pnpm-lock.yaml
535
+ - package-lock.json
536
+ - yarn.lock
537
+ - README.md
538
+ \`\`\`
539
+
540
+ ## Agent A2: Regression Tests And Acceptance
541
+
542
+ ### Executor
543
+
544
+ - id: codex
545
+ - model: ${modelId}
546
+ - budget.minutes: ${maxWallClockMinutes}
547
+ - budget.turns: ${maxTurns}
548
+ - codex.json: true
549
+ - codex.config: model_reasoning_effort=${reasoningEffort}
550
+
551
+ ### Context7
552
+
553
+ - bundle: none
554
+
555
+ ### Components
556
+
557
+ - benchmark-program-and-evals
558
+
559
+ ### Exit contract
560
+
561
+ - completion: integrated
562
+ - durability: none
563
+ - proof: integration
564
+ - doc-impact: owned
565
+
566
+ ### Prompt
567
+
568
+ \`\`\`text
569
+ Primary goal:
570
+ - Add or adjust the narrowest regression coverage needed and independently check that the patch matches the issue requirements.
571
+
572
+ Required context before coding:
573
+ - Read docs/reference/repository-guidance.md.
574
+ - Read docs/research/agent-context-sources.md.
575
+ - Read the issue statement and the files A1 changes before editing.
576
+
577
+ Specific expectations:
578
+ - keep tests tightly scoped to the bug
579
+ - do not broaden the patch unless the issue requires it
580
+ - if a reliable local test run is not practical in this environment, say so explicitly rather than fabricating proof
581
+
582
+ Issue statement:
583
+ ${String(row.problem_statement || "")}
584
+
585
+ Official target tests:
586
+ - ${testList}
587
+
588
+ File ownership (only touch these paths):
589
+ - test/
590
+ - tests/
591
+ - __tests__/
592
+ - spec/
593
+ \`\`\`
594
+ `;
595
+ }
596
+
597
+ function renderWaveRepoGuide() {
598
+ return `# Repository Guidance
599
+
600
+ - This repository is being used as a benchmark task workspace.
601
+ - Only edit source files needed for the task and the Wave-owned closure reports.
602
+ - Do not use benchmark gold patches, hidden answers, or verifier outputs as solve hints.
603
+ - Keep changes minimal and reviewable.
604
+ `;
605
+ }
606
+
607
+ function normalizeBenchmarkWaveConfig(repoDir) {
608
+ const configPath = path.join(repoDir, "wave.config.json");
609
+ if (!fs.existsSync(configPath)) {
610
+ return;
611
+ }
612
+ const config = JSON.parse(fs.readFileSync(configPath, "utf8"));
613
+ const runtimePolicy = {
614
+ ...(config.lanes?.main?.runtimePolicy || {}),
615
+ runtimeMixTargets: {
616
+ codex: 10,
617
+ claude: 0,
618
+ opencode: 0,
619
+ },
620
+ defaultExecutorByRole: {
621
+ implementation: "codex",
622
+ integration: "codex",
623
+ documentation: "codex",
624
+ "cont-qa": "codex",
625
+ "cont-eval": "codex",
626
+ security: "codex",
627
+ research: "codex",
628
+ infra: "codex",
629
+ deploy: "codex",
630
+ },
631
+ fallbackExecutorOrder: ["codex", "claude", "opencode"],
632
+ };
633
+ config.executors = {
634
+ ...(config.executors || {}),
635
+ default: "codex",
636
+ };
637
+ config.lanes = {
638
+ ...(config.lanes || {}),
639
+ main: {
640
+ ...(config.lanes?.main || {}),
641
+ runtimePolicy,
642
+ },
643
+ };
644
+ writeJsonAtomic(configPath, config);
645
+ }
646
+
647
+ function writeFullWaveScaffold(row, repoDir, modelId, reasoningEffort, maxWallClockMinutes, maxTurns) {
648
+ const docsRefDir = path.join(repoDir, "docs", "reference");
649
+ ensureDirectory(docsRefDir);
650
+ const repoGuidePath = path.join(docsRefDir, "repository-guidance.md");
651
+ if (!fs.existsSync(repoGuidePath)) {
652
+ writeTextAtomic(repoGuidePath, `${renderWaveRepoGuide()}\n`);
653
+ }
654
+ const wavePath = path.join(repoDir, "docs", "plans", "waves", "wave-1.md");
655
+ writeTextAtomic(
656
+ wavePath,
657
+ `${buildFullWaveMarkdown(row, modelId, reasoningEffort, maxWallClockMinutes, maxTurns)}\n`,
658
+ );
659
+ return wavePath;
660
+ }
661
+
662
+ function removeSeededStarterWave(repoDir) {
663
+ fs.rmSync(path.join(repoDir, "docs", "plans", "waves", "wave-0.md"), { force: true });
664
+ fs.rmSync(path.join(repoDir, "docs", "plans", "waves", "specs", "wave-0.json"), { force: true });
665
+ }
666
+
667
+ function parseGitStatusPorcelain(output) {
668
+ return String(output || "")
669
+ .split(/\r?\n/)
670
+ .map((line) => line.trimEnd())
671
+ .filter(Boolean)
672
+ .map((line) => ({
673
+ code: line.slice(0, 2),
674
+ path: line.slice(3).trim(),
675
+ }));
676
+ }
677
+
678
+ function shouldExcludeFromBenchmarkPatch(relPath, seededFiles) {
679
+ const normalized = relPath.replaceAll(path.sep, "/");
680
+ if (!normalized) {
681
+ return true;
682
+ }
683
+ if (normalized.startsWith(".wave/") || normalized.startsWith(".tmp/")) {
684
+ return true;
685
+ }
686
+ if (seededFiles.has(normalized)) {
687
+ return true;
688
+ }
689
+ return [
690
+ "docs/agents/",
691
+ "docs/context7/",
692
+ "docs/evals/",
693
+ "docs/guides/",
694
+ "docs/plans/",
695
+ "docs/reference/",
696
+ "docs/research/",
697
+ "skills/",
698
+ ].some((prefix) => normalized.startsWith(prefix)) || normalized === "wave.config.json";
699
+ }
700
+
701
+ function buildDiffPathspecs(seededFiles) {
702
+ const exactFiles = Array.from(seededFiles).map((filePath) => `:(exclude)${filePath}`);
703
+ const globExcludes = [
704
+ ".wave/**",
705
+ ".tmp/**",
706
+ "docs/agents/**",
707
+ "docs/context7/**",
708
+ "docs/evals/**",
709
+ "docs/guides/**",
710
+ "docs/plans/**",
711
+ "docs/reference/**",
712
+ "docs/research/**",
713
+ "skills/**",
714
+ ].map((pattern) => `:(glob,exclude)${pattern}`);
715
+ return Array.from(new Set([...exactFiles, ":(exclude)wave.config.json", ...globExcludes]));
716
+ }
717
+
718
+ function buildPatch(repoDir, seededFiles = new Set()) {
719
+ const status = runShellCommand("git status --porcelain=v1 -uall", { cwd: repoDir });
720
+ assertSuccess(status, "git status");
721
+ const untracked = parseGitStatusPorcelain(status.stdout)
722
+ .filter((entry) => entry.code === "??")
723
+ .map((entry) => entry.path)
724
+ .filter((entry) => !shouldExcludeFromBenchmarkPatch(entry, seededFiles));
725
+ if (untracked.length > 0) {
726
+ const addIntent = runShellCommand(
727
+ `git add -N -- ${untracked.map((filePath) => shellQuote(filePath)).join(" ")}`,
728
+ { cwd: repoDir },
729
+ );
730
+ assertSuccess(addIntent, "git add -N");
731
+ }
732
+ const pathspecs = buildDiffPathspecs(seededFiles);
733
+ const diffCommand = `git diff --binary HEAD -- . ${pathspecs.map((entry) => shellQuote(entry)).join(" ")}`.trim();
734
+ const diff = runShellCommand(diffCommand, { cwd: repoDir });
735
+ assertSuccess(diff, "git diff");
736
+ return diff.stdout;
737
+ }
738
+
739
+ function parseWaveCodexUsage(repoDir) {
740
+ const logsDir = path.join(repoDir, ".tmp", "main-wave-launcher", "logs");
741
+ if (!fs.existsSync(logsDir)) {
742
+ return { input_tokens: 0, cached_input_tokens: 0, output_tokens: 0 };
743
+ }
744
+ const usages = fs
745
+ .readdirSync(logsDir)
746
+ .filter((name) => name.startsWith("wave-1-") && name.endsWith(".log"))
747
+ .map((name) => parseCodexUsageFromLog(path.join(logsDir, name)));
748
+ return mergeUsageTotals(usages);
749
+ }
750
+
751
+ function buildSingleAgentSolve(row, taskWorkspace, options) {
752
+ ensureDirectory(taskWorkspace.logsDir);
753
+ const promptPath = path.join(taskWorkspace.logsDir, "single-agent-prompt.txt");
754
+ const logPath = path.join(taskWorkspace.logsDir, "single-agent-codex.jsonl");
755
+ writeTextAtomic(promptPath, `${buildSingleAgentPrompt(row)}\n`);
756
+ const command = buildCodexExecInvocation(promptPath, logPath, "danger-full-access", "codex", {
757
+ model: options.modelId,
758
+ config: [`model_reasoning_effort=${options.reasoningEffort}`],
759
+ search: false,
760
+ json: true,
761
+ ephemeral: false,
762
+ });
763
+ const result = runShellCommand(command, {
764
+ cwd: taskWorkspace.repoDir,
765
+ timeoutMs: options.maxWallClockMinutes * 60 * 1000,
766
+ });
767
+ return {
768
+ execution: result,
769
+ tokenUsage: parseCodexUsageFromLog(logPath),
770
+ tracePath: null,
771
+ summaryPath: path.relative(REPO_ROOT, logPath).replaceAll(path.sep, "/"),
772
+ };
773
+ }
774
+
775
+ function buildFullWaveSolve(row, taskWorkspace, options) {
776
+ const init = runShellCommand(`node ${shellQuote(WAVE_ENTRY)} init --json`, {
777
+ cwd: taskWorkspace.repoDir,
778
+ timeoutMs: 2 * 60 * 1000,
779
+ });
780
+ assertSuccess(init, "wave init");
781
+ const initPayload = JSON.parse(init.stdout);
782
+ const seededFiles = new Set((initPayload.seededFiles || []).map((filePath) => String(filePath).replaceAll("\\", "/")));
783
+ normalizeBenchmarkWaveConfig(taskWorkspace.repoDir);
784
+ removeSeededStarterWave(taskWorkspace.repoDir);
785
+ writeFullWaveScaffold(
786
+ row,
787
+ taskWorkspace.repoDir,
788
+ options.modelId,
789
+ options.reasoningEffort,
790
+ options.maxWallClockMinutes,
791
+ options.maxTurns,
792
+ );
793
+ const doctor = runShellCommand(`node ${shellQuote(WAVE_ENTRY)} doctor --json`, {
794
+ cwd: taskWorkspace.repoDir,
795
+ timeoutMs: 2 * 60 * 1000,
796
+ });
797
+ assertSuccess(doctor, "wave doctor");
798
+ const launch = runShellCommand(
799
+ `node ${shellQuote(WAVE_ENTRY)} launch --lane main --start-wave 1 --end-wave 1 --no-dashboard --terminal-surface tmux`,
800
+ {
801
+ cwd: taskWorkspace.repoDir,
802
+ timeoutMs: options.maxWallClockMinutes * 60 * 1000,
803
+ },
804
+ );
805
+ const integrationSummaryPath = path.join(
806
+ taskWorkspace.repoDir,
807
+ ".tmp",
808
+ "main-wave-launcher",
809
+ "integration",
810
+ "wave-1.md",
811
+ );
812
+ const tracePath = path.join(taskWorkspace.repoDir, "traces", "wave-1");
813
+ return {
814
+ execution: launch,
815
+ tokenUsage: parseWaveCodexUsage(taskWorkspace.repoDir),
816
+ seededFiles,
817
+ tracePath: fs.existsSync(tracePath) ? path.relative(REPO_ROOT, tracePath).replaceAll(path.sep, "/") : null,
818
+ summaryPath: fs.existsSync(integrationSummaryPath)
819
+ ? path.relative(REPO_ROOT, integrationSummaryPath).replaceAll(path.sep, "/")
820
+ : null,
821
+ };
822
+ }
823
+
824
+ function evaluatePatch(row, patch, taskWorkspace, options, arm) {
825
+ ensureDirectory(taskWorkspace.evalDir);
826
+ const rawSamplePath = path.join(taskWorkspace.evalDir, "raw-sample.jsonl");
827
+ const patchPath = path.join(taskWorkspace.evalDir, "patches.json");
828
+ const outputDir = path.join(taskWorkspace.evalDir, "output");
829
+ const stdoutPath = path.join(taskWorkspace.evalDir, "official-eval.stdout.log");
830
+ const stderrPath = path.join(taskWorkspace.evalDir, "official-eval.stderr.log");
831
+ const commandPath = path.join(taskWorkspace.evalDir, "official-eval.command.txt");
832
+ ensureDirectory(outputDir);
833
+ const rawRow = normalizeRawSampleRow(row);
834
+ fs.writeFileSync(rawSamplePath, `${JSON.stringify(rawRow)}\n`, "utf8");
835
+ fs.writeFileSync(
836
+ patchPath,
837
+ `${JSON.stringify([{ instance_id: row.instance_id, patch, prefix: arm }], null, 2)}\n`,
838
+ "utf8",
839
+ );
840
+ const evalCommand = [
841
+ shellQuote(options.pythonBin),
842
+ shellQuote(path.join(options.sweBenchRoot, "swe_bench_pro_eval.py")),
843
+ `--raw_sample_path=${shellQuote(rawSamplePath)}`,
844
+ `--patch_path=${shellQuote(patchPath)}`,
845
+ `--output_dir=${shellQuote(outputDir)}`,
846
+ `--scripts_dir=${shellQuote(path.join(options.sweBenchRoot, "run_scripts"))}`,
847
+ "--num_workers=1",
848
+ "--dockerhub_username=jefzda",
849
+ "--use_local_docker",
850
+ ].join(" ");
851
+ fs.writeFileSync(commandPath, `${evalCommand}\n`, "utf8");
852
+ const result = runShellCommand(evalCommand, {
853
+ cwd: options.sweBenchRoot,
854
+ timeoutMs: 60 * 60 * 1000,
855
+ });
856
+ fs.writeFileSync(stdoutPath, result.stdout || "", "utf8");
857
+ fs.writeFileSync(stderrPath, result.stderr || "", "utf8");
858
+ const evalResultsPath = path.join(outputDir, "eval_results.json");
859
+ if (result.error?.code === "ETIMEDOUT") {
860
+ return {
861
+ success: false,
862
+ artifactPath: null,
863
+ verificationStdoutPath: path.relative(REPO_ROOT, stdoutPath).replaceAll(path.sep, "/"),
864
+ verificationStderrPath: path.relative(REPO_ROOT, stderrPath).replaceAll(path.sep, "/"),
865
+ verificationOutputDir: path.relative(REPO_ROOT, outputDir).replaceAll(path.sep, "/"),
866
+ reviewCategory: "timeout",
867
+ detail: `official SWE-bench Pro evaluation timed out after ${result.wallClockMs}ms`,
868
+ };
869
+ }
870
+ if (result.exitCode !== 0) {
871
+ const detail = cleanText(result.stderr || result.stdout) || "no output";
872
+ return {
873
+ success: false,
874
+ artifactPath: fs.existsSync(evalResultsPath)
875
+ ? path.relative(REPO_ROOT, evalResultsPath).replaceAll(path.sep, "/")
876
+ : null,
877
+ verificationStdoutPath: path.relative(REPO_ROOT, stdoutPath).replaceAll(path.sep, "/"),
878
+ verificationStderrPath: path.relative(REPO_ROOT, stderrPath).replaceAll(path.sep, "/"),
879
+ verificationOutputDir: path.relative(REPO_ROOT, outputDir).replaceAll(path.sep, "/"),
880
+ reviewCategory: isVerifierImageFailureDetail(detail.toLowerCase()) ? "verifier-image" : "setup-harness",
881
+ detail: `official SWE-bench Pro evaluation failed (${result.exitCode}): ${detail}`,
882
+ };
883
+ }
884
+ const evalResults = JSON.parse(fs.readFileSync(evalResultsPath, "utf8"));
885
+ const success = Boolean(evalResults[row.instance_id]);
886
+ return {
887
+ success,
888
+ artifactPath: path.relative(REPO_ROOT, evalResultsPath).replaceAll(path.sep, "/"),
889
+ verificationStdoutPath: path.relative(REPO_ROOT, stdoutPath).replaceAll(path.sep, "/"),
890
+ verificationStderrPath: path.relative(REPO_ROOT, stderrPath).replaceAll(path.sep, "/"),
891
+ verificationOutputDir: path.relative(REPO_ROOT, outputDir).replaceAll(path.sep, "/"),
892
+ detail: cleanText(result.stdout.split(/\r?\n/).filter(Boolean).slice(-1)[0]) || "evaluation completed",
893
+ };
894
+ }
895
+
896
+ function classifyReviewCategory({ solve, evaluation }) {
897
+ if (evaluation.reviewCategory) {
898
+ return evaluation.reviewCategory;
899
+ }
900
+ if (evaluation.success) {
901
+ return "solved";
902
+ }
903
+ const detail = cleanText(evaluation.detail).toLowerCase();
904
+ if (detail.includes("dry-run plan only") || detail.includes("planning only")) {
905
+ return "dry-run-plan";
906
+ }
907
+ if (solve.execution.error?.code === "ETIMEDOUT" || detail.includes("timed out") || detail.includes("timeout")) {
908
+ return "timeout";
909
+ }
910
+ if (isVerifierImageFailureDetail(detail)) {
911
+ return "verifier-image";
912
+ }
913
+ if (detail.includes("needs-more-work") || detail.includes("proof gap") || detail.includes("blocked")) {
914
+ return "blocked-proof";
915
+ }
916
+ if (isSetupHarnessFailureDetail(detail)) {
917
+ return "setup-harness";
918
+ }
919
+ if (solve.execution.exitCode !== 0) {
920
+ return "setup-harness";
921
+ }
922
+ return "incorrect-patch";
923
+ }
924
+
925
+ function buildResultPayload({
926
+ row,
927
+ arm,
928
+ solve,
929
+ evaluation,
930
+ patch,
931
+ taskWorkspace,
932
+ }) {
933
+ const patchPath = path.join(taskWorkspace.artifactsDir, `${arm}.patch.diff`);
934
+ const resultPath = path.join(taskWorkspace.artifactsDir, `${arm}.result.json`);
935
+ ensureDirectory(taskWorkspace.artifactsDir);
936
+ fs.writeFileSync(patchPath, patch, "utf8");
937
+ const payload = {
938
+ generatedAt: toIsoTimestamp(),
939
+ instanceId: row.instance_id,
940
+ repo: row.repo,
941
+ arm,
942
+ success: evaluation.success,
943
+ wallClockMs: solve.execution.wallClockMs,
944
+ totalCostUsd: null,
945
+ tokenUsage: solve.tokenUsage,
946
+ tracePath: solve.tracePath,
947
+ summaryPath: solve.summaryPath,
948
+ artifactPath: evaluation.artifactPath,
949
+ patchPath: path.relative(REPO_ROOT, patchPath).replaceAll(path.sep, "/"),
950
+ verificationStdoutPath: evaluation.verificationStdoutPath,
951
+ verificationStderrPath: evaluation.verificationStderrPath,
952
+ verificationOutputDir: evaluation.verificationOutputDir,
953
+ reviewCategory: classifyReviewCategory({ solve, evaluation }),
954
+ detail: evaluation.detail,
955
+ };
956
+ writeJsonAtomic(resultPath, payload);
957
+ return payload;
958
+ }
959
+
960
+ function main() {
961
+ const options = parseArgs(process.argv.slice(2));
962
+ if (options.command !== "run") {
963
+ throw new Error(`Unsupported command: ${options.command}`);
964
+ }
965
+ const arm = normalizeArm(options.arm);
966
+ if (!options.instanceId) {
967
+ throw new Error("--instance is required");
968
+ }
969
+ if (!options.modelId) {
970
+ throw new Error("--model is required");
971
+ }
972
+ if (!fs.existsSync(options.pythonBin)) {
973
+ throw new Error(`Python runtime not found: ${options.pythonBin}`);
974
+ }
975
+ if (!fs.existsSync(options.sweBenchRoot)) {
976
+ throw new Error(`SWE-bench Pro repo not found: ${options.sweBenchRoot}`);
977
+ }
978
+ const row = loadDatasetRow(options.instanceId, options.pythonBin);
979
+ const taskWorkspace = prepareTaskWorkspace(row, arm, options.outputRoot);
980
+ cloneRepo(row, taskWorkspace.repoDir);
981
+ const solve =
982
+ arm === "single-agent"
983
+ ? buildSingleAgentSolve(row, taskWorkspace, options)
984
+ : buildFullWaveSolve(row, taskWorkspace, options);
985
+ const patch = buildPatch(taskWorkspace.repoDir, solve.seededFiles || new Set());
986
+ const evaluation = evaluatePatch(row, patch, taskWorkspace, options, arm);
987
+ const payload = buildResultPayload({
988
+ row,
989
+ arm,
990
+ solve,
991
+ evaluation,
992
+ patch,
993
+ taskWorkspace,
994
+ });
995
+ console.log(JSON.stringify(payload));
996
+ }
997
+
998
+ try {
999
+ main();
1000
+ } catch (error) {
1001
+ const message = error instanceof Error ? error.message : String(error);
1002
+ console.error(message);
1003
+ process.exit(1);
1004
+ }