@chllming/wave-orchestration 0.6.3 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/CHANGELOG.md +82 -1
  2. package/README.md +40 -7
  3. package/docs/agents/wave-orchestrator-role.md +50 -0
  4. package/docs/agents/wave-planner-role.md +39 -0
  5. package/docs/context7/bundles.json +9 -0
  6. package/docs/context7/planner-agent/README.md +25 -0
  7. package/docs/context7/planner-agent/manifest.json +83 -0
  8. package/docs/context7/planner-agent/papers/cooperbench-why-coding-agents-cannot-be-your-teammates-yet.md +3283 -0
  9. package/docs/context7/planner-agent/papers/dova-deliberation-first-multi-agent-orchestration-for-autonomous-research-automation.md +1699 -0
  10. package/docs/context7/planner-agent/papers/dpbench-large-language-models-struggle-with-simultaneous-coordination.md +2251 -0
  11. package/docs/context7/planner-agent/papers/incremental-planning-to-control-a-blackboard-based-problem-solver.md +1729 -0
  12. package/docs/context7/planner-agent/papers/silo-bench-a-scalable-environment-for-evaluating-distributed-coordination-in-multi-agent-llm-systems.md +3747 -0
  13. package/docs/context7/planner-agent/papers/todoevolve-learning-to-architect-agent-planning-systems.md +1675 -0
  14. package/docs/context7/planner-agent/papers/verified-multi-agent-orchestration-a-plan-execute-verify-replan-framework-for-complex-query-resolution.md +1173 -0
  15. package/docs/context7/planner-agent/papers/why-do-multi-agent-llm-systems-fail.md +5211 -0
  16. package/docs/context7/planner-agent/topics/planning-and-orchestration.md +24 -0
  17. package/docs/evals/README.md +96 -1
  18. package/docs/evals/arm-templates/README.md +13 -0
  19. package/docs/evals/arm-templates/full-wave.json +15 -0
  20. package/docs/evals/arm-templates/single-agent.json +15 -0
  21. package/docs/evals/benchmark-catalog.json +7 -0
  22. package/docs/evals/cases/README.md +47 -0
  23. package/docs/evals/cases/wave-blackboard-inbox-targeting.json +73 -0
  24. package/docs/evals/cases/wave-contradiction-conflict.json +104 -0
  25. package/docs/evals/cases/wave-expert-routing-preservation.json +69 -0
  26. package/docs/evals/cases/wave-hidden-profile-private-evidence.json +81 -0
  27. package/docs/evals/cases/wave-premature-closure-guard.json +71 -0
  28. package/docs/evals/cases/wave-silo-cross-agent-state.json +77 -0
  29. package/docs/evals/cases/wave-simultaneous-lockstep.json +92 -0
  30. package/docs/evals/cooperbench/real-world-mitigation.md +341 -0
  31. package/docs/evals/external-benchmarks.json +85 -0
  32. package/docs/evals/external-command-config.sample.json +9 -0
  33. package/docs/evals/external-command-config.swe-bench-pro.json +8 -0
  34. package/docs/evals/pilots/README.md +47 -0
  35. package/docs/evals/pilots/swe-bench-pro-public-full-wave-review-10.json +64 -0
  36. package/docs/evals/pilots/swe-bench-pro-public-pilot.json +111 -0
  37. package/docs/evals/wave-benchmark-program.md +302 -0
  38. package/docs/guides/planner.md +67 -11
  39. package/docs/guides/terminal-surfaces.md +12 -0
  40. package/docs/plans/context7-wave-orchestrator.md +20 -0
  41. package/docs/plans/current-state.md +8 -1
  42. package/docs/plans/examples/wave-benchmark-improvement.md +108 -0
  43. package/docs/plans/examples/wave-example-live-proof.md +1 -1
  44. package/docs/plans/examples/wave-example-rollout-fidelity.md +340 -0
  45. package/docs/plans/migration.md +26 -0
  46. package/docs/plans/wave-orchestrator.md +60 -12
  47. package/docs/plans/waves/reviews/wave-1-benchmark-operator.md +118 -0
  48. package/docs/reference/cli-reference.md +547 -0
  49. package/docs/reference/coordination-and-closure.md +436 -0
  50. package/docs/reference/live-proof-waves.md +25 -3
  51. package/docs/reference/npmjs-trusted-publishing.md +3 -3
  52. package/docs/reference/proof-metrics.md +90 -0
  53. package/docs/reference/runtime-config/README.md +63 -2
  54. package/docs/reference/runtime-config/codex.md +2 -1
  55. package/docs/reference/sample-waves.md +29 -18
  56. package/docs/reference/wave-control.md +164 -0
  57. package/docs/reference/wave-planning-lessons.md +131 -0
  58. package/package.json +5 -4
  59. package/releases/manifest.json +40 -0
  60. package/scripts/research/agent-context-archive.mjs +18 -0
  61. package/scripts/research/manifests/agent-context-expanded-2026-03-22.mjs +17 -0
  62. package/scripts/research/sync-planner-context7-bundle.mjs +133 -0
  63. package/scripts/wave-orchestrator/agent-state.mjs +11 -2
  64. package/scripts/wave-orchestrator/artifact-schemas.mjs +232 -0
  65. package/scripts/wave-orchestrator/autonomous.mjs +7 -0
  66. package/scripts/wave-orchestrator/benchmark-cases.mjs +374 -0
  67. package/scripts/wave-orchestrator/benchmark-external.mjs +1384 -0
  68. package/scripts/wave-orchestrator/benchmark.mjs +972 -0
  69. package/scripts/wave-orchestrator/clarification-triage.mjs +78 -12
  70. package/scripts/wave-orchestrator/config.mjs +175 -0
  71. package/scripts/wave-orchestrator/control-cli.mjs +1216 -0
  72. package/scripts/wave-orchestrator/control-plane.mjs +697 -0
  73. package/scripts/wave-orchestrator/coord-cli.mjs +360 -2
  74. package/scripts/wave-orchestrator/coordination-store.mjs +211 -9
  75. package/scripts/wave-orchestrator/coordination.mjs +84 -0
  76. package/scripts/wave-orchestrator/dashboard-renderer.mjs +120 -5
  77. package/scripts/wave-orchestrator/dashboard-state.mjs +22 -0
  78. package/scripts/wave-orchestrator/evals.mjs +23 -0
  79. package/scripts/wave-orchestrator/executors.mjs +3 -2
  80. package/scripts/wave-orchestrator/feedback.mjs +55 -0
  81. package/scripts/wave-orchestrator/install.mjs +151 -2
  82. package/scripts/wave-orchestrator/launcher-closure.mjs +4 -1
  83. package/scripts/wave-orchestrator/launcher-runtime.mjs +33 -30
  84. package/scripts/wave-orchestrator/launcher.mjs +884 -36
  85. package/scripts/wave-orchestrator/planner-context.mjs +75 -0
  86. package/scripts/wave-orchestrator/planner.mjs +2270 -136
  87. package/scripts/wave-orchestrator/proof-cli.mjs +195 -0
  88. package/scripts/wave-orchestrator/proof-registry.mjs +317 -0
  89. package/scripts/wave-orchestrator/replay.mjs +10 -4
  90. package/scripts/wave-orchestrator/retry-cli.mjs +184 -0
  91. package/scripts/wave-orchestrator/retry-control.mjs +225 -0
  92. package/scripts/wave-orchestrator/shared.mjs +26 -0
  93. package/scripts/wave-orchestrator/swe-bench-pro-task.mjs +1004 -0
  94. package/scripts/wave-orchestrator/terminals.mjs +1 -1
  95. package/scripts/wave-orchestrator/traces.mjs +157 -2
  96. package/scripts/wave-orchestrator/wave-control-client.mjs +532 -0
  97. package/scripts/wave-orchestrator/wave-control-schema.mjs +309 -0
  98. package/scripts/wave-orchestrator/wave-files.mjs +144 -23
  99. package/scripts/wave.mjs +27 -0
  100. package/skills/repo-coding-rules/SKILL.md +1 -0
  101. package/skills/role-cont-eval/SKILL.md +1 -0
  102. package/skills/role-cont-qa/SKILL.md +13 -6
  103. package/skills/role-deploy/SKILL.md +1 -0
  104. package/skills/role-documentation/SKILL.md +4 -0
  105. package/skills/role-implementation/SKILL.md +4 -0
  106. package/skills/role-infra/SKILL.md +2 -1
  107. package/skills/role-integration/SKILL.md +15 -8
  108. package/skills/role-planner/SKILL.md +39 -0
  109. package/skills/role-planner/skill.json +21 -0
  110. package/skills/role-research/SKILL.md +1 -0
  111. package/skills/role-security/SKILL.md +2 -2
  112. package/skills/runtime-claude/SKILL.md +2 -1
  113. package/skills/runtime-codex/SKILL.md +1 -0
  114. package/skills/runtime-local/SKILL.md +2 -0
  115. package/skills/runtime-opencode/SKILL.md +1 -0
  116. package/skills/wave-core/SKILL.md +25 -6
  117. package/skills/wave-core/references/marker-syntax.md +16 -8
  118. package/wave.config.json +45 -0
@@ -0,0 +1,1384 @@
1
+ import fs from "node:fs";
2
+ import path from "node:path";
3
+ import { spawnSync } from "node:child_process";
4
+ import {
5
+ DEFAULT_WAVE_LANE,
6
+ REPO_ROOT,
7
+ buildLanePaths,
8
+ ensureDirectory,
9
+ readJsonOrNull,
10
+ toIsoTimestamp,
11
+ writeJsonAtomic,
12
+ writeTextAtomic,
13
+ } from "./shared.mjs";
14
+ import { loadExternalBenchmarkAdapters } from "./benchmark-cases.mjs";
15
+ import {
16
+ buildWaveControlArtifactFromPath,
17
+ flushWaveControlQueue,
18
+ safeQueueWaveControlEvent,
19
+ } from "./wave-control-client.mjs";
20
+ import { buildWaveControlConfigAttestationHash } from "./wave-control-schema.mjs";
21
+
22
+ const DEFAULT_EXTERNAL_PILOTS_DIR = "docs/evals/pilots";
23
+ const DEFAULT_EXTERNAL_ARM_TEMPLATES_DIR = "docs/evals/arm-templates";
24
+ const EXTERNAL_BENCHMARK_ARMS = ["single-agent", "full-wave"];
25
+
26
+ function cleanText(value) {
27
+ return String(value ?? "").trim();
28
+ }
29
+
30
+ function matchesFailurePattern(detail, patterns) {
31
+ return patterns.some((pattern) => detail.includes(pattern));
32
+ }
33
+
34
+ function isVerifierImageFailureDetail(detail) {
35
+ return matchesFailurePattern(detail, [
36
+ "failed to pull",
37
+ "manifest unknown",
38
+ "no matching manifest",
39
+ "pull access denied",
40
+ "jefzda/sweap-images",
41
+ "docker image",
42
+ "dockerhub_username",
43
+ ]);
44
+ }
45
+
46
+ function isSetupHarnessFailureDetail(detail) {
47
+ return matchesFailurePattern(detail, [
48
+ "wave init failed",
49
+ "wave doctor failed",
50
+ "wave launch failed",
51
+ "git diff failed",
52
+ "git add -n failed",
53
+ "patch extraction failed",
54
+ "repository preparation failed",
55
+ "repo already contained wave bootstrap files",
56
+ "already contained wave bootstrap files",
57
+ "could not parse object",
58
+ "fatal: could not parse object",
59
+ "bootstrap",
60
+ "harness",
61
+ "workspace",
62
+ "task workspace",
63
+ "setup failed",
64
+ ]);
65
+ }
66
+
67
+ function reviewDispositionForCategory(category) {
68
+ switch (cleanText(category)) {
69
+ case "solved":
70
+ return "solved";
71
+ case "dry-run-plan":
72
+ return "dry-run";
73
+ case "verifier-image":
74
+ return "invalidated";
75
+ case "setup-harness":
76
+ case "harness-env":
77
+ return "setup-failure";
78
+ case "incorrect-patch":
79
+ return "scored-failure";
80
+ case "timeout":
81
+ return "timeout";
82
+ case "blocked-proof":
83
+ return "blocked-proof";
84
+ default:
85
+ return "unknown";
86
+ }
87
+ }
88
+
89
+ function sortCountEntries(counts) {
90
+ return Object.entries(counts || {}).sort((left, right) => {
91
+ if (right[1] !== left[1]) {
92
+ return right[1] - left[1];
93
+ }
94
+ return left[0].localeCompare(right[0]);
95
+ });
96
+ }
97
+
98
+ function formatCountSummary(counts) {
99
+ return sortCountEntries(counts)
100
+ .map(([key, value]) => `${key}=${value}`)
101
+ .join(" ");
102
+ }
103
+
104
+ function escapeMarkdownCell(value) {
105
+ return String(value ?? "")
106
+ .replace(/\r?\n/g, " ")
107
+ .replace(/\|/g, "\\|");
108
+ }
109
+
110
+ function benchmarkTelemetryLanePaths() {
111
+ try {
112
+ return buildLanePaths(DEFAULT_WAVE_LANE);
113
+ } catch {
114
+ return null;
115
+ }
116
+ }
117
+
118
+ function benchmarkRunId(output) {
119
+ return `bench-${output.adapter.id}-${output.manifest.id}-${String(output.generatedAt || toIsoTimestamp()).replace(/[-:.TZ]/g, "").slice(0, 14)}`;
120
+ }
121
+
122
+ function reviewValidityForResult(result, output) {
123
+ if (result.success && output.comparisonReady) {
124
+ return "comparison-valid";
125
+ }
126
+ if (result.reviewDisposition === "invalidated") {
127
+ return "benchmark-invalid";
128
+ }
129
+ if (result.reviewDisposition === "setup-failure") {
130
+ return "harness-setup-failure";
131
+ }
132
+ if (result.reviewDisposition === "blocked-proof") {
133
+ return "proof-blocked";
134
+ }
135
+ if (result.reviewDisposition === "scored-failure") {
136
+ return "trustworthy-model-failure";
137
+ }
138
+ return "review-only";
139
+ }
140
+
141
+ function externalTaskArtifacts(result) {
142
+ const artifacts = [];
143
+ if (result.patchPath) {
144
+ artifacts.push({
145
+ ...buildWaveControlArtifactFromPath(path.resolve(REPO_ROOT, result.patchPath), {
146
+ kind: "benchmark-patch-manifest",
147
+ uploadPolicy: "selected",
148
+ }),
149
+ sourcePath: path.resolve(REPO_ROOT, result.patchPath),
150
+ });
151
+ }
152
+ if (result.summaryPath) {
153
+ artifacts.push({
154
+ ...buildWaveControlArtifactFromPath(path.resolve(REPO_ROOT, result.summaryPath), {
155
+ kind: "benchmark-task-summary",
156
+ uploadPolicy: "selected",
157
+ }),
158
+ sourcePath: path.resolve(REPO_ROOT, result.summaryPath),
159
+ });
160
+ }
161
+ if (result.verificationStdoutPath) {
162
+ artifacts.push({
163
+ ...buildWaveControlArtifactFromPath(path.resolve(REPO_ROOT, result.verificationStdoutPath), {
164
+ kind: "verification-stdout",
165
+ uploadPolicy: "selected",
166
+ }),
167
+ sourcePath: path.resolve(REPO_ROOT, result.verificationStdoutPath),
168
+ });
169
+ }
170
+ if (result.verificationStderrPath) {
171
+ artifacts.push({
172
+ ...buildWaveControlArtifactFromPath(path.resolve(REPO_ROOT, result.verificationStderrPath), {
173
+ kind: "verification-stderr",
174
+ uploadPolicy: "selected",
175
+ }),
176
+ sourcePath: path.resolve(REPO_ROOT, result.verificationStderrPath),
177
+ });
178
+ }
179
+ return artifacts;
180
+ }
181
+
182
+ function publishExternalBenchmarkTelemetry({ output, outputDir, failureReview }) {
183
+ const lanePaths = benchmarkTelemetryLanePaths();
184
+ if (!lanePaths || lanePaths.waveControl?.captureBenchmarkRuns === false) {
185
+ return null;
186
+ }
187
+ const benchmarkRunIdValue = benchmarkRunId(output);
188
+ const attestation = {
189
+ adapterId: output.adapter.id,
190
+ manifestId: output.manifest.id,
191
+ manifestPath: output.manifest.path,
192
+ selectedArms: output.selectedArms,
193
+ comparisonReady: output.comparisonReady,
194
+ comparisonMode: output.comparisonMode,
195
+ runConfig: output.runConfig,
196
+ summary: {
197
+ tasks: output.summary.tasks,
198
+ solved: output.summary.solved,
199
+ successRate: output.summary.successRate,
200
+ },
201
+ };
202
+ safeQueueWaveControlEvent(lanePaths, {
203
+ category: "benchmark",
204
+ entityType: "benchmark_run",
205
+ entityId: benchmarkRunIdValue,
206
+ action: output.dryRun ? "planned" : "completed",
207
+ source: "benchmark-runner",
208
+ actor: "wave benchmark external-run",
209
+ recordedAt: output.generatedAt,
210
+ identity: {
211
+ runKind: "benchmark",
212
+ benchmarkRunId: benchmarkRunIdValue,
213
+ },
214
+ tags: [output.adapter.id, output.comparisonMode],
215
+ attestation,
216
+ data: {
217
+ adapter: output.adapter,
218
+ manifest: {
219
+ id: output.manifest.id,
220
+ path: output.manifest.path,
221
+ reviewOnly: output.manifest.reviewOnly,
222
+ },
223
+ comparisonReady: output.comparisonReady,
224
+ comparisonMode: output.comparisonMode,
225
+ selectedArms: output.selectedArms,
226
+ summary: output.summary,
227
+ review: failureReview.summary,
228
+ configHash: buildWaveControlConfigAttestationHash(attestation),
229
+ },
230
+ artifacts: [
231
+ {
232
+ ...buildWaveControlArtifactFromPath(path.join(outputDir, "results.json"), {
233
+ kind: "benchmark-results",
234
+ uploadPolicy: "selected",
235
+ }),
236
+ sourcePath: path.join(outputDir, "results.json"),
237
+ },
238
+ {
239
+ ...buildWaveControlArtifactFromPath(path.join(outputDir, "results.md"), {
240
+ kind: "benchmark-results-markdown",
241
+ uploadPolicy: "metadata-only",
242
+ }),
243
+ sourcePath: path.join(outputDir, "results.md"),
244
+ },
245
+ {
246
+ ...buildWaveControlArtifactFromPath(path.join(outputDir, "failure-review.json"), {
247
+ kind: "benchmark-failure-review",
248
+ uploadPolicy: "selected",
249
+ }),
250
+ sourcePath: path.join(outputDir, "failure-review.json"),
251
+ },
252
+ {
253
+ ...buildWaveControlArtifactFromPath(path.join(outputDir, "failure-review.md"), {
254
+ kind: "benchmark-failure-review-markdown",
255
+ uploadPolicy: "metadata-only",
256
+ }),
257
+ sourcePath: path.join(outputDir, "failure-review.md"),
258
+ },
259
+ ],
260
+ });
261
+ for (const result of output.tasks || []) {
262
+ const reviewValidity = reviewValidityForResult(result, output);
263
+ const identity = {
264
+ runKind: "benchmark",
265
+ benchmarkRunId: benchmarkRunIdValue,
266
+ benchmarkItemId: `${result.taskId}:${result.arm}`,
267
+ };
268
+ const taskArtifacts = externalTaskArtifacts(result);
269
+ safeQueueWaveControlEvent(lanePaths, {
270
+ category: "benchmark",
271
+ entityType: "benchmark_item",
272
+ entityId: `${result.taskId}:${result.arm}`,
273
+ action: result.success ? "passed" : "failed",
274
+ source: "benchmark-runner",
275
+ actor: "wave benchmark external-run",
276
+ recordedAt: output.generatedAt,
277
+ identity,
278
+ tags: [output.adapter.id, result.arm, reviewValidity],
279
+ data: {
280
+ benchmarkId: result.benchmarkId,
281
+ benchmarkTitle: result.benchmarkTitle,
282
+ taskId: result.taskId,
283
+ repo: result.repo,
284
+ repoLanguage: result.repoLanguage,
285
+ arm: result.arm,
286
+ modelId: result.modelId,
287
+ executorId: result.executorId,
288
+ success: result.success,
289
+ wallClockMs: result.wallClockMs,
290
+ totalCostUsd: result.totalCostUsd,
291
+ tokenUsage: result.tokenUsage,
292
+ reviewCategory: result.reviewCategory,
293
+ reviewDisposition: result.reviewDisposition,
294
+ reviewValidity,
295
+ detail: result.detail,
296
+ tracePath: result.tracePath || null,
297
+ },
298
+ artifacts: taskArtifacts,
299
+ });
300
+ safeQueueWaveControlEvent(lanePaths, {
301
+ category: "benchmark",
302
+ entityType: "verification",
303
+ entityId: `${result.taskId}:${result.arm}:verification`,
304
+ action: result.success ? "passed" : "failed",
305
+ source: "benchmark-runner",
306
+ actor: output.runConfig.verificationHarness || "benchmark-verifier",
307
+ recordedAt: output.generatedAt,
308
+ identity,
309
+ tags: [output.adapter.id, result.arm, "verification"],
310
+ data: {
311
+ verificationHarness: output.runConfig.verificationHarness || null,
312
+ officialScore: result.success ? 1 : 0,
313
+ reviewCategory: result.reviewCategory,
314
+ reviewDisposition: result.reviewDisposition,
315
+ verificationOutputDir: result.verificationOutputDir || null,
316
+ },
317
+ artifacts: taskArtifacts.filter((artifact) =>
318
+ ["verification-stdout", "verification-stderr"].includes(artifact.kind),
319
+ ),
320
+ });
321
+ safeQueueWaveControlEvent(lanePaths, {
322
+ category: "benchmark",
323
+ entityType: "review",
324
+ entityId: `${result.taskId}:${result.arm}:review`,
325
+ action: reviewValidity,
326
+ source: "benchmark-runner",
327
+ actor: "wave benchmark external-run",
328
+ recordedAt: output.generatedAt,
329
+ identity,
330
+ tags: [output.adapter.id, result.arm, reviewValidity],
331
+ data: {
332
+ reviewCategory: result.reviewCategory,
333
+ reviewDisposition: result.reviewDisposition,
334
+ reviewValidity,
335
+ comparisonReady: output.comparisonReady,
336
+ comparisonMode: output.comparisonMode,
337
+ detail: result.detail,
338
+ },
339
+ });
340
+ }
341
+ void flushWaveControlQueue(lanePaths);
342
+ return benchmarkRunIdValue;
343
+ }
344
+
345
+ function normalizeRepoRelativePath(value, label) {
346
+ const normalized = cleanText(value)
347
+ .replaceAll("\\", "/")
348
+ .replace(/^\.\/+/, "")
349
+ .replace(/\/+/g, "/")
350
+ .replace(/\/$/, "");
351
+ if (!normalized) {
352
+ throw new Error(`${label} is required`);
353
+ }
354
+ if (normalized.startsWith("/") || normalized.startsWith("../") || normalized.includes("/../")) {
355
+ throw new Error(`${label} must stay within the repository`);
356
+ }
357
+ return normalized;
358
+ }
359
+
360
+ function normalizeId(value, label) {
361
+ const normalized = cleanText(value).toLowerCase();
362
+ if (!/^[a-z0-9][a-z0-9._-]*$/.test(normalized)) {
363
+ throw new Error(`${label} must match /^[a-z0-9][a-z0-9._-]*$/`);
364
+ }
365
+ return normalized;
366
+ }
367
+
368
+ function normalizeSelectedExternalArms(arms) {
369
+ if (!Array.isArray(arms) || arms.length === 0) {
370
+ return EXTERNAL_BENCHMARK_ARMS.slice();
371
+ }
372
+ const selected = [];
373
+ for (const arm of arms) {
374
+ const normalized = normalizeId(arm, "arm");
375
+ if (!EXTERNAL_BENCHMARK_ARMS.includes(normalized)) {
376
+ throw new Error(
377
+ `Unsupported external benchmark arm: ${arm}. Allowed arms: ${EXTERNAL_BENCHMARK_ARMS.join(", ")}`,
378
+ );
379
+ }
380
+ if (!selected.includes(normalized)) {
381
+ selected.push(normalized);
382
+ }
383
+ }
384
+ return selected;
385
+ }
386
+
387
+ function normalizeStringArray(value, label) {
388
+ if (value == null) {
389
+ return [];
390
+ }
391
+ if (!Array.isArray(value)) {
392
+ throw new Error(`${label} must be an array`);
393
+ }
394
+ return value.map((entry, index) => {
395
+ const normalized = cleanText(entry);
396
+ if (!normalized) {
397
+ throw new Error(`${label}[${index}] must be a non-empty string`);
398
+ }
399
+ return normalized;
400
+ });
401
+ }
402
+
403
+ function readJsonFile(filePath, label = "JSON file") {
404
+ const payload = readJsonOrNull(filePath);
405
+ if (!payload || typeof payload !== "object" || Array.isArray(payload)) {
406
+ throw new Error(`Invalid ${label}: ${path.relative(REPO_ROOT, filePath)}`);
407
+ }
408
+ return payload;
409
+ }
410
+
411
+ function normalizeExternalTask(rawTask, index, manifest) {
412
+ if (!rawTask || typeof rawTask !== "object" || Array.isArray(rawTask)) {
413
+ throw new Error(`tasks[${index}] in ${manifest.path} must be an object`);
414
+ }
415
+ const taskId = cleanText(rawTask.taskId || rawTask.instanceId);
416
+ if (!taskId) {
417
+ throw new Error(`tasks[${index}].taskId is required in ${manifest.path}`);
418
+ }
419
+ return {
420
+ taskId,
421
+ repo: cleanText(rawTask.repo) || null,
422
+ repoLanguage: cleanText(rawTask.repoLanguage) || null,
423
+ level: cleanText(rawTask.level) || null,
424
+ title: cleanText(rawTask.title) || null,
425
+ protocol: cleanText(rawTask.protocol || manifest.protocol) || null,
426
+ teamSize:
427
+ rawTask.teamSize == null || rawTask.teamSize === ""
428
+ ? null
429
+ : Number.parseInt(String(rawTask.teamSize), 10),
430
+ complexityLevel: cleanText(rawTask.complexityLevel || rawTask.level) || null,
431
+ metadata: rawTask.metadata && typeof rawTask.metadata === "object" && !Array.isArray(rawTask.metadata)
432
+ ? rawTask.metadata
433
+ : {},
434
+ smoke: rawTask.smoke && typeof rawTask.smoke === "object" && !Array.isArray(rawTask.smoke)
435
+ ? rawTask.smoke
436
+ : null,
437
+ };
438
+ }
439
+
440
+ export function loadExternalPilotManifest(manifestPath) {
441
+ const normalizedPath = normalizeRepoRelativePath(manifestPath, "manifestPath");
442
+ const absolutePath = path.resolve(REPO_ROOT, normalizedPath);
443
+ const payload = readJsonFile(absolutePath, "pilot manifest");
444
+ const tasks = Array.isArray(payload.tasks) ? payload.tasks : [];
445
+ return {
446
+ version: Number.parseInt(String(payload.version ?? "1"), 10) || 1,
447
+ id: normalizeId(payload.id, `${normalizedPath}: id`),
448
+ benchmarkId: normalizeId(payload.benchmarkId, `${normalizedPath}: benchmarkId`),
449
+ title: cleanText(payload.title) || normalizeId(payload.id, `${normalizedPath}: id`),
450
+ split: cleanText(payload.split) || null,
451
+ sampleStrategy: cleanText(payload.sampleStrategy) || null,
452
+ sampleSource: cleanText(payload.sampleSource) || null,
453
+ derivedFromManifestPath: cleanText(payload.derivedFromManifestPath) || null,
454
+ reviewOnly: Boolean(payload.reviewOnly),
455
+ reviewScope: cleanText(payload.reviewScope) || null,
456
+ protocol: cleanText(payload.protocol) || null,
457
+ teamSizes: normalizeStringArray(payload.teamSizes, `${normalizedPath}: teamSizes`).map((value) =>
458
+ Number.parseInt(value, 10),
459
+ ),
460
+ path: normalizedPath,
461
+ absolutePath,
462
+ tasks: tasks.map((task, index) =>
463
+ normalizeExternalTask(task, index, { path: normalizedPath, protocol: payload.protocol }),
464
+ ),
465
+ };
466
+ }
467
+
468
+ export function loadExternalArmTemplates(options = {}) {
469
+ const templatesDir = path.resolve(
470
+ REPO_ROOT,
471
+ normalizeRepoRelativePath(
472
+ options.armTemplatesDir || DEFAULT_EXTERNAL_ARM_TEMPLATES_DIR,
473
+ "armTemplatesDir",
474
+ ),
475
+ );
476
+ const files = fs
477
+ .readdirSync(templatesDir, { withFileTypes: true })
478
+ .filter((entry) => entry.isFile() && entry.name.endsWith(".json"))
479
+ .map((entry) => path.join(templatesDir, entry.name))
480
+ .toSorted();
481
+ const templates = new Map();
482
+ for (const filePath of files) {
483
+ const payload = readJsonFile(filePath, "external arm template");
484
+ const armId = normalizeId(payload.armId, `${filePath}: armId`);
485
+ templates.set(armId, {
486
+ armId,
487
+ title: cleanText(payload.title) || armId,
488
+ roles: normalizeStringArray(payload.roles, `${filePath}: roles`),
489
+ includeContEval: Boolean(payload.includeContEval),
490
+ includeIntegrationSteward: Boolean(payload.includeIntegrationSteward),
491
+ includeDocumentationSteward: Boolean(payload.includeDocumentationSteward),
492
+ tracesRequired:
493
+ payload.tracesRequired === undefined ? armId === "full-wave" : Boolean(payload.tracesRequired),
494
+ notes: normalizeStringArray(payload.notes, `${filePath}: notes`),
495
+ path: path.relative(REPO_ROOT, filePath).replaceAll(path.sep, "/"),
496
+ });
497
+ }
498
+ for (const armId of EXTERNAL_BENCHMARK_ARMS) {
499
+ if (!templates.has(armId)) {
500
+ throw new Error(`Missing external arm template for ${armId}`);
501
+ }
502
+ }
503
+ return {
504
+ templatesDir: path.relative(REPO_ROOT, templatesDir).replaceAll(path.sep, "/"),
505
+ absoluteTemplatesDir: templatesDir,
506
+ templates,
507
+ };
508
+ }
509
+
510
+ export function loadExternalPilotManifests(options = {}) {
511
+ const pilotsDir = path.resolve(
512
+ REPO_ROOT,
513
+ normalizeRepoRelativePath(options.pilotsDir || DEFAULT_EXTERNAL_PILOTS_DIR, "pilotsDir"),
514
+ );
515
+ const files = fs
516
+ .readdirSync(pilotsDir, { withFileTypes: true })
517
+ .filter((entry) => entry.isFile() && entry.name.endsWith(".json"))
518
+ .map((entry) => path.join(pilotsDir, entry.name))
519
+ .toSorted();
520
+ const manifests = files.map((filePath) =>
521
+ loadExternalPilotManifest(path.relative(REPO_ROOT, filePath).replaceAll(path.sep, "/")),
522
+ );
523
+ return {
524
+ pilotsDir: path.relative(REPO_ROOT, pilotsDir).replaceAll(path.sep, "/"),
525
+ absolutePilotsDir: pilotsDir,
526
+ manifests,
527
+ byId: new Map(manifests.map((manifest) => [manifest.id, manifest])),
528
+ };
529
+ }
530
+
531
+ export function loadExternalCommandConfig(configPath) {
532
+ const normalizedPath = normalizeRepoRelativePath(configPath, "commandConfigPath");
533
+ const absolutePath = path.resolve(REPO_ROOT, normalizedPath);
534
+ const payload = readJsonFile(absolutePath, "external command config");
535
+ const adapters =
536
+ payload.adapters && typeof payload.adapters === "object" && !Array.isArray(payload.adapters)
537
+ ? payload.adapters
538
+ : {};
539
+ return {
540
+ path: normalizedPath,
541
+ absolutePath,
542
+ adapters,
543
+ };
544
+ }
545
+
546
+ export function assertComparableExternalRunConfig(runConfig) {
547
+ const required = [
548
+ "benchmarkId",
549
+ "modelId",
550
+ "executorId",
551
+ "executorCommand",
552
+ "toolPermissions",
553
+ "temperature",
554
+ "reasoningEffort",
555
+ "maxWallClockMinutes",
556
+ "maxTurns",
557
+ "retryLimit",
558
+ "verificationHarness",
559
+ "datasetVersion",
560
+ ];
561
+ for (const field of required) {
562
+ if (cleanText(runConfig?.[field]) === "") {
563
+ throw new Error(`Comparable external run config requires ${field}`);
564
+ }
565
+ }
566
+ const baseline = runConfig.armOverrides?.["single-agent"] || {};
567
+ const fullWave = runConfig.armOverrides?.["full-wave"] || {};
568
+ const forbiddenFields = [
569
+ "modelId",
570
+ "executorId",
571
+ "executorCommand",
572
+ "toolPermissions",
573
+ "temperature",
574
+ "reasoningEffort",
575
+ "maxWallClockMinutes",
576
+ "maxTurns",
577
+ "retryLimit",
578
+ "verificationHarness",
579
+ "datasetVersion",
580
+ ];
581
+ for (const field of forbiddenFields) {
582
+ if (baseline[field] !== undefined || fullWave[field] !== undefined) {
583
+ throw new Error(`Arm overrides must not change comparable field ${field}`);
584
+ }
585
+ }
586
+ }
587
+
588
+ function renderTemplate(template, variables) {
589
+ return String(template || "").replace(/\{([a-zA-Z0-9_]+)\}/g, (_, key) => {
590
+ if (!(key in variables)) {
591
+ return "";
592
+ }
593
+ return String(variables[key]);
594
+ });
595
+ }
596
+
597
+ function normalizeSmokeOutcome(outcome, label) {
598
+ if (!outcome || typeof outcome !== "object" || Array.isArray(outcome)) {
599
+ throw new Error(`${label} must be an object`);
600
+ }
601
+ return {
602
+ success: Boolean(outcome.success),
603
+ wallClockMs:
604
+ outcome.wallClockMs == null ? null : Number.parseInt(String(outcome.wallClockMs), 10),
605
+ totalCostUsd:
606
+ outcome.totalCostUsd == null ? null : Number.parseFloat(String(outcome.totalCostUsd)),
607
+ tokenUsage:
608
+ outcome.tokenUsage && typeof outcome.tokenUsage === "object" && !Array.isArray(outcome.tokenUsage)
609
+ ? outcome.tokenUsage
610
+ : null,
611
+ partialCorrectness:
612
+ outcome.partialCorrectness == null ? null : Number.parseFloat(String(outcome.partialCorrectness)),
613
+ communicationDensity:
614
+ outcome.communicationDensity == null ? null : Number.parseFloat(String(outcome.communicationDensity)),
615
+ detail: cleanText(outcome.detail) || "",
616
+ };
617
+ }
618
+
619
+ function executeCommand(command, workingDirectory) {
620
+ const startedAt = Date.now();
621
+ const result = spawnSync(command, {
622
+ cwd: workingDirectory,
623
+ shell: true,
624
+ encoding: "utf8",
625
+ env: process.env,
626
+ });
627
+ return {
628
+ exitCode: Number.isInteger(result.status) ? result.status : 1,
629
+ stdout: result.stdout || "",
630
+ stderr: result.stderr || "",
631
+ wallClockMs: Date.now() - startedAt,
632
+ };
633
+ }
634
+
635
+ function parseStructuredCommandOutput(text) {
636
+ const normalized = cleanText(text);
637
+ if (!normalized) {
638
+ return null;
639
+ }
640
+ const candidates = [normalized];
641
+ const lines = normalized
642
+ .split(/\r?\n/)
643
+ .map((line) => line.trim())
644
+ .filter(Boolean);
645
+ if (lines.length > 1) {
646
+ candidates.push(lines[lines.length - 1]);
647
+ }
648
+ for (const candidate of candidates) {
649
+ try {
650
+ const parsed = JSON.parse(candidate);
651
+ if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
652
+ return parsed;
653
+ }
654
+ } catch {
655
+ // Fall through to the next candidate.
656
+ }
657
+ }
658
+ return null;
659
+ }
660
+
661
+ function normalizeStructuredCommandOutcome(outcome, label) {
662
+ if (!outcome || typeof outcome !== "object" || Array.isArray(outcome)) {
663
+ return null;
664
+ }
665
+ const payload = {
666
+ success: outcome.success == null ? null : Boolean(outcome.success),
667
+ wallClockMs:
668
+ outcome.wallClockMs == null ? null : Number.parseInt(String(outcome.wallClockMs), 10),
669
+ totalCostUsd:
670
+ outcome.totalCostUsd == null ? null : Number.parseFloat(String(outcome.totalCostUsd)),
671
+ tokenUsage:
672
+ outcome.tokenUsage && typeof outcome.tokenUsage === "object" && !Array.isArray(outcome.tokenUsage)
673
+ ? outcome.tokenUsage
674
+ : null,
675
+ partialCorrectness:
676
+ outcome.partialCorrectness == null ? null : Number.parseFloat(String(outcome.partialCorrectness)),
677
+ communicationDensity:
678
+ outcome.communicationDensity == null ? null : Number.parseFloat(String(outcome.communicationDensity)),
679
+ artifactPath: cleanText(outcome.artifactPath) || null,
680
+ patchPath: cleanText(outcome.patchPath) || null,
681
+ tracePath: cleanText(outcome.tracePath) || null,
682
+ summaryPath: cleanText(outcome.summaryPath) || null,
683
+ verificationStdoutPath: cleanText(outcome.verificationStdoutPath) || null,
684
+ verificationStderrPath: cleanText(outcome.verificationStderrPath) || null,
685
+ verificationOutputDir: cleanText(outcome.verificationOutputDir) || null,
686
+ reviewCategory: cleanText(outcome.reviewCategory) || null,
687
+ detail: cleanText(outcome.detail) || "",
688
+ };
689
+ if (
690
+ payload.wallClockMs != null &&
691
+ (!Number.isFinite(payload.wallClockMs) || payload.wallClockMs < 0)
692
+ ) {
693
+ throw new Error(`${label}.wallClockMs must be a non-negative integer when provided`);
694
+ }
695
+ if (
696
+ payload.totalCostUsd != null &&
697
+ (!Number.isFinite(payload.totalCostUsd) || payload.totalCostUsd < 0)
698
+ ) {
699
+ throw new Error(`${label}.totalCostUsd must be a non-negative number when provided`);
700
+ }
701
+ if (
702
+ payload.partialCorrectness != null &&
703
+ (!Number.isFinite(payload.partialCorrectness) || payload.partialCorrectness < 0)
704
+ ) {
705
+ throw new Error(`${label}.partialCorrectness must be a non-negative number when provided`);
706
+ }
707
+ if (
708
+ payload.communicationDensity != null &&
709
+ (!Number.isFinite(payload.communicationDensity) || payload.communicationDensity < 0)
710
+ ) {
711
+ throw new Error(`${label}.communicationDensity must be a non-negative number when provided`);
712
+ }
713
+ return payload;
714
+ }
715
+
716
+ function aggregateTokenUsage(taskResults) {
717
+ const totals = {};
718
+ let hasUsage = false;
719
+ for (const result of taskResults) {
720
+ if (!result.tokenUsage || typeof result.tokenUsage !== "object" || Array.isArray(result.tokenUsage)) {
721
+ continue;
722
+ }
723
+ hasUsage = true;
724
+ for (const [key, value] of Object.entries(result.tokenUsage)) {
725
+ const numeric = Number.parseInt(String(value), 10);
726
+ if (!Number.isFinite(numeric)) {
727
+ continue;
728
+ }
729
+ totals[key] = (totals[key] || 0) + numeric;
730
+ }
731
+ }
732
+ return hasUsage ? totals : null;
733
+ }
734
+
735
+ function classifyTaskReviewCategory({ success, commandExitCode, detail, structuredCategory }) {
736
+ const normalizedDetail = cleanText(detail).toLowerCase();
737
+ const normalizedStructuredCategory = cleanText(structuredCategory).toLowerCase();
738
+ if (normalizedStructuredCategory && normalizedStructuredCategory !== "harness-env") {
739
+ return normalizedStructuredCategory;
740
+ }
741
+ if (success) {
742
+ return "solved";
743
+ }
744
+ if (normalizedDetail.includes("dry-run plan only") || normalizedDetail.includes("planning only")) {
745
+ return "dry-run-plan";
746
+ }
747
+ if (normalizedDetail.includes("timed out") || normalizedDetail.includes("timeout")) {
748
+ return "timeout";
749
+ }
750
+ if (isVerifierImageFailureDetail(normalizedDetail)) {
751
+ return "verifier-image";
752
+ }
753
+ if (
754
+ normalizedDetail.includes("needs-more-work") ||
755
+ normalizedDetail.includes("proof gap") ||
756
+ normalizedDetail.includes("missing execution summary") ||
757
+ normalizedDetail.includes("blocked")
758
+ ) {
759
+ return "blocked-proof";
760
+ }
761
+ if (isSetupHarnessFailureDetail(normalizedDetail)) {
762
+ return "setup-harness";
763
+ }
764
+ if (commandExitCode !== 0) {
765
+ return "setup-harness";
766
+ }
767
+ return "incorrect-patch";
768
+ }
769
+
770
+ function normalizeTaskArmResult({
771
+ benchmarkId,
772
+ manifest,
773
+ adapter,
774
+ task,
775
+ arm,
776
+ runConfig,
777
+ armTemplate,
778
+ execution,
779
+ verification,
780
+ }) {
781
+ const detail = verification.detail || execution.detail || "";
782
+ const reviewCategory = classifyTaskReviewCategory({
783
+ success: Boolean(verification.success),
784
+ commandExitCode: execution.exitCode,
785
+ detail,
786
+ structuredCategory: verification.reviewCategory || null,
787
+ });
788
+ const reviewDisposition = reviewDispositionForCategory(reviewCategory);
789
+ return {
790
+ benchmarkId,
791
+ benchmarkTitle: adapter.title,
792
+ split: manifest.split,
793
+ taskId: task.taskId,
794
+ repo: task.repo,
795
+ repoLanguage: task.repoLanguage,
796
+ complexityLevel: task.complexityLevel,
797
+ protocol: task.protocol,
798
+ teamSize: task.teamSize,
799
+ arm,
800
+ armTemplate: armTemplate.path,
801
+ modelId: runConfig.modelId,
802
+ executorId: runConfig.executorId,
803
+ executorCommand: runConfig.executorCommand,
804
+ toolPermissions: runConfig.toolPermissions,
805
+ temperature: runConfig.temperature,
806
+ reasoningEffort: runConfig.reasoningEffort,
807
+ maxWallClockMinutes: Number.parseInt(String(runConfig.maxWallClockMinutes), 10),
808
+ maxTurns: Number.parseInt(String(runConfig.maxTurns), 10),
809
+ retryLimit: Number.parseInt(String(runConfig.retryLimit), 10),
810
+ verificationHarness: runConfig.verificationHarness,
811
+ datasetVersion: runConfig.datasetVersion,
812
+ success: Boolean(verification.success),
813
+ wallClockMs: verification.wallClockMs ?? execution.wallClockMs ?? null,
814
+ totalCostUsd: verification.totalCostUsd ?? null,
815
+ tokenUsage: verification.tokenUsage ?? null,
816
+ partialCorrectness: verification.partialCorrectness ?? null,
817
+ communicationDensity: verification.communicationDensity ?? null,
818
+ verificationArtifactPath: verification.artifactPath || null,
819
+ patchPath: verification.patchPath || null,
820
+ tracePath:
821
+ armTemplate.tracesRequired && execution.tracePath ? execution.tracePath : null,
822
+ summaryPath: execution.summaryPath || null,
823
+ verificationStdoutPath: verification.verificationStdoutPath || null,
824
+ verificationStderrPath: verification.verificationStderrPath || null,
825
+ verificationOutputDir: verification.verificationOutputDir || null,
826
+ command: execution.command || null,
827
+ commandExitCode: execution.exitCode,
828
+ reviewCategory,
829
+ reviewDisposition,
830
+ detail,
831
+ };
832
+ }
833
+
834
+ function aggregateExternalResults(taskResults, adapter, runConfig, selectedArms) {
835
+ const arms = selectedArms;
836
+ const overall = {};
837
+ for (const arm of arms) {
838
+ const armResults = taskResults.filter((result) => result.arm === arm);
839
+ const solved = armResults.filter((result) => result.success).length;
840
+ const costValues = armResults.map((result) => result.totalCostUsd).filter((value) => value != null);
841
+ const hasCompleteCost = armResults.length > 0 && costValues.length === armResults.length;
842
+ const totalCost = hasCompleteCost
843
+ ? armResults.reduce((total, result) => total + (result.totalCostUsd || 0), 0)
844
+ : null;
845
+ const totalWallClock = armResults.reduce((total, result) => total + (result.wallClockMs || 0), 0);
846
+ const reviewBuckets = {};
847
+ const reviewDispositions = {};
848
+ for (const result of armResults) {
849
+ const bucket = result.reviewCategory || "unknown";
850
+ reviewBuckets[bucket] = (reviewBuckets[bucket] || 0) + 1;
851
+ const disposition = result.reviewDisposition || "unknown";
852
+ reviewDispositions[disposition] = (reviewDispositions[disposition] || 0) + 1;
853
+ }
854
+ overall[arm] = {
855
+ tasks: armResults.length,
856
+ solved,
857
+ successRate: armResults.length === 0 ? 0 : Number(((solved / armResults.length) * 100).toFixed(2)),
858
+ totalCostUsd: totalCost == null ? null : Number(totalCost.toFixed(4)),
859
+ totalWallClockMs: totalWallClock,
860
+ tokenUsageTotals: aggregateTokenUsage(armResults),
861
+ reviewBuckets,
862
+ reviewDispositions,
863
+ costPerSolvedTask:
864
+ solved === 0 || totalCost == null ? null : Number((totalCost / solved).toFixed(4)),
865
+ wallClockPerSolvedTaskMs: solved === 0 ? null : Math.round(totalWallClock / solved),
866
+ };
867
+ }
868
+ const breakdownByLevel = {};
869
+ const levels = Array.from(new Set(taskResults.map((result) => result.complexityLevel).filter(Boolean)));
870
+ for (const level of levels) {
871
+ breakdownByLevel[level] = {};
872
+ for (const arm of arms) {
873
+ const levelResults = taskResults.filter(
874
+ (result) => result.arm === arm && result.complexityLevel === level,
875
+ );
876
+ const solved = levelResults.filter((result) => result.success).length;
877
+ breakdownByLevel[level][arm] = {
878
+ tasks: levelResults.length,
879
+ solved,
880
+ successRate:
881
+ levelResults.length === 0 ? 0 : Number(((solved / levelResults.length) * 100).toFixed(2)),
882
+ };
883
+ }
884
+ }
885
+ const breakdownByTeamSize = {};
886
+ const teamSizes = Array.from(new Set(taskResults.map((result) => result.teamSize).filter(Number.isFinite)));
887
+ for (const teamSize of teamSizes) {
888
+ breakdownByTeamSize[teamSize] = {};
889
+ for (const arm of arms) {
890
+ const grouped = taskResults.filter((result) => result.arm === arm && result.teamSize === teamSize);
891
+ const solved = grouped.filter((result) => result.success).length;
892
+ breakdownByTeamSize[teamSize][arm] = {
893
+ tasks: grouped.length,
894
+ solved,
895
+ successRate: grouped.length === 0 ? 0 : Number(((solved / grouped.length) * 100).toFixed(2)),
896
+ };
897
+ }
898
+ }
899
+ return {
900
+ benchmarkId: adapter.id,
901
+ benchmarkTitle: adapter.title,
902
+ datasetVersion: runConfig.datasetVersion,
903
+ verificationHarness: runConfig.verificationHarness,
904
+ overall,
905
+ breakdownByLevel,
906
+ breakdownByTeamSize,
907
+ };
908
+ }
909
+
910
+ function renderExternalResultsMarkdown(output) {
911
+ const formatTokenUsage = (tokenUsage) => (tokenUsage ? formatCountSummary(tokenUsage) : "n/a");
912
+ const lines = [
913
+ `# External Benchmark Pilot Results`,
914
+ "",
915
+ `- Benchmark: \`${output.adapter.id}\``,
916
+ `- Manifest: \`${output.manifest.path}\``,
917
+ `- Arms: \`${output.selectedArms.join("`, `")}\``,
918
+ `- Comparison-ready: ${output.comparisonReady ? "yes" : "no"}`,
919
+ `- Comparison mode: \`${output.comparisonMode}\``,
920
+ `- Review-only manifest: ${output.manifest.reviewOnly ? "yes" : "no"}`,
921
+ `- Review scope: \`${output.manifest.reviewScope || "n/a"}\``,
922
+ `- Generated: ${output.generatedAt}`,
923
+ `- Mode: ${output.mode}`,
924
+ `- Dry run: ${output.dryRun ? "yes" : "no"}`,
925
+ `- Failure review: \`${output.failureReviewPath}\``,
926
+ `- Failure review JSON: \`${output.failureReviewJsonPath}\``,
927
+ "",
928
+ "## Overall",
929
+ ];
930
+ for (const [arm, stats] of Object.entries(output.summary.overall || {})) {
931
+ lines.push(
932
+ `- ${arm}: solved=${stats.solved}/${stats.tasks} success_rate=${stats.successRate}% cost_per_solved=${stats.costPerSolvedTask ?? "n/a"} wall_clock_per_solved_ms=${stats.wallClockPerSolvedTaskMs ?? "n/a"} token_usage=${formatTokenUsage(stats.tokenUsageTotals)}`,
933
+ );
934
+ }
935
+ if (Object.keys(output.summary.overall || {}).length > 0) {
936
+ lines.push("", "## Review Buckets");
937
+ for (const [arm, stats] of Object.entries(output.summary.overall || {})) {
938
+ const buckets = formatCountSummary(stats.reviewBuckets || {});
939
+ lines.push(`- ${arm}: ${buckets || "n/a"}`);
940
+ }
941
+ lines.push("", "## Review Dispositions");
942
+ for (const [arm, stats] of Object.entries(output.summary.overall || {})) {
943
+ const dispositions = formatCountSummary(stats.reviewDispositions || {});
944
+ lines.push(`- ${arm}: ${dispositions || "n/a"}`);
945
+ }
946
+ }
947
+ if (Object.keys(output.summary.breakdownByLevel || {}).length > 0) {
948
+ lines.push("", "## Complexity Breakdown");
949
+ for (const [level, armStats] of Object.entries(output.summary.breakdownByLevel)) {
950
+ lines.push(`### ${level}`);
951
+ for (const [arm, stats] of Object.entries(armStats)) {
952
+ lines.push(`- ${arm}: solved=${stats.solved}/${stats.tasks} success_rate=${stats.successRate}%`);
953
+ }
954
+ }
955
+ }
956
+ if (Object.keys(output.summary.breakdownByTeamSize || {}).length > 0) {
957
+ lines.push("", "## Team Size Breakdown");
958
+ for (const [teamSize, armStats] of Object.entries(output.summary.breakdownByTeamSize)) {
959
+ lines.push(`### team_size_${teamSize}`);
960
+ for (const [arm, stats] of Object.entries(armStats)) {
961
+ lines.push(`- ${arm}: solved=${stats.solved}/${stats.tasks} success_rate=${stats.successRate}%`);
962
+ }
963
+ }
964
+ }
965
+ return lines.join("\n");
966
+ }
967
+
968
+ function buildExternalFailureReview(output) {
969
+ const byArm = {};
970
+ for (const arm of output.selectedArms) {
971
+ const armResults = output.tasks.filter((result) => result.arm === arm);
972
+ const reviewBuckets = {};
973
+ const reviewDispositions = {};
974
+ for (const result of armResults) {
975
+ const bucket = result.reviewCategory || "unknown";
976
+ reviewBuckets[bucket] = (reviewBuckets[bucket] || 0) + 1;
977
+ const disposition = result.reviewDisposition || "unknown";
978
+ reviewDispositions[disposition] = (reviewDispositions[disposition] || 0) + 1;
979
+ }
980
+ const solved = armResults.filter((result) => result.success).length;
981
+ const blocked =
982
+ output.dryRun ||
983
+ (reviewDispositions["dry-run"] || 0) > 0 ||
984
+ (reviewDispositions["invalidated"] || 0) > 0 ||
985
+ (reviewDispositions["setup-failure"] || 0) > 0 ||
986
+ (reviewDispositions["timeout"] || 0) > 0 ||
987
+ (reviewDispositions["blocked-proof"] || 0) > 0;
988
+ const verdict = output.dryRun
989
+ ? "planning-only"
990
+ : blocked
991
+ ? "blocked"
992
+ : solved === armResults.length
993
+ ? "clean"
994
+ : (reviewDispositions["scored-failure"] || 0) > 0
995
+ ? "scored-failure"
996
+ : "mixed";
997
+ byArm[arm] = {
998
+ taskCount: armResults.length,
999
+ solved,
1000
+ officialScore: `${solved}/${armResults.length}`,
1001
+ verdict,
1002
+ reviewBuckets,
1003
+ reviewDispositions,
1004
+ invalidatesExternalComparison: blocked || output.comparisonMode !== "pairwise-comparison",
1005
+ tasksByDisposition: {
1006
+ invalidated: reviewDispositions["invalidated"] || 0,
1007
+ setupFailure: reviewDispositions["setup-failure"] || 0,
1008
+ trustworthyPatchFailure: reviewDispositions["scored-failure"] || 0,
1009
+ timeout: reviewDispositions["timeout"] || 0,
1010
+ blockedProof: reviewDispositions["blocked-proof"] || 0,
1011
+ dryRun: reviewDispositions["dry-run"] || 0,
1012
+ },
1013
+ taskResults: armResults.map((result) => ({
1014
+ arm: result.arm,
1015
+ taskId: result.taskId,
1016
+ repo: result.repo,
1017
+ officialScore: result.success ? 1 : 0,
1018
+ reviewCategory: result.reviewCategory,
1019
+ reviewDisposition: result.reviewDisposition,
1020
+ wallClockMs: result.wallClockMs,
1021
+ detail: result.detail,
1022
+ patchPath: result.patchPath,
1023
+ summaryPath: result.summaryPath,
1024
+ tracePath: result.tracePath,
1025
+ verificationArtifactPath: result.verificationArtifactPath,
1026
+ verificationStdoutPath: result.verificationStdoutPath,
1027
+ verificationStderrPath: result.verificationStderrPath,
1028
+ verificationOutputDir: result.verificationOutputDir,
1029
+ })),
1030
+ };
1031
+ }
1032
+ return {
1033
+ generatedAt: output.generatedAt,
1034
+ benchmarkId: output.adapter.id,
1035
+ benchmarkTitle: output.adapter.title,
1036
+ manifestPath: output.manifest.path,
1037
+ manifestId: output.manifest.id,
1038
+ comparisonMode: output.comparisonMode,
1039
+ comparisonReady: output.comparisonReady,
1040
+ reviewOnlyManifest: output.manifest.reviewOnly,
1041
+ reviewScope: output.manifest.reviewScope || null,
1042
+ dryRun: output.dryRun,
1043
+ mode: output.mode,
1044
+ selectedArms: output.selectedArms,
1045
+ byArm,
1046
+ };
1047
+ }
1048
+
1049
+ function renderExternalFailureReviewMarkdown(review) {
1050
+ const lines = [
1051
+ "# External Benchmark Failure Review",
1052
+ "",
1053
+ `- Benchmark: \`${review.benchmarkId}\``,
1054
+ `- Manifest: \`${review.manifestPath}\``,
1055
+ `- Comparison-ready: ${review.comparisonReady ? "yes" : "no"}`,
1056
+ `- Comparison mode: \`${review.comparisonMode}\``,
1057
+ `- Review-only manifest: ${review.reviewOnlyManifest ? "yes" : "no"}`,
1058
+ `- Review scope: \`${review.reviewScope || "n/a"}\``,
1059
+ `- Generated: ${review.generatedAt}`,
1060
+ `- Mode: ${review.mode}`,
1061
+ `- Dry run: ${review.dryRun ? "yes" : "no"}`,
1062
+ "",
1063
+ "## Verdict",
1064
+ ];
1065
+ for (const [arm, summary] of Object.entries(review.byArm || {})) {
1066
+ lines.push(
1067
+ `- ${arm}: verdict=${summary.verdict} official_score=${summary.officialScore} invalidated=${summary.tasksByDisposition.invalidated} setup_failure=${summary.tasksByDisposition.setupFailure} trustworthy_patch_failure=${summary.tasksByDisposition.trustworthyPatchFailure} timeout=${summary.tasksByDisposition.timeout} blocked_proof=${summary.tasksByDisposition.blockedProof} dry_run=${summary.tasksByDisposition.dryRun}`,
1068
+ );
1069
+ }
1070
+ if (Object.keys(review.byArm || {}).length > 0) {
1071
+ lines.push("", "## Failure Buckets");
1072
+ for (const [arm, summary] of Object.entries(review.byArm || {})) {
1073
+ lines.push(`- ${arm}: ${formatCountSummary(summary.reviewBuckets) || "n/a"}`);
1074
+ }
1075
+ lines.push("", "## Review Dispositions");
1076
+ for (const [arm, summary] of Object.entries(review.byArm || {})) {
1077
+ lines.push(`- ${arm}: ${formatCountSummary(summary.reviewDispositions) || "n/a"}`);
1078
+ }
1079
+ lines.push("", "## Task Scorecard");
1080
+ lines.push("| Arm | Task | Repo | Official | Review bucket | Review disposition | Wall clock ms | Notes |");
1081
+ lines.push("| --- | --- | --- | ---: | --- | --- | ---: | --- |");
1082
+ for (const [arm, summary] of Object.entries(review.byArm || {})) {
1083
+ for (const task of summary.taskResults || []) {
1084
+ lines.push(
1085
+ `| ${escapeMarkdownCell(arm)} | \`${escapeMarkdownCell(task.taskId)}\` | \`${escapeMarkdownCell(task.repo || "n/a")}\` | ${task.officialScore} | \`${escapeMarkdownCell(task.reviewCategory || "unknown")}\` | \`${escapeMarkdownCell(task.reviewDisposition || "unknown")}\` | ${task.wallClockMs ?? 0} | ${escapeMarkdownCell(task.detail || "")} |`,
1086
+ );
1087
+ }
1088
+ }
1089
+ }
1090
+ return lines.join("\n");
1091
+ }
1092
+
1093
+ function executeTaskArm({
1094
+ adapter,
1095
+ manifest,
1096
+ task,
1097
+ arm,
1098
+ armTemplate,
1099
+ runConfig,
1100
+ outputDir,
1101
+ dryRun,
1102
+ }) {
1103
+ const variables = {
1104
+ benchmark_id: adapter.id,
1105
+ benchmark_title: adapter.title,
1106
+ split: manifest.split || "",
1107
+ task_id: task.taskId,
1108
+ repo: task.repo || "",
1109
+ repo_language: task.repoLanguage || "",
1110
+ level: task.level || "",
1111
+ complexity_level: task.complexityLevel || "",
1112
+ protocol: task.protocol || "",
1113
+ team_size: task.teamSize ?? "",
1114
+ arm,
1115
+ model_id: runConfig.modelId,
1116
+ executor_id: runConfig.executorId,
1117
+ executor_command: runConfig.executorCommand,
1118
+ temperature: runConfig.temperature,
1119
+ reasoning_effort: runConfig.reasoningEffort,
1120
+ max_wall_clock_minutes: runConfig.maxWallClockMinutes,
1121
+ max_turns: runConfig.maxTurns,
1122
+ retry_limit: runConfig.retryLimit,
1123
+ verification_harness: runConfig.verificationHarness,
1124
+ dataset_version: runConfig.datasetVersion,
1125
+ };
1126
+ const commands = runConfig.commandTemplates || {};
1127
+ const commandTemplate = commands[adapter.id]?.[arm] || "";
1128
+ const verifyTemplate = commands[adapter.id]?.verify || "";
1129
+ const command = commandTemplate ? renderTemplate(commandTemplate, variables) : null;
1130
+ const verifyCommand = verifyTemplate ? renderTemplate(verifyTemplate, variables) : null;
1131
+ if (task.smoke?.[arm]) {
1132
+ const smokeOutcome = normalizeSmokeOutcome(task.smoke[arm], `smoke.${arm}`);
1133
+ const artifactPath = path.join(outputDir, "smoke", `${adapter.id}-${arm}-${task.taskId}.json`);
1134
+ writeJsonAtomic(artifactPath, smokeOutcome);
1135
+ return normalizeTaskArmResult({
1136
+ benchmarkId: adapter.id,
1137
+ manifest,
1138
+ adapter,
1139
+ task,
1140
+ arm,
1141
+ runConfig,
1142
+ armTemplate,
1143
+ execution: {
1144
+ command,
1145
+ exitCode: 0,
1146
+ wallClockMs: smokeOutcome.wallClockMs,
1147
+ tracePath: armTemplate.tracesRequired ? `traces/${task.taskId}/${arm}` : null,
1148
+ summaryPath: `summaries/${task.taskId}/${arm}.json`,
1149
+ },
1150
+ verification: {
1151
+ ...smokeOutcome,
1152
+ artifactPath: path.relative(REPO_ROOT, artifactPath).replaceAll(path.sep, "/"),
1153
+ },
1154
+ });
1155
+ }
1156
+ if (dryRun) {
1157
+ return normalizeTaskArmResult({
1158
+ benchmarkId: adapter.id,
1159
+ manifest,
1160
+ adapter,
1161
+ task,
1162
+ arm,
1163
+ runConfig,
1164
+ armTemplate,
1165
+ execution: {
1166
+ command,
1167
+ exitCode: 0,
1168
+ wallClockMs: 0,
1169
+ tracePath: armTemplate.tracesRequired ? `traces/${task.taskId}/${arm}` : null,
1170
+ summaryPath: `plans/${task.taskId}/${arm}.json`,
1171
+ detail: "dry-run plan only",
1172
+ },
1173
+ verification: {
1174
+ success: false,
1175
+ wallClockMs: 0,
1176
+ totalCostUsd: null,
1177
+ tokenUsage: null,
1178
+ partialCorrectness: null,
1179
+ communicationDensity: null,
1180
+ artifactPath: null,
1181
+ detail: "dry-run plan only",
1182
+ },
1183
+ });
1184
+ }
1185
+ if (!command) {
1186
+ throw new Error(
1187
+ `Missing execution command template for adapter ${adapter.id} arm ${arm}; use --dry-run or provide commandTemplates.`,
1188
+ );
1189
+ }
1190
+ const execution = executeCommand(command, REPO_ROOT);
1191
+ const executionStructured = normalizeStructuredCommandOutcome(
1192
+ parseStructuredCommandOutput(execution.stdout),
1193
+ "execution output",
1194
+ );
1195
+ let verification = {
1196
+ success: executionStructured?.success ?? execution.exitCode === 0,
1197
+ wallClockMs: executionStructured?.wallClockMs ?? execution.wallClockMs,
1198
+ totalCostUsd: executionStructured?.totalCostUsd ?? null,
1199
+ tokenUsage: executionStructured?.tokenUsage ?? null,
1200
+ partialCorrectness: executionStructured?.partialCorrectness ?? null,
1201
+ communicationDensity: executionStructured?.communicationDensity ?? null,
1202
+ artifactPath: executionStructured?.artifactPath || null,
1203
+ patchPath: executionStructured?.patchPath || null,
1204
+ verificationStdoutPath: executionStructured?.verificationStdoutPath || null,
1205
+ verificationStderrPath: executionStructured?.verificationStderrPath || null,
1206
+ verificationOutputDir: executionStructured?.verificationOutputDir || null,
1207
+ reviewCategory: executionStructured?.reviewCategory || null,
1208
+ detail:
1209
+ executionStructured?.detail ||
1210
+ (execution.exitCode === 0 ? "command completed" : execution.stderr || execution.stdout),
1211
+ };
1212
+ if (verifyCommand) {
1213
+ const verifyResult = executeCommand(verifyCommand, REPO_ROOT);
1214
+ const verifyStructured = normalizeStructuredCommandOutcome(
1215
+ parseStructuredCommandOutput(verifyResult.stdout),
1216
+ "verification output",
1217
+ );
1218
+ verification = {
1219
+ ...verification,
1220
+ success: verifyStructured?.success ?? verifyResult.exitCode === 0,
1221
+ wallClockMs: verifyStructured?.wallClockMs ?? executionStructured?.wallClockMs ?? execution.wallClockMs,
1222
+ totalCostUsd: verifyStructured?.totalCostUsd ?? verification.totalCostUsd,
1223
+ tokenUsage: verifyStructured?.tokenUsage ?? verification.tokenUsage,
1224
+ partialCorrectness: verifyStructured?.partialCorrectness ?? verification.partialCorrectness,
1225
+ communicationDensity: verifyStructured?.communicationDensity ?? verification.communicationDensity,
1226
+ artifactPath: verifyStructured?.artifactPath || verification.artifactPath,
1227
+ patchPath: verifyStructured?.patchPath || verification.patchPath,
1228
+ verificationStdoutPath:
1229
+ verifyStructured?.verificationStdoutPath || verification.verificationStdoutPath,
1230
+ verificationStderrPath:
1231
+ verifyStructured?.verificationStderrPath || verification.verificationStderrPath,
1232
+ verificationOutputDir:
1233
+ verifyStructured?.verificationOutputDir || verification.verificationOutputDir,
1234
+ reviewCategory: verifyStructured?.reviewCategory || verification.reviewCategory,
1235
+ detail:
1236
+ verifyStructured?.detail ||
1237
+ (verifyResult.exitCode === 0 ? verifyResult.stdout.trim() : verifyResult.stderr.trim()),
1238
+ };
1239
+ }
1240
+ return normalizeTaskArmResult({
1241
+ benchmarkId: adapter.id,
1242
+ manifest,
1243
+ adapter,
1244
+ task,
1245
+ arm,
1246
+ runConfig,
1247
+ armTemplate,
1248
+ execution: {
1249
+ command,
1250
+ exitCode: execution.exitCode,
1251
+ wallClockMs: execution.wallClockMs,
1252
+ tracePath:
1253
+ armTemplate.tracesRequired ? executionStructured?.tracePath || `traces/${task.taskId}/${arm}` : null,
1254
+ summaryPath: executionStructured?.summaryPath || null,
1255
+ detail: executionStructured?.detail || execution.stderr || execution.stdout,
1256
+ },
1257
+ verification,
1258
+ });
1259
+ }
1260
+
1261
+ export function runExternalBenchmarkPilot(options = {}) {
1262
+ const adapters = loadExternalBenchmarkAdapters(options);
1263
+ const adapterId = normalizeId(options.adapterId, "adapterId");
1264
+ const adapter = adapters.adapters.find((entry) => entry.id === adapterId);
1265
+ if (!adapter) {
1266
+ throw new Error(`Unknown external benchmark adapter: ${adapterId}`);
1267
+ }
1268
+ const manifest = loadExternalPilotManifest(
1269
+ options.manifestPath || adapter.pilotManifestPath || `${DEFAULT_EXTERNAL_PILOTS_DIR}/${adapterId}.json`,
1270
+ );
1271
+ if (manifest.benchmarkId !== adapter.id) {
1272
+ throw new Error(`Pilot manifest ${manifest.path} is for ${manifest.benchmarkId}, not ${adapter.id}`);
1273
+ }
1274
+ const templates = loadExternalArmTemplates(options);
1275
+ const selectedArms = normalizeSelectedExternalArms(options.arms);
1276
+ const runConfig = {
1277
+ benchmarkId: adapter.id,
1278
+ modelId: cleanText(options.modelId),
1279
+ executorId: cleanText(options.executorId),
1280
+ executorCommand: cleanText(options.executorCommand),
1281
+ toolPermissions: cleanText(options.toolPermissions),
1282
+ temperature: cleanText(options.temperature),
1283
+ reasoningEffort: cleanText(options.reasoningEffort),
1284
+ maxWallClockMinutes: cleanText(options.maxWallClockMinutes),
1285
+ maxTurns: cleanText(options.maxTurns),
1286
+ retryLimit: cleanText(options.retryLimit),
1287
+ verificationHarness: cleanText(options.verificationHarness),
1288
+ datasetVersion: cleanText(options.datasetVersion),
1289
+ armOverrides:
1290
+ options.armOverrides && typeof options.armOverrides === "object" && !Array.isArray(options.armOverrides)
1291
+ ? options.armOverrides
1292
+ : {},
1293
+ commandTemplates:
1294
+ options.commandTemplates && typeof options.commandTemplates === "object" && !Array.isArray(options.commandTemplates)
1295
+ ? options.commandTemplates
1296
+ : {},
1297
+ };
1298
+ if (options.commandConfigPath) {
1299
+ const commandConfig = loadExternalCommandConfig(options.commandConfigPath);
1300
+ runConfig.commandTemplates = commandConfig.adapters;
1301
+ }
1302
+ assertComparableExternalRunConfig(runConfig);
1303
+ const outputDir = path.resolve(
1304
+ REPO_ROOT,
1305
+ normalizeRepoRelativePath(
1306
+ options.outputDir || `.tmp/wave-benchmarks/external/${adapter.id}`,
1307
+ "outputDir",
1308
+ ),
1309
+ );
1310
+ ensureDirectory(outputDir);
1311
+ const selectedTaskIds = options.taskIds?.length
1312
+ ? new Set(options.taskIds.map((taskId) => cleanText(taskId)))
1313
+ : null;
1314
+ const expandedTasks = manifest.tasks
1315
+ .filter((task) => !selectedTaskIds || selectedTaskIds.has(task.taskId))
1316
+ .flatMap((task) => {
1317
+ const teamSizes = task.teamSize != null ? [task.teamSize] : manifest.teamSizes;
1318
+ if (!Array.isArray(teamSizes) || teamSizes.length === 0) {
1319
+ return [task];
1320
+ }
1321
+ return teamSizes.map((teamSize) => ({
1322
+ ...task,
1323
+ teamSize,
1324
+ complexityLevel: task.complexityLevel || task.level,
1325
+ }));
1326
+ });
1327
+ const taskResults = [];
1328
+ for (const task of expandedTasks) {
1329
+ for (const arm of selectedArms) {
1330
+ const armTemplate = templates.templates.get(arm);
1331
+ taskResults.push(
1332
+ executeTaskArm({
1333
+ adapter,
1334
+ manifest,
1335
+ task,
1336
+ arm,
1337
+ armTemplate,
1338
+ runConfig,
1339
+ outputDir,
1340
+ dryRun: options.dryRun !== false ? Boolean(options.dryRun) : false,
1341
+ }),
1342
+ );
1343
+ }
1344
+ }
1345
+ const comparisonReady =
1346
+ !manifest.reviewOnly &&
1347
+ selectedArms.length === EXTERNAL_BENCHMARK_ARMS.length &&
1348
+ EXTERNAL_BENCHMARK_ARMS.every((arm) => selectedArms.includes(arm));
1349
+ const summary = aggregateExternalResults(taskResults, adapter, runConfig, selectedArms);
1350
+ const output = {
1351
+ generatedAt: toIsoTimestamp(),
1352
+ dryRun: Boolean(options.dryRun),
1353
+ mode: options.dryRun ? "plan" : "execution",
1354
+ selectedArms,
1355
+ comparisonReady,
1356
+ comparisonMode: comparisonReady ? "pairwise-comparison" : "review-only",
1357
+ adapter,
1358
+ manifest,
1359
+ armTemplates: Object.fromEntries(
1360
+ selectedArms.map((arm) => [arm, templates.templates.get(arm)]),
1361
+ ),
1362
+ failureReviewPath: path.relative(REPO_ROOT, path.join(outputDir, "failure-review.md")).replaceAll(path.sep, "/"),
1363
+ failureReviewJsonPath: path
1364
+ .relative(REPO_ROOT, path.join(outputDir, "failure-review.json"))
1365
+ .replaceAll(path.sep, "/"),
1366
+ runConfig: {
1367
+ ...runConfig,
1368
+ commandTemplates: undefined,
1369
+ armOverrides: undefined,
1370
+ },
1371
+ tasks: taskResults,
1372
+ summary,
1373
+ };
1374
+ const failureReview = buildExternalFailureReview(output);
1375
+ writeJsonAtomic(path.join(outputDir, "results.json"), output);
1376
+ writeTextAtomic(path.join(outputDir, "results.md"), `${renderExternalResultsMarkdown(output)}\n`);
1377
+ writeJsonAtomic(path.join(outputDir, "failure-review.json"), failureReview);
1378
+ writeTextAtomic(path.join(outputDir, "failure-review.md"), `${renderExternalFailureReviewMarkdown(failureReview)}\n`);
1379
+ publishExternalBenchmarkTelemetry({ output, outputDir, failureReview });
1380
+ return {
1381
+ ...output,
1382
+ outputDir: path.relative(REPO_ROOT, outputDir).replaceAll(path.sep, "/"),
1383
+ };
1384
+ }