@chllming/wave-orchestration 0.6.3 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/CHANGELOG.md +82 -1
  2. package/README.md +40 -7
  3. package/docs/agents/wave-orchestrator-role.md +50 -0
  4. package/docs/agents/wave-planner-role.md +39 -0
  5. package/docs/context7/bundles.json +9 -0
  6. package/docs/context7/planner-agent/README.md +25 -0
  7. package/docs/context7/planner-agent/manifest.json +83 -0
  8. package/docs/context7/planner-agent/papers/cooperbench-why-coding-agents-cannot-be-your-teammates-yet.md +3283 -0
  9. package/docs/context7/planner-agent/papers/dova-deliberation-first-multi-agent-orchestration-for-autonomous-research-automation.md +1699 -0
  10. package/docs/context7/planner-agent/papers/dpbench-large-language-models-struggle-with-simultaneous-coordination.md +2251 -0
  11. package/docs/context7/planner-agent/papers/incremental-planning-to-control-a-blackboard-based-problem-solver.md +1729 -0
  12. package/docs/context7/planner-agent/papers/silo-bench-a-scalable-environment-for-evaluating-distributed-coordination-in-multi-agent-llm-systems.md +3747 -0
  13. package/docs/context7/planner-agent/papers/todoevolve-learning-to-architect-agent-planning-systems.md +1675 -0
  14. package/docs/context7/planner-agent/papers/verified-multi-agent-orchestration-a-plan-execute-verify-replan-framework-for-complex-query-resolution.md +1173 -0
  15. package/docs/context7/planner-agent/papers/why-do-multi-agent-llm-systems-fail.md +5211 -0
  16. package/docs/context7/planner-agent/topics/planning-and-orchestration.md +24 -0
  17. package/docs/evals/README.md +96 -1
  18. package/docs/evals/arm-templates/README.md +13 -0
  19. package/docs/evals/arm-templates/full-wave.json +15 -0
  20. package/docs/evals/arm-templates/single-agent.json +15 -0
  21. package/docs/evals/benchmark-catalog.json +7 -0
  22. package/docs/evals/cases/README.md +47 -0
  23. package/docs/evals/cases/wave-blackboard-inbox-targeting.json +73 -0
  24. package/docs/evals/cases/wave-contradiction-conflict.json +104 -0
  25. package/docs/evals/cases/wave-expert-routing-preservation.json +69 -0
  26. package/docs/evals/cases/wave-hidden-profile-private-evidence.json +81 -0
  27. package/docs/evals/cases/wave-premature-closure-guard.json +71 -0
  28. package/docs/evals/cases/wave-silo-cross-agent-state.json +77 -0
  29. package/docs/evals/cases/wave-simultaneous-lockstep.json +92 -0
  30. package/docs/evals/cooperbench/real-world-mitigation.md +341 -0
  31. package/docs/evals/external-benchmarks.json +85 -0
  32. package/docs/evals/external-command-config.sample.json +9 -0
  33. package/docs/evals/external-command-config.swe-bench-pro.json +8 -0
  34. package/docs/evals/pilots/README.md +47 -0
  35. package/docs/evals/pilots/swe-bench-pro-public-full-wave-review-10.json +64 -0
  36. package/docs/evals/pilots/swe-bench-pro-public-pilot.json +111 -0
  37. package/docs/evals/wave-benchmark-program.md +302 -0
  38. package/docs/guides/planner.md +67 -11
  39. package/docs/guides/terminal-surfaces.md +12 -0
  40. package/docs/plans/context7-wave-orchestrator.md +20 -0
  41. package/docs/plans/current-state.md +8 -1
  42. package/docs/plans/examples/wave-benchmark-improvement.md +108 -0
  43. package/docs/plans/examples/wave-example-live-proof.md +1 -1
  44. package/docs/plans/examples/wave-example-rollout-fidelity.md +340 -0
  45. package/docs/plans/migration.md +26 -0
  46. package/docs/plans/wave-orchestrator.md +60 -12
  47. package/docs/plans/waves/reviews/wave-1-benchmark-operator.md +118 -0
  48. package/docs/reference/cli-reference.md +547 -0
  49. package/docs/reference/coordination-and-closure.md +436 -0
  50. package/docs/reference/live-proof-waves.md +25 -3
  51. package/docs/reference/npmjs-trusted-publishing.md +3 -3
  52. package/docs/reference/proof-metrics.md +90 -0
  53. package/docs/reference/runtime-config/README.md +63 -2
  54. package/docs/reference/runtime-config/codex.md +2 -1
  55. package/docs/reference/sample-waves.md +29 -18
  56. package/docs/reference/wave-control.md +164 -0
  57. package/docs/reference/wave-planning-lessons.md +131 -0
  58. package/package.json +5 -4
  59. package/releases/manifest.json +40 -0
  60. package/scripts/research/agent-context-archive.mjs +18 -0
  61. package/scripts/research/manifests/agent-context-expanded-2026-03-22.mjs +17 -0
  62. package/scripts/research/sync-planner-context7-bundle.mjs +133 -0
  63. package/scripts/wave-orchestrator/agent-state.mjs +11 -2
  64. package/scripts/wave-orchestrator/artifact-schemas.mjs +232 -0
  65. package/scripts/wave-orchestrator/autonomous.mjs +7 -0
  66. package/scripts/wave-orchestrator/benchmark-cases.mjs +374 -0
  67. package/scripts/wave-orchestrator/benchmark-external.mjs +1384 -0
  68. package/scripts/wave-orchestrator/benchmark.mjs +972 -0
  69. package/scripts/wave-orchestrator/clarification-triage.mjs +78 -12
  70. package/scripts/wave-orchestrator/config.mjs +175 -0
  71. package/scripts/wave-orchestrator/control-cli.mjs +1216 -0
  72. package/scripts/wave-orchestrator/control-plane.mjs +697 -0
  73. package/scripts/wave-orchestrator/coord-cli.mjs +360 -2
  74. package/scripts/wave-orchestrator/coordination-store.mjs +211 -9
  75. package/scripts/wave-orchestrator/coordination.mjs +84 -0
  76. package/scripts/wave-orchestrator/dashboard-renderer.mjs +120 -5
  77. package/scripts/wave-orchestrator/dashboard-state.mjs +22 -0
  78. package/scripts/wave-orchestrator/evals.mjs +23 -0
  79. package/scripts/wave-orchestrator/executors.mjs +3 -2
  80. package/scripts/wave-orchestrator/feedback.mjs +55 -0
  81. package/scripts/wave-orchestrator/install.mjs +151 -2
  82. package/scripts/wave-orchestrator/launcher-closure.mjs +4 -1
  83. package/scripts/wave-orchestrator/launcher-runtime.mjs +33 -30
  84. package/scripts/wave-orchestrator/launcher.mjs +884 -36
  85. package/scripts/wave-orchestrator/planner-context.mjs +75 -0
  86. package/scripts/wave-orchestrator/planner.mjs +2270 -136
  87. package/scripts/wave-orchestrator/proof-cli.mjs +195 -0
  88. package/scripts/wave-orchestrator/proof-registry.mjs +317 -0
  89. package/scripts/wave-orchestrator/replay.mjs +10 -4
  90. package/scripts/wave-orchestrator/retry-cli.mjs +184 -0
  91. package/scripts/wave-orchestrator/retry-control.mjs +225 -0
  92. package/scripts/wave-orchestrator/shared.mjs +26 -0
  93. package/scripts/wave-orchestrator/swe-bench-pro-task.mjs +1004 -0
  94. package/scripts/wave-orchestrator/terminals.mjs +1 -1
  95. package/scripts/wave-orchestrator/traces.mjs +157 -2
  96. package/scripts/wave-orchestrator/wave-control-client.mjs +532 -0
  97. package/scripts/wave-orchestrator/wave-control-schema.mjs +309 -0
  98. package/scripts/wave-orchestrator/wave-files.mjs +144 -23
  99. package/scripts/wave.mjs +27 -0
  100. package/skills/repo-coding-rules/SKILL.md +1 -0
  101. package/skills/role-cont-eval/SKILL.md +1 -0
  102. package/skills/role-cont-qa/SKILL.md +13 -6
  103. package/skills/role-deploy/SKILL.md +1 -0
  104. package/skills/role-documentation/SKILL.md +4 -0
  105. package/skills/role-implementation/SKILL.md +4 -0
  106. package/skills/role-infra/SKILL.md +2 -1
  107. package/skills/role-integration/SKILL.md +15 -8
  108. package/skills/role-planner/SKILL.md +39 -0
  109. package/skills/role-planner/skill.json +21 -0
  110. package/skills/role-research/SKILL.md +1 -0
  111. package/skills/role-security/SKILL.md +2 -2
  112. package/skills/runtime-claude/SKILL.md +2 -1
  113. package/skills/runtime-codex/SKILL.md +1 -0
  114. package/skills/runtime-local/SKILL.md +2 -0
  115. package/skills/runtime-opencode/SKILL.md +1 -0
  116. package/skills/wave-core/SKILL.md +25 -6
  117. package/skills/wave-core/references/marker-syntax.md +16 -8
  118. package/wave.config.json +45 -0
@@ -0,0 +1,972 @@
1
+ import path from "node:path";
2
+ import {
3
+ compileAgentInbox,
4
+ compileSharedSummary,
5
+ openClarificationLinkedRequests,
6
+ } from "./coordination-store.mjs";
7
+ import { buildRequestAssignments } from "./routing-state.mjs";
8
+ import { loadBenchmarkCases, loadExternalBenchmarkAdapters } from "./benchmark-cases.mjs";
9
+ import {
10
+ DEFAULT_WAVE_LANE,
11
+ REPO_ROOT,
12
+ buildLanePaths,
13
+ ensureDirectory,
14
+ toIsoTimestamp,
15
+ writeJsonAtomic,
16
+ writeTextAtomic,
17
+ } from "./shared.mjs";
18
+ import {
19
+ loadExternalArmTemplates,
20
+ loadExternalCommandConfig,
21
+ loadExternalPilotManifest,
22
+ loadExternalPilotManifests,
23
+ runExternalBenchmarkPilot,
24
+ } from "./benchmark-external.mjs";
25
+ import {
26
+ buildWaveControlArtifactFromPath,
27
+ flushWaveControlQueue,
28
+ safeQueueWaveControlEvent,
29
+ } from "./wave-control-client.mjs";
30
+ import { buildWaveControlConfigAttestationHash } from "./wave-control-schema.mjs";
31
+
32
+ const DEFAULT_OUTPUT_DIR = ".tmp/wave-benchmarks/latest";
33
+ const BASELINE_ARM = "single-agent";
34
+
35
+ function cleanText(value) {
36
+ return String(value ?? "").trim();
37
+ }
38
+
39
+ function normalizeId(value, label) {
40
+ const normalized = cleanText(value).toLowerCase();
41
+ if (!/^[a-z0-9][a-z0-9._-]*$/.test(normalized)) {
42
+ throw new Error(`${label} must match /^[a-z0-9][a-z0-9._-]*$/`);
43
+ }
44
+ return normalized;
45
+ }
46
+
47
+ function benchmarkTelemetryLanePaths() {
48
+ try {
49
+ return buildLanePaths(DEFAULT_WAVE_LANE);
50
+ } catch {
51
+ return null;
52
+ }
53
+ }
54
+
55
+ function localBenchmarkRunId(output) {
56
+ return `bench-local-${String(output.generatedAt || toIsoTimestamp()).replace(/[-:.TZ]/g, "").slice(0, 14)}`;
57
+ }
58
+
59
+ function publishLocalBenchmarkTelemetry({ output, outputDir }) {
60
+ const lanePaths = benchmarkTelemetryLanePaths();
61
+ if (!lanePaths || lanePaths.waveControl?.captureBenchmarkRuns === false) {
62
+ return null;
63
+ }
64
+ const benchmarkRunIdValue = localBenchmarkRunId(output);
65
+ const attestation = {
66
+ suite: output.suite,
67
+ cases: output.cases.map((benchmarkCase) => benchmarkCase.id),
68
+ familySummary: output.familySummary,
69
+ comparisons: output.comparisons,
70
+ };
71
+ safeQueueWaveControlEvent(lanePaths, {
72
+ category: "benchmark",
73
+ entityType: "benchmark_run",
74
+ entityId: benchmarkRunIdValue,
75
+ action: "completed",
76
+ source: "benchmark-runner",
77
+ actor: "wave benchmark run",
78
+ recordedAt: output.generatedAt,
79
+ identity: {
80
+ runKind: "benchmark",
81
+ benchmarkRunId: benchmarkRunIdValue,
82
+ },
83
+ tags: ["local-benchmark-suite"],
84
+ attestation,
85
+ data: {
86
+ suite: output.suite,
87
+ familySummary: output.familySummary,
88
+ comparisons: output.comparisons,
89
+ configHash: buildWaveControlConfigAttestationHash(attestation),
90
+ },
91
+ artifacts: [
92
+ {
93
+ ...buildWaveControlArtifactFromPath(path.join(outputDir, "results.json"), {
94
+ kind: "benchmark-results",
95
+ uploadPolicy: "selected",
96
+ }),
97
+ sourcePath: path.join(outputDir, "results.json"),
98
+ },
99
+ {
100
+ ...buildWaveControlArtifactFromPath(path.join(outputDir, "results.md"), {
101
+ kind: "benchmark-results-markdown",
102
+ uploadPolicy: "metadata-only",
103
+ }),
104
+ sourcePath: path.join(outputDir, "results.md"),
105
+ },
106
+ ],
107
+ });
108
+ for (const benchmarkCase of output.cases || []) {
109
+ for (const [arm, armResult] of Object.entries(benchmarkCase.arms || {})) {
110
+ safeQueueWaveControlEvent(lanePaths, {
111
+ category: "benchmark",
112
+ entityType: "benchmark_item",
113
+ entityId: `${benchmarkCase.id}:${arm}`,
114
+ action: armResult.passed ? "passed" : "failed",
115
+ source: "benchmark-runner",
116
+ actor: "wave benchmark run",
117
+ recordedAt: output.generatedAt,
118
+ identity: {
119
+ runKind: "benchmark",
120
+ benchmarkRunId: benchmarkRunIdValue,
121
+ benchmarkItemId: `${benchmarkCase.id}:${arm}`,
122
+ },
123
+ tags: [benchmarkCase.familyId, benchmarkCase.benchmarkId, arm],
124
+ data: {
125
+ id: benchmarkCase.id,
126
+ title: benchmarkCase.title,
127
+ familyId: benchmarkCase.familyId,
128
+ benchmarkId: benchmarkCase.benchmarkId,
129
+ primaryMetric: benchmarkCase.primaryMetric,
130
+ arm,
131
+ score: armResult.score,
132
+ alignedScore: armResult.alignedScore,
133
+ passed: armResult.passed,
134
+ direction: armResult.direction,
135
+ threshold: armResult.threshold,
136
+ metrics: armResult.metrics,
137
+ details: armResult.details,
138
+ artifacts: armResult.artifacts,
139
+ },
140
+ });
141
+ }
142
+ }
143
+ void flushWaveControlQueue(lanePaths);
144
+ return benchmarkRunIdValue;
145
+ }
146
+
147
+ function containsFact(text, fact) {
148
+ return String(text || "").toLowerCase().includes(String(fact || "").trim().toLowerCase());
149
+ }
150
+
151
+ function percent(numerator, denominator) {
152
+ if (!denominator) {
153
+ return 100;
154
+ }
155
+ return Number(((numerator / denominator) * 100).toFixed(2));
156
+ }
157
+
158
+ function scoreFactRecall(text, facts = []) {
159
+ const matched = facts.filter((fact) => containsFact(text, fact)).length;
160
+ return {
161
+ matched,
162
+ total: facts.length,
163
+ percent: percent(matched, facts.length),
164
+ };
165
+ }
166
+
167
+ function scoreTargetedInboxes(inboxes, expectedInboxes) {
168
+ const entries = Object.entries(expectedInboxes || {});
169
+ if (entries.length === 0) {
170
+ return { matched: 0, total: 0, percent: 100 };
171
+ }
172
+ let matched = 0;
173
+ let total = 0;
174
+ for (const [agentId, facts] of entries) {
175
+ const text = inboxes[agentId] || "";
176
+ for (const fact of facts) {
177
+ total += 1;
178
+ if (containsFact(text, fact)) {
179
+ matched += 1;
180
+ }
181
+ }
182
+ }
183
+ return { matched, total, percent: percent(matched, total) };
184
+ }
185
+
186
+ function scoreAssignments(assignments, expectedAssignments) {
187
+ if ((expectedAssignments || []).length === 0) {
188
+ return { matched: 0, total: 0, percent: 100 };
189
+ }
190
+ const lookup = new Map(assignments.map((assignment) => [assignment.requestId, assignment.assignedAgentId]));
191
+ let matched = 0;
192
+ for (const expected of expectedAssignments) {
193
+ if (lookup.get(expected.requestId) === expected.assignedAgentId) {
194
+ matched += 1;
195
+ }
196
+ }
197
+ return {
198
+ matched,
199
+ total: expectedAssignments.length,
200
+ percent: percent(matched, expectedAssignments.length),
201
+ };
202
+ }
203
+
204
+ function renderCoordinationLine(record) {
205
+ return `- ${record.kind} ${record.id}: ${record.summary || record.detail || record.id}`;
206
+ }
207
+
208
+ function singleAgentVisibleRecords(benchmarkCase) {
209
+ const primaryAgentId = benchmarkCase.fixture.primaryAgentId;
210
+ return benchmarkCase.fixture.state.latestRecords.filter((record) => record.agentId === primaryAgentId);
211
+ }
212
+
213
+ function renderSingleAgentSummary(benchmarkCase) {
214
+ const primaryAgentId = benchmarkCase.fixture.primaryAgentId;
215
+ const lines = singleAgentVisibleRecords(benchmarkCase).map(renderCoordinationLine);
216
+ return [
217
+ `# Single Agent Local View`,
218
+ "",
219
+ `- Agent: ${primaryAgentId}`,
220
+ `- Visible records: ${lines.length}`,
221
+ "",
222
+ "## Local coordination",
223
+ ...(lines.length > 0 ? lines : ["- None."]),
224
+ "",
225
+ ].join("\n");
226
+ }
227
+
228
+ function renderMinimalSharedSummary(benchmarkCase) {
229
+ const state = benchmarkCase.fixture.state;
230
+ return [
231
+ `# Minimal Shared Summary`,
232
+ "",
233
+ `- Open records: ${state.openRecords.length}`,
234
+ `- Requests: ${state.requests.length}`,
235
+ `- Blockers: ${state.blockers.length}`,
236
+ `- Clarifications: ${state.clarifications.length}`,
237
+ "",
238
+ "## Coordination shape",
239
+ "- This baseline keeps only coarse counts and does not preserve detailed targeted facts.",
240
+ "",
241
+ ].join("\n");
242
+ }
243
+
244
+ function renderSingleAgentInbox(benchmarkCase, agent) {
245
+ if (agent.agentId !== benchmarkCase.fixture.primaryAgentId) {
246
+ return `# Inbox unavailable for ${agent.agentId}\n\n- This arm does not compile targeted inboxes.\n`;
247
+ }
248
+ const records = singleAgentVisibleRecords(benchmarkCase).map(renderCoordinationLine);
249
+ return [
250
+ `# Local Inbox For ${agent.agentId}`,
251
+ "",
252
+ ...(records.length > 0 ? records : ["- None."]),
253
+ "",
254
+ ].join("\n");
255
+ }
256
+
257
+ function buildArmArtifacts(benchmarkCase, arm) {
258
+ const wave = {
259
+ wave: benchmarkCase.fixture.waveNumber,
260
+ agents: benchmarkCase.fixture.agents,
261
+ };
262
+ const state = benchmarkCase.fixture.state;
263
+ if (arm === "single-agent") {
264
+ return {
265
+ sharedSummary: renderSingleAgentSummary(benchmarkCase),
266
+ inboxes: Object.fromEntries(
267
+ benchmarkCase.fixture.agents.map((agent) => [agent.agentId, renderSingleAgentInbox(benchmarkCase, agent)]),
268
+ ),
269
+ assignments: [],
270
+ blockingGuard: false,
271
+ };
272
+ }
273
+ if (arm === "multi-agent-minimal") {
274
+ const sharedSummary = renderMinimalSharedSummary(benchmarkCase);
275
+ return {
276
+ sharedSummary,
277
+ inboxes: Object.fromEntries(benchmarkCase.fixture.agents.map((agent) => [agent.agentId, sharedSummary])),
278
+ assignments: [],
279
+ blockingGuard: false,
280
+ };
281
+ }
282
+ const assignments = buildRequestAssignments({
283
+ coordinationState: state,
284
+ agents: benchmarkCase.fixture.agents,
285
+ capabilityRouting: benchmarkCase.fixture.capabilityRouting,
286
+ ledger: { tasks: [] },
287
+ });
288
+ const sharedSummary = compileSharedSummary({
289
+ wave,
290
+ state,
291
+ capabilityAssignments: assignments,
292
+ }).text;
293
+ const inboxes = Object.fromEntries(
294
+ benchmarkCase.fixture.agents.map((agent) => [
295
+ agent.agentId,
296
+ compileAgentInbox({
297
+ wave,
298
+ agent,
299
+ state,
300
+ capabilityAssignments: assignments,
301
+ }).text,
302
+ ]),
303
+ );
304
+ return {
305
+ sharedSummary,
306
+ inboxes,
307
+ assignments,
308
+ blockingGuard:
309
+ benchmarkCase.expectations.requireBlockingGuard &&
310
+ openClarificationLinkedRequests(state).length > 0,
311
+ };
312
+ }
313
+
314
+ function renderAssignmentLine(assignment) {
315
+ return [
316
+ assignment.requestId,
317
+ assignment.summary || "",
318
+ assignment.target,
319
+ assignment.assignedAgentId || "unassigned",
320
+ assignment.assignmentReason || "",
321
+ assignment.assignmentDetail || "",
322
+ ]
323
+ .filter(Boolean)
324
+ .join(" ");
325
+ }
326
+
327
+ function buildArtifactUnionText(artifacts) {
328
+ return [artifacts.sharedSummary, ...Object.values(artifacts.inboxes || {}), ...(artifacts.assignments || []).map(
329
+ (assignment) => renderAssignmentLine(assignment),
330
+ )]
331
+ .filter(Boolean)
332
+ .join("\n");
333
+ }
334
+
335
+ function integrationAgentIds(benchmarkCase) {
336
+ return benchmarkCase.fixture.agents
337
+ .filter((agent) => Array.isArray(agent.capabilities) && agent.capabilities.includes("integration"))
338
+ .map((agent) => agent.agentId);
339
+ }
340
+
341
+ function buildIntegrationVisibleText(benchmarkCase, artifacts) {
342
+ const integrationIds = integrationAgentIds(benchmarkCase);
343
+ const integrationInboxes =
344
+ integrationIds.length > 0
345
+ ? integrationIds.map((agentId) => artifacts.inboxes?.[agentId] || "")
346
+ : [];
347
+ return [artifacts.sharedSummary, ...integrationInboxes, ...(artifacts.assignments || []).map(
348
+ (assignment) => renderAssignmentLine(assignment),
349
+ )]
350
+ .filter(Boolean)
351
+ .join("\n");
352
+ }
353
+
354
+ function scoreProjectionCase(benchmarkCase, arm, artifacts) {
355
+ const integrationVisibleText = buildIntegrationVisibleText(benchmarkCase, artifacts);
356
+ const artifactUnionText = buildArtifactUnionText(artifacts);
357
+ const globalFacts = scoreFactRecall(integrationVisibleText, benchmarkCase.expectations.globalFacts);
358
+ const summaryFacts = scoreFactRecall(artifacts.sharedSummary, benchmarkCase.expectations.summaryFacts);
359
+ const targetedInboxes = scoreTargetedInboxes(
360
+ artifacts.inboxes,
361
+ benchmarkCase.expectations.targetedInboxes,
362
+ );
363
+ const assignmentPrecision = scoreAssignments(
364
+ artifacts.assignments,
365
+ benchmarkCase.expectations.requiredAssignments,
366
+ );
367
+ const distinctAssignedAgents = new Set(
368
+ (artifacts.assignments || []).map((assignment) => assignment.assignedAgentId).filter(Boolean),
369
+ ).size;
370
+ const clarificationRecall = scoreFactRecall(
371
+ artifactUnionText,
372
+ benchmarkCase.expectations.clarificationRequestIds,
373
+ );
374
+ const metrics = {
375
+ "distributed-info-accuracy": globalFacts.percent,
376
+ "latent-asymmetry-surfacing-rate":
377
+ benchmarkCase.expectations.clarificationRequestIds.length > 0
378
+ ? clarificationRecall.percent
379
+ : targetedInboxes.percent,
380
+ "premature-convergence-rate":
381
+ benchmarkCase.expectations.requireBlockingGuard && !artifacts.blockingGuard ? 100 : 0,
382
+ "global-state-reconstruction-rate": globalFacts.percent,
383
+ "summary-fact-retention-rate": summaryFacts.percent,
384
+ "communication-reasoning-gap": Number((100 - globalFacts.percent).toFixed(2)),
385
+ "projection-consistency-rate": summaryFacts.percent,
386
+ "targeted-inbox-recall": targetedInboxes.percent,
387
+ "integration-coherence-rate": globalFacts.percent,
388
+ "contradiction-detection-rate": targetedInboxes.percent,
389
+ "repair-closure-rate": assignmentPrecision.percent,
390
+ "false-consensus-rate":
391
+ benchmarkCase.expectations.requireBlockingGuard && !artifacts.blockingGuard ? 100 : 0,
392
+ "deadlock-rate":
393
+ benchmarkCase.expectations.minimumDistinctAssignedAgents &&
394
+ distinctAssignedAgents < benchmarkCase.expectations.minimumDistinctAssignedAgents
395
+ ? 100
396
+ : 0,
397
+ "contention-resolution-rate": assignmentPrecision.percent,
398
+ "symmetry-breaking-rate":
399
+ benchmarkCase.expectations.minimumDistinctAssignedAgents == null
400
+ ? 100
401
+ : percent(
402
+ Math.min(distinctAssignedAgents, benchmarkCase.expectations.minimumDistinctAssignedAgents),
403
+ benchmarkCase.expectations.minimumDistinctAssignedAgents,
404
+ ),
405
+ "expert-preservation-rate": targetedInboxes.percent,
406
+ "capability-routing-precision": assignmentPrecision.percent,
407
+ "expert-performance-gap": Number((100 - targetedInboxes.percent).toFixed(2)),
408
+ };
409
+ return {
410
+ metrics,
411
+ details: {
412
+ globalFacts,
413
+ summaryFacts,
414
+ targetedInboxes,
415
+ clarificationRecall,
416
+ assignmentPrecision,
417
+ distinctAssignedAgents,
418
+ blockingGuard: artifacts.blockingGuard,
419
+ },
420
+ };
421
+ }
422
+
423
+ function percentile(sortedValues, p) {
424
+ if (sortedValues.length === 0) {
425
+ return 0;
426
+ }
427
+ const index = Math.max(0, Math.min(sortedValues.length - 1, Math.floor(p * (sortedValues.length - 1))));
428
+ return sortedValues[index];
429
+ }
430
+
431
+ function createSeededRandom(seedInput) {
432
+ let seed = 0;
433
+ for (const char of String(seedInput || "wave-benchmark")) {
434
+ seed = (seed * 31 + char.charCodeAt(0)) >>> 0;
435
+ }
436
+ return () => {
437
+ seed = (seed * 1664525 + 1013904223) >>> 0;
438
+ return seed / 0x100000000;
439
+ };
440
+ }
441
+
442
+ function bootstrapMeanConfidenceInterval(values, seedKey) {
443
+ if (values.length <= 1) {
444
+ const only = Number(values[0] || 0);
445
+ return { low: only, high: only };
446
+ }
447
+ const random = createSeededRandom(seedKey);
448
+ const means = [];
449
+ for (let index = 0; index < 400; index += 1) {
450
+ let total = 0;
451
+ for (let sampleIndex = 0; sampleIndex < values.length; sampleIndex += 1) {
452
+ const picked = values[Math.floor(random() * values.length)];
453
+ total += picked;
454
+ }
455
+ means.push(total / values.length);
456
+ }
457
+ means.sort((left, right) => left - right);
458
+ return {
459
+ low: Number(percentile(means, 0.025).toFixed(2)),
460
+ high: Number(percentile(means, 0.975).toFixed(2)),
461
+ };
462
+ }
463
+
464
+ function metricDirection(benchmarkCase, metricId, catalog) {
465
+ const family = catalog.families[benchmarkCase.familyId];
466
+ const metricDescriptors = [family.primaryMetric, ...(family.secondaryMetrics || [])].filter(Boolean);
467
+ return metricDescriptors.find((descriptor) => descriptor.id === metricId)?.direction || "higher-is-better";
468
+ }
469
+
470
+ function metricPasses(direction, actual, threshold) {
471
+ if (threshold == null) {
472
+ return true;
473
+ }
474
+ return direction === "lower-is-better" ? actual <= threshold : actual >= threshold;
475
+ }
476
+
477
+ function alignMetricScore(direction, score) {
478
+ const numeric = Number(score || 0);
479
+ return Number((direction === "lower-is-better" ? 100 - numeric : numeric).toFixed(2));
480
+ }
481
+
482
+ function evaluateBenchmarkCaseArm(benchmarkCase, arm, catalog) {
483
+ const artifacts = buildArmArtifacts(benchmarkCase, arm);
484
+ const scoring = scoreProjectionCase(benchmarkCase, arm, artifacts);
485
+ const primaryMetric = benchmarkCase.scoring.primaryMetric;
486
+ const primaryScore = scoring.metrics[primaryMetric] ?? 0;
487
+ const direction = metricDirection(benchmarkCase, primaryMetric, catalog);
488
+ const threshold = benchmarkCase.scoring.thresholds[primaryMetric] ?? null;
489
+ return {
490
+ arm,
491
+ score: primaryScore,
492
+ alignedScore: alignMetricScore(direction, primaryScore),
493
+ passed: metricPasses(direction, primaryScore, threshold),
494
+ direction,
495
+ threshold,
496
+ metrics: scoring.metrics,
497
+ details: scoring.details,
498
+ artifacts: {
499
+ sharedSummary: artifacts.sharedSummary,
500
+ inboxes: artifacts.inboxes,
501
+ assignments: artifacts.assignments,
502
+ blockingGuard: artifacts.blockingGuard,
503
+ },
504
+ };
505
+ }
506
+
507
+ function aggregateByFamily(caseResults) {
508
+ const familyMap = new Map();
509
+ for (const caseResult of caseResults) {
510
+ const entry = familyMap.get(caseResult.familyId) || {
511
+ familyId: caseResult.familyId,
512
+ familyTitle: caseResult.familyTitle,
513
+ arms: {},
514
+ cases: 0,
515
+ };
516
+ entry.cases += 1;
517
+ for (const [arm, armResult] of Object.entries(caseResult.arms)) {
518
+ const armEntry = entry.arms[arm] || { totalAlignedScore: 0, passed: 0, cases: 0 };
519
+ armEntry.totalAlignedScore += armResult.alignedScore;
520
+ armEntry.passed += armResult.passed ? 1 : 0;
521
+ armEntry.cases += 1;
522
+ entry.arms[arm] = armEntry;
523
+ }
524
+ familyMap.set(caseResult.familyId, entry);
525
+ }
526
+ return Array.from(familyMap.values()).map((entry) => ({
527
+ familyId: entry.familyId,
528
+ familyTitle: entry.familyTitle,
529
+ cases: entry.cases,
530
+ arms: Object.fromEntries(
531
+ Object.entries(entry.arms).map(([arm, value]) => [
532
+ arm,
533
+ {
534
+ meanScore: Number((value.totalAlignedScore / value.cases).toFixed(2)),
535
+ passRate: percent(value.passed, value.cases),
536
+ cases: value.cases,
537
+ },
538
+ ]),
539
+ ),
540
+ }));
541
+ }
542
+
543
+ function buildComparisons(caseResults, catalog) {
544
+ const arms = Array.from(
545
+ new Set(caseResults.flatMap((caseResult) => Object.keys(caseResult.arms))),
546
+ ).filter((arm) => arm !== BASELINE_ARM);
547
+ const comparisons = [];
548
+ for (const challenger of arms) {
549
+ const overallDeltas = [];
550
+ for (const caseResult of caseResults) {
551
+ const baseline = caseResult.arms[BASELINE_ARM];
552
+ const candidate = caseResult.arms[challenger];
553
+ if (!baseline || !candidate) {
554
+ continue;
555
+ }
556
+ overallDeltas.push(candidate.alignedScore - baseline.alignedScore);
557
+ }
558
+ if (overallDeltas.length > 0) {
559
+ const ci = bootstrapMeanConfidenceInterval(overallDeltas, `overall:${challenger}`);
560
+ const meanDelta = Number(
561
+ (overallDeltas.reduce((total, value) => total + value, 0) / overallDeltas.length).toFixed(2),
562
+ );
563
+ comparisons.push({
564
+ scope: "overall",
565
+ baselineArm: BASELINE_ARM,
566
+ challengerArm: challenger,
567
+ meanDelta,
568
+ confidenceInterval: ci,
569
+ statisticallyConfident: ci.low > 0,
570
+ });
571
+ }
572
+ }
573
+ const familyIds = Array.from(new Set(caseResults.map((caseResult) => caseResult.familyId)));
574
+ for (const familyId of familyIds) {
575
+ for (const challenger of arms) {
576
+ const deltas = caseResults
577
+ .filter((caseResult) => caseResult.familyId === familyId)
578
+ .map((caseResult) => {
579
+ const baseline = caseResult.arms[BASELINE_ARM];
580
+ const candidate = caseResult.arms[challenger];
581
+ return baseline && candidate ? candidate.alignedScore - baseline.alignedScore : null;
582
+ })
583
+ .filter((value) => typeof value === "number");
584
+ if (deltas.length === 0) {
585
+ continue;
586
+ }
587
+ const ci = bootstrapMeanConfidenceInterval(deltas, `${familyId}:${challenger}`);
588
+ const meanDelta = Number(
589
+ (deltas.reduce((total, value) => total + value, 0) / deltas.length).toFixed(2),
590
+ );
591
+ comparisons.push({
592
+ scope: "family",
593
+ familyId,
594
+ familyTitle: catalog.families[familyId]?.title || familyId,
595
+ baselineArm: BASELINE_ARM,
596
+ challengerArm: challenger,
597
+ meanDelta,
598
+ confidenceInterval: ci,
599
+ statisticallyConfident: ci.low > 0,
600
+ });
601
+ }
602
+ }
603
+ return comparisons;
604
+ }
605
+
606
+ function renderCaseMarkdown(caseResult) {
607
+ const lines = [
608
+ `### ${caseResult.title}`,
609
+ "",
610
+ `- Case id: \`${caseResult.id}\``,
611
+ `- Family: \`${caseResult.familyId}\``,
612
+ `- Benchmark: \`${caseResult.benchmarkId}\``,
613
+ `- Primary metric: \`${caseResult.primaryMetric}\``,
614
+ ];
615
+ for (const [arm, armResult] of Object.entries(caseResult.arms)) {
616
+ const scoreLabel =
617
+ armResult.alignedScore === armResult.score
618
+ ? `score=${armResult.score}`
619
+ : `score=${armResult.score} aligned=${armResult.alignedScore}`;
620
+ lines.push(
621
+ `- ${arm}: ${scoreLabel} pass=${armResult.passed ? "yes" : "no"} threshold=${armResult.threshold ?? "n/a"}`,
622
+ );
623
+ }
624
+ lines.push("");
625
+ return lines.join("\n");
626
+ }
627
+
628
+ function renderMarkdownReport(output) {
629
+ return [
630
+ "# Wave Benchmark Results",
631
+ "",
632
+ `- Generated: ${output.generatedAt}`,
633
+ `- Cases: ${output.cases.length}`,
634
+ `- Cases directory: \`${output.suite.casesDir}\``,
635
+ `- Baseline arm: \`${BASELINE_ARM}\``,
636
+ "",
637
+ "## Family Summary",
638
+ ...output.familySummary.flatMap((family) => [
639
+ `### ${family.familyTitle}`,
640
+ ...Object.entries(family.arms).map(
641
+ ([arm, stats]) =>
642
+ `- ${arm}: aligned_mean=${stats.meanScore} pass_rate=${stats.passRate}% cases=${stats.cases}`,
643
+ ),
644
+ "",
645
+ ]),
646
+ "## Comparisons",
647
+ ...(output.comparisons.length > 0
648
+ ? output.comparisons.map((comparison) => {
649
+ const scope =
650
+ comparison.scope === "overall"
651
+ ? "overall"
652
+ : `${comparison.familyTitle || comparison.familyId}`;
653
+ return `- ${scope}: ${comparison.challengerArm} vs ${comparison.baselineArm} aligned_delta=${comparison.meanDelta} ci=[${comparison.confidenceInterval.low}, ${comparison.confidenceInterval.high}] confident=${comparison.statisticallyConfident ? "yes" : "no"}`;
654
+ })
655
+ : ["- None."]),
656
+ "",
657
+ "## Cases",
658
+ ...output.cases.map(renderCaseMarkdown),
659
+ ].join("\n");
660
+ }
661
+
662
+ export function runBenchmarkSuite(options = {}) {
663
+ const suite = loadBenchmarkCases(options);
664
+ const adapters = loadExternalBenchmarkAdapters(options);
665
+ const selectedCaseIds = options.caseIds?.length
666
+ ? options.caseIds.map((caseId) => normalizeId(caseId, "caseIds"))
667
+ : null;
668
+ const selectedFamilyIds = options.familyIds?.length
669
+ ? options.familyIds.map((familyId) => normalizeId(familyId, "familyIds"))
670
+ : null;
671
+ const selectedBenchmarkIds = options.benchmarkIds?.length
672
+ ? options.benchmarkIds.map((benchmarkId) => normalizeId(benchmarkId, "benchmarkIds"))
673
+ : null;
674
+ const requestedArms = options.arms?.length
675
+ ? options.arms.map((arm) => normalizeId(arm, "arms"))
676
+ : [BASELINE_ARM, "multi-agent-minimal", "full-wave"];
677
+ const cases = suite.cases.filter((benchmarkCase) => {
678
+ if (selectedCaseIds && !selectedCaseIds.includes(benchmarkCase.id)) {
679
+ return false;
680
+ }
681
+ if (selectedFamilyIds && !selectedFamilyIds.includes(benchmarkCase.familyId)) {
682
+ return false;
683
+ }
684
+ if (selectedBenchmarkIds && !selectedBenchmarkIds.includes(benchmarkCase.benchmarkId)) {
685
+ return false;
686
+ }
687
+ return true;
688
+ });
689
+ const caseResults = cases.map((benchmarkCase) => ({
690
+ id: benchmarkCase.id,
691
+ title: benchmarkCase.title,
692
+ summary: benchmarkCase.summary,
693
+ familyId: benchmarkCase.familyId,
694
+ familyTitle: benchmarkCase.familyTitle,
695
+ benchmarkId: benchmarkCase.benchmarkId,
696
+ benchmarkTitle: benchmarkCase.benchmarkTitle,
697
+ primaryMetric: benchmarkCase.scoring.primaryMetric,
698
+ arms: Object.fromEntries(
699
+ requestedArms
700
+ .filter((arm) => benchmarkCase.supportedArms.includes(arm))
701
+ .map((arm) => [arm, evaluateBenchmarkCaseArm(benchmarkCase, arm, suite.catalog)]),
702
+ ),
703
+ }));
704
+ const output = {
705
+ generatedAt: toIsoTimestamp(),
706
+ suite: {
707
+ casesDir: suite.casesDir,
708
+ benchmarkCatalogPath: suite.catalog.path,
709
+ requestedArms,
710
+ },
711
+ adapters,
712
+ cases: caseResults,
713
+ familySummary: aggregateByFamily(caseResults),
714
+ comparisons: buildComparisons(caseResults, suite.catalog),
715
+ };
716
+ if (options.writeOutputs !== false) {
717
+ const outputDir = path.resolve(REPO_ROOT, cleanText(options.outputDir) || DEFAULT_OUTPUT_DIR);
718
+ ensureDirectory(outputDir);
719
+ writeJsonAtomic(path.join(outputDir, "results.json"), output);
720
+ writeTextAtomic(path.join(outputDir, "results.md"), `${renderMarkdownReport(output)}\n`);
721
+ publishLocalBenchmarkTelemetry({ output, outputDir });
722
+ output.outputDir = path.relative(REPO_ROOT, outputDir).replaceAll(path.sep, "/");
723
+ }
724
+ return output;
725
+ }
726
+
727
+ function printUsage() {
728
+ console.log(`Usage:
729
+ wave benchmark list [--json]
730
+ wave benchmark show --case <id> [--json]
731
+ wave benchmark run [--case <id>] [--family <id>] [--benchmark <id>] [--arm <id>] [--output-dir <path>] [--json]
732
+ wave benchmark adapters [--json]
733
+ wave benchmark external-list [--json]
734
+ wave benchmark external-show --adapter <id> [--json]
735
+ wave benchmark external-pilots [--json]
736
+ wave benchmark external-run --adapter <id> [--manifest <path>] [--task <id>] [--arm <id>] [--dry-run] [--command-config <path>] [run options] [--json]
737
+ `);
738
+ }
739
+
740
+ function parseArgs(argv) {
741
+ const args = Array.isArray(argv) ? argv.slice() : [];
742
+ const subcommand = cleanText(args.shift()).toLowerCase();
743
+ const options = {
744
+ json: false,
745
+ caseIds: [],
746
+ familyIds: [],
747
+ benchmarkIds: [],
748
+ arms: [],
749
+ outputDir: "",
750
+ adapterId: "",
751
+ manifestPath: "",
752
+ taskIds: [],
753
+ dryRun: false,
754
+ modelId: "",
755
+ executorId: "",
756
+ executorCommand: "",
757
+ toolPermissions: "",
758
+ temperature: "",
759
+ reasoningEffort: "",
760
+ maxWallClockMinutes: "",
761
+ maxTurns: "",
762
+ retryLimit: "",
763
+ verificationHarness: "",
764
+ datasetVersion: "",
765
+ commandConfigPath: "",
766
+ };
767
+ for (let index = 0; index < args.length; index += 1) {
768
+ const arg = args[index];
769
+ if (arg === "--json") {
770
+ options.json = true;
771
+ } else if (arg === "--case") {
772
+ options.caseIds.push(args[++index]);
773
+ } else if (arg === "--family") {
774
+ options.familyIds.push(args[++index]);
775
+ } else if (arg === "--benchmark") {
776
+ options.benchmarkIds.push(args[++index]);
777
+ } else if (arg === "--arm") {
778
+ options.arms.push(args[++index]);
779
+ } else if (arg === "--output-dir") {
780
+ options.outputDir = cleanText(args[++index]);
781
+ } else if (arg === "--adapter") {
782
+ options.adapterId = cleanText(args[++index]);
783
+ } else if (arg === "--manifest") {
784
+ options.manifestPath = cleanText(args[++index]);
785
+ } else if (arg === "--task") {
786
+ options.taskIds.push(cleanText(args[++index]));
787
+ } else if (arg === "--dry-run") {
788
+ options.dryRun = true;
789
+ } else if (arg === "--model-id") {
790
+ options.modelId = cleanText(args[++index]);
791
+ } else if (arg === "--executor-id") {
792
+ options.executorId = cleanText(args[++index]);
793
+ } else if (arg === "--executor-command") {
794
+ options.executorCommand = cleanText(args[++index]);
795
+ } else if (arg === "--tool-permissions") {
796
+ options.toolPermissions = cleanText(args[++index]);
797
+ } else if (arg === "--temperature") {
798
+ options.temperature = cleanText(args[++index]);
799
+ } else if (arg === "--reasoning-effort") {
800
+ options.reasoningEffort = cleanText(args[++index]);
801
+ } else if (arg === "--max-wall-clock-minutes") {
802
+ options.maxWallClockMinutes = cleanText(args[++index]);
803
+ } else if (arg === "--max-turns") {
804
+ options.maxTurns = cleanText(args[++index]);
805
+ } else if (arg === "--retry-limit") {
806
+ options.retryLimit = cleanText(args[++index]);
807
+ } else if (arg === "--verification-harness") {
808
+ options.verificationHarness = cleanText(args[++index]);
809
+ } else if (arg === "--dataset-version") {
810
+ options.datasetVersion = cleanText(args[++index]);
811
+ } else if (arg === "--command-config") {
812
+ options.commandConfigPath = cleanText(args[++index]);
813
+ } else if (arg === "--help" || arg === "-h") {
814
+ return { subcommand: "help", options };
815
+ } else if (arg) {
816
+ throw new Error(`Unknown argument: ${arg}`);
817
+ }
818
+ }
819
+ return { subcommand, options };
820
+ }
821
+
822
+ export async function runBenchmarkCli(argv) {
823
+ const { subcommand, options } = parseArgs(argv);
824
+ if (!subcommand || subcommand === "help") {
825
+ printUsage();
826
+ return;
827
+ }
828
+ if (subcommand === "list") {
829
+ const suite = loadBenchmarkCases(options);
830
+ const payload = suite.cases.map((benchmarkCase) => ({
831
+ id: benchmarkCase.id,
832
+ familyId: benchmarkCase.familyId,
833
+ benchmarkId: benchmarkCase.benchmarkId,
834
+ title: benchmarkCase.title,
835
+ supportedArms: benchmarkCase.supportedArms,
836
+ }));
837
+ if (options.json) {
838
+ console.log(JSON.stringify(payload, null, 2));
839
+ return;
840
+ }
841
+ for (const item of payload) {
842
+ console.log(`${item.id} ${item.familyId}/${item.benchmarkId} ${item.title}`);
843
+ }
844
+ return;
845
+ }
846
+ if (subcommand === "show") {
847
+ if (options.caseIds.length !== 1) {
848
+ throw new Error("wave benchmark show requires exactly one --case <id>");
849
+ }
850
+ const suite = loadBenchmarkCases(options);
851
+ const benchmarkCase = suite.byId.get(normalizeId(options.caseIds[0], "--case"));
852
+ if (!benchmarkCase) {
853
+ throw new Error(`Unknown benchmark case: ${options.caseIds[0]}`);
854
+ }
855
+ if (options.json) {
856
+ console.log(JSON.stringify(benchmarkCase, null, 2));
857
+ return;
858
+ }
859
+ console.log(`${benchmarkCase.id} ${benchmarkCase.familyId}/${benchmarkCase.benchmarkId}`);
860
+ console.log(benchmarkCase.title);
861
+ return;
862
+ }
863
+ if (subcommand === "adapters") {
864
+ const adapters = loadExternalBenchmarkAdapters(options);
865
+ if (options.json) {
866
+ console.log(JSON.stringify(adapters, null, 2));
867
+ return;
868
+ }
869
+ for (const adapter of adapters.adapters) {
870
+ console.log(`${adapter.id} ${adapter.mode} ${adapter.sourceBenchmark || ""}`.trim());
871
+ }
872
+ return;
873
+ }
874
+ if (subcommand === "external-list") {
875
+ const adapters = loadExternalBenchmarkAdapters(options);
876
+ const payload = adapters.adapters.filter((adapter) => adapter.mode === "direct");
877
+ if (options.json) {
878
+ console.log(JSON.stringify(payload, null, 2));
879
+ return;
880
+ }
881
+ for (const adapter of payload) {
882
+ console.log(`${adapter.id} ${adapter.sourceBenchmark || ""}`.trim());
883
+ }
884
+ return;
885
+ }
886
+ if (subcommand === "external-show") {
887
+ if (!options.adapterId) {
888
+ throw new Error("wave benchmark external-show requires --adapter <id>");
889
+ }
890
+ const adapters = loadExternalBenchmarkAdapters(options);
891
+ const adapter = adapters.adapters.find((entry) => entry.id === normalizeId(options.adapterId, "--adapter"));
892
+ if (!adapter) {
893
+ throw new Error(`Unknown external benchmark adapter: ${options.adapterId}`);
894
+ }
895
+ const templates = loadExternalArmTemplates(options);
896
+ const payload = {
897
+ adapter,
898
+ armTemplates: Object.fromEntries(Array.from(templates.templates.entries())),
899
+ };
900
+ if (adapter.pilotManifestPath) {
901
+ payload.manifest = loadExternalPilotManifest(options.manifestPath || adapter.pilotManifestPath);
902
+ }
903
+ if (options.commandConfigPath) {
904
+ payload.commandConfig = loadExternalCommandConfig(options.commandConfigPath);
905
+ }
906
+ if (options.json) {
907
+ console.log(JSON.stringify(payload, null, 2));
908
+ return;
909
+ }
910
+ console.log(`${adapter.id} ${adapter.title}`);
911
+ if (payload.manifest?.path) {
912
+ console.log(`manifest=${payload.manifest.path}`);
913
+ }
914
+ return;
915
+ }
916
+ if (subcommand === "external-pilots") {
917
+ const manifests = loadExternalPilotManifests(options);
918
+ if (options.json) {
919
+ console.log(JSON.stringify(manifests, null, 2));
920
+ return;
921
+ }
922
+ for (const manifest of manifests.manifests) {
923
+ console.log(`${manifest.id} ${manifest.benchmarkId} tasks=${manifest.tasks.length}`);
924
+ }
925
+ return;
926
+ }
927
+ if (subcommand === "external-run") {
928
+ if (!options.adapterId) {
929
+ throw new Error("wave benchmark external-run requires --adapter <id>");
930
+ }
931
+ const output = runExternalBenchmarkPilot({
932
+ adapterId: options.adapterId,
933
+ manifestPath: options.manifestPath || undefined,
934
+ taskIds: options.taskIds,
935
+ arms: options.arms,
936
+ outputDir: options.outputDir || undefined,
937
+ dryRun: options.dryRun,
938
+ modelId: options.modelId,
939
+ executorId: options.executorId,
940
+ executorCommand: options.executorCommand,
941
+ toolPermissions: options.toolPermissions,
942
+ temperature: options.temperature,
943
+ reasoningEffort: options.reasoningEffort,
944
+ maxWallClockMinutes: options.maxWallClockMinutes,
945
+ maxTurns: options.maxTurns,
946
+ retryLimit: options.retryLimit,
947
+ verificationHarness: options.verificationHarness,
948
+ datasetVersion: options.datasetVersion,
949
+ commandConfigPath: options.commandConfigPath || undefined,
950
+ });
951
+ if (options.json) {
952
+ console.log(JSON.stringify(output, null, 2));
953
+ return;
954
+ }
955
+ console.log(`external benchmark ${output.adapter.id}`);
956
+ console.log(`output_dir=${output.outputDir}`);
957
+ return;
958
+ }
959
+ if (subcommand === "run") {
960
+ const output = runBenchmarkSuite(options);
961
+ if (options.json) {
962
+ console.log(JSON.stringify(output, null, 2));
963
+ return;
964
+ }
965
+ console.log(renderMarkdownReport(output));
966
+ if (output.outputDir) {
967
+ console.log(`\n[wave:benchmark] output_dir=${output.outputDir}`);
968
+ }
969
+ return;
970
+ }
971
+ throw new Error(`Unknown benchmark subcommand: ${subcommand}`);
972
+ }