@chllming/wave-orchestration 0.6.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. package/CHANGELOG.md +64 -1
  2. package/README.md +44 -8
  3. package/docs/agents/wave-orchestrator-role.md +50 -0
  4. package/docs/agents/wave-planner-role.md +39 -0
  5. package/docs/context7/bundles.json +9 -0
  6. package/docs/context7/planner-agent/README.md +25 -0
  7. package/docs/context7/planner-agent/manifest.json +83 -0
  8. package/docs/context7/planner-agent/papers/cooperbench-why-coding-agents-cannot-be-your-teammates-yet.md +3283 -0
  9. package/docs/context7/planner-agent/papers/dova-deliberation-first-multi-agent-orchestration-for-autonomous-research-automation.md +1699 -0
  10. package/docs/context7/planner-agent/papers/dpbench-large-language-models-struggle-with-simultaneous-coordination.md +2251 -0
  11. package/docs/context7/planner-agent/papers/incremental-planning-to-control-a-blackboard-based-problem-solver.md +1729 -0
  12. package/docs/context7/planner-agent/papers/silo-bench-a-scalable-environment-for-evaluating-distributed-coordination-in-multi-agent-llm-systems.md +3747 -0
  13. package/docs/context7/planner-agent/papers/todoevolve-learning-to-architect-agent-planning-systems.md +1675 -0
  14. package/docs/context7/planner-agent/papers/verified-multi-agent-orchestration-a-plan-execute-verify-replan-framework-for-complex-query-resolution.md +1173 -0
  15. package/docs/context7/planner-agent/papers/why-do-multi-agent-llm-systems-fail.md +5211 -0
  16. package/docs/context7/planner-agent/topics/planning-and-orchestration.md +24 -0
  17. package/docs/evals/README.md +96 -1
  18. package/docs/evals/arm-templates/README.md +13 -0
  19. package/docs/evals/arm-templates/full-wave.json +15 -0
  20. package/docs/evals/arm-templates/single-agent.json +15 -0
  21. package/docs/evals/benchmark-catalog.json +7 -0
  22. package/docs/evals/cases/README.md +47 -0
  23. package/docs/evals/cases/wave-blackboard-inbox-targeting.json +73 -0
  24. package/docs/evals/cases/wave-contradiction-conflict.json +104 -0
  25. package/docs/evals/cases/wave-expert-routing-preservation.json +69 -0
  26. package/docs/evals/cases/wave-hidden-profile-private-evidence.json +81 -0
  27. package/docs/evals/cases/wave-premature-closure-guard.json +71 -0
  28. package/docs/evals/cases/wave-silo-cross-agent-state.json +77 -0
  29. package/docs/evals/cases/wave-simultaneous-lockstep.json +92 -0
  30. package/docs/evals/cooperbench/real-world-mitigation.md +341 -0
  31. package/docs/evals/external-benchmarks.json +85 -0
  32. package/docs/evals/external-command-config.sample.json +9 -0
  33. package/docs/evals/external-command-config.swe-bench-pro.json +8 -0
  34. package/docs/evals/pilots/README.md +47 -0
  35. package/docs/evals/pilots/swe-bench-pro-public-full-wave-review-10.json +64 -0
  36. package/docs/evals/pilots/swe-bench-pro-public-pilot.json +111 -0
  37. package/docs/evals/wave-benchmark-program.md +302 -0
  38. package/docs/guides/planner.md +48 -11
  39. package/docs/plans/context7-wave-orchestrator.md +20 -0
  40. package/docs/plans/current-state.md +9 -1
  41. package/docs/plans/examples/wave-benchmark-improvement.md +108 -0
  42. package/docs/plans/examples/wave-example-live-proof.md +1 -1
  43. package/docs/plans/examples/wave-example-rollout-fidelity.md +340 -0
  44. package/docs/plans/wave-orchestrator.md +73 -11
  45. package/docs/plans/waves/reviews/wave-1-benchmark-operator.md +118 -0
  46. package/docs/reference/coordination-and-closure.md +436 -0
  47. package/docs/reference/live-proof-waves.md +25 -3
  48. package/docs/reference/npmjs-trusted-publishing.md +3 -3
  49. package/docs/reference/proof-metrics.md +90 -0
  50. package/docs/reference/runtime-config/README.md +61 -0
  51. package/docs/reference/sample-waves.md +29 -18
  52. package/docs/reference/wave-control.md +164 -0
  53. package/docs/reference/wave-planning-lessons.md +131 -0
  54. package/package.json +5 -4
  55. package/releases/manifest.json +33 -0
  56. package/scripts/research/agent-context-archive.mjs +18 -0
  57. package/scripts/research/manifests/agent-context-expanded-2026-03-22.mjs +17 -0
  58. package/scripts/research/sync-planner-context7-bundle.mjs +133 -0
  59. package/scripts/wave-autonomous.mjs +2 -4
  60. package/scripts/wave-orchestrator/adhoc.mjs +32 -11
  61. package/scripts/wave-orchestrator/artifact-schemas.mjs +232 -0
  62. package/scripts/wave-orchestrator/autonomous.mjs +27 -6
  63. package/scripts/wave-orchestrator/benchmark-cases.mjs +374 -0
  64. package/scripts/wave-orchestrator/benchmark-external.mjs +1384 -0
  65. package/scripts/wave-orchestrator/benchmark.mjs +972 -0
  66. package/scripts/wave-orchestrator/clarification-triage.mjs +78 -12
  67. package/scripts/wave-orchestrator/config.mjs +175 -0
  68. package/scripts/wave-orchestrator/control-cli.mjs +1123 -0
  69. package/scripts/wave-orchestrator/control-plane.mjs +697 -0
  70. package/scripts/wave-orchestrator/coord-cli.mjs +360 -2
  71. package/scripts/wave-orchestrator/coordination-store.mjs +211 -9
  72. package/scripts/wave-orchestrator/coordination.mjs +84 -0
  73. package/scripts/wave-orchestrator/dashboard-renderer.mjs +38 -3
  74. package/scripts/wave-orchestrator/dashboard-state.mjs +22 -0
  75. package/scripts/wave-orchestrator/evals.mjs +23 -0
  76. package/scripts/wave-orchestrator/executors.mjs +3 -2
  77. package/scripts/wave-orchestrator/feedback.mjs +55 -0
  78. package/scripts/wave-orchestrator/install.mjs +253 -26
  79. package/scripts/wave-orchestrator/launcher-closure.mjs +4 -1
  80. package/scripts/wave-orchestrator/launcher-runtime.mjs +24 -21
  81. package/scripts/wave-orchestrator/launcher.mjs +800 -35
  82. package/scripts/wave-orchestrator/package-update-notice.mjs +230 -0
  83. package/scripts/wave-orchestrator/package-version.mjs +32 -0
  84. package/scripts/wave-orchestrator/planner-context.mjs +75 -0
  85. package/scripts/wave-orchestrator/planner.mjs +2270 -136
  86. package/scripts/wave-orchestrator/proof-cli.mjs +195 -0
  87. package/scripts/wave-orchestrator/proof-registry.mjs +317 -0
  88. package/scripts/wave-orchestrator/replay.mjs +10 -4
  89. package/scripts/wave-orchestrator/retry-cli.mjs +184 -0
  90. package/scripts/wave-orchestrator/retry-control.mjs +225 -0
  91. package/scripts/wave-orchestrator/shared.mjs +26 -0
  92. package/scripts/wave-orchestrator/swe-bench-pro-task.mjs +1004 -0
  93. package/scripts/wave-orchestrator/traces.mjs +157 -2
  94. package/scripts/wave-orchestrator/wave-control-client.mjs +532 -0
  95. package/scripts/wave-orchestrator/wave-control-schema.mjs +309 -0
  96. package/scripts/wave-orchestrator/wave-files.mjs +17 -5
  97. package/scripts/wave.mjs +39 -2
  98. package/skills/repo-coding-rules/SKILL.md +1 -0
  99. package/skills/role-cont-eval/SKILL.md +1 -0
  100. package/skills/role-cont-qa/SKILL.md +13 -6
  101. package/skills/role-deploy/SKILL.md +1 -0
  102. package/skills/role-documentation/SKILL.md +4 -0
  103. package/skills/role-implementation/SKILL.md +4 -0
  104. package/skills/role-infra/SKILL.md +2 -1
  105. package/skills/role-integration/SKILL.md +15 -8
  106. package/skills/role-planner/SKILL.md +39 -0
  107. package/skills/role-planner/skill.json +21 -0
  108. package/skills/role-research/SKILL.md +1 -0
  109. package/skills/role-security/SKILL.md +2 -2
  110. package/skills/runtime-claude/SKILL.md +2 -1
  111. package/skills/runtime-codex/SKILL.md +1 -0
  112. package/skills/runtime-local/SKILL.md +2 -0
  113. package/skills/runtime-opencode/SKILL.md +1 -0
  114. package/skills/wave-core/SKILL.md +25 -6
  115. package/skills/wave-core/references/marker-syntax.md +16 -8
  116. package/wave.config.json +45 -0
@@ -0,0 +1,71 @@
1
+ {
2
+ "version": 1,
3
+ "id": "wave-premature-closure-guard",
4
+ "title": "Premature Closure Guard",
5
+ "summary": "A clarification-linked repair request remains open, so the full Wave arm should preserve a blocking guard instead of converging early.",
6
+ "familyId": "hidden-profile-pooling",
7
+ "benchmarkId": "premature-consensus-guard",
8
+ "kind": "projection",
9
+ "supportedArms": ["single-agent", "multi-agent-minimal", "full-wave"],
10
+ "scoring": {
11
+ "kind": "closure-guard",
12
+ "primaryMetric": "premature-convergence-rate",
13
+ "thresholds": {
14
+ "premature-convergence-rate": 0
15
+ },
16
+ "practicalWinThreshold": 50
17
+ },
18
+ "expectations": {
19
+ "clarificationRequestIds": ["clarify-missing-evidence"],
20
+ "requireBlockingGuard": true
21
+ },
22
+ "fixture": {
23
+ "lane": "main",
24
+ "waveNumber": 0,
25
+ "primaryAgentId": "a1",
26
+ "agents": [
27
+ {
28
+ "agentId": "a1",
29
+ "title": "Implementation Owner",
30
+ "ownedPaths": ["src/runtime.ts"],
31
+ "capabilities": ["runtime"]
32
+ },
33
+ {
34
+ "agentId": "a8",
35
+ "title": "Integration Steward",
36
+ "ownedPaths": [".tmp/main-wave-launcher/integration/wave-0.md"],
37
+ "capabilities": ["integration"]
38
+ }
39
+ ],
40
+ "records": [
41
+ {
42
+ "id": "clarify-missing-evidence",
43
+ "kind": "clarification-request",
44
+ "lane": "main",
45
+ "wave": 0,
46
+ "agentId": "a1",
47
+ "targets": ["launcher"],
48
+ "status": "open",
49
+ "priority": "high",
50
+ "artifactRefs": ["src/runtime.ts"],
51
+ "summary": "Missing evidence for restart durability",
52
+ "detail": "The implementation owner still needs proof for restart durability."
53
+ },
54
+ {
55
+ "id": "repair-missing-evidence",
56
+ "kind": "request",
57
+ "lane": "main",
58
+ "wave": 0,
59
+ "agentId": "a8",
60
+ "targets": ["agent:a1"],
61
+ "status": "open",
62
+ "priority": "high",
63
+ "artifactRefs": ["src/runtime.ts"],
64
+ "dependsOn": ["clarify-missing-evidence"],
65
+ "closureCondition": "clarification:clarify-missing-evidence",
66
+ "summary": "Repair request is still open while clarification remains unresolved",
67
+ "detail": "This request must remain blocking until the clarification chain is closed."
68
+ }
69
+ ]
70
+ }
71
+ }
@@ -0,0 +1,77 @@
1
+ {
2
+ "version": 1,
3
+ "id": "wave-silo-cross-agent-state",
4
+ "title": "Cross-Agent State Reconstruction",
5
+ "summary": "The correct diagnosis requires facts from both the API and queue owners and should only reconstruct cleanly in the full Wave arm.",
6
+ "familyId": "silo-escape",
7
+ "benchmarkId": "cross-agent-state-reconstruction",
8
+ "kind": "projection",
9
+ "supportedArms": ["single-agent", "multi-agent-minimal", "full-wave"],
10
+ "scoring": {
11
+ "kind": "state-reconstruction",
12
+ "primaryMetric": "global-state-reconstruction-rate",
13
+ "thresholds": {
14
+ "global-state-reconstruction-rate": 100,
15
+ "summary-fact-retention-rate": 100
16
+ },
17
+ "practicalWinThreshold": 20
18
+ },
19
+ "expectations": {
20
+ "globalFacts": [
21
+ "api retries are saturating the worker queue",
22
+ "queue lag only spikes after the retry fanout begins"
23
+ ],
24
+ "summaryFacts": ["api retries are saturating the worker queue"],
25
+ "targetedInboxes": {
26
+ "a1": ["queue lag only spikes after the retry fanout begins"],
27
+ "a2": ["api retries are saturating the worker queue"]
28
+ }
29
+ },
30
+ "fixture": {
31
+ "lane": "main",
32
+ "waveNumber": 0,
33
+ "primaryAgentId": "a1",
34
+ "agents": [
35
+ {
36
+ "agentId": "a1",
37
+ "title": "API Owner",
38
+ "ownedPaths": ["src/api/retries.ts"],
39
+ "capabilities": ["api"]
40
+ },
41
+ {
42
+ "agentId": "a2",
43
+ "title": "Queue Owner",
44
+ "ownedPaths": ["src/queue/worker.ts"],
45
+ "capabilities": ["queue"]
46
+ }
47
+ ],
48
+ "records": [
49
+ {
50
+ "id": "block-api-fanout",
51
+ "kind": "blocker",
52
+ "lane": "main",
53
+ "wave": 0,
54
+ "agentId": "a1",
55
+ "targets": ["agent:a2"],
56
+ "status": "open",
57
+ "priority": "high",
58
+ "artifactRefs": ["src/api/retries.ts", "src/queue/worker.ts"],
59
+ "summary": "api retries are saturating the worker queue",
60
+ "detail": "The API owner sees retry fanout but needs queue evidence to reconstruct the full state."
61
+ },
62
+ {
63
+ "id": "block-queue-lag",
64
+ "kind": "blocker",
65
+ "lane": "main",
66
+ "wave": 0,
67
+ "agentId": "a2",
68
+ "targets": ["agent:a1"],
69
+ "status": "open",
70
+ "priority": "high",
71
+ "artifactRefs": ["src/queue/worker.ts", "src/api/retries.ts"],
72
+ "summary": "queue lag only spikes after the retry fanout begins",
73
+ "detail": "The queue owner sees the lag pattern but needs the API owner's retry context."
74
+ }
75
+ ]
76
+ }
77
+ }
@@ -0,0 +1,92 @@
1
+ {
2
+ "version": 1,
3
+ "id": "wave-simultaneous-lockstep",
4
+ "title": "Lockstep Resolution",
5
+ "summary": "Two concurrent blocking requests should route to different specialists instead of collapsing into unresolved contention.",
6
+ "familyId": "simultaneous-coordination",
7
+ "benchmarkId": "lockstep-resolution",
8
+ "kind": "projection",
9
+ "supportedArms": ["single-agent", "multi-agent-minimal", "full-wave"],
10
+ "scoring": {
11
+ "kind": "simultaneous-coordination",
12
+ "primaryMetric": "contention-resolution-rate",
13
+ "thresholds": {
14
+ "contention-resolution-rate": 100,
15
+ "symmetry-breaking-rate": 100,
16
+ "deadlock-rate": 0
17
+ },
18
+ "practicalWinThreshold": 30
19
+ },
20
+ "expectations": {
21
+ "requiredAssignments": [
22
+ {
23
+ "requestId": "req-cache-guard",
24
+ "assignedAgentId": "a2"
25
+ },
26
+ {
27
+ "requestId": "req-queue-budget",
28
+ "assignedAgentId": "a3"
29
+ }
30
+ ],
31
+ "minimumDistinctAssignedAgents": 2
32
+ },
33
+ "fixture": {
34
+ "lane": "main",
35
+ "waveNumber": 0,
36
+ "primaryAgentId": "a1",
37
+ "capabilityRouting": {
38
+ "preferredAgents": {
39
+ "cache": ["a2"],
40
+ "queue": ["a3"]
41
+ }
42
+ },
43
+ "agents": [
44
+ {
45
+ "agentId": "a1",
46
+ "title": "Primary Owner",
47
+ "ownedPaths": ["src/runtime.ts"],
48
+ "capabilities": ["runtime"]
49
+ },
50
+ {
51
+ "agentId": "a2",
52
+ "title": "Cache Owner",
53
+ "ownedPaths": ["src/cache/guard.ts"],
54
+ "capabilities": ["cache"]
55
+ },
56
+ {
57
+ "agentId": "a3",
58
+ "title": "Queue Owner",
59
+ "ownedPaths": ["src/queue/budget.ts"],
60
+ "capabilities": ["queue"]
61
+ }
62
+ ],
63
+ "records": [
64
+ {
65
+ "id": "req-cache-guard",
66
+ "kind": "request",
67
+ "lane": "main",
68
+ "wave": 0,
69
+ "agentId": "a8",
70
+ "targets": ["capability:cache"],
71
+ "status": "open",
72
+ "priority": "high",
73
+ "artifactRefs": ["src/cache/guard.ts"],
74
+ "summary": "Concurrent fix one: cache guard must be updated before release",
75
+ "detail": "This blocking request should route to the cache owner."
76
+ },
77
+ {
78
+ "id": "req-queue-budget",
79
+ "kind": "request",
80
+ "lane": "main",
81
+ "wave": 0,
82
+ "agentId": "a8",
83
+ "targets": ["capability:queue"],
84
+ "status": "open",
85
+ "priority": "high",
86
+ "artifactRefs": ["src/queue/budget.ts"],
87
+ "summary": "Concurrent fix two: queue budget must be updated before release",
88
+ "detail": "This blocking request should route to the queue owner."
89
+ }
90
+ ]
91
+ }
92
+ }
@@ -0,0 +1,341 @@
1
+ ---
2
+ summary: "Comparison of CooperBench coordination failure modes against LEAP-Claw Wave 7-10 traces, with concrete examples and the wave-framework countermeasures that helped or still leaked"
3
+ read_when:
4
+ - You want to compare LEAP-Claw wave traces to the coordination failure taxonomy in CooperBench
5
+ - You need exact local message examples instead of a general impression
6
+ - You are deciding whether the wave framework mostly mitigates or still exhibits multi-agent coordination failures
7
+ title: "CooperBench Versus LEAP-Claw Waves"
8
+ ---
9
+
10
+ # CooperBench Versus LEAP-Claw Waves
11
+
12
+ This report compares the failure taxonomy from
13
+ [CooperBench](https://cooperbench.com/static/pdfs/main.pdf) with the concrete
14
+ execution history from LEAP-Claw Waves 7-10.
15
+
16
+ The short conclusion is:
17
+
18
+ - we do still see the same broad classes of coordination failure that
19
+ CooperBench describes
20
+ - the wave framework mitigates many of them by turning them into explicit,
21
+ machine-visible gate failures instead of silent merge-time corruption
22
+ - the remaining gaps are mostly around stale state, retry semantics, and
23
+ escalation timing rather than uncontrolled code conflicts
24
+
25
+ ## Scope and evidence base
26
+
27
+ This comparison uses:
28
+
29
+ - Wave 7 rerun traces and remediation notes
30
+ - Wave 8 execution-gap review
31
+ - Wave 9 and Wave 10 launcher dashboards, summaries, and coordination traces
32
+ - the current wave role prompts and wave-file structure
33
+
34
+ Primary local evidence:
35
+
36
+ - [Wave 7.1 Remediation](/home/coder/slowfast.ai/docs/plans/waves/reviews/wave-7.1-remediation.md)
37
+ - [Wave 8 Execution Gap Review](/home/coder/slowfast.ai/docs/plans/waves/reviews/wave-8-execution-gap-review.md)
38
+ - [Wave Planning Lessons](/home/coder/slowfast.ai/docs/plans/waves/reviews/wave-planning-lessons.md)
39
+ - [Wave 10](/home/coder/slowfast.ai/docs/plans/waves/wave-10.md)
40
+ - [Wave Integration Role](/home/coder/slowfast.ai/docs/agents/wave-integration-role.md)
41
+ - [Wave Documentation Role](/home/coder/slowfast.ai/docs/agents/wave-documentation-role.md)
42
+ - [Wave Evaluator Role](/home/coder/slowfast.ai/docs/agents/wave-evaluator-role.md)
43
+
44
+ ## The paper's three failure buckets
45
+
46
+ CooperBench groups coordination failure into three buckets:
47
+
48
+ 1. communication channels become noisy, late, or inaccurate
49
+ 2. agents fail to carry out or preserve their commitments
50
+ 3. agents form incorrect beliefs about what their partners did, saw, or meant
51
+
52
+ That grouping fits our traces very well.
53
+
54
+ ## 1. Communication failures: still present, but far more legible
55
+
56
+ ### What CooperBench warns about
57
+
58
+ The paper highlights communication that is vague, late, repetitive, or
59
+ incorrect. The practical problem is not merely "too much chat"; it is that
60
+ messages fail to drive timely coordinated action.
61
+
62
+ ### Exact LEAP-Claw example: routed clarification plus immediate human escalation
63
+
64
+ Wave 10 produced the clearest example.
65
+
66
+ In the same coordination chain:
67
+
68
+ - A7 asked for approved rollout drill and rollback commands
69
+ - ownership policy routed that clarification to `A1`
70
+ - the launcher still opened a human escalation immediately
71
+
72
+ The exact records are visible in the archived Wave 10 trace:
73
+
74
+ - clarification moved to `in_progress` with `detail: "Ownership policy resolved this clarification to A1."` in [coordination.raw.jsonl](/home/coder/slowfast.ai/.tmp/retry-archive/wave-10-20260322T195609Z/wave-10-traces/attempt-2/coordination.raw.jsonl#L28)
75
+ - routed follow-up opened for `agent:A1` in [coordination.raw.jsonl](/home/coder/slowfast.ai/.tmp/retry-archive/wave-10-20260322T195609Z/wave-10-traces/attempt-2/coordination.raw.jsonl#L29)
76
+ - explicit assignment to `A1` recorded in [coordination.raw.jsonl](/home/coder/slowfast.ai/.tmp/retry-archive/wave-10-20260322T195609Z/wave-10-traces/attempt-2/coordination.raw.jsonl#L30)
77
+ - a human escalation for the same issue opened immediately afterward in [coordination.raw.jsonl](/home/coder/slowfast.ai/.tmp/retry-archive/wave-10-20260322T195609Z/wave-10-traces/attempt-2/coordination.raw.jsonl#L31)
78
+
79
+ This is a genuine communication failure mode. The framework did not prevent the
80
+ duplication. It created both a machine-routed clarification and a human ticket
81
+ for the same issue before the routed path was exhausted.
82
+
83
+ ### What countered it
84
+
85
+ The wave framework still improved the situation substantially:
86
+
87
+ - the issue was recorded in durable structured logs rather than disappearing in
88
+ chat
89
+ - the queue was inspectable with `pnpm wave:feedback -- list --lane leap-claw --pending`
90
+ - the operator could answer the request with an exact command surface, and the
91
+ request file recorded that answer
92
+
93
+ So the failure was not silent. The framework converted a latent ambiguity into a
94
+ visible triage problem. That is better than raw agent-to-agent chat, but it is
95
+ still an unresolved planner bug.
96
+
97
+ ### Secondary communication example: accurate but late handoff
98
+
99
+ `A1` eventually resolved A7's question very clearly. The archived trace shows:
100
+
101
+ - `A1` handoff: `"A7 clarification answered: approved Wave 10 command surface is on disk"` in [coordination.raw.jsonl](/home/coder/slowfast.ai/.tmp/retry-archive/wave-10-20260322T195609Z/wave-10-traces/attempt-2/coordination.raw.jsonl#L37)
102
+ - `A1` resolved-by-policy note: `"Wave 10 A7 clarification resolved by published command surface and stop rules"` in [coordination.raw.jsonl](/home/coder/slowfast.ai/.tmp/retry-archive/wave-10-20260322T195609Z/wave-10-traces/attempt-2/coordination.raw.jsonl#L40)
103
+
104
+ This is a positive sign: the agents can produce good coordination messages. The
105
+ problem is reliability and timing, not total absence of the capability.
106
+
107
+ ## 2. Commitment drift: heavily mitigated, but still common
108
+
109
+ ### What CooperBench warns about
110
+
111
+ The paper highlights agents making claims they do not operationally cash out,
112
+ or failing to preserve agreed coordination points even after substantive work is
113
+ done.
114
+
115
+ ### Exact LEAP-Claw example: work landed, protocol still failed
116
+
117
+ Wave 10 `A1` shows this cleanly.
118
+
119
+ On attempt 1, the launcher failed `A1` because the final structured proof marker
120
+ was missing:
121
+
122
+ - `"Implementation exit contract blocked wave 10: Missing [wave-proof] marker for A1."` in [wave-10.json](/home/coder/slowfast.ai/.tmp/leap-claw-wave-launcher/dashboards/wave-10.json#L205)
123
+
124
+ But the agent had already landed the owned files:
125
+
126
+ - `go/internal/rollout/apply/pilot_integration_test.go`
127
+ - `go/internal/rollout/apply/rollback_switch.go`
128
+ - `docs/plans/operations/wave-10-rollout-drill.md`
129
+
130
+ Those deliverables appear in the later clean summary in [wave-10-10-a1.summary.json](/home/coder/slowfast.ai/.tmp/leap-claw-wave-launcher/status/wave-10-10-a1.summary.json#L43).
131
+
132
+ This is not "the agent did nothing." It is closer to CooperBench's commitment
133
+ drift pattern:
134
+
135
+ - the substantive implementation commitment was met
136
+ - the wave-protocol commitment was not met
137
+ - the framework therefore refused to infer completion
138
+
139
+ ### Exact LEAP-Claw example: closure agents and formatting discipline
140
+
141
+ Wave 7 exposed the same class of issue at closure level rather than
142
+ implementation level.
143
+
144
+ The remediation record states:
145
+
146
+ - structured marker parsing was too brittle for backtick-wrapped or fenced
147
+ markers in [wave-7.1-remediation.md](/home/coder/slowfast.ai/docs/plans/waves/reviews/wave-7.1-remediation.md#L17)
148
+ - local fixes then required A0, A8, and A9 to emit final markers as plain last
149
+ lines in [wave-7.1-remediation.md](/home/coder/slowfast.ai/docs/plans/waves/reviews/wave-7.1-remediation.md#L42)
150
+
151
+ Again, the framework did not stop the omission. But it did keep the omission
152
+ from becoming a false success.
153
+
154
+ ### What countered it
155
+
156
+ This is where the wave framework helps the most.
157
+
158
+ The repo now explicitly counteracts commitment drift with:
159
+
160
+ - structured marker requirements for A8, A9, and A0 in [wave-10.md](/home/coder/slowfast.ai/docs/plans/waves/wave-10.md#L50)
161
+ - explicit `### Deliverables` and `### Proof artifacts` in [wave-10.md](/home/coder/slowfast.ai/docs/plans/waves/wave-10.md#L171)
162
+ - a standing implementation skill that says landed files without required
163
+ markers are not done
164
+ - A8, A9, and A0 closure gates that refuse to treat intent as closure
165
+
166
+ So yes, we still see commitment drift. But the framework mostly catches it as a
167
+ protocol failure before the lane advances.
168
+
169
+ ## 3. Incorrect expectations: this is our biggest remaining problem
170
+
171
+ ### What CooperBench warns about
172
+
173
+ The paper's third bucket is incorrect expectations about others' plans,
174
+ observations, or communication. In practice, this causes duplicate work,
175
+ mis-sequencing, or reasoning from stale or partial state.
176
+
177
+ ### Exact LEAP-Claw example: stale status reuse in live-proof waves
178
+
179
+ Wave 8 documented this explicitly.
180
+
181
+ The review records:
182
+
183
+ - stale generated state was reused too aggressively in [wave-8-execution-gap-review.md](/home/coder/slowfast.ai/docs/plans/waves/reviews/wave-8-execution-gap-review.md#L122)
184
+ - `A3` had exited `0` without a closure-grade summary, yet that stale status
185
+ was treated as reusable in [wave-8-execution-gap-review.md](/home/coder/slowfast.ai/docs/plans/waves/reviews/wave-8-execution-gap-review.md#L128)
186
+ - `A6` reused an obsolete proof-gap summary after the missing live proof bundle
187
+ already existed in [wave-8-execution-gap-review.md](/home/coder/slowfast.ai/docs/plans/waves/reviews/wave-8-execution-gap-review.md#L130)
188
+
189
+ This maps directly to the paper's "incorrect expectations" bucket. The runtime
190
+ was effectively reasoning as if prior agent observations were still current.
191
+
192
+ ### Exact LEAP-Claw example: shared-component retry stranded sibling owners
193
+
194
+ Wave 10 retry showed an even sharper version.
195
+
196
+ After the second `A1` attempt, the clean summary explicitly said:
197
+
198
+ - `proof.state = met` in [wave-10-10-a1.summary.json](/home/coder/slowfast.ai/.tmp/leap-claw-wave-launcher/status/wave-10-10-a1.summary.json#L6)
199
+ - the remaining component gap was outside A1 and belonged to live pilot
200
+ authority in [wave-10-10-a1.summary.json](/home/coder/slowfast.ai/.tmp/leap-claw-wave-launcher/status/wave-10-10-a1.summary.json#L25)
201
+
202
+ But the dashboard still ended the wave at A1:
203
+
204
+ - `A1` ended `Exit component-gap` in [wave-10.json](/home/coder/slowfast.ai/.tmp/leap-claw-wave-launcher/dashboards/wave-10.json#L116)
205
+ - `A2` stayed pending with `"Stale status=0 ignored due to prompt drift or missing metadata"` in [wave-10.json](/home/coder/slowfast.ai/.tmp/leap-claw-wave-launcher/dashboards/wave-10.json#L139)
206
+ - `A7` stayed pending with the same stale-state message in [wave-10.json](/home/coder/slowfast.ai/.tmp/leap-claw-wave-launcher/dashboards/wave-10.json#L162)
207
+
208
+ This is not a simple code-quality problem. It is a coordination-state problem:
209
+
210
+ - the launcher knew the remaining `pilot-live` gap was sibling-owned
211
+ - the launcher still treated `A1` as the terminal failing point
212
+
213
+ That is very close to the paper's claim that agents or systems form incorrect
214
+ expectations about partner state and then act on the wrong mental model.
215
+
216
+ ### Exact LEAP-Claw example: stale integration and closure artifacts
217
+
218
+ Wave 7 also hit this category. The remediation note records:
219
+
220
+ - final closure artifacts could stay stale or synthesized instead of reflecting
221
+ the authoritative rerun in [wave-7.1-remediation.md](/home/coder/slowfast.ai/docs/plans/waves/reviews/wave-7.1-remediation.md#L21)
222
+
223
+ That is again an expectations problem: the system continued to act as if earlier
224
+ closure state was still authoritative.
225
+
226
+ ### What countered it
227
+
228
+ The wave framework pushes hard against this class of error, but it does not
229
+ eliminate it.
230
+
231
+ The main countermeasures are:
232
+
233
+ - A8 as a dedicated integration steward that checks contradictions and proof gaps
234
+ before docs and evaluation in [wave-integration-role.md](/home/coder/slowfast.ai/docs/agents/wave-integration-role.md#L35)
235
+ - A9 refusing to treat early doc updates as final if integration is not closed
236
+ in [wave-documentation-role.md](/home/coder/slowfast.ai/docs/agents/wave-documentation-role.md#L57)
237
+ - A0 treating the final closure sweep as authoritative in [wave-evaluator-role.md](/home/coder/slowfast.ai/docs/agents/wave-evaluator-role.md#L128)
238
+ - explicit proof-bundle doctrine for `pilot-live` and above in
239
+ [wave-planning-lessons.md](/home/coder/slowfast.ai/docs/plans/waves/reviews/wave-planning-lessons.md#L18)
240
+
241
+ These are real mitigations. They are why stale or wrong expectations usually
242
+ show up as blocked waves rather than false passes.
243
+
244
+ But this is still the area where the runtime leaks most.
245
+
246
+ ## 4. Failure modes we mostly avoid because of the framework
247
+
248
+ CooperBench centers workspaces with overlapping code and partial observability.
249
+ We do share the partial-observability problem, but the wave framework avoids
250
+ some of the worst merge-era failure modes by design.
251
+
252
+ ### Resource-division failures are much rarer
253
+
254
+ Wave files impose explicit resource division.
255
+
256
+ Wave 10 does this in the open:
257
+
258
+ - A1 owns `go/internal/rollout/apply/` plus one runbook in [wave-10.md](/home/coder/slowfast.ai/docs/plans/waves/wave-10.md#L192)
259
+ - A2 owns `go/internal/rollout/shadow/`, `go/internal/cluster/view/rollout_status_test.go`, and one QA doc in [wave-10.md](/home/coder/slowfast.ai/docs/plans/waves/wave-10.md#L252)
260
+ - A7 owns the live proof bundle and review note in [wave-10.md](/home/coder/slowfast.ai/docs/plans/waves/wave-10.md#L258)
261
+
262
+ This is close to the paper's successful "resource division" pattern. The key
263
+ difference is that our framework makes the split declarative up front instead of
264
+ hoping the agents negotiate it reliably in freeform chat.
265
+
266
+ ### Role division is strong
267
+
268
+ The framework also forces role division:
269
+
270
+ - implementation agents own concrete deliverables
271
+ - A8 owns cross-agent coherence, not code delivery
272
+ - A9 owns shared-plan synchronization
273
+ - A0 owns final gate truth
274
+
275
+ That division is encoded in the wave file and standing role prompts, not only in
276
+ agent memory.
277
+
278
+ In practice, this means many failures that would become destructive code
279
+ overwrites in a looser system instead become:
280
+
281
+ - missing markers
282
+ - unresolved component gaps
283
+ - stale-state reuse bugs
284
+ - over-eager escalations
285
+
286
+ Those are still real problems, but they are safer problems.
287
+
288
+ ## 5. What the framework is actually doing
289
+
290
+ The paper argues that many systems rely on scaffolds and active supervision
291
+ rather than raw cooperative ability. That is also true here.
292
+
293
+ The wave framework is not evidence that the agents have solved social
294
+ intelligence. It is evidence that we have built stronger external scaffolding:
295
+
296
+ - explicit ownership
297
+ - explicit deliverables
298
+ - explicit proof artifacts
299
+ - explicit maturity levels
300
+ - explicit integration and evaluator gates
301
+ - durable coordination records
302
+
303
+ This scaffolding does three useful things:
304
+
305
+ 1. it reduces ambiguous coordination space
306
+ 2. it makes hidden contradictions visible
307
+ 3. it keeps many failures from being mistaken for success
308
+
309
+ That is a meaningful mitigation, but it is not the same as eliminating the
310
+ underlying coordination problem.
311
+
312
+ ## 6. Bottom line
313
+
314
+ The honest comparison is:
315
+
316
+ - yes, we still see the CooperBench failure classes in real wave traces
317
+ - no, they usually do not show up as uncontrolled agent chaos
318
+ - instead, they show up as:
319
+ - duplicated escalation paths
320
+ - missing marker failures
321
+ - stale closure or status reuse
322
+ - shared-component retry bugs
323
+
324
+ So the wave framework mostly mitigates these failures by containing them,
325
+ surfacing them, and refusing to advance the lane on bad coordination state.
326
+
327
+ What it does not yet fully solve:
328
+
329
+ - premature or duplicated escalation
330
+ - stale-state invalidation for high-maturity waves
331
+ - shared-component retry semantics once one owner becomes clean
332
+ - the gap between "agent landed a correct slice" and "the runtime moved the
333
+ whole shared component forward correctly"
334
+
335
+ That means the right claim is not "the framework solves multi-agent
336
+ coordination." The right claim is:
337
+
338
+ - it meaningfully narrows the failure surface
339
+ - it converts many soft coordination mistakes into explicit gate failures
340
+ - it still needs better runtime behavior around retries, stale state, and
341
+ escalation timing
@@ -0,0 +1,85 @@
1
+ {
2
+ "version": 1,
3
+ "adapters": [
4
+ {
5
+ "id": "swe-bench-pro",
6
+ "title": "SWE-bench Pro",
7
+ "mode": "direct",
8
+ "sourceBenchmark": "SWE-bench Pro",
9
+ "split": "public",
10
+ "pilotManifestPath": "docs/evals/pilots/swe-bench-pro-public-pilot.json",
11
+ "officialDocsUrl": "https://scaleapi.github.io/SWE-bench_Pro-os/",
12
+ "officialCodeUrl": "https://github.com/scaleapi/SWE-bench_Pro-os",
13
+ "summary": "Contamination-resistant long-horizon software engineering benchmark for public, held-out, and commercial repositories.",
14
+ "commandTemplate": "",
15
+ "metrics": ["task-success-rate", "cost-per-solved-task", "wall-clock-per-solved-task"],
16
+ "notes": [
17
+ "Use the public split for the first direct external benchmark run and rely on the official verifier for pass or fail.",
18
+ "Keep the base model, executor, and budget identical across the `single-agent` and `full-wave` arms.",
19
+ "The second direct benchmark slot is intentionally deferred until the later CooperBench pass."
20
+ ]
21
+ },
22
+ {
23
+ "id": "skillsbench-style-ablation",
24
+ "title": "SkillsBench-style Ablation",
25
+ "mode": "adapted",
26
+ "sourceBenchmark": "SkillsBench",
27
+ "summary": "Adapt the SkillsBench methodology to Wave skill bundles by comparing no skills, curated skills, and overbroad skills.",
28
+ "commandTemplate": "wave benchmark run --arm single-agent --arm multi-agent-minimal --arm full-wave",
29
+ "metrics": ["pass-rate-delta", "negative-skill-regression-rate", "runtime-cost"],
30
+ "notes": [
31
+ "This is a local adaptation rather than a direct external suite.",
32
+ "The initial repo benchmark runner ships the local corpus and registry, not the full external execution harness."
33
+ ]
34
+ },
35
+ {
36
+ "id": "evoclaw-style-sequence",
37
+ "title": "EvoClaw-style Sequence",
38
+ "mode": "adapted",
39
+ "sourceBenchmark": "EvoClaw",
40
+ "summary": "Sequence multiple dependent waves to measure long-horizon maintenance and error accumulation.",
41
+ "commandTemplate": "wave benchmark run --arm single-agent --arm full-wave --family silo-escape",
42
+ "metrics": ["milestone-pass-decay", "reopen-rate", "regression-carryover"],
43
+ "notes": [
44
+ "Use the local benchmark harness to define milestone DAGs or ordered wave sequences.",
45
+ "Best used after the deterministic coordination corpus is stable."
46
+ ]
47
+ },
48
+ {
49
+ "id": "silo-bench-style-coordination",
50
+ "title": "Silo-Bench-style Coordination",
51
+ "mode": "adapted",
52
+ "sourceBenchmark": "Silo-Bench",
53
+ "summary": "Distributed-information and communication-reasoning-gap evaluations adapted into Wave-native coordination fixtures.",
54
+ "commandTemplate": "wave benchmark run --family hidden-profile-pooling --family silo-escape",
55
+ "metrics": ["distributed-info-accuracy", "global-state-reconstruction-rate", "communication-reasoning-gap"],
56
+ "notes": [
57
+ "The shipped local cases in docs/evals/cases/ are the first adaptation layer for this family."
58
+ ]
59
+ },
60
+ {
61
+ "id": "hiddenbench-style-pooling",
62
+ "title": "HiddenBench-style Pooling",
63
+ "mode": "adapted",
64
+ "sourceBenchmark": "HiddenBench",
65
+ "summary": "Asymmetric-information tasks that focus specifically on whether decision-changing private evidence reaches shared state before closure.",
66
+ "commandTemplate": "wave benchmark run --family hidden-profile-pooling",
67
+ "metrics": ["distributed-info-accuracy", "premature-convergence-rate"],
68
+ "notes": [
69
+ "This is the recommended next coordination benchmark after the first SWE-bench Pro pilot."
70
+ ]
71
+ },
72
+ {
73
+ "id": "dpbench-style-contention",
74
+ "title": "DPBench-style Contention",
75
+ "mode": "adapted",
76
+ "sourceBenchmark": "DPBench",
77
+ "summary": "Simultaneous coordination and contention cases adapted into capability-routing and helper-assignment fixtures.",
78
+ "commandTemplate": "wave benchmark run --family simultaneous-coordination",
79
+ "metrics": ["deadlock-rate", "contention-resolution-rate", "symmetry-breaking-rate"],
80
+ "notes": [
81
+ "The initial local corpus measures the routing and blocking substrate before live concurrent execution is added."
82
+ ]
83
+ }
84
+ ]
85
+ }
@@ -0,0 +1,9 @@
1
+ {
2
+ "adapters": {
3
+ "swe-bench-pro": {
4
+ "single-agent": "external-harness run --benchmark swe-bench-pro --task {task_id} --arm {arm} --model {model_id} --executor {executor_command}",
5
+ "full-wave": "external-harness run --benchmark swe-bench-pro --task {task_id} --arm {arm} --model {model_id} --executor {executor_command}",
6
+ "verify": "external-harness verify --benchmark swe-bench-pro --task {task_id} --arm {arm}"
7
+ }
8
+ }
9
+ }