@chllming/wave-orchestration 0.5.4 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +46 -3
- package/README.md +33 -5
- package/docs/README.md +18 -4
- package/docs/agents/wave-cont-eval-role.md +36 -0
- package/docs/agents/{wave-evaluator-role.md → wave-cont-qa-role.md} +14 -11
- package/docs/agents/wave-documentation-role.md +1 -1
- package/docs/agents/wave-infra-role.md +1 -1
- package/docs/agents/wave-integration-role.md +3 -3
- package/docs/agents/wave-launcher-role.md +4 -3
- package/docs/agents/wave-security-role.md +40 -0
- package/docs/concepts/context7-vs-skills.md +1 -1
- package/docs/concepts/what-is-a-wave.md +56 -6
- package/docs/evals/README.md +166 -0
- package/docs/evals/benchmark-catalog.json +663 -0
- package/docs/guides/author-and-run-waves.md +135 -0
- package/docs/guides/planner.md +5 -0
- package/docs/guides/terminal-surfaces.md +2 -0
- package/docs/plans/component-cutover-matrix.json +1 -1
- package/docs/plans/component-cutover-matrix.md +1 -1
- package/docs/plans/current-state.md +19 -1
- package/docs/plans/examples/wave-example-live-proof.md +435 -0
- package/docs/plans/migration.md +42 -0
- package/docs/plans/wave-orchestrator.md +46 -7
- package/docs/plans/waves/wave-0.md +4 -4
- package/docs/reference/live-proof-waves.md +177 -0
- package/docs/reference/migration-0.2-to-0.5.md +26 -19
- package/docs/reference/npmjs-trusted-publishing.md +6 -5
- package/docs/reference/runtime-config/README.md +13 -3
- package/docs/reference/sample-waves.md +87 -0
- package/docs/reference/skills.md +110 -42
- package/docs/research/agent-context-sources.md +130 -11
- package/docs/research/coordination-failure-review.md +266 -0
- package/docs/roadmap.md +6 -2
- package/package.json +2 -2
- package/releases/manifest.json +20 -2
- package/scripts/research/agent-context-archive.mjs +83 -1
- package/scripts/research/manifests/agent-context-expanded-2026-03-22.mjs +811 -0
- package/scripts/wave-orchestrator/adhoc.mjs +1331 -0
- package/scripts/wave-orchestrator/agent-state.mjs +358 -6
- package/scripts/wave-orchestrator/artifact-schemas.mjs +173 -0
- package/scripts/wave-orchestrator/clarification-triage.mjs +10 -3
- package/scripts/wave-orchestrator/config.mjs +48 -12
- package/scripts/wave-orchestrator/context7.mjs +2 -0
- package/scripts/wave-orchestrator/coord-cli.mjs +51 -19
- package/scripts/wave-orchestrator/coordination-store.mjs +26 -4
- package/scripts/wave-orchestrator/coordination.mjs +83 -9
- package/scripts/wave-orchestrator/dashboard-state.mjs +20 -8
- package/scripts/wave-orchestrator/dep-cli.mjs +5 -2
- package/scripts/wave-orchestrator/docs-queue.mjs +8 -2
- package/scripts/wave-orchestrator/evals.mjs +451 -0
- package/scripts/wave-orchestrator/feedback.mjs +15 -1
- package/scripts/wave-orchestrator/install.mjs +32 -9
- package/scripts/wave-orchestrator/launcher-closure.mjs +281 -0
- package/scripts/wave-orchestrator/launcher-runtime.mjs +334 -0
- package/scripts/wave-orchestrator/launcher.mjs +709 -601
- package/scripts/wave-orchestrator/ledger.mjs +123 -20
- package/scripts/wave-orchestrator/local-executor.mjs +99 -12
- package/scripts/wave-orchestrator/planner.mjs +177 -42
- package/scripts/wave-orchestrator/replay.mjs +6 -3
- package/scripts/wave-orchestrator/role-helpers.mjs +84 -0
- package/scripts/wave-orchestrator/shared.mjs +75 -11
- package/scripts/wave-orchestrator/skills.mjs +637 -106
- package/scripts/wave-orchestrator/traces.mjs +71 -48
- package/scripts/wave-orchestrator/wave-files.mjs +947 -101
- package/scripts/wave.mjs +9 -0
- package/skills/README.md +202 -0
- package/skills/provider-aws/SKILL.md +111 -0
- package/skills/provider-aws/adapters/claude.md +1 -0
- package/skills/provider-aws/adapters/codex.md +1 -0
- package/skills/provider-aws/references/service-verification.md +39 -0
- package/skills/provider-aws/skill.json +50 -1
- package/skills/provider-custom-deploy/SKILL.md +59 -0
- package/skills/provider-custom-deploy/skill.json +46 -1
- package/skills/provider-docker-compose/SKILL.md +90 -0
- package/skills/provider-docker-compose/adapters/local.md +1 -0
- package/skills/provider-docker-compose/skill.json +49 -1
- package/skills/provider-github-release/SKILL.md +116 -1
- package/skills/provider-github-release/adapters/claude.md +1 -0
- package/skills/provider-github-release/adapters/codex.md +1 -0
- package/skills/provider-github-release/skill.json +51 -1
- package/skills/provider-kubernetes/SKILL.md +137 -0
- package/skills/provider-kubernetes/adapters/claude.md +1 -0
- package/skills/provider-kubernetes/adapters/codex.md +1 -0
- package/skills/provider-kubernetes/references/kubectl-patterns.md +58 -0
- package/skills/provider-kubernetes/skill.json +48 -1
- package/skills/provider-railway/SKILL.md +118 -1
- package/skills/provider-railway/references/verification-commands.md +39 -0
- package/skills/provider-railway/skill.json +67 -1
- package/skills/provider-ssh-manual/SKILL.md +91 -0
- package/skills/provider-ssh-manual/skill.json +50 -1
- package/skills/repo-coding-rules/SKILL.md +84 -0
- package/skills/repo-coding-rules/skill.json +30 -1
- package/skills/role-cont-eval/SKILL.md +90 -0
- package/skills/role-cont-eval/adapters/codex.md +1 -0
- package/skills/role-cont-eval/skill.json +36 -0
- package/skills/role-cont-qa/SKILL.md +93 -0
- package/skills/role-cont-qa/adapters/claude.md +1 -0
- package/skills/role-cont-qa/skill.json +36 -0
- package/skills/role-deploy/SKILL.md +90 -0
- package/skills/role-deploy/skill.json +32 -1
- package/skills/role-documentation/SKILL.md +66 -0
- package/skills/role-documentation/skill.json +32 -1
- package/skills/role-implementation/SKILL.md +62 -0
- package/skills/role-implementation/skill.json +32 -1
- package/skills/role-infra/SKILL.md +74 -0
- package/skills/role-infra/skill.json +32 -1
- package/skills/role-integration/SKILL.md +79 -1
- package/skills/role-integration/skill.json +32 -1
- package/skills/role-research/SKILL.md +58 -0
- package/skills/role-research/skill.json +32 -1
- package/skills/role-security/SKILL.md +60 -0
- package/skills/role-security/skill.json +36 -0
- package/skills/runtime-claude/SKILL.md +60 -1
- package/skills/runtime-claude/skill.json +32 -1
- package/skills/runtime-codex/SKILL.md +52 -1
- package/skills/runtime-codex/skill.json +32 -1
- package/skills/runtime-local/SKILL.md +39 -0
- package/skills/runtime-local/skill.json +32 -1
- package/skills/runtime-opencode/SKILL.md +51 -0
- package/skills/runtime-opencode/skill.json +32 -1
- package/skills/wave-core/SKILL.md +107 -0
- package/skills/wave-core/references/marker-syntax.md +62 -0
- package/skills/wave-core/skill.json +31 -1
- package/wave.config.json +35 -6
- package/skills/role-evaluator/SKILL.md +0 -6
- package/skills/role-evaluator/skill.json +0 -5
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Coordination Failure Review"
|
|
3
|
+
summary: "Assessment of whether the Wave orchestrator constructively addresses coordination and blackboard failure modes highlighted by recent multi-agent papers."
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Coordination Failure Review
|
|
7
|
+
|
|
8
|
+
## Bottom Line
|
|
9
|
+
|
|
10
|
+
The Wave orchestrator addresses several coordination failure modes constructively in code, not just in prose. In particular, it has:
|
|
11
|
+
|
|
12
|
+
- a canonical machine-readable coordination log
|
|
13
|
+
- compiled shared summaries plus per-agent inboxes
|
|
14
|
+
- explicit clarification, helper-assignment, dependency, integration, documentation, and cont-QA barriers
|
|
15
|
+
- structured proof and verdict validation
|
|
16
|
+
- replayable trace bundles with coordination-quality metrics
|
|
17
|
+
|
|
18
|
+
That is materially stronger than the common "agents talk in a shared channel and we hope that was enough" pattern criticized by recent multi-agent papers.
|
|
19
|
+
|
|
20
|
+
The main weakness is empirical, not architectural. The repo does not yet contain a benchmark family that proves the blackboard actually helps agents reconstruct distributed state under HiddenBench or Silo-Bench style pressure, or that it handles DPBench-style simultaneous coordination reliably.
|
|
21
|
+
|
|
22
|
+
## What The Papers Warn About
|
|
23
|
+
|
|
24
|
+
### `Why Do Multi-Agent LLM Systems Fail?`
|
|
25
|
+
|
|
26
|
+
This paper is the broadest warning. Its failure taxonomy groups problems into:
|
|
27
|
+
|
|
28
|
+
- system design issues
|
|
29
|
+
- inter-agent misalignment
|
|
30
|
+
- task verification failures
|
|
31
|
+
|
|
32
|
+
Those categories are useful here because they distinguish "we gave agents a shared workspace" from "the workspace is actually enforceable and auditable."
|
|
33
|
+
|
|
34
|
+
### `HiddenBench` / `Systematic Failures in Collective Reasoning under Distributed Information in Multi-Agent LLMs`
|
|
35
|
+
|
|
36
|
+
This is the clearest warning for blackboard-style systems. The central result is that multi-agent groups often fail not because they never communicated, but because they do not notice latent information asymmetry and do not actively surface unshared evidence. They converge on shared evidence too early.
|
|
37
|
+
|
|
38
|
+
For this repo, the key question is therefore not "do agents have a board?" but "does the shared state force enough evidence pooling to avoid premature convergence?"
|
|
39
|
+
|
|
40
|
+
### `Silo-Bench`
|
|
41
|
+
|
|
42
|
+
Silo-Bench sharpens the same point. Agents can exchange information and even form reasonable communication topologies, yet still fail at the reasoning-integration step. Communication volume is not the same thing as distributed-state synthesis.
|
|
43
|
+
|
|
44
|
+
For this repo, the corresponding question is whether summaries, inboxes, and integration passes merely move information around, or actually make the final decision depend on the integrated state.
|
|
45
|
+
|
|
46
|
+
### `DPBench`
|
|
47
|
+
|
|
48
|
+
DPBench shows that LLM teams can look coordinated in serial settings and still collapse in simultaneous coordination settings, with communication often failing to save them. Its practical lesson is that explicit external coordination mechanisms matter when concurrent access or simultaneous action is involved.
|
|
49
|
+
|
|
50
|
+
For this repo, the relevant question is whether coordination is only conversational or whether there are explicit external barriers and tickets that serialize or block unsafe progress.
|
|
51
|
+
|
|
52
|
+
### `Multi-Agent Teams Hold Experts Back`
|
|
53
|
+
|
|
54
|
+
This paper argues that unconstrained teams underuse expertise. Even when the best agent is identifiable, teams often drift toward integrative compromise instead of properly weighting expert judgment.
|
|
55
|
+
|
|
56
|
+
For this repo, the key question is whether the design relies on self-organizing consensus or on explicit role ownership, routing, and gating.
|
|
57
|
+
|
|
58
|
+
## What This Repo Already Does Constructively
|
|
59
|
+
|
|
60
|
+
### Implemented In Code And Tests
|
|
61
|
+
|
|
62
|
+
#### 1. It uses a real canonical shared state, not a cosmetic board
|
|
63
|
+
|
|
64
|
+
The strongest blackboard-like mechanism is the canonical JSONL coordination log plus materialized state in [scripts/wave-orchestrator/coordination-store.mjs](../../scripts/wave-orchestrator/coordination-store.mjs). The markdown board is explicitly a projection for humans, not the scheduler's source of truth, as stated in [docs/plans/wave-orchestrator.md](../plans/wave-orchestrator.md).
|
|
65
|
+
|
|
66
|
+
That state is then compiled into:
|
|
67
|
+
|
|
68
|
+
- a wave-level shared summary via `compileSharedSummary()`
|
|
69
|
+
- targeted per-agent inboxes via `compileAgentInbox()`
|
|
70
|
+
|
|
71
|
+
This is a real mitigation against information silos because agents are not expected to reconstruct the whole wave by rereading raw logs. The inbox compiler also pulls in relevant open coordination through `artifactRefs`, ownership, components, docs items, helper assignments, and dependencies. That behavior is exercised in [test/wave-orchestrator/coordination-store.test.ts](../../test/wave-orchestrator/coordination-store.test.ts).
|
|
72
|
+
|
|
73
|
+
Assessment against the papers:
|
|
74
|
+
|
|
75
|
+
- `HiddenBench`: partially addressed in design
|
|
76
|
+
- `Silo-Bench`: partially addressed in design
|
|
77
|
+
- proof that this works under benchmarked distributed-information pressure: missing
|
|
78
|
+
|
|
79
|
+
#### 2. It makes completion depend on integrated state, not on agent self-report
|
|
80
|
+
|
|
81
|
+
The launcher's gate stack in [scripts/wave-orchestrator/launcher.mjs](../../scripts/wave-orchestrator/launcher.mjs) is the clearest constructive safeguard in the repo. Closure is blocked by:
|
|
82
|
+
|
|
83
|
+
- open clarifications
|
|
84
|
+
- unresolved clarification-linked follow-up requests
|
|
85
|
+
- pending human input
|
|
86
|
+
- unresolved helper assignments
|
|
87
|
+
- open required dependencies
|
|
88
|
+
- integration failures
|
|
89
|
+
- documentation closure failures
|
|
90
|
+
- cont-EVAL failures
|
|
91
|
+
- cont-QA failures
|
|
92
|
+
|
|
93
|
+
This matters because several paper failure modes are really verification failures: agents say they are done, but the system has no hard check that the distributed state was reconciled. Here, the final decision is made by barrier logic rather than informal consensus.
|
|
94
|
+
|
|
95
|
+
Tests in [test/wave-orchestrator/clarification-triage.test.ts](../../test/wave-orchestrator/clarification-triage.test.ts) and [test/wave-orchestrator/launcher.test.ts](../../test/wave-orchestrator/launcher.test.ts) confirm that routed clarification work remains blocking until the linked follow-up is resolved and that integration evidence is derived from coordination, docs, validation, and runtime signals.
|
|
96
|
+
|
|
97
|
+
Assessment against the papers:
|
|
98
|
+
|
|
99
|
+
- `Why Do Multi-Agent LLM Systems Fail?`: strong mitigation of task-verification failures
|
|
100
|
+
- `Silo-Bench`: helps because integrated state has operational consequences
|
|
101
|
+
- `DPBench`: helps by using external barriers instead of relying on emergent coordination alone
|
|
102
|
+
|
|
103
|
+
#### 3. It validates structured evidence instead of trusting narrative summaries
|
|
104
|
+
|
|
105
|
+
[scripts/wave-orchestrator/agent-state.mjs](../../scripts/wave-orchestrator/agent-state.mjs) validates structured markers for implementation proof, integration, cont-EVAL, documentation closure, and cont-QA verdicts. That means the orchestrator can reject:
|
|
106
|
+
|
|
107
|
+
- missing proof markers
|
|
108
|
+
- weaker completion or durability than promised
|
|
109
|
+
- missing doc-delta markers
|
|
110
|
+
- missing component evidence
|
|
111
|
+
- missing deliverables
|
|
112
|
+
- non-ready integration summaries
|
|
113
|
+
- non-satisfied cont-EVAL outcomes
|
|
114
|
+
- non-pass cont-QA gates
|
|
115
|
+
|
|
116
|
+
This directly addresses the "don't kid yourself" critique behind the failure-taxonomy paper. A system that validates explicit proof contracts is much less vulnerable to premature closure than a system that trusts free-form role reports.
|
|
117
|
+
|
|
118
|
+
Assessment against the papers:
|
|
119
|
+
|
|
120
|
+
- `Why Do Multi-Agent LLM Systems Fail?`: strong mitigation for verification and termination failures
|
|
121
|
+
- `Multi-Agent Teams Hold Experts Back`: indirect mitigation, because expert or steward judgment must still be grounded in evidence
|
|
122
|
+
|
|
123
|
+
#### 4. It reduces naive self-organizing compromise through explicit ownership and routing
|
|
124
|
+
|
|
125
|
+
The repo does not rely on free-form team consensus in the way criticized by `Multi-Agent Teams Hold Experts Back`. Instead it uses:
|
|
126
|
+
|
|
127
|
+
- named stewardship roles such as integration and cont-QA in [docs/agents/wave-integration-role.md](../agents/wave-integration-role.md) and [docs/agents/wave-cont-qa-role.md](../agents/wave-cont-qa-role.md)
|
|
128
|
+
- capability-targeted request routing in [scripts/wave-orchestrator/routing-state.mjs](../../scripts/wave-orchestrator/routing-state.mjs)
|
|
129
|
+
- deterministic assignment based on explicit target, preferred agent, or least-busy capability owner
|
|
130
|
+
- staged closure order documented in [docs/plans/current-state.md](../plans/current-state.md) and enforced in the launcher
|
|
131
|
+
|
|
132
|
+
This is a constructive response to the paper's warning about teams averaging expert and non-expert views. The repo favors explicit owner selection and role-specific closure authority over emergent compromise.
|
|
133
|
+
|
|
134
|
+
Assessment against the papers:
|
|
135
|
+
|
|
136
|
+
- `Multi-Agent Teams Hold Experts Back`: partially addressed and better than unconstrained collaboration
|
|
137
|
+
- not fully solved, because routing is based mostly on declared capability and load, not demonstrated expertise quality
|
|
138
|
+
|
|
139
|
+
#### 5. It is unusually observable and replayable
|
|
140
|
+
|
|
141
|
+
[scripts/wave-orchestrator/traces.mjs](../../scripts/wave-orchestrator/traces.mjs) and [scripts/wave-orchestrator/replay.mjs](../../scripts/wave-orchestrator/replay.mjs) give the system an unusually strong postmortem surface. A trace bundle includes:
|
|
142
|
+
|
|
143
|
+
- raw coordination log
|
|
144
|
+
- materialized coordination state
|
|
145
|
+
- ledger
|
|
146
|
+
- docs queue
|
|
147
|
+
- integration summary
|
|
148
|
+
- shared summary
|
|
149
|
+
- copied prompts, logs, status, and inbox artifacts
|
|
150
|
+
- structured signals
|
|
151
|
+
- `quality.json`
|
|
152
|
+
- replay metadata and outcome baseline
|
|
153
|
+
|
|
154
|
+
The quality metrics include unresolved clarifications, contradiction count, capability-assignment timing, dependency-resolution timing, blocker-resolution timing, and fallback counts. Tests in [test/wave-orchestrator/traces.test.ts](../../test/wave-orchestrator/traces.test.ts) verify replay integrity and hash validation.
|
|
155
|
+
|
|
156
|
+
This does not by itself solve coordination failure, but it is a serious safeguard against hidden failure modes because it makes them inspectable and replayable.
|
|
157
|
+
|
|
158
|
+
Assessment against the papers:
|
|
159
|
+
|
|
160
|
+
- `Why Do Multi-Agent LLM Systems Fail?`: strong support for diagnosis and failure analysis
|
|
161
|
+
- `Silo-Bench` and `HiddenBench`: useful observability layer, but not yet a direct capability benchmark
|
|
162
|
+
|
|
163
|
+
### Stated In Docs And Also Reflected In The Software
|
|
164
|
+
|
|
165
|
+
The docs are not purely aspirational here. The main claims in [docs/plans/current-state.md](../plans/current-state.md) and [docs/plans/wave-orchestrator.md](../plans/wave-orchestrator.md) are broadly backed by the code:
|
|
166
|
+
|
|
167
|
+
- canonical coordination log plus generated board
|
|
168
|
+
- compiled shared summaries and per-agent inboxes
|
|
169
|
+
- orchestrator-first clarification triage
|
|
170
|
+
- blocking helper assignments and cross-lane dependencies
|
|
171
|
+
- staged closure order
|
|
172
|
+
- trace bundles and replay validation
|
|
173
|
+
|
|
174
|
+
That alignment matters. In many MAS projects the docs promise a blackboard, but the runtime still reduces to prompt-only coordination. Here the repo's architectural claims are mostly real.
|
|
175
|
+
|
|
176
|
+
## What Is Still Missing To Make The Claim Credible
|
|
177
|
+
|
|
178
|
+
### 1. No distributed-information benchmark family yet
|
|
179
|
+
|
|
180
|
+
The biggest gap is in [docs/evals/benchmark-catalog.json](../evals/benchmark-catalog.json). The current families are:
|
|
181
|
+
|
|
182
|
+
- `service-output`
|
|
183
|
+
- `latency`
|
|
184
|
+
- `quality-regression`
|
|
185
|
+
|
|
186
|
+
There is nothing yet for:
|
|
187
|
+
|
|
188
|
+
- hidden-profile reconstruction
|
|
189
|
+
- silo escape under partial information
|
|
190
|
+
- blackboard consistency across raw log, summary, inboxes, ledger, and integration state
|
|
191
|
+
- contradiction injection and recovery
|
|
192
|
+
- simultaneous coordination under contention
|
|
193
|
+
|
|
194
|
+
So the repo can reasonably claim "we built mechanisms intended to mitigate these failures," but it cannot yet claim "we demonstrated that these mechanisms overcome the failures highlighted by HiddenBench, Silo-Bench, or DPBench."
|
|
195
|
+
|
|
196
|
+
### 2. Information integration is supported, but not measured directly
|
|
197
|
+
|
|
198
|
+
The shared summary, inboxes, and integration pass are all constructive. But there is still no metric that asks:
|
|
199
|
+
|
|
200
|
+
- Did the team reconstruct the globally correct hidden state?
|
|
201
|
+
- Did the summary preserve the critical fact that was originally siloed?
|
|
202
|
+
- Did a wave converge too early on shared evidence while missing private evidence?
|
|
203
|
+
|
|
204
|
+
This is the central failure highlighted by `HiddenBench` and `Silo-Bench`, and the repo does not yet score it directly.
|
|
205
|
+
|
|
206
|
+
### 3. Expertise routing is explicit, but shallow
|
|
207
|
+
|
|
208
|
+
[scripts/wave-orchestrator/routing-state.mjs](../../scripts/wave-orchestrator/routing-state.mjs) is better than unconstrained self-organization, but it still routes mostly by:
|
|
209
|
+
|
|
210
|
+
- explicit target
|
|
211
|
+
- configured preferred agents
|
|
212
|
+
- declared capability ownership
|
|
213
|
+
- least-busy fallback
|
|
214
|
+
|
|
215
|
+
It does not yet weight:
|
|
216
|
+
|
|
217
|
+
- historical success on a capability
|
|
218
|
+
- evidence quality by agent
|
|
219
|
+
- confidence calibration
|
|
220
|
+
- expert-leverage metrics
|
|
221
|
+
|
|
222
|
+
So the repo partially addresses the concern from `Multi-Agent Teams Hold Experts Back`, but it does not yet prove that the best agent's expertise is actually being exploited rather than merely named.
|
|
223
|
+
|
|
224
|
+
### 4. Clarification and contradiction handling are still somewhat heuristic
|
|
225
|
+
|
|
226
|
+
Clarification triage and integration evidence aggregation are real safeguards, but they still lean heavily on:
|
|
227
|
+
|
|
228
|
+
- ownership mappings
|
|
229
|
+
- artifact references
|
|
230
|
+
- structured markers
|
|
231
|
+
- text-level summaries and conflict extraction
|
|
232
|
+
|
|
233
|
+
That is enough to make the runtime operationally safer, but it is not yet a richer semantic evidence-integration layer. Subtle contradictions or latent information asymmetries may still be missed.
|
|
234
|
+
|
|
235
|
+
### 5. DPBench-style simultaneous coordination is only indirectly addressed
|
|
236
|
+
|
|
237
|
+
The repo already uses external coordination mechanisms such as blocking assignments, dependency tickets, and closure barriers. That is directionally aligned with DPBench's lesson that explicit external coordination beats naive emergent coordination.
|
|
238
|
+
|
|
239
|
+
But there is still no direct stress harness for:
|
|
240
|
+
|
|
241
|
+
- simultaneous resource contention
|
|
242
|
+
- many-way concurrent dependencies
|
|
243
|
+
- lock-step coordination failures
|
|
244
|
+
- deadlock-like patterns caused by convergent reasoning
|
|
245
|
+
|
|
246
|
+
So the design points in the right direction, but the claim is not yet validated.
|
|
247
|
+
|
|
248
|
+
## Gap Matrix
|
|
249
|
+
|
|
250
|
+
| Paper | Main warning | Repo response | Assessment |
|
|
251
|
+
| --- | --- | --- | --- |
|
|
252
|
+
| [Why Do Multi-Agent LLM Systems Fail?](https://arxiv.org/abs/2503.13657) | MAS fail through bad system design, misalignment, and weak verification | Canonical coordination state, barrier-based closure, structured evidence validation, replayable traces | Addressed materially in architecture and software |
|
|
253
|
+
| [Systematic Failures in Collective Reasoning under Distributed Information in Multi-Agent LLMs](https://arxiv.org/abs/2505.11556) | Teams miss latent information asymmetry and converge too early on shared evidence | Shared summaries, per-agent inboxes, integration steward, clarification flow | Partially addressed in design, not validated empirically |
|
|
254
|
+
| [Silo-Bench](https://arxiv.org/abs/2603.01045) | Communication is not enough; reasoning integration is the bottleneck | Integration evidence aggregation and barrier-driven closure | Partially addressed in design, but no direct integration-quality benchmark |
|
|
255
|
+
| [DPBench](https://arxiv.org/abs/2602.13255) | Simultaneous coordination can fail badly even with communication | External helper assignments, dependency barriers, explicit blocking workflow | Directionally addressed, but not benchmarked under simultaneous contention |
|
|
256
|
+
| [Multi-Agent Teams Hold Experts Back](https://arxiv.org/abs/2602.01011) | Self-organizing teams underuse experts and drift toward compromise | Named stewards, explicit role authority, capability routing, proof gates | Better than naive teams, but expertise leverage is not measured or optimized deeply |
|
|
257
|
+
|
|
258
|
+
## Final Assessment
|
|
259
|
+
|
|
260
|
+
If the standard is "does this repo merely claim multi-agent coordination," the answer is no. It has real machinery for blackboard-like state sharing, evidence-based closure, clarification handling, and coordination diagnostics.
|
|
261
|
+
|
|
262
|
+
If the standard is "has this repo already demonstrated that its design beats the core failure modes isolated by HiddenBench, Silo-Bench, DPBench, and related work," the answer is also no. The design is substantially more credible than most MAS stacks, but the empirical proof is still missing.
|
|
263
|
+
|
|
264
|
+
The most accurate claim today is:
|
|
265
|
+
|
|
266
|
+
> Wave already implements several constructive anti-failure mechanisms for coordination and blackboard-style orchestration, especially around shared state, gating, and observability. What it still lacks is a benchmark suite that proves those mechanisms actually overcome distributed-information and simultaneous-coordination failures rather than simply organizing them better.
|
package/docs/roadmap.md
CHANGED
|
@@ -12,7 +12,7 @@ The repository already has the right runtime substrate:
|
|
|
12
12
|
|
|
13
13
|
- lane-scoped state under `.tmp/`
|
|
14
14
|
- wave parsing and validation
|
|
15
|
-
- role-based execution with
|
|
15
|
+
- role-based execution with cont-qa, integration, and documentation stewards
|
|
16
16
|
- executor profiles and lane runtime policy
|
|
17
17
|
- compiled inboxes, ledgers, docs queues, dependency snapshots, and trace bundles
|
|
18
18
|
- orchestrator-first clarification handling and human feedback workflows
|
|
@@ -76,14 +76,17 @@ CLI target:
|
|
|
76
76
|
- `wave adhoc run --task "..." [--task "..."]`
|
|
77
77
|
- `wave adhoc list`
|
|
78
78
|
- `wave adhoc show --run <id>`
|
|
79
|
+
- `wave adhoc promote --run <id> --wave <n>`
|
|
79
80
|
|
|
80
81
|
Behavior:
|
|
81
82
|
|
|
82
83
|
- accept one or more free-form task requests
|
|
83
84
|
- normalize them into a single transient plan or spec
|
|
84
|
-
- synthesize the worker roles needed for the request while still preserving
|
|
85
|
+
- synthesize the worker roles needed for the request while still preserving cont-qa, integration, and documentation closure when relevant
|
|
85
86
|
- run that transient plan through the existing launcher, coordination, inbox, ledger, docs queue, integration, and trace machinery
|
|
86
87
|
- keep ad-hoc runs logged, inspectable, and replayable with the same basic operator surfaces as roadmap waves
|
|
88
|
+
- route shared-plan documentation deltas into the canonical shared docs queue, plus an ad-hoc closure report for the run
|
|
89
|
+
- treat only repo-local paths as ownership hints and ignore external references such as URLs
|
|
87
90
|
|
|
88
91
|
Storage model:
|
|
89
92
|
|
|
@@ -98,6 +101,7 @@ Design constraints:
|
|
|
98
101
|
- treat ad-hoc as a transient single-run execution unit, not a fake roadmap wave
|
|
99
102
|
- do not let ad-hoc completion mutate normal `completedWaves` lane state
|
|
100
103
|
- give `wave coord`, `wave feedback`, and future replay or reporting flows a way to target `--run <id>`
|
|
104
|
+
- promote numbered roadmap artifacts from the stored ad-hoc spec instead of recomputing them from the current project profile
|
|
101
105
|
|
|
102
106
|
Why this matters:
|
|
103
107
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@chllming/wave-orchestration",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.6.0",
|
|
4
4
|
"license": "MIT",
|
|
5
5
|
"description": "Generic wave-based multi-agent orchestration for repository work.",
|
|
6
6
|
"repository": {
|
|
@@ -39,7 +39,7 @@
|
|
|
39
39
|
},
|
|
40
40
|
"scripts": {
|
|
41
41
|
"context7:api-check": "bash scripts/context7-export-env.sh run bash scripts/context7-api-check.sh",
|
|
42
|
-
"research:import-agent-context": "node scripts/research/import-agent-context-archive.mjs scripts/research/manifests/
|
|
42
|
+
"research:import-agent-context": "node scripts/research/import-agent-context-archive.mjs scripts/research/manifests/agent-context-expanded-2026-03-22.mjs",
|
|
43
43
|
"research:index-agent-context": "node scripts/research/generate-agent-context-indexes.mjs",
|
|
44
44
|
"research:refresh-agent-context": "pnpm research:import-agent-context && pnpm research:index-agent-context",
|
|
45
45
|
"test": "vitest run --config vitest.config.ts",
|
package/releases/manifest.json
CHANGED
|
@@ -2,6 +2,24 @@
|
|
|
2
2
|
"schemaVersion": 1,
|
|
3
3
|
"packageName": "@chllming/wave-orchestration",
|
|
4
4
|
"releases": [
|
|
5
|
+
{
|
|
6
|
+
"version": "0.6.0",
|
|
7
|
+
"date": "2026-03-22",
|
|
8
|
+
"summary": "Closure-role split, benchmark-governed cont-EVAL, security review, ad-hoc runs, and expanded skills and docs.",
|
|
9
|
+
"features": [
|
|
10
|
+
"The closure model now treats `cont-EVAL` (`E0`) as an optional first-class eval stage before integration and keeps `cont-QA` (`A0`) as the final release verdict owner, with dedicated validation and report requirements for each role.",
|
|
11
|
+
"Wave authoring now includes a benchmark catalog under `docs/evals/benchmark-catalog.json`, wave-level `## Eval targets`, and planner, launcher, and validation support for delegated versus pinned eval contracts.",
|
|
12
|
+
"Optional report-only security review now has its own role prompt, validation path, `[wave-security]` marker, security summaries, and closure-stage sequencing.",
|
|
13
|
+
"Operators can now run transient ad-hoc work through `wave adhoc plan|run|show|promote`, with generated specs, launcher-compatible markdown, isolated runtime state, and promotion back into numbered roadmap artifacts.",
|
|
14
|
+
"The starter docs and skills surface now ship richer role, runtime, deploy-kind, provider, and proof-first guidance across the package-owned `docs/` and `skills/` trees."
|
|
15
|
+
],
|
|
16
|
+
"manualSteps": [
|
|
17
|
+
"After upgrading, update any repo-owned `evaluator` role references to the new `cont-QA` plus optional `cont-EVAL` split before running live waves.",
|
|
18
|
+
"If a wave uses `cont-EVAL`, declare `## Eval targets` at the wave level and keep benchmark ids inside `docs/evals/benchmark-catalog.json`.",
|
|
19
|
+
"Run `pnpm exec wave doctor` and `pnpm exec wave launch --lane main --dry-run --no-dashboard` after upgrading so the stricter closure, skill, and security validation paths can fail fast before live execution."
|
|
20
|
+
],
|
|
21
|
+
"breaking": false
|
|
22
|
+
},
|
|
5
23
|
{
|
|
6
24
|
"version": "0.5.4",
|
|
7
25
|
"date": "2026-03-22",
|
|
@@ -24,7 +42,7 @@
|
|
|
24
42
|
"date": "2026-03-22",
|
|
25
43
|
"summary": "Closure-sweep launch ordering fix for implementation-first wave execution.",
|
|
26
44
|
"features": [
|
|
27
|
-
"The launcher now starts only implementation agents in the initial wave pass when implementation work remains, deferring integration, documentation, and
|
|
45
|
+
"The launcher now starts only implementation agents in the initial wave pass when implementation work remains, deferring integration, documentation, and cont-qa roles until closure sweep order.",
|
|
28
46
|
"Wave waiting, dashboard progress refresh, and human-feedback monitoring now scope to the runs launched in the current pass so deferred closure agents no longer create false pending or missing-status failures.",
|
|
29
47
|
"Regression coverage now exercises both mixed implementation/closure waves and closure-only retry waves directly."
|
|
30
48
|
],
|
|
@@ -136,7 +154,7 @@
|
|
|
136
154
|
"features": [
|
|
137
155
|
"Canonical coordination log materialization, generated board projection, compiled shared summary, per-agent inboxes, and a durable wave ledger.",
|
|
138
156
|
"Planning-time executor profiles, lane runtime policy, hard runtime-mix enforcement, and retry fallback reassignment recorded into ledger, integration, and traces.",
|
|
139
|
-
"Orchestrator-first clarification triage, explicit integration summaries, and staged closure that runs integration before documentation and
|
|
157
|
+
"Orchestrator-first clarification triage, explicit integration summaries, and staged closure that runs integration before documentation and cont-qa closure."
|
|
140
158
|
],
|
|
141
159
|
"manualSteps": [
|
|
142
160
|
"Run `pnpm exec wave init --adopt-existing` in older repos if they do not yet have the newer role prompts and docs surfaces.",
|
|
@@ -8,12 +8,24 @@ export const TOPIC_DEFINITIONS = [
|
|
|
8
8
|
description:
|
|
9
9
|
"Current guidance and recent papers on agent harness design, reviewer loops, terminal-native execution, and practical coding-agent workflows.",
|
|
10
10
|
},
|
|
11
|
+
{
|
|
12
|
+
id: "planning-and-orchestration",
|
|
13
|
+
title: "Planning and Orchestration",
|
|
14
|
+
description:
|
|
15
|
+
"Planning topology, verifier and replanner loops, protocol-driven coordination, and blackboard-aware orchestration patterns for multi-agent systems.",
|
|
16
|
+
},
|
|
11
17
|
{
|
|
12
18
|
id: "long-running-agents-and-compaction",
|
|
13
19
|
title: "Long-Running Agents and Compaction",
|
|
14
20
|
description:
|
|
15
21
|
"Long-horizon execution, resumability, memory systems, compaction, and evolving-task evaluation for agents that span many sessions.",
|
|
16
22
|
},
|
|
23
|
+
{
|
|
24
|
+
id: "skills-and-procedural-memory",
|
|
25
|
+
title: "Skills and Procedural Memory",
|
|
26
|
+
description:
|
|
27
|
+
"Reusable skills, procedural memory, workflow induction, skill libraries, and evaluation patterns for agents that improve through reusable procedures.",
|
|
28
|
+
},
|
|
17
29
|
{
|
|
18
30
|
id: "blackboard-and-shared-workspaces",
|
|
19
31
|
title: "Blackboard and Shared Workspaces",
|
|
@@ -26,6 +38,12 @@ export const TOPIC_DEFINITIONS = [
|
|
|
26
38
|
description:
|
|
27
39
|
"Repository-level context files, harness evaluation methods, and evidence on what improves or harms coding-agent performance.",
|
|
28
40
|
},
|
|
41
|
+
{
|
|
42
|
+
id: "security-and-secure-code-generation",
|
|
43
|
+
title: "Security and Secure Code Generation",
|
|
44
|
+
description:
|
|
45
|
+
"Secure code generation, repair and analyzer loops, repository-grounded security benchmarks, and security/privacy risks in multi-agent systems.",
|
|
46
|
+
},
|
|
29
47
|
];
|
|
30
48
|
|
|
31
49
|
export const PAPER_SECTION_ORDER = [
|
|
@@ -47,6 +65,44 @@ const TOPIC_OVERRIDE_MAP = {
|
|
|
47
65
|
["repo-context-and-evaluation"],
|
|
48
66
|
};
|
|
49
67
|
|
|
68
|
+
const PLANNING_TOPIC_OVERRIDE_SLUGS = new Set([
|
|
69
|
+
"building-effective-ai-coding-agents-for-the-terminal-scaffolding-harness-context-engineering-and-lessons-learned",
|
|
70
|
+
"vero-an-evaluation-harness-for-agents-to-optimize-agents",
|
|
71
|
+
"evoclaw-evaluating-ai-agents-on-continuous-software-evolution",
|
|
72
|
+
"exploring-advanced-llm-multi-agent-systems-based-on-blackboard-architecture",
|
|
73
|
+
"llm-based-multi-agent-blackboard-system-for-information-discovery-in-data-science",
|
|
74
|
+
"dova-deliberation-first-multi-agent-orchestration-for-autonomous-research-automation",
|
|
75
|
+
"symphony-synergistic-multi-agent-planning-with-heterogeneous-language-model-assembly",
|
|
76
|
+
"silo-bench-a-scalable-environment-for-evaluating-distributed-coordination-in-multi-agent-llm-systems",
|
|
77
|
+
"terrarium-revisiting-the-blackboard-for-multi-agent-safety-privacy-and-security-studies",
|
|
78
|
+
"macc-multi-agent-collaborative-competition-for-scientific-exploration",
|
|
79
|
+
"the-orchestration-of-multi-agent-systems-architectures-protocols-and-enterprise-adoption",
|
|
80
|
+
"describing-agentic-ai-systems-with-c4-lessons-from-industry-projects",
|
|
81
|
+
"verified-multi-agent-orchestration-a-plan-execute-verify-replan-framework-for-complex-query-resolution",
|
|
82
|
+
"todoevolve-learning-to-architect-agent-planning-systems",
|
|
83
|
+
"parallelized-planning-acting-for-efficient-llm-based-multi-agent-systems-in-minecraft",
|
|
84
|
+
"orchmas-orchestrated-reasoning-with-multi-collaborative-heterogeneous-scientific-expert-structured-agents",
|
|
85
|
+
"towards-engineering-multi-agent-llms-a-protocol-driven-approach",
|
|
86
|
+
"advancing-multi-agent-systems-through-model-context-protocol-architecture-implementation-and-applications",
|
|
87
|
+
"enhancing-model-context-protocol-mcp-with-context-aware-server-collaboration",
|
|
88
|
+
"why-do-multi-agent-llm-systems-fail",
|
|
89
|
+
"systematic-failures-in-collective-reasoning-under-distributed-information-in-multi-agent-llms",
|
|
90
|
+
"dpbench-large-language-models-struggle-with-simultaneous-coordination",
|
|
91
|
+
"multi-agent-teams-hold-experts-back",
|
|
92
|
+
"a-survey-on-llm-based-multi-agent-systems-workflow-infrastructure-and-challenges",
|
|
93
|
+
"llm-based-multi-agent-systems-for-software-engineering-literature-review-vision-and-the-road-ahead",
|
|
94
|
+
"a-taxonomy-of-hierarchical-multi-agent-systems-design-patterns-coordination-mechanisms-and-industrial-applications",
|
|
95
|
+
"blackboard-systems-part-one-the-blackboard-model-of-problem-solving-and-the-evolution-of-blackboard-architectures",
|
|
96
|
+
"a-blackboard-architecture-for-control",
|
|
97
|
+
"incremental-planning-to-control-a-blackboard-based-problem-solver",
|
|
98
|
+
"blackboard-systems",
|
|
99
|
+
]);
|
|
100
|
+
|
|
101
|
+
const SKILLS_TOPIC_OVERRIDE_SLUGS = new Set([
|
|
102
|
+
"memory-for-autonomous-llm-agents-mechanisms-evaluation-and-emerging-frontiers",
|
|
103
|
+
"meta-context-engineering-via-agentic-skill-evolution",
|
|
104
|
+
]);
|
|
105
|
+
|
|
50
106
|
function escapeInlinePipes(value) {
|
|
51
107
|
return String(value ?? "").replaceAll("|", "\\|");
|
|
52
108
|
}
|
|
@@ -189,8 +245,15 @@ export function inferTopics(entry, section = null) {
|
|
|
189
245
|
if (override) {
|
|
190
246
|
topics.push(...override);
|
|
191
247
|
}
|
|
248
|
+
const hasDeclaredTopics = topics.length > 0;
|
|
249
|
+
if (PLANNING_TOPIC_OVERRIDE_SLUGS.has(entry.slug)) {
|
|
250
|
+
topics.push("planning-and-orchestration");
|
|
251
|
+
}
|
|
252
|
+
if (SKILLS_TOPIC_OVERRIDE_SLUGS.has(entry.slug)) {
|
|
253
|
+
topics.push("skills-and-procedural-memory");
|
|
254
|
+
}
|
|
192
255
|
|
|
193
|
-
if (
|
|
256
|
+
if (hasDeclaredTopics) {
|
|
194
257
|
return unique(topics);
|
|
195
258
|
}
|
|
196
259
|
|
|
@@ -220,12 +283,28 @@ export function inferTopics(entry, section = null) {
|
|
|
220
283
|
topics.push("long-running-agents-and-compaction");
|
|
221
284
|
}
|
|
222
285
|
|
|
286
|
+
if (
|
|
287
|
+
/skill|procedural memory|workflow memory|skill library|voyager|toolformer|tool makers|synapse|expel|reuseit|skillweaver|procmem|memskill|memento-skills|metaclaw/.test(
|
|
288
|
+
haystack,
|
|
289
|
+
)
|
|
290
|
+
) {
|
|
291
|
+
topics.push("skills-and-procedural-memory");
|
|
292
|
+
}
|
|
293
|
+
|
|
223
294
|
if (
|
|
224
295
|
entry.kind === "article" || /harness|codex|terminal|engineering|reviewer|agent-first/.test(haystack)
|
|
225
296
|
) {
|
|
226
297
|
topics.push("harnesses-and-practice");
|
|
227
298
|
}
|
|
228
299
|
|
|
300
|
+
if (
|
|
301
|
+
/security|secure code|secure coding|vulnerability|vulnerabilities|cve|static analyzer|codeql|secureagentbench|secrepobench|secodeplt|tosss|privacy/.test(
|
|
302
|
+
haystack,
|
|
303
|
+
)
|
|
304
|
+
) {
|
|
305
|
+
topics.push("security-and-secure-code-generation");
|
|
306
|
+
}
|
|
307
|
+
|
|
229
308
|
if (topics.length === 0) {
|
|
230
309
|
topics.push(entry.kind === "article" ? "harnesses-and-practice" : "long-running-agents-and-compaction");
|
|
231
310
|
}
|
|
@@ -346,9 +425,12 @@ of the repository docs.
|
|
|
346
425
|
## Coverage
|
|
347
426
|
|
|
348
427
|
- Harnesses and practice
|
|
428
|
+
- Planning and orchestration
|
|
349
429
|
- Long-running agents and compaction
|
|
430
|
+
- Skills and procedural memory
|
|
350
431
|
- Blackboard and shared workspaces
|
|
351
432
|
- Repo context and evaluation
|
|
433
|
+
- Security and secure code generation
|
|
352
434
|
|
|
353
435
|
${sections.join("\n\n")}
|
|
354
436
|
`;
|