@chllming/wave-orchestration 0.5.4 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. package/CHANGELOG.md +46 -3
  2. package/README.md +33 -5
  3. package/docs/README.md +18 -4
  4. package/docs/agents/wave-cont-eval-role.md +36 -0
  5. package/docs/agents/{wave-evaluator-role.md → wave-cont-qa-role.md} +14 -11
  6. package/docs/agents/wave-documentation-role.md +1 -1
  7. package/docs/agents/wave-infra-role.md +1 -1
  8. package/docs/agents/wave-integration-role.md +3 -3
  9. package/docs/agents/wave-launcher-role.md +4 -3
  10. package/docs/agents/wave-security-role.md +40 -0
  11. package/docs/concepts/context7-vs-skills.md +1 -1
  12. package/docs/concepts/what-is-a-wave.md +56 -6
  13. package/docs/evals/README.md +166 -0
  14. package/docs/evals/benchmark-catalog.json +663 -0
  15. package/docs/guides/author-and-run-waves.md +135 -0
  16. package/docs/guides/planner.md +5 -0
  17. package/docs/guides/terminal-surfaces.md +2 -0
  18. package/docs/plans/component-cutover-matrix.json +1 -1
  19. package/docs/plans/component-cutover-matrix.md +1 -1
  20. package/docs/plans/current-state.md +19 -1
  21. package/docs/plans/examples/wave-example-live-proof.md +435 -0
  22. package/docs/plans/migration.md +42 -0
  23. package/docs/plans/wave-orchestrator.md +46 -7
  24. package/docs/plans/waves/wave-0.md +4 -4
  25. package/docs/reference/live-proof-waves.md +177 -0
  26. package/docs/reference/migration-0.2-to-0.5.md +26 -19
  27. package/docs/reference/npmjs-trusted-publishing.md +6 -5
  28. package/docs/reference/runtime-config/README.md +13 -3
  29. package/docs/reference/sample-waves.md +87 -0
  30. package/docs/reference/skills.md +110 -42
  31. package/docs/research/agent-context-sources.md +130 -11
  32. package/docs/research/coordination-failure-review.md +266 -0
  33. package/docs/roadmap.md +6 -2
  34. package/package.json +2 -2
  35. package/releases/manifest.json +20 -2
  36. package/scripts/research/agent-context-archive.mjs +83 -1
  37. package/scripts/research/manifests/agent-context-expanded-2026-03-22.mjs +811 -0
  38. package/scripts/wave-orchestrator/adhoc.mjs +1331 -0
  39. package/scripts/wave-orchestrator/agent-state.mjs +358 -6
  40. package/scripts/wave-orchestrator/artifact-schemas.mjs +173 -0
  41. package/scripts/wave-orchestrator/clarification-triage.mjs +10 -3
  42. package/scripts/wave-orchestrator/config.mjs +48 -12
  43. package/scripts/wave-orchestrator/context7.mjs +2 -0
  44. package/scripts/wave-orchestrator/coord-cli.mjs +51 -19
  45. package/scripts/wave-orchestrator/coordination-store.mjs +26 -4
  46. package/scripts/wave-orchestrator/coordination.mjs +83 -9
  47. package/scripts/wave-orchestrator/dashboard-state.mjs +20 -8
  48. package/scripts/wave-orchestrator/dep-cli.mjs +5 -2
  49. package/scripts/wave-orchestrator/docs-queue.mjs +8 -2
  50. package/scripts/wave-orchestrator/evals.mjs +451 -0
  51. package/scripts/wave-orchestrator/feedback.mjs +15 -1
  52. package/scripts/wave-orchestrator/install.mjs +32 -9
  53. package/scripts/wave-orchestrator/launcher-closure.mjs +281 -0
  54. package/scripts/wave-orchestrator/launcher-runtime.mjs +334 -0
  55. package/scripts/wave-orchestrator/launcher.mjs +709 -601
  56. package/scripts/wave-orchestrator/ledger.mjs +123 -20
  57. package/scripts/wave-orchestrator/local-executor.mjs +99 -12
  58. package/scripts/wave-orchestrator/planner.mjs +177 -42
  59. package/scripts/wave-orchestrator/replay.mjs +6 -3
  60. package/scripts/wave-orchestrator/role-helpers.mjs +84 -0
  61. package/scripts/wave-orchestrator/shared.mjs +75 -11
  62. package/scripts/wave-orchestrator/skills.mjs +637 -106
  63. package/scripts/wave-orchestrator/traces.mjs +71 -48
  64. package/scripts/wave-orchestrator/wave-files.mjs +947 -101
  65. package/scripts/wave.mjs +9 -0
  66. package/skills/README.md +202 -0
  67. package/skills/provider-aws/SKILL.md +111 -0
  68. package/skills/provider-aws/adapters/claude.md +1 -0
  69. package/skills/provider-aws/adapters/codex.md +1 -0
  70. package/skills/provider-aws/references/service-verification.md +39 -0
  71. package/skills/provider-aws/skill.json +50 -1
  72. package/skills/provider-custom-deploy/SKILL.md +59 -0
  73. package/skills/provider-custom-deploy/skill.json +46 -1
  74. package/skills/provider-docker-compose/SKILL.md +90 -0
  75. package/skills/provider-docker-compose/adapters/local.md +1 -0
  76. package/skills/provider-docker-compose/skill.json +49 -1
  77. package/skills/provider-github-release/SKILL.md +116 -1
  78. package/skills/provider-github-release/adapters/claude.md +1 -0
  79. package/skills/provider-github-release/adapters/codex.md +1 -0
  80. package/skills/provider-github-release/skill.json +51 -1
  81. package/skills/provider-kubernetes/SKILL.md +137 -0
  82. package/skills/provider-kubernetes/adapters/claude.md +1 -0
  83. package/skills/provider-kubernetes/adapters/codex.md +1 -0
  84. package/skills/provider-kubernetes/references/kubectl-patterns.md +58 -0
  85. package/skills/provider-kubernetes/skill.json +48 -1
  86. package/skills/provider-railway/SKILL.md +118 -1
  87. package/skills/provider-railway/references/verification-commands.md +39 -0
  88. package/skills/provider-railway/skill.json +67 -1
  89. package/skills/provider-ssh-manual/SKILL.md +91 -0
  90. package/skills/provider-ssh-manual/skill.json +50 -1
  91. package/skills/repo-coding-rules/SKILL.md +84 -0
  92. package/skills/repo-coding-rules/skill.json +30 -1
  93. package/skills/role-cont-eval/SKILL.md +90 -0
  94. package/skills/role-cont-eval/adapters/codex.md +1 -0
  95. package/skills/role-cont-eval/skill.json +36 -0
  96. package/skills/role-cont-qa/SKILL.md +93 -0
  97. package/skills/role-cont-qa/adapters/claude.md +1 -0
  98. package/skills/role-cont-qa/skill.json +36 -0
  99. package/skills/role-deploy/SKILL.md +90 -0
  100. package/skills/role-deploy/skill.json +32 -1
  101. package/skills/role-documentation/SKILL.md +66 -0
  102. package/skills/role-documentation/skill.json +32 -1
  103. package/skills/role-implementation/SKILL.md +62 -0
  104. package/skills/role-implementation/skill.json +32 -1
  105. package/skills/role-infra/SKILL.md +74 -0
  106. package/skills/role-infra/skill.json +32 -1
  107. package/skills/role-integration/SKILL.md +79 -1
  108. package/skills/role-integration/skill.json +32 -1
  109. package/skills/role-research/SKILL.md +58 -0
  110. package/skills/role-research/skill.json +32 -1
  111. package/skills/role-security/SKILL.md +60 -0
  112. package/skills/role-security/skill.json +36 -0
  113. package/skills/runtime-claude/SKILL.md +60 -1
  114. package/skills/runtime-claude/skill.json +32 -1
  115. package/skills/runtime-codex/SKILL.md +52 -1
  116. package/skills/runtime-codex/skill.json +32 -1
  117. package/skills/runtime-local/SKILL.md +39 -0
  118. package/skills/runtime-local/skill.json +32 -1
  119. package/skills/runtime-opencode/SKILL.md +51 -0
  120. package/skills/runtime-opencode/skill.json +32 -1
  121. package/skills/wave-core/SKILL.md +107 -0
  122. package/skills/wave-core/references/marker-syntax.md +62 -0
  123. package/skills/wave-core/skill.json +31 -1
  124. package/wave.config.json +35 -6
  125. package/skills/role-evaluator/SKILL.md +0 -6
  126. package/skills/role-evaluator/skill.json +0 -5
@@ -0,0 +1,266 @@
1
+ ---
2
+ title: "Coordination Failure Review"
3
+ summary: "Assessment of whether the Wave orchestrator constructively addresses coordination and blackboard failure modes highlighted by recent multi-agent papers."
4
+ ---
5
+
6
+ # Coordination Failure Review
7
+
8
+ ## Bottom Line
9
+
10
+ The Wave orchestrator addresses several coordination failure modes constructively in code, not just in prose. In particular, it has:
11
+
12
+ - a canonical machine-readable coordination log
13
+ - compiled shared summaries plus per-agent inboxes
14
+ - explicit clarification, helper-assignment, dependency, integration, documentation, and cont-QA barriers
15
+ - structured proof and verdict validation
16
+ - replayable trace bundles with coordination-quality metrics
17
+
18
+ That is materially stronger than the common "agents talk in a shared channel and we hope that was enough" pattern criticized by recent multi-agent papers.
19
+
20
+ The main weakness is empirical, not architectural. The repo does not yet contain a benchmark family that proves the blackboard actually helps agents reconstruct distributed state under HiddenBench or Silo-Bench style pressure, or that it handles DPBench-style simultaneous coordination reliably.
21
+
22
+ ## What The Papers Warn About
23
+
24
+ ### `Why Do Multi-Agent LLM Systems Fail?`
25
+
26
+ This paper is the broadest warning. Its failure taxonomy groups problems into:
27
+
28
+ - system design issues
29
+ - inter-agent misalignment
30
+ - task verification failures
31
+
32
+ Those categories are useful here because they distinguish "we gave agents a shared workspace" from "the workspace is actually enforceable and auditable."
33
+
34
+ ### `HiddenBench` / `Systematic Failures in Collective Reasoning under Distributed Information in Multi-Agent LLMs`
35
+
36
+ This is the clearest warning for blackboard-style systems. The central result is that multi-agent groups often fail not because they never communicated, but because they do not notice latent information asymmetry and do not actively surface unshared evidence. They converge on shared evidence too early.
37
+
38
+ For this repo, the key question is therefore not "do agents have a board?" but "does the shared state force enough evidence pooling to avoid premature convergence?"
39
+
40
+ ### `Silo-Bench`
41
+
42
+ Silo-Bench sharpens the same point. Agents can exchange information and even form reasonable communication topologies, yet still fail at the reasoning-integration step. Communication volume is not the same thing as distributed-state synthesis.
43
+
44
+ For this repo, the corresponding question is whether summaries, inboxes, and integration passes merely move information around, or actually make the final decision depend on the integrated state.
45
+
46
+ ### `DPBench`
47
+
48
+ DPBench shows that LLM teams can look coordinated in serial settings and still collapse in simultaneous coordination settings, with communication often failing to save them. Its practical lesson is that explicit external coordination mechanisms matter when concurrent access or simultaneous action is involved.
49
+
50
+ For this repo, the relevant question is whether coordination is only conversational or whether there are explicit external barriers and tickets that serialize or block unsafe progress.
51
+
52
+ ### `Multi-Agent Teams Hold Experts Back`
53
+
54
+ This paper argues that unconstrained teams underuse expertise. Even when the best agent is identifiable, teams often drift toward integrative compromise instead of properly weighting expert judgment.
55
+
56
+ For this repo, the key question is whether the design relies on self-organizing consensus or on explicit role ownership, routing, and gating.
57
+
58
+ ## What This Repo Already Does Constructively
59
+
60
+ ### Implemented In Code And Tests
61
+
62
+ #### 1. It uses a real canonical shared state, not a cosmetic board
63
+
64
+ The strongest blackboard-like mechanism is the canonical JSONL coordination log plus materialized state in [scripts/wave-orchestrator/coordination-store.mjs](../../scripts/wave-orchestrator/coordination-store.mjs). The markdown board is explicitly a projection for humans, not the scheduler's source of truth, as stated in [docs/plans/wave-orchestrator.md](../plans/wave-orchestrator.md).
65
+
66
+ That state is then compiled into:
67
+
68
+ - a wave-level shared summary via `compileSharedSummary()`
69
+ - targeted per-agent inboxes via `compileAgentInbox()`
70
+
71
+ This is a real mitigation against information silos because agents are not expected to reconstruct the whole wave by rereading raw logs. The inbox compiler also pulls in relevant open coordination through `artifactRefs`, ownership, components, docs items, helper assignments, and dependencies. That behavior is exercised in [test/wave-orchestrator/coordination-store.test.ts](../../test/wave-orchestrator/coordination-store.test.ts).
72
+
73
+ Assessment against the papers:
74
+
75
+ - `HiddenBench`: partially addressed in design
76
+ - `Silo-Bench`: partially addressed in design
77
+ - proof that this works under benchmarked distributed-information pressure: missing
78
+
79
+ #### 2. It makes completion depend on integrated state, not on agent self-report
80
+
81
+ The launcher's gate stack in [scripts/wave-orchestrator/launcher.mjs](../../scripts/wave-orchestrator/launcher.mjs) is the clearest constructive safeguard in the repo. Closure is blocked by:
82
+
83
+ - open clarifications
84
+ - unresolved clarification-linked follow-up requests
85
+ - pending human input
86
+ - unresolved helper assignments
87
+ - open required dependencies
88
+ - integration failures
89
+ - documentation closure failures
90
+ - cont-EVAL failures
91
+ - cont-QA failures
92
+
93
+ This matters because several paper failure modes are really verification failures: agents say they are done, but the system has no hard check that the distributed state was reconciled. Here, the final decision is made by barrier logic rather than informal consensus.
94
+
95
+ Tests in [test/wave-orchestrator/clarification-triage.test.ts](../../test/wave-orchestrator/clarification-triage.test.ts) and [test/wave-orchestrator/launcher.test.ts](../../test/wave-orchestrator/launcher.test.ts) confirm that routed clarification work remains blocking until the linked follow-up is resolved and that integration evidence is derived from coordination, docs, validation, and runtime signals.
96
+
97
+ Assessment against the papers:
98
+
99
+ - `Why Do Multi-Agent LLM Systems Fail?`: strong mitigation of task-verification failures
100
+ - `Silo-Bench`: helps because integrated state has operational consequences
101
+ - `DPBench`: helps by using external barriers instead of relying on emergent coordination alone
102
+
103
+ #### 3. It validates structured evidence instead of trusting narrative summaries
104
+
105
+ [scripts/wave-orchestrator/agent-state.mjs](../../scripts/wave-orchestrator/agent-state.mjs) validates structured markers for implementation proof, integration, cont-EVAL, documentation closure, and cont-QA verdicts. That means the orchestrator can reject:
106
+
107
+ - missing proof markers
108
+ - weaker completion or durability than promised
109
+ - missing doc-delta markers
110
+ - missing component evidence
111
+ - missing deliverables
112
+ - non-ready integration summaries
113
+ - non-satisfied cont-EVAL outcomes
114
+ - non-pass cont-QA gates
115
+
116
+ This directly addresses the "don't kid yourself" critique behind the failure-taxonomy paper. A system that validates explicit proof contracts is much less vulnerable to premature closure than a system that trusts free-form role reports.
117
+
118
+ Assessment against the papers:
119
+
120
+ - `Why Do Multi-Agent LLM Systems Fail?`: strong mitigation for verification and termination failures
121
+ - `Multi-Agent Teams Hold Experts Back`: indirect mitigation, because expert or steward judgment must still be grounded in evidence
122
+
123
+ #### 4. It reduces naive self-organizing compromise through explicit ownership and routing
124
+
125
+ The repo does not rely on free-form team consensus in the way criticized by `Multi-Agent Teams Hold Experts Back`. Instead it uses:
126
+
127
+ - named stewardship roles such as integration and cont-QA in [docs/agents/wave-integration-role.md](../agents/wave-integration-role.md) and [docs/agents/wave-cont-qa-role.md](../agents/wave-cont-qa-role.md)
128
+ - capability-targeted request routing in [scripts/wave-orchestrator/routing-state.mjs](../../scripts/wave-orchestrator/routing-state.mjs)
129
+ - deterministic assignment based on explicit target, preferred agent, or least-busy capability owner
130
+ - staged closure order documented in [docs/plans/current-state.md](../plans/current-state.md) and enforced in the launcher
131
+
132
+ This is a constructive response to the paper's warning about teams averaging expert and non-expert views. The repo favors explicit owner selection and role-specific closure authority over emergent compromise.
133
+
134
+ Assessment against the papers:
135
+
136
+ - `Multi-Agent Teams Hold Experts Back`: partially addressed and better than unconstrained collaboration
137
+ - not fully solved, because routing is based mostly on declared capability and load, not demonstrated expertise quality
138
+
139
+ #### 5. It is unusually observable and replayable
140
+
141
+ [scripts/wave-orchestrator/traces.mjs](../../scripts/wave-orchestrator/traces.mjs) and [scripts/wave-orchestrator/replay.mjs](../../scripts/wave-orchestrator/replay.mjs) give the system an unusually strong postmortem surface. A trace bundle includes:
142
+
143
+ - raw coordination log
144
+ - materialized coordination state
145
+ - ledger
146
+ - docs queue
147
+ - integration summary
148
+ - shared summary
149
+ - copied prompts, logs, status, and inbox artifacts
150
+ - structured signals
151
+ - `quality.json`
152
+ - replay metadata and outcome baseline
153
+
154
+ The quality metrics include unresolved clarifications, contradiction count, capability-assignment timing, dependency-resolution timing, blocker-resolution timing, and fallback counts. Tests in [test/wave-orchestrator/traces.test.ts](../../test/wave-orchestrator/traces.test.ts) verify replay integrity and hash validation.
155
+
156
+ This does not by itself solve coordination failure, but it is a serious safeguard against hidden failure modes because it makes them inspectable and replayable.
157
+
158
+ Assessment against the papers:
159
+
160
+ - `Why Do Multi-Agent LLM Systems Fail?`: strong support for diagnosis and failure analysis
161
+ - `Silo-Bench` and `HiddenBench`: useful observability layer, but not yet a direct capability benchmark
162
+
163
+ ### Stated In Docs And Also Reflected In The Software
164
+
165
+ The docs are not purely aspirational here. The main claims in [docs/plans/current-state.md](../plans/current-state.md) and [docs/plans/wave-orchestrator.md](../plans/wave-orchestrator.md) are broadly backed by the code:
166
+
167
+ - canonical coordination log plus generated board
168
+ - compiled shared summaries and per-agent inboxes
169
+ - orchestrator-first clarification triage
170
+ - blocking helper assignments and cross-lane dependencies
171
+ - staged closure order
172
+ - trace bundles and replay validation
173
+
174
+ That alignment matters. In many MAS projects the docs promise a blackboard, but the runtime still reduces to prompt-only coordination. Here the repo's architectural claims are mostly real.
175
+
176
+ ## What Is Still Missing To Make The Claim Credible
177
+
178
+ ### 1. No distributed-information benchmark family yet
179
+
180
+ The biggest gap is in [docs/evals/benchmark-catalog.json](../evals/benchmark-catalog.json). The current families are:
181
+
182
+ - `service-output`
183
+ - `latency`
184
+ - `quality-regression`
185
+
186
+ There is nothing yet for:
187
+
188
+ - hidden-profile reconstruction
189
+ - silo escape under partial information
190
+ - blackboard consistency across raw log, summary, inboxes, ledger, and integration state
191
+ - contradiction injection and recovery
192
+ - simultaneous coordination under contention
193
+
194
+ So the repo can reasonably claim "we built mechanisms intended to mitigate these failures," but it cannot yet claim "we demonstrated that these mechanisms overcome the failures highlighted by HiddenBench, Silo-Bench, or DPBench."
195
+
196
+ ### 2. Information integration is supported, but not measured directly
197
+
198
+ The shared summary, inboxes, and integration pass are all constructive. But there is still no metric that asks:
199
+
200
+ - Did the team reconstruct the globally correct hidden state?
201
+ - Did the summary preserve the critical fact that was originally siloed?
202
+ - Did a wave converge too early on shared evidence while missing private evidence?
203
+
204
+ This is the central failure highlighted by `HiddenBench` and `Silo-Bench`, and the repo does not yet score it directly.
205
+
206
+ ### 3. Expertise routing is explicit, but shallow
207
+
208
+ [scripts/wave-orchestrator/routing-state.mjs](../../scripts/wave-orchestrator/routing-state.mjs) is better than unconstrained self-organization, but it still routes mostly by:
209
+
210
+ - explicit target
211
+ - configured preferred agents
212
+ - declared capability ownership
213
+ - least-busy fallback
214
+
215
+ It does not yet weight:
216
+
217
+ - historical success on a capability
218
+ - evidence quality by agent
219
+ - confidence calibration
220
+ - expert-leverage metrics
221
+
222
+ So the repo partially addresses the concern from `Multi-Agent Teams Hold Experts Back`, but it does not yet prove that the best agent's expertise is actually being exploited rather than merely named.
223
+
224
+ ### 4. Clarification and contradiction handling are still somewhat heuristic
225
+
226
+ Clarification triage and integration evidence aggregation are real safeguards, but they still lean heavily on:
227
+
228
+ - ownership mappings
229
+ - artifact references
230
+ - structured markers
231
+ - text-level summaries and conflict extraction
232
+
233
+ That is enough to make the runtime operationally safer, but it is not yet a richer semantic evidence-integration layer. Subtle contradictions or latent information asymmetries may still be missed.
234
+
235
+ ### 5. DPBench-style simultaneous coordination is only indirectly addressed
236
+
237
+ The repo already uses external coordination mechanisms such as blocking assignments, dependency tickets, and closure barriers. That is directionally aligned with DPBench's lesson that explicit external coordination beats naive emergent coordination.
238
+
239
+ But there is still no direct stress harness for:
240
+
241
+ - simultaneous resource contention
242
+ - many-way concurrent dependencies
243
+ - lock-step coordination failures
244
+ - deadlock-like patterns caused by convergent reasoning
245
+
246
+ So the design points in the right direction, but the claim is not yet validated.
247
+
248
+ ## Gap Matrix
249
+
250
+ | Paper | Main warning | Repo response | Assessment |
251
+ | --- | --- | --- | --- |
252
+ | [Why Do Multi-Agent LLM Systems Fail?](https://arxiv.org/abs/2503.13657) | MAS fail through bad system design, misalignment, and weak verification | Canonical coordination state, barrier-based closure, structured evidence validation, replayable traces | Addressed materially in architecture and software |
253
+ | [Systematic Failures in Collective Reasoning under Distributed Information in Multi-Agent LLMs](https://arxiv.org/abs/2505.11556) | Teams miss latent information asymmetry and converge too early on shared evidence | Shared summaries, per-agent inboxes, integration steward, clarification flow | Partially addressed in design, not validated empirically |
254
+ | [Silo-Bench](https://arxiv.org/abs/2603.01045) | Communication is not enough; reasoning integration is the bottleneck | Integration evidence aggregation and barrier-driven closure | Partially addressed in design, but no direct integration-quality benchmark |
255
+ | [DPBench](https://arxiv.org/abs/2602.13255) | Simultaneous coordination can fail badly even with communication | External helper assignments, dependency barriers, explicit blocking workflow | Directionally addressed, but not benchmarked under simultaneous contention |
256
+ | [Multi-Agent Teams Hold Experts Back](https://arxiv.org/abs/2602.01011) | Self-organizing teams underuse experts and drift toward compromise | Named stewards, explicit role authority, capability routing, proof gates | Better than naive teams, but expertise leverage is not measured or optimized deeply |
257
+
258
+ ## Final Assessment
259
+
260
+ If the standard is "does this repo merely claim multi-agent coordination," the answer is no. It has real machinery for blackboard-like state sharing, evidence-based closure, clarification handling, and coordination diagnostics.
261
+
262
+ If the standard is "has this repo already demonstrated that its design beats the core failure modes isolated by HiddenBench, Silo-Bench, DPBench, and related work," the answer is also no. The design is substantially more credible than most MAS stacks, but the empirical proof is still missing.
263
+
264
+ The most accurate claim today is:
265
+
266
+ > Wave already implements several constructive anti-failure mechanisms for coordination and blackboard-style orchestration, especially around shared state, gating, and observability. What it still lacks is a benchmark suite that proves those mechanisms actually overcome distributed-information and simultaneous-coordination failures rather than simply organizing them better.
package/docs/roadmap.md CHANGED
@@ -12,7 +12,7 @@ The repository already has the right runtime substrate:
12
12
 
13
13
  - lane-scoped state under `.tmp/`
14
14
  - wave parsing and validation
15
- - role-based execution with evaluator, integration, and documentation stewards
15
+ - role-based execution with cont-qa, integration, and documentation stewards
16
16
  - executor profiles and lane runtime policy
17
17
  - compiled inboxes, ledgers, docs queues, dependency snapshots, and trace bundles
18
18
  - orchestrator-first clarification handling and human feedback workflows
@@ -76,14 +76,17 @@ CLI target:
76
76
  - `wave adhoc run --task "..." [--task "..."]`
77
77
  - `wave adhoc list`
78
78
  - `wave adhoc show --run <id>`
79
+ - `wave adhoc promote --run <id> --wave <n>`
79
80
 
80
81
  Behavior:
81
82
 
82
83
  - accept one or more free-form task requests
83
84
  - normalize them into a single transient plan or spec
84
- - synthesize the worker roles needed for the request while still preserving evaluator, integration, and documentation closure when relevant
85
+ - synthesize the worker roles needed for the request while still preserving cont-qa, integration, and documentation closure when relevant
85
86
  - run that transient plan through the existing launcher, coordination, inbox, ledger, docs queue, integration, and trace machinery
86
87
  - keep ad-hoc runs logged, inspectable, and replayable with the same basic operator surfaces as roadmap waves
88
+ - route shared-plan documentation deltas into the canonical shared docs queue, plus an ad-hoc closure report for the run
89
+ - treat only repo-local paths as ownership hints and ignore external references such as URLs
87
90
 
88
91
  Storage model:
89
92
 
@@ -98,6 +101,7 @@ Design constraints:
98
101
  - treat ad-hoc as a transient single-run execution unit, not a fake roadmap wave
99
102
  - do not let ad-hoc completion mutate normal `completedWaves` lane state
100
103
  - give `wave coord`, `wave feedback`, and future replay or reporting flows a way to target `--run <id>`
104
+ - promote numbered roadmap artifacts from the stored ad-hoc spec instead of recomputing them from the current project profile
101
105
 
102
106
  Why this matters:
103
107
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@chllming/wave-orchestration",
3
- "version": "0.5.4",
3
+ "version": "0.6.0",
4
4
  "license": "MIT",
5
5
  "description": "Generic wave-based multi-agent orchestration for repository work.",
6
6
  "repository": {
@@ -39,7 +39,7 @@
39
39
  },
40
40
  "scripts": {
41
41
  "context7:api-check": "bash scripts/context7-export-env.sh run bash scripts/context7-api-check.sh",
42
- "research:import-agent-context": "node scripts/research/import-agent-context-archive.mjs scripts/research/manifests/harness-and-blackboard-2026-03-21.mjs",
42
+ "research:import-agent-context": "node scripts/research/import-agent-context-archive.mjs scripts/research/manifests/agent-context-expanded-2026-03-22.mjs",
43
43
  "research:index-agent-context": "node scripts/research/generate-agent-context-indexes.mjs",
44
44
  "research:refresh-agent-context": "pnpm research:import-agent-context && pnpm research:index-agent-context",
45
45
  "test": "vitest run --config vitest.config.ts",
@@ -2,6 +2,24 @@
2
2
  "schemaVersion": 1,
3
3
  "packageName": "@chllming/wave-orchestration",
4
4
  "releases": [
5
+ {
6
+ "version": "0.6.0",
7
+ "date": "2026-03-22",
8
+ "summary": "Closure-role split, benchmark-governed cont-EVAL, security review, ad-hoc runs, and expanded skills and docs.",
9
+ "features": [
10
+ "The closure model now treats `cont-EVAL` (`E0`) as an optional first-class eval stage before integration and keeps `cont-QA` (`A0`) as the final release verdict owner, with dedicated validation and report requirements for each role.",
11
+ "Wave authoring now includes a benchmark catalog under `docs/evals/benchmark-catalog.json`, wave-level `## Eval targets`, and planner, launcher, and validation support for delegated versus pinned eval contracts.",
12
+ "Optional report-only security review now has its own role prompt, validation path, `[wave-security]` marker, security summaries, and closure-stage sequencing.",
13
+ "Operators can now run transient ad-hoc work through `wave adhoc plan|run|show|promote`, with generated specs, launcher-compatible markdown, isolated runtime state, and promotion back into numbered roadmap artifacts.",
14
+ "The starter docs and skills surface now ship richer role, runtime, deploy-kind, provider, and proof-first guidance across the package-owned `docs/` and `skills/` trees."
15
+ ],
16
+ "manualSteps": [
17
+ "After upgrading, update any repo-owned `evaluator` role references to the new `cont-QA` plus optional `cont-EVAL` split before running live waves.",
18
+ "If a wave uses `cont-EVAL`, declare `## Eval targets` at the wave level and keep benchmark ids inside `docs/evals/benchmark-catalog.json`.",
19
+ "Run `pnpm exec wave doctor` and `pnpm exec wave launch --lane main --dry-run --no-dashboard` after upgrading so the stricter closure, skill, and security validation paths can fail fast before live execution."
20
+ ],
21
+ "breaking": false
22
+ },
5
23
  {
6
24
  "version": "0.5.4",
7
25
  "date": "2026-03-22",
@@ -24,7 +42,7 @@
24
42
  "date": "2026-03-22",
25
43
  "summary": "Closure-sweep launch ordering fix for implementation-first wave execution.",
26
44
  "features": [
27
- "The launcher now starts only implementation agents in the initial wave pass when implementation work remains, deferring integration, documentation, and evaluator roles until closure sweep order.",
45
+ "The launcher now starts only implementation agents in the initial wave pass when implementation work remains, deferring integration, documentation, and cont-qa roles until closure sweep order.",
28
46
  "Wave waiting, dashboard progress refresh, and human-feedback monitoring now scope to the runs launched in the current pass so deferred closure agents no longer create false pending or missing-status failures.",
29
47
  "Regression coverage now exercises both mixed implementation/closure waves and closure-only retry waves directly."
30
48
  ],
@@ -136,7 +154,7 @@
136
154
  "features": [
137
155
  "Canonical coordination log materialization, generated board projection, compiled shared summary, per-agent inboxes, and a durable wave ledger.",
138
156
  "Planning-time executor profiles, lane runtime policy, hard runtime-mix enforcement, and retry fallback reassignment recorded into ledger, integration, and traces.",
139
- "Orchestrator-first clarification triage, explicit integration summaries, and staged closure that runs integration before documentation and evaluator closure."
157
+ "Orchestrator-first clarification triage, explicit integration summaries, and staged closure that runs integration before documentation and cont-qa closure."
140
158
  ],
141
159
  "manualSteps": [
142
160
  "Run `pnpm exec wave init --adopt-existing` in older repos if they do not yet have the newer role prompts and docs surfaces.",
@@ -8,12 +8,24 @@ export const TOPIC_DEFINITIONS = [
8
8
  description:
9
9
  "Current guidance and recent papers on agent harness design, reviewer loops, terminal-native execution, and practical coding-agent workflows.",
10
10
  },
11
+ {
12
+ id: "planning-and-orchestration",
13
+ title: "Planning and Orchestration",
14
+ description:
15
+ "Planning topology, verifier and replanner loops, protocol-driven coordination, and blackboard-aware orchestration patterns for multi-agent systems.",
16
+ },
11
17
  {
12
18
  id: "long-running-agents-and-compaction",
13
19
  title: "Long-Running Agents and Compaction",
14
20
  description:
15
21
  "Long-horizon execution, resumability, memory systems, compaction, and evolving-task evaluation for agents that span many sessions.",
16
22
  },
23
+ {
24
+ id: "skills-and-procedural-memory",
25
+ title: "Skills and Procedural Memory",
26
+ description:
27
+ "Reusable skills, procedural memory, workflow induction, skill libraries, and evaluation patterns for agents that improve through reusable procedures.",
28
+ },
17
29
  {
18
30
  id: "blackboard-and-shared-workspaces",
19
31
  title: "Blackboard and Shared Workspaces",
@@ -26,6 +38,12 @@ export const TOPIC_DEFINITIONS = [
26
38
  description:
27
39
  "Repository-level context files, harness evaluation methods, and evidence on what improves or harms coding-agent performance.",
28
40
  },
41
+ {
42
+ id: "security-and-secure-code-generation",
43
+ title: "Security and Secure Code Generation",
44
+ description:
45
+ "Secure code generation, repair and analyzer loops, repository-grounded security benchmarks, and security/privacy risks in multi-agent systems.",
46
+ },
29
47
  ];
30
48
 
31
49
  export const PAPER_SECTION_ORDER = [
@@ -47,6 +65,44 @@ const TOPIC_OVERRIDE_MAP = {
47
65
  ["repo-context-and-evaluation"],
48
66
  };
49
67
 
68
+ const PLANNING_TOPIC_OVERRIDE_SLUGS = new Set([
69
+ "building-effective-ai-coding-agents-for-the-terminal-scaffolding-harness-context-engineering-and-lessons-learned",
70
+ "vero-an-evaluation-harness-for-agents-to-optimize-agents",
71
+ "evoclaw-evaluating-ai-agents-on-continuous-software-evolution",
72
+ "exploring-advanced-llm-multi-agent-systems-based-on-blackboard-architecture",
73
+ "llm-based-multi-agent-blackboard-system-for-information-discovery-in-data-science",
74
+ "dova-deliberation-first-multi-agent-orchestration-for-autonomous-research-automation",
75
+ "symphony-synergistic-multi-agent-planning-with-heterogeneous-language-model-assembly",
76
+ "silo-bench-a-scalable-environment-for-evaluating-distributed-coordination-in-multi-agent-llm-systems",
77
+ "terrarium-revisiting-the-blackboard-for-multi-agent-safety-privacy-and-security-studies",
78
+ "macc-multi-agent-collaborative-competition-for-scientific-exploration",
79
+ "the-orchestration-of-multi-agent-systems-architectures-protocols-and-enterprise-adoption",
80
+ "describing-agentic-ai-systems-with-c4-lessons-from-industry-projects",
81
+ "verified-multi-agent-orchestration-a-plan-execute-verify-replan-framework-for-complex-query-resolution",
82
+ "todoevolve-learning-to-architect-agent-planning-systems",
83
+ "parallelized-planning-acting-for-efficient-llm-based-multi-agent-systems-in-minecraft",
84
+ "orchmas-orchestrated-reasoning-with-multi-collaborative-heterogeneous-scientific-expert-structured-agents",
85
+ "towards-engineering-multi-agent-llms-a-protocol-driven-approach",
86
+ "advancing-multi-agent-systems-through-model-context-protocol-architecture-implementation-and-applications",
87
+ "enhancing-model-context-protocol-mcp-with-context-aware-server-collaboration",
88
+ "why-do-multi-agent-llm-systems-fail",
89
+ "systematic-failures-in-collective-reasoning-under-distributed-information-in-multi-agent-llms",
90
+ "dpbench-large-language-models-struggle-with-simultaneous-coordination",
91
+ "multi-agent-teams-hold-experts-back",
92
+ "a-survey-on-llm-based-multi-agent-systems-workflow-infrastructure-and-challenges",
93
+ "llm-based-multi-agent-systems-for-software-engineering-literature-review-vision-and-the-road-ahead",
94
+ "a-taxonomy-of-hierarchical-multi-agent-systems-design-patterns-coordination-mechanisms-and-industrial-applications",
95
+ "blackboard-systems-part-one-the-blackboard-model-of-problem-solving-and-the-evolution-of-blackboard-architectures",
96
+ "a-blackboard-architecture-for-control",
97
+ "incremental-planning-to-control-a-blackboard-based-problem-solver",
98
+ "blackboard-systems",
99
+ ]);
100
+
101
+ const SKILLS_TOPIC_OVERRIDE_SLUGS = new Set([
102
+ "memory-for-autonomous-llm-agents-mechanisms-evaluation-and-emerging-frontiers",
103
+ "meta-context-engineering-via-agentic-skill-evolution",
104
+ ]);
105
+
50
106
  function escapeInlinePipes(value) {
51
107
  return String(value ?? "").replaceAll("|", "\\|");
52
108
  }
@@ -189,8 +245,15 @@ export function inferTopics(entry, section = null) {
189
245
  if (override) {
190
246
  topics.push(...override);
191
247
  }
248
+ const hasDeclaredTopics = topics.length > 0;
249
+ if (PLANNING_TOPIC_OVERRIDE_SLUGS.has(entry.slug)) {
250
+ topics.push("planning-and-orchestration");
251
+ }
252
+ if (SKILLS_TOPIC_OVERRIDE_SLUGS.has(entry.slug)) {
253
+ topics.push("skills-and-procedural-memory");
254
+ }
192
255
 
193
- if (topics.length > 0) {
256
+ if (hasDeclaredTopics) {
194
257
  return unique(topics);
195
258
  }
196
259
 
@@ -220,12 +283,28 @@ export function inferTopics(entry, section = null) {
220
283
  topics.push("long-running-agents-and-compaction");
221
284
  }
222
285
 
286
+ if (
287
+ /skill|procedural memory|workflow memory|skill library|voyager|toolformer|tool makers|synapse|expel|reuseit|skillweaver|procmem|memskill|memento-skills|metaclaw/.test(
288
+ haystack,
289
+ )
290
+ ) {
291
+ topics.push("skills-and-procedural-memory");
292
+ }
293
+
223
294
  if (
224
295
  entry.kind === "article" || /harness|codex|terminal|engineering|reviewer|agent-first/.test(haystack)
225
296
  ) {
226
297
  topics.push("harnesses-and-practice");
227
298
  }
228
299
 
300
+ if (
301
+ /security|secure code|secure coding|vulnerability|vulnerabilities|cve|static analyzer|codeql|secureagentbench|secrepobench|secodeplt|tosss|privacy/.test(
302
+ haystack,
303
+ )
304
+ ) {
305
+ topics.push("security-and-secure-code-generation");
306
+ }
307
+
229
308
  if (topics.length === 0) {
230
309
  topics.push(entry.kind === "article" ? "harnesses-and-practice" : "long-running-agents-and-compaction");
231
310
  }
@@ -346,9 +425,12 @@ of the repository docs.
346
425
  ## Coverage
347
426
 
348
427
  - Harnesses and practice
428
+ - Planning and orchestration
349
429
  - Long-running agents and compaction
430
+ - Skills and procedural memory
350
431
  - Blackboard and shared workspaces
351
432
  - Repo context and evaluation
433
+ - Security and secure code generation
352
434
 
353
435
  ${sections.join("\n\n")}
354
436
  `;