@chllming/wave-orchestration 0.5.4 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. package/CHANGELOG.md +46 -3
  2. package/README.md +33 -5
  3. package/docs/README.md +18 -4
  4. package/docs/agents/wave-cont-eval-role.md +36 -0
  5. package/docs/agents/{wave-evaluator-role.md → wave-cont-qa-role.md} +14 -11
  6. package/docs/agents/wave-documentation-role.md +1 -1
  7. package/docs/agents/wave-infra-role.md +1 -1
  8. package/docs/agents/wave-integration-role.md +3 -3
  9. package/docs/agents/wave-launcher-role.md +4 -3
  10. package/docs/agents/wave-security-role.md +40 -0
  11. package/docs/concepts/context7-vs-skills.md +1 -1
  12. package/docs/concepts/what-is-a-wave.md +56 -6
  13. package/docs/evals/README.md +166 -0
  14. package/docs/evals/benchmark-catalog.json +663 -0
  15. package/docs/guides/author-and-run-waves.md +135 -0
  16. package/docs/guides/planner.md +5 -0
  17. package/docs/guides/terminal-surfaces.md +2 -0
  18. package/docs/plans/component-cutover-matrix.json +1 -1
  19. package/docs/plans/component-cutover-matrix.md +1 -1
  20. package/docs/plans/current-state.md +19 -1
  21. package/docs/plans/examples/wave-example-live-proof.md +435 -0
  22. package/docs/plans/migration.md +42 -0
  23. package/docs/plans/wave-orchestrator.md +46 -7
  24. package/docs/plans/waves/wave-0.md +4 -4
  25. package/docs/reference/live-proof-waves.md +177 -0
  26. package/docs/reference/migration-0.2-to-0.5.md +26 -19
  27. package/docs/reference/npmjs-trusted-publishing.md +6 -5
  28. package/docs/reference/runtime-config/README.md +13 -3
  29. package/docs/reference/sample-waves.md +87 -0
  30. package/docs/reference/skills.md +110 -42
  31. package/docs/research/agent-context-sources.md +130 -11
  32. package/docs/research/coordination-failure-review.md +266 -0
  33. package/docs/roadmap.md +6 -2
  34. package/package.json +2 -2
  35. package/releases/manifest.json +20 -2
  36. package/scripts/research/agent-context-archive.mjs +83 -1
  37. package/scripts/research/manifests/agent-context-expanded-2026-03-22.mjs +811 -0
  38. package/scripts/wave-orchestrator/adhoc.mjs +1331 -0
  39. package/scripts/wave-orchestrator/agent-state.mjs +358 -6
  40. package/scripts/wave-orchestrator/artifact-schemas.mjs +173 -0
  41. package/scripts/wave-orchestrator/clarification-triage.mjs +10 -3
  42. package/scripts/wave-orchestrator/config.mjs +48 -12
  43. package/scripts/wave-orchestrator/context7.mjs +2 -0
  44. package/scripts/wave-orchestrator/coord-cli.mjs +51 -19
  45. package/scripts/wave-orchestrator/coordination-store.mjs +26 -4
  46. package/scripts/wave-orchestrator/coordination.mjs +83 -9
  47. package/scripts/wave-orchestrator/dashboard-state.mjs +20 -8
  48. package/scripts/wave-orchestrator/dep-cli.mjs +5 -2
  49. package/scripts/wave-orchestrator/docs-queue.mjs +8 -2
  50. package/scripts/wave-orchestrator/evals.mjs +451 -0
  51. package/scripts/wave-orchestrator/feedback.mjs +15 -1
  52. package/scripts/wave-orchestrator/install.mjs +32 -9
  53. package/scripts/wave-orchestrator/launcher-closure.mjs +281 -0
  54. package/scripts/wave-orchestrator/launcher-runtime.mjs +334 -0
  55. package/scripts/wave-orchestrator/launcher.mjs +709 -601
  56. package/scripts/wave-orchestrator/ledger.mjs +123 -20
  57. package/scripts/wave-orchestrator/local-executor.mjs +99 -12
  58. package/scripts/wave-orchestrator/planner.mjs +177 -42
  59. package/scripts/wave-orchestrator/replay.mjs +6 -3
  60. package/scripts/wave-orchestrator/role-helpers.mjs +84 -0
  61. package/scripts/wave-orchestrator/shared.mjs +75 -11
  62. package/scripts/wave-orchestrator/skills.mjs +637 -106
  63. package/scripts/wave-orchestrator/traces.mjs +71 -48
  64. package/scripts/wave-orchestrator/wave-files.mjs +947 -101
  65. package/scripts/wave.mjs +9 -0
  66. package/skills/README.md +202 -0
  67. package/skills/provider-aws/SKILL.md +111 -0
  68. package/skills/provider-aws/adapters/claude.md +1 -0
  69. package/skills/provider-aws/adapters/codex.md +1 -0
  70. package/skills/provider-aws/references/service-verification.md +39 -0
  71. package/skills/provider-aws/skill.json +50 -1
  72. package/skills/provider-custom-deploy/SKILL.md +59 -0
  73. package/skills/provider-custom-deploy/skill.json +46 -1
  74. package/skills/provider-docker-compose/SKILL.md +90 -0
  75. package/skills/provider-docker-compose/adapters/local.md +1 -0
  76. package/skills/provider-docker-compose/skill.json +49 -1
  77. package/skills/provider-github-release/SKILL.md +116 -1
  78. package/skills/provider-github-release/adapters/claude.md +1 -0
  79. package/skills/provider-github-release/adapters/codex.md +1 -0
  80. package/skills/provider-github-release/skill.json +51 -1
  81. package/skills/provider-kubernetes/SKILL.md +137 -0
  82. package/skills/provider-kubernetes/adapters/claude.md +1 -0
  83. package/skills/provider-kubernetes/adapters/codex.md +1 -0
  84. package/skills/provider-kubernetes/references/kubectl-patterns.md +58 -0
  85. package/skills/provider-kubernetes/skill.json +48 -1
  86. package/skills/provider-railway/SKILL.md +118 -1
  87. package/skills/provider-railway/references/verification-commands.md +39 -0
  88. package/skills/provider-railway/skill.json +67 -1
  89. package/skills/provider-ssh-manual/SKILL.md +91 -0
  90. package/skills/provider-ssh-manual/skill.json +50 -1
  91. package/skills/repo-coding-rules/SKILL.md +84 -0
  92. package/skills/repo-coding-rules/skill.json +30 -1
  93. package/skills/role-cont-eval/SKILL.md +90 -0
  94. package/skills/role-cont-eval/adapters/codex.md +1 -0
  95. package/skills/role-cont-eval/skill.json +36 -0
  96. package/skills/role-cont-qa/SKILL.md +93 -0
  97. package/skills/role-cont-qa/adapters/claude.md +1 -0
  98. package/skills/role-cont-qa/skill.json +36 -0
  99. package/skills/role-deploy/SKILL.md +90 -0
  100. package/skills/role-deploy/skill.json +32 -1
  101. package/skills/role-documentation/SKILL.md +66 -0
  102. package/skills/role-documentation/skill.json +32 -1
  103. package/skills/role-implementation/SKILL.md +62 -0
  104. package/skills/role-implementation/skill.json +32 -1
  105. package/skills/role-infra/SKILL.md +74 -0
  106. package/skills/role-infra/skill.json +32 -1
  107. package/skills/role-integration/SKILL.md +79 -1
  108. package/skills/role-integration/skill.json +32 -1
  109. package/skills/role-research/SKILL.md +58 -0
  110. package/skills/role-research/skill.json +32 -1
  111. package/skills/role-security/SKILL.md +60 -0
  112. package/skills/role-security/skill.json +36 -0
  113. package/skills/runtime-claude/SKILL.md +60 -1
  114. package/skills/runtime-claude/skill.json +32 -1
  115. package/skills/runtime-codex/SKILL.md +52 -1
  116. package/skills/runtime-codex/skill.json +32 -1
  117. package/skills/runtime-local/SKILL.md +39 -0
  118. package/skills/runtime-local/skill.json +32 -1
  119. package/skills/runtime-opencode/SKILL.md +51 -0
  120. package/skills/runtime-opencode/skill.json +32 -1
  121. package/skills/wave-core/SKILL.md +107 -0
  122. package/skills/wave-core/references/marker-syntax.md +62 -0
  123. package/skills/wave-core/skill.json +31 -1
  124. package/wave.config.json +35 -6
  125. package/skills/role-evaluator/SKILL.md +0 -6
  126. package/skills/role-evaluator/skill.json +0 -5
@@ -0,0 +1,663 @@
1
+ {
2
+ "version": 2,
3
+ "families": {
4
+ "service-output": {
5
+ "title": "Service Output Quality",
6
+ "summary": "Benchmarks for reviewing runtime output quality, behavior, and correctness.",
7
+ "benchmarks": {
8
+ "golden-response-smoke": {
9
+ "title": "Golden Response Smoke",
10
+ "summary": "Compare representative outputs to known-good expectations."
11
+ },
12
+ "manual-session-review": {
13
+ "title": "Manual Session Review",
14
+ "summary": "Run the service manually and document output gaps from representative user flows."
15
+ }
16
+ }
17
+ },
18
+ "latency": {
19
+ "title": "Latency",
20
+ "summary": "Benchmarks for startup and request latency.",
21
+ "benchmarks": {
22
+ "http-latency-smoke": {
23
+ "title": "HTTP Latency Smoke",
24
+ "summary": "Quick latency pass over the service's core request path."
25
+ },
26
+ "cold-start-smoke": {
27
+ "title": "Cold Start Smoke",
28
+ "summary": "Track cold start behavior for the primary runtime surface."
29
+ }
30
+ }
31
+ },
32
+ "quality-regression": {
33
+ "title": "Quality Regression",
34
+ "summary": "Benchmarks for guarding against regressions during output tuning.",
35
+ "benchmarks": {
36
+ "baseline-diff-review": {
37
+ "title": "Baseline Diff Review",
38
+ "summary": "Compare current outputs to a previous accepted baseline."
39
+ },
40
+ "operator-checklist-review": {
41
+ "title": "Operator Checklist Review",
42
+ "summary": "Run a documented checklist over the tuned output surface."
43
+ }
44
+ }
45
+ },
46
+ "hidden-profile-pooling": {
47
+ "title": "Hidden Profile Pooling",
48
+ "summary": "Benchmarks for whether distributed private evidence is surfaced, pooled, and integrated before the team converges.",
49
+ "category": "coordination",
50
+ "coordinationModel": "blackboard-distributed-information",
51
+ "primaryMetric": {
52
+ "id": "distributed-info-accuracy",
53
+ "title": "Distributed Information Accuracy",
54
+ "unit": "percent",
55
+ "direction": "higher-is-better",
56
+ "summary": "Accuracy when key evidence begins distributed across agents."
57
+ },
58
+ "secondaryMetrics": [
59
+ {
60
+ "id": "latent-asymmetry-surfacing-rate",
61
+ "title": "Latent Asymmetry Surfacing Rate",
62
+ "unit": "percent",
63
+ "direction": "higher-is-better"
64
+ },
65
+ {
66
+ "id": "premature-convergence-rate",
67
+ "title": "Premature Convergence Rate",
68
+ "unit": "percent",
69
+ "direction": "lower-is-better"
70
+ }
71
+ ],
72
+ "paperReferences": [
73
+ {
74
+ "id": "hiddenbench-2025",
75
+ "title": "Systematic Failures in Collective Reasoning under Distributed Information in Multi-Agent LLMs",
76
+ "year": 2025,
77
+ "url": "https://arxiv.org/abs/2505.11556",
78
+ "summary": "HiddenBench benchmark for collective reasoning under distributed information."
79
+ }
80
+ ],
81
+ "sotaBaseline": {
82
+ "source": "paper-static",
83
+ "paper": "HiddenBench",
84
+ "year": 2025,
85
+ "metric": "distributed-info-accuracy",
86
+ "value": 30.1,
87
+ "notes": "Reported multi-agent accuracy under distributed information in the paper abstract.",
88
+ "url": "https://arxiv.org/abs/2505.11556"
89
+ },
90
+ "benchmarks": {
91
+ "latent-asymmetry-surfacing": {
92
+ "title": "Latent Asymmetry Surfacing",
93
+ "summary": "Checks whether agents explicitly seek or expose facts they suspect other agents may hold.",
94
+ "goal": "Measure whether the framework notices unshared evidence before converging.",
95
+ "failureModes": [
96
+ "latent-information-asymmetry",
97
+ "premature-consensus",
98
+ "shared-evidence-overweighting"
99
+ ],
100
+ "signals": [
101
+ "requests-for-missing-evidence",
102
+ "inbox-to-summary-fact-escalation",
103
+ "late-discovered-decision-changing-fact"
104
+ ],
105
+ "scoring": {
106
+ "primaryMetric": "latent-asymmetry-surfacing-rate",
107
+ "successCriterion": "Critical unshared evidence is surfaced before final recommendation.",
108
+ "rubric": "Score higher when agents explicitly search for missing facts instead of relying on already-shared evidence."
109
+ },
110
+ "paperReferences": [
111
+ {
112
+ "id": "hiddenbench-2025",
113
+ "title": "Systematic Failures in Collective Reasoning under Distributed Information in Multi-Agent LLMs",
114
+ "year": 2025,
115
+ "url": "https://arxiv.org/abs/2505.11556"
116
+ }
117
+ ],
118
+ "tuningNotes": "Use when adjusting inbox compilation, clarification prompting, or board-to-summary compression."
119
+ },
120
+ "private-evidence-integration": {
121
+ "title": "Private Evidence Integration",
122
+ "summary": "Checks whether separately observed facts are integrated into the final answer rather than merely repeated in conversation.",
123
+ "goal": "Measure end-to-end integration of distributed evidence into a coherent outcome.",
124
+ "failureModes": [
125
+ "communication-without-integration",
126
+ "evidence-drop",
127
+ "incorrect-global-reconstruction"
128
+ ],
129
+ "signals": [
130
+ "final-answer-uses-private-facts",
131
+ "integration-summary-cites-distributed-evidence",
132
+ "correct-global-state-reconstruction"
133
+ ],
134
+ "scoring": {
135
+ "primaryMetric": "distributed-info-accuracy",
136
+ "successCriterion": "The final answer requires and reflects the distributed facts.",
137
+ "rubric": "Score higher only when the final recommendation depends on the pooled evidence."
138
+ },
139
+ "sotaBaseline": {
140
+ "source": "paper-static",
141
+ "paper": "HiddenBench",
142
+ "year": 2025,
143
+ "metric": "distributed-info-accuracy",
144
+ "value": 30.1,
145
+ "notes": "Paper baseline for multi-agent distributed-information accuracy.",
146
+ "url": "https://arxiv.org/abs/2505.11556"
147
+ }
148
+ },
149
+ "premature-consensus-guard": {
150
+ "title": "Premature Consensus Guard",
151
+ "summary": "Checks whether the system delays closure when important evidence is still siloed.",
152
+ "goal": "Measure resistance to converging early on shared but incomplete evidence.",
153
+ "failureModes": [
154
+ "premature-consensus",
155
+ "closure-before-evidence-pooling"
156
+ ],
157
+ "signals": [
158
+ "clarification-raised-before-close",
159
+ "integration-barrier-trips",
160
+ "documentation-or-cont-qa-blocks"
161
+ ],
162
+ "scoring": {
163
+ "primaryMetric": "premature-convergence-rate",
164
+ "successCriterion": "Final closure is blocked when a decision-relevant fact remains siloed."
165
+ }
166
+ }
167
+ }
168
+ },
169
+ "silo-escape": {
170
+ "title": "Silo Escape",
171
+ "summary": "Benchmarks for whether agents can move from locally sufficient views to globally correct coordinated state.",
172
+ "category": "coordination",
173
+ "coordinationModel": "shared-workspace-integration",
174
+ "primaryMetric": {
175
+ "id": "global-state-reconstruction-rate",
176
+ "title": "Global State Reconstruction Rate",
177
+ "unit": "percent",
178
+ "direction": "higher-is-better",
179
+ "summary": "Rate at which the team reconstructs the correct distributed state."
180
+ },
181
+ "secondaryMetrics": [
182
+ {
183
+ "id": "communication-reasoning-gap",
184
+ "title": "Communication-Reasoning Gap",
185
+ "unit": "rubric",
186
+ "direction": "lower-is-better"
187
+ },
188
+ {
189
+ "id": "summary-fact-retention-rate",
190
+ "title": "Summary Fact Retention Rate",
191
+ "unit": "percent",
192
+ "direction": "higher-is-better"
193
+ }
194
+ ],
195
+ "paperReferences": [
196
+ {
197
+ "id": "silo-bench-2026",
198
+ "title": "Silo-Bench: A Scalable Environment for Evaluating Distributed Coordination in Multi-Agent LLM Systems",
199
+ "year": 2026,
200
+ "url": "https://arxiv.org/abs/2603.01045",
201
+ "summary": "Shows agents often exchange information but fail to integrate it."
202
+ }
203
+ ],
204
+ "benchmarks": {
205
+ "cross-agent-state-reconstruction": {
206
+ "title": "Cross-Agent State Reconstruction",
207
+ "summary": "Checks whether the final shared state reflects facts that no single agent started with alone.",
208
+ "goal": "Measure whether the blackboard can reconstruct a correct global state from distributed local views.",
209
+ "failureModes": [
210
+ "information-silo",
211
+ "global-state-loss",
212
+ "integration-failure"
213
+ ],
214
+ "signals": [
215
+ "integration-summary-mentions-cross-agent-facts",
216
+ "ledger-reflects-distributed-blockers",
217
+ "final-answer-requires-pooled-state"
218
+ ],
219
+ "scoring": {
220
+ "primaryMetric": "global-state-reconstruction-rate",
221
+ "successCriterion": "The correct global state is reconstructed and used in the decision."
222
+ },
223
+ "tuningNotes": "Use when adjusting shared summary size, inbox targeting, or integration evidence aggregation."
224
+ },
225
+ "shared-summary-fact-retention": {
226
+ "title": "Shared Summary Fact Retention",
227
+ "summary": "Checks whether compiled summaries preserve critical facts from the raw coordination log.",
228
+ "goal": "Measure information loss introduced by summary compression.",
229
+ "failureModes": [
230
+ "summary-compression-loss",
231
+ "critical-fact-drop"
232
+ ],
233
+ "signals": [
234
+ "raw-log-fact-present-in-summary",
235
+ "summary-fact-present-in-inbox",
236
+ "decision-uses-retained-facts"
237
+ ],
238
+ "scoring": {
239
+ "primaryMetric": "summary-fact-retention-rate",
240
+ "successCriterion": "Decision-changing facts survive from raw log into the shared summary and targeted inboxes."
241
+ }
242
+ },
243
+ "communication-reasoning-gap-review": {
244
+ "title": "Communication-Reasoning Gap Review",
245
+ "summary": "Checks whether active communication actually improves the final integrated answer.",
246
+ "goal": "Measure the gap between information exchange and correct synthesis.",
247
+ "failureModes": [
248
+ "communication-without-synthesis",
249
+ "false-confidence-after-sharing"
250
+ ],
251
+ "signals": [
252
+ "messages-exchanged",
253
+ "facts-shared",
254
+ "incorrect-integrated-answer"
255
+ ],
256
+ "scoring": {
257
+ "primaryMetric": "communication-reasoning-gap",
258
+ "successCriterion": "Additional communication materially improves the final integrated answer."
259
+ },
260
+ "sotaBaseline": {
261
+ "source": "paper-static",
262
+ "paper": "Silo-Bench",
263
+ "year": 2026,
264
+ "metric": "communication-reasoning-gap",
265
+ "value": "present",
266
+ "notes": "Silo-Bench reports a persistent communication-reasoning gap rather than a single abstracted scalar baseline.",
267
+ "url": "https://arxiv.org/abs/2603.01045"
268
+ }
269
+ }
270
+ }
271
+ },
272
+ "simultaneous-coordination": {
273
+ "title": "Simultaneous Coordination",
274
+ "summary": "Benchmarks for contention, deadlock, and convergent-reasoning failures when decisions must be made concurrently.",
275
+ "category": "coordination",
276
+ "coordinationModel": "simultaneous-resource-contention",
277
+ "primaryMetric": {
278
+ "id": "deadlock-rate",
279
+ "title": "Deadlock Rate",
280
+ "unit": "percent",
281
+ "direction": "lower-is-better",
282
+ "summary": "Frequency of deadlock or coordination collapse under simultaneous decisions."
283
+ },
284
+ "secondaryMetrics": [
285
+ {
286
+ "id": "contention-resolution-rate",
287
+ "title": "Contention Resolution Rate",
288
+ "unit": "percent",
289
+ "direction": "higher-is-better"
290
+ },
291
+ {
292
+ "id": "symmetry-breaking-rate",
293
+ "title": "Symmetry Breaking Rate",
294
+ "unit": "percent",
295
+ "direction": "higher-is-better"
296
+ }
297
+ ],
298
+ "paperReferences": [
299
+ {
300
+ "id": "dpbench-2026",
301
+ "title": "DPBench: Large Language Models Struggle with Simultaneous Coordination",
302
+ "year": 2026,
303
+ "url": "https://arxiv.org/abs/2602.13255",
304
+ "summary": "Shows severe failures under simultaneous coordination and convergent reasoning."
305
+ }
306
+ ],
307
+ "sotaBaseline": {
308
+ "source": "paper-static",
309
+ "paper": "DPBench",
310
+ "year": 2026,
311
+ "metric": "deadlock-rate",
312
+ "value": 95,
313
+ "notes": "Paper abstract reports deadlock rates exceeding 95 percent under some simultaneous settings.",
314
+ "url": "https://arxiv.org/abs/2602.13255"
315
+ },
316
+ "benchmarks": {
317
+ "contention-deadlock-avoidance": {
318
+ "title": "Contention Deadlock Avoidance",
319
+ "summary": "Checks whether the framework avoids deadlock under simultaneous resource contention.",
320
+ "goal": "Measure whether external coordination surfaces reduce deadlock-like failure.",
321
+ "failureModes": [
322
+ "deadlock",
323
+ "convergent-reasoning",
324
+ "resource-contention-collapse"
325
+ ],
326
+ "signals": [
327
+ "deadlock-observed",
328
+ "dependency-barrier-resolution",
329
+ "assignment-reroute-success"
330
+ ],
331
+ "scoring": {
332
+ "primaryMetric": "deadlock-rate",
333
+ "successCriterion": "The team avoids deadlock and reaches a valid allocation or resolution."
334
+ },
335
+ "sotaBaseline": {
336
+ "source": "paper-static",
337
+ "paper": "DPBench",
338
+ "year": 2026,
339
+ "metric": "deadlock-rate",
340
+ "value": 95,
341
+ "notes": "Reference point from the abstract's reported worst-case deadlock rate.",
342
+ "url": "https://arxiv.org/abs/2602.13255"
343
+ }
344
+ },
345
+ "symmetric-action-divergence": {
346
+ "title": "Symmetric Action Divergence",
347
+ "summary": "Checks whether agents can avoid choosing the same locally sensible but globally conflicting action.",
348
+ "goal": "Measure symmetry breaking under pressure.",
349
+ "failureModes": [
350
+ "same-plan-collapse",
351
+ "convergent-reasoning"
352
+ ],
353
+ "signals": [
354
+ "distinct-action-selection",
355
+ "lock-or-ticket-usage",
356
+ "retry-breaks-symmetry"
357
+ ],
358
+ "scoring": {
359
+ "primaryMetric": "symmetry-breaking-rate",
360
+ "successCriterion": "At least one agent changes strategy in time to avoid global conflict."
361
+ }
362
+ },
363
+ "lockstep-resolution": {
364
+ "title": "Lockstep Resolution",
365
+ "summary": "Checks whether the framework resolves many-way concurrent dependencies without circular waiting.",
366
+ "goal": "Measure coordination quality when several blocking tickets must resolve together.",
367
+ "failureModes": [
368
+ "circular-wait",
369
+ "helper-assignment-stall",
370
+ "dependency-stall"
371
+ ],
372
+ "signals": [
373
+ "required-dependencies-close",
374
+ "clarification-chain-completes",
375
+ "no-indefinite-retry-loop"
376
+ ],
377
+ "scoring": {
378
+ "primaryMetric": "contention-resolution-rate",
379
+ "successCriterion": "Concurrent blockers close without repeated deadlock-like relaunch cycles."
380
+ }
381
+ }
382
+ }
383
+ },
384
+ "expertise-leverage": {
385
+ "title": "Expertise Leverage",
386
+ "summary": "Benchmarks for whether the framework routes work toward experts and preserves expert signals instead of averaging them away.",
387
+ "category": "coordination",
388
+ "coordinationModel": "role-and-capability-routing",
389
+ "primaryMetric": {
390
+ "id": "expert-preservation-rate",
391
+ "title": "Expert Preservation Rate",
392
+ "unit": "percent",
393
+ "direction": "higher-is-better",
394
+ "summary": "Rate at which expert-advantaged decisions survive to the final recommendation."
395
+ },
396
+ "secondaryMetrics": [
397
+ {
398
+ "id": "expert-performance-gap",
399
+ "title": "Expert Performance Gap",
400
+ "unit": "percent",
401
+ "direction": "lower-is-better"
402
+ },
403
+ {
404
+ "id": "capability-routing-precision",
405
+ "title": "Capability Routing Precision",
406
+ "unit": "percent",
407
+ "direction": "higher-is-better"
408
+ }
409
+ ],
410
+ "paperReferences": [
411
+ {
412
+ "id": "experts-back-2026",
413
+ "title": "Multi-Agent Teams Hold Experts Back",
414
+ "year": 2026,
415
+ "url": "https://arxiv.org/abs/2602.01011",
416
+ "summary": "Shows unconstrained teams often underuse their best expert."
417
+ }
418
+ ],
419
+ "sotaBaseline": {
420
+ "source": "paper-static",
421
+ "paper": "Multi-Agent Teams Hold Experts Back",
422
+ "year": 2026,
423
+ "metric": "expert-performance-gap",
424
+ "value": 37.6,
425
+ "notes": "Paper abstract reports performance losses of up to 37.6 percent relative to the best expert agent.",
426
+ "url": "https://arxiv.org/abs/2602.01011"
427
+ },
428
+ "benchmarks": {
429
+ "expert-routing-preservation": {
430
+ "title": "Expert Routing Preservation",
431
+ "summary": "Checks whether capability-targeted work is routed to the best available owner and stays there through closure.",
432
+ "goal": "Measure whether the harness protects expert ownership instead of diluting it.",
433
+ "failureModes": [
434
+ "expert-underuse",
435
+ "misrouting",
436
+ "owner-dilution"
437
+ ],
438
+ "signals": [
439
+ "preferred-agent-selected",
440
+ "assigned-owner-resolves-task",
441
+ "few-non-expert-overrides"
442
+ ],
443
+ "scoring": {
444
+ "primaryMetric": "capability-routing-precision",
445
+ "successCriterion": "Capability-targeted requests land on the intended owner or a justified fallback."
446
+ }
447
+ },
448
+ "expert-signal-weighting": {
449
+ "title": "Expert Signal Weighting",
450
+ "summary": "Checks whether the final recommendation preserves a stronger expert signal when expert and non-expert views conflict.",
451
+ "goal": "Measure resistance to integrative compromise.",
452
+ "failureModes": [
453
+ "integrative-compromise",
454
+ "expert-signal-dilution"
455
+ ],
456
+ "signals": [
457
+ "expert-evidence-cited-integration",
458
+ "cont-qa-honors-expert-proof",
459
+ "non-expert-opinion-does-not-override-proof"
460
+ ],
461
+ "scoring": {
462
+ "primaryMetric": "expert-preservation-rate",
463
+ "successCriterion": "The final result tracks the best-supported expert view rather than the average view."
464
+ }
465
+ },
466
+ "anti-compromise-decision-review": {
467
+ "title": "Anti-Compromise Decision Review",
468
+ "summary": "Checks whether the framework avoids averaging incompatible recommendations into a low-quality compromise.",
469
+ "goal": "Measure whether closure authority and proof gates prevent compromise collapse.",
470
+ "failureModes": [
471
+ "consensus-over-correctness",
472
+ "expert-performance-gap"
473
+ ],
474
+ "signals": [
475
+ "integration-flags-conflict",
476
+ "clarification-raised",
477
+ "final-recommendation-follows-evidence"
478
+ ],
479
+ "scoring": {
480
+ "primaryMetric": "expert-performance-gap",
481
+ "successCriterion": "The team does not regress materially below its strongest expert signal."
482
+ },
483
+ "sotaBaseline": {
484
+ "source": "paper-static",
485
+ "paper": "Multi-Agent Teams Hold Experts Back",
486
+ "year": 2026,
487
+ "metric": "expert-performance-gap",
488
+ "value": 37.6,
489
+ "notes": "Reference point from the abstract's reported expert-underuse loss.",
490
+ "url": "https://arxiv.org/abs/2602.01011"
491
+ }
492
+ }
493
+ }
494
+ },
495
+ "blackboard-fidelity": {
496
+ "title": "Blackboard Fidelity",
497
+ "summary": "Benchmarks for whether raw coordination, summaries, inboxes, ledgers, and integration state stay semantically aligned.",
498
+ "category": "coordination",
499
+ "coordinationModel": "blackboard-projection-integrity",
500
+ "primaryMetric": {
501
+ "id": "projection-consistency-rate",
502
+ "title": "Projection Consistency Rate",
503
+ "unit": "percent",
504
+ "direction": "higher-is-better",
505
+ "summary": "Rate at which derived blackboard artifacts preserve the underlying coordination facts."
506
+ },
507
+ "secondaryMetrics": [
508
+ {
509
+ "id": "targeted-inbox-recall",
510
+ "title": "Targeted Inbox Recall",
511
+ "unit": "percent",
512
+ "direction": "higher-is-better"
513
+ },
514
+ {
515
+ "id": "integration-coherence-rate",
516
+ "title": "Integration Coherence Rate",
517
+ "unit": "percent",
518
+ "direction": "higher-is-better"
519
+ }
520
+ ],
521
+ "benchmarks": {
522
+ "log-to-summary-consistency": {
523
+ "title": "Log To Summary Consistency",
524
+ "summary": "Checks whether the shared summary preserves critical blockers, clarifications, dependencies, and conflicts from the raw log.",
525
+ "goal": "Measure whether summary generation remains trustworthy as the wave grows.",
526
+ "failureModes": [
527
+ "summary-drift",
528
+ "projection-loss"
529
+ ],
530
+ "signals": [
531
+ "blocker-preserved",
532
+ "clarification-preserved",
533
+ "dependency-preserved",
534
+ "conflict-preserved"
535
+ ],
536
+ "scoring": {
537
+ "primaryMetric": "projection-consistency-rate",
538
+ "successCriterion": "Critical coordination facts remain visible after projection."
539
+ }
540
+ },
541
+ "inbox-targeting-fidelity": {
542
+ "title": "Inbox Targeting Fidelity",
543
+ "summary": "Checks whether relevant facts reach the agents that own the impacted paths, components, or requests.",
544
+ "goal": "Measure whether inbox targeting reduces silos instead of creating them.",
545
+ "failureModes": [
546
+ "mis-targeted-context",
547
+ "owner-context-loss"
548
+ ],
549
+ "signals": [
550
+ "artifact-relevant-fact-routed",
551
+ "owned-component-fact-routed",
552
+ "targeted-request-routed"
553
+ ],
554
+ "scoring": {
555
+ "primaryMetric": "targeted-inbox-recall",
556
+ "successCriterion": "Agents receive the facts relevant to their owned paths, components, or follow-up work."
557
+ }
558
+ },
559
+ "integration-summary-coherence": {
560
+ "title": "Integration Summary Coherence",
561
+ "summary": "Checks whether integration output is faithful to blockers, claims, proof gaps, docs gaps, and deploy risks seen elsewhere.",
562
+ "goal": "Measure whether the integration steward output is a reliable synthesis layer.",
563
+ "failureModes": [
564
+ "integration-hallucination",
565
+ "integration-omission"
566
+ ],
567
+ "signals": [
568
+ "conflicts-match-log",
569
+ "proof-gaps-match-agent-state",
570
+ "doc-gaps-match-doc-queue"
571
+ ],
572
+ "scoring": {
573
+ "primaryMetric": "integration-coherence-rate",
574
+ "successCriterion": "Integration summaries match the evidence visible in coordination, validation, and runtime signals."
575
+ }
576
+ }
577
+ }
578
+ },
579
+ "contradiction-recovery": {
580
+ "title": "Contradiction Recovery",
581
+ "summary": "Benchmarks for whether the framework notices conflicting claims and converts them into explicit repair work instead of silent inconsistency.",
582
+ "category": "coordination",
583
+ "coordinationModel": "conflict-detection-and-repair",
584
+ "primaryMetric": {
585
+ "id": "contradiction-detection-rate",
586
+ "title": "Contradiction Detection Rate",
587
+ "unit": "percent",
588
+ "direction": "higher-is-better",
589
+ "summary": "Rate at which material conflicting claims are surfaced before closeout."
590
+ },
591
+ "secondaryMetrics": [
592
+ {
593
+ "id": "repair-closure-rate",
594
+ "title": "Repair Closure Rate",
595
+ "unit": "percent",
596
+ "direction": "higher-is-better"
597
+ },
598
+ {
599
+ "id": "false-consensus-rate",
600
+ "title": "False Consensus Rate",
601
+ "unit": "percent",
602
+ "direction": "lower-is-better"
603
+ }
604
+ ],
605
+ "benchmarks": {
606
+ "claim-conflict-detection": {
607
+ "title": "Claim Conflict Detection",
608
+ "summary": "Checks whether incompatible claims are surfaced in coordination or integration instead of passing through silently.",
609
+ "goal": "Measure whether the framework sees contradictory evidence before final closure.",
610
+ "failureModes": [
611
+ "false-consensus",
612
+ "silent-contradiction"
613
+ ],
614
+ "signals": [
615
+ "integration-conflict-raised",
616
+ "cont-qa-blocks",
617
+ "follow-up-request-created"
618
+ ],
619
+ "scoring": {
620
+ "primaryMetric": "contradiction-detection-rate",
621
+ "successCriterion": "Conflicting claims are surfaced as explicit blockers or conflicts before closeout."
622
+ }
623
+ },
624
+ "evidence-based-repair": {
625
+ "title": "Evidence Based Repair",
626
+ "summary": "Checks whether contradictions produce concrete owner-bound follow-up work and eventually close.",
627
+ "goal": "Measure whether conflict handling ends in repair rather than logging alone.",
628
+ "failureModes": [
629
+ "conflict-without-repair",
630
+ "dangling-follow-up"
631
+ ],
632
+ "signals": [
633
+ "helper-assignment-created",
634
+ "clarification-linked-request-closes",
635
+ "integration-recommendation-improves"
636
+ ],
637
+ "scoring": {
638
+ "primaryMetric": "repair-closure-rate",
639
+ "successCriterion": "Contradictions produce follow-up work that resolves before final pass."
640
+ }
641
+ },
642
+ "clarification-chain-closure": {
643
+ "title": "Clarification Chain Closure",
644
+ "summary": "Checks whether clarification requests remain linked and blocking until their follow-up chain really closes.",
645
+ "goal": "Measure resistance to fake resolution through status drift.",
646
+ "failureModes": [
647
+ "premature-clarification-close",
648
+ "broken-linkage"
649
+ ],
650
+ "signals": [
651
+ "closure-condition-present",
652
+ "linked-request-open-while-parent-open",
653
+ "human-escalation-when-needed"
654
+ ],
655
+ "scoring": {
656
+ "primaryMetric": "false-consensus-rate",
657
+ "successCriterion": "Clarifications do not silently disappear without an actual linked resolution."
658
+ }
659
+ }
660
+ }
661
+ }
662
+ }
663
+ }