@chllming/wave-orchestration 0.6.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. package/CHANGELOG.md +64 -1
  2. package/README.md +44 -8
  3. package/docs/agents/wave-orchestrator-role.md +50 -0
  4. package/docs/agents/wave-planner-role.md +39 -0
  5. package/docs/context7/bundles.json +9 -0
  6. package/docs/context7/planner-agent/README.md +25 -0
  7. package/docs/context7/planner-agent/manifest.json +83 -0
  8. package/docs/context7/planner-agent/papers/cooperbench-why-coding-agents-cannot-be-your-teammates-yet.md +3283 -0
  9. package/docs/context7/planner-agent/papers/dova-deliberation-first-multi-agent-orchestration-for-autonomous-research-automation.md +1699 -0
  10. package/docs/context7/planner-agent/papers/dpbench-large-language-models-struggle-with-simultaneous-coordination.md +2251 -0
  11. package/docs/context7/planner-agent/papers/incremental-planning-to-control-a-blackboard-based-problem-solver.md +1729 -0
  12. package/docs/context7/planner-agent/papers/silo-bench-a-scalable-environment-for-evaluating-distributed-coordination-in-multi-agent-llm-systems.md +3747 -0
  13. package/docs/context7/planner-agent/papers/todoevolve-learning-to-architect-agent-planning-systems.md +1675 -0
  14. package/docs/context7/planner-agent/papers/verified-multi-agent-orchestration-a-plan-execute-verify-replan-framework-for-complex-query-resolution.md +1173 -0
  15. package/docs/context7/planner-agent/papers/why-do-multi-agent-llm-systems-fail.md +5211 -0
  16. package/docs/context7/planner-agent/topics/planning-and-orchestration.md +24 -0
  17. package/docs/evals/README.md +96 -1
  18. package/docs/evals/arm-templates/README.md +13 -0
  19. package/docs/evals/arm-templates/full-wave.json +15 -0
  20. package/docs/evals/arm-templates/single-agent.json +15 -0
  21. package/docs/evals/benchmark-catalog.json +7 -0
  22. package/docs/evals/cases/README.md +47 -0
  23. package/docs/evals/cases/wave-blackboard-inbox-targeting.json +73 -0
  24. package/docs/evals/cases/wave-contradiction-conflict.json +104 -0
  25. package/docs/evals/cases/wave-expert-routing-preservation.json +69 -0
  26. package/docs/evals/cases/wave-hidden-profile-private-evidence.json +81 -0
  27. package/docs/evals/cases/wave-premature-closure-guard.json +71 -0
  28. package/docs/evals/cases/wave-silo-cross-agent-state.json +77 -0
  29. package/docs/evals/cases/wave-simultaneous-lockstep.json +92 -0
  30. package/docs/evals/cooperbench/real-world-mitigation.md +341 -0
  31. package/docs/evals/external-benchmarks.json +85 -0
  32. package/docs/evals/external-command-config.sample.json +9 -0
  33. package/docs/evals/external-command-config.swe-bench-pro.json +8 -0
  34. package/docs/evals/pilots/README.md +47 -0
  35. package/docs/evals/pilots/swe-bench-pro-public-full-wave-review-10.json +64 -0
  36. package/docs/evals/pilots/swe-bench-pro-public-pilot.json +111 -0
  37. package/docs/evals/wave-benchmark-program.md +302 -0
  38. package/docs/guides/planner.md +48 -11
  39. package/docs/plans/context7-wave-orchestrator.md +20 -0
  40. package/docs/plans/current-state.md +9 -1
  41. package/docs/plans/examples/wave-benchmark-improvement.md +108 -0
  42. package/docs/plans/examples/wave-example-live-proof.md +1 -1
  43. package/docs/plans/examples/wave-example-rollout-fidelity.md +340 -0
  44. package/docs/plans/wave-orchestrator.md +73 -11
  45. package/docs/plans/waves/reviews/wave-1-benchmark-operator.md +118 -0
  46. package/docs/reference/coordination-and-closure.md +436 -0
  47. package/docs/reference/live-proof-waves.md +25 -3
  48. package/docs/reference/npmjs-trusted-publishing.md +3 -3
  49. package/docs/reference/proof-metrics.md +90 -0
  50. package/docs/reference/runtime-config/README.md +61 -0
  51. package/docs/reference/sample-waves.md +29 -18
  52. package/docs/reference/wave-control.md +164 -0
  53. package/docs/reference/wave-planning-lessons.md +131 -0
  54. package/package.json +5 -4
  55. package/releases/manifest.json +33 -0
  56. package/scripts/research/agent-context-archive.mjs +18 -0
  57. package/scripts/research/manifests/agent-context-expanded-2026-03-22.mjs +17 -0
  58. package/scripts/research/sync-planner-context7-bundle.mjs +133 -0
  59. package/scripts/wave-autonomous.mjs +2 -4
  60. package/scripts/wave-orchestrator/adhoc.mjs +32 -11
  61. package/scripts/wave-orchestrator/artifact-schemas.mjs +232 -0
  62. package/scripts/wave-orchestrator/autonomous.mjs +27 -6
  63. package/scripts/wave-orchestrator/benchmark-cases.mjs +374 -0
  64. package/scripts/wave-orchestrator/benchmark-external.mjs +1384 -0
  65. package/scripts/wave-orchestrator/benchmark.mjs +972 -0
  66. package/scripts/wave-orchestrator/clarification-triage.mjs +78 -12
  67. package/scripts/wave-orchestrator/config.mjs +175 -0
  68. package/scripts/wave-orchestrator/control-cli.mjs +1123 -0
  69. package/scripts/wave-orchestrator/control-plane.mjs +697 -0
  70. package/scripts/wave-orchestrator/coord-cli.mjs +360 -2
  71. package/scripts/wave-orchestrator/coordination-store.mjs +211 -9
  72. package/scripts/wave-orchestrator/coordination.mjs +84 -0
  73. package/scripts/wave-orchestrator/dashboard-renderer.mjs +38 -3
  74. package/scripts/wave-orchestrator/dashboard-state.mjs +22 -0
  75. package/scripts/wave-orchestrator/evals.mjs +23 -0
  76. package/scripts/wave-orchestrator/executors.mjs +3 -2
  77. package/scripts/wave-orchestrator/feedback.mjs +55 -0
  78. package/scripts/wave-orchestrator/install.mjs +253 -26
  79. package/scripts/wave-orchestrator/launcher-closure.mjs +4 -1
  80. package/scripts/wave-orchestrator/launcher-runtime.mjs +24 -21
  81. package/scripts/wave-orchestrator/launcher.mjs +800 -35
  82. package/scripts/wave-orchestrator/package-update-notice.mjs +230 -0
  83. package/scripts/wave-orchestrator/package-version.mjs +32 -0
  84. package/scripts/wave-orchestrator/planner-context.mjs +75 -0
  85. package/scripts/wave-orchestrator/planner.mjs +2270 -136
  86. package/scripts/wave-orchestrator/proof-cli.mjs +195 -0
  87. package/scripts/wave-orchestrator/proof-registry.mjs +317 -0
  88. package/scripts/wave-orchestrator/replay.mjs +10 -4
  89. package/scripts/wave-orchestrator/retry-cli.mjs +184 -0
  90. package/scripts/wave-orchestrator/retry-control.mjs +225 -0
  91. package/scripts/wave-orchestrator/shared.mjs +26 -0
  92. package/scripts/wave-orchestrator/swe-bench-pro-task.mjs +1004 -0
  93. package/scripts/wave-orchestrator/traces.mjs +157 -2
  94. package/scripts/wave-orchestrator/wave-control-client.mjs +532 -0
  95. package/scripts/wave-orchestrator/wave-control-schema.mjs +309 -0
  96. package/scripts/wave-orchestrator/wave-files.mjs +17 -5
  97. package/scripts/wave.mjs +39 -2
  98. package/skills/repo-coding-rules/SKILL.md +1 -0
  99. package/skills/role-cont-eval/SKILL.md +1 -0
  100. package/skills/role-cont-qa/SKILL.md +13 -6
  101. package/skills/role-deploy/SKILL.md +1 -0
  102. package/skills/role-documentation/SKILL.md +4 -0
  103. package/skills/role-implementation/SKILL.md +4 -0
  104. package/skills/role-infra/SKILL.md +2 -1
  105. package/skills/role-integration/SKILL.md +15 -8
  106. package/skills/role-planner/SKILL.md +39 -0
  107. package/skills/role-planner/skill.json +21 -0
  108. package/skills/role-research/SKILL.md +1 -0
  109. package/skills/role-security/SKILL.md +2 -2
  110. package/skills/runtime-claude/SKILL.md +2 -1
  111. package/skills/runtime-codex/SKILL.md +1 -0
  112. package/skills/runtime-local/SKILL.md +2 -0
  113. package/skills/runtime-opencode/SKILL.md +1 -0
  114. package/skills/wave-core/SKILL.md +25 -6
  115. package/skills/wave-core/references/marker-syntax.md +16 -8
  116. package/wave.config.json +45 -0
@@ -0,0 +1,24 @@
1
+ ---
2
+ summary: 'Curated planning and orchestration corpus exported for the agentic planner Context7 bundle.'
3
+ read_when:
4
+ - You are publishing or refreshing the planner-agentic Context7 library
5
+ - You need the exact planner research subset that Wave ships for agentic planning
6
+ title: 'Planner Agentic Context7 Corpus'
7
+ ---
8
+
9
+ # Planner Agentic Context7 Corpus
10
+
11
+ This file is the tracked topic index for the planner-specific Context7 corpus.
12
+ It intentionally references only the copied files that ship under
13
+ `docs/context7/planner-agent/`.
14
+
15
+ ## Included papers
16
+
17
+ - [Verified Multi-Agent Orchestration: A Plan-Execute-Verify-Replan Framework for Complex Query Resolution](../papers/verified-multi-agent-orchestration-a-plan-execute-verify-replan-framework-for-complex-query-resolution.md)
18
+ - [TodoEvolve: Learning to Architect Agent Planning Systems](../papers/todoevolve-learning-to-architect-agent-planning-systems.md)
19
+ - [DOVA: Deliberation-First Multi-Agent Orchestration for Autonomous Research Automation](../papers/dova-deliberation-first-multi-agent-orchestration-for-autonomous-research-automation.md)
20
+ - [Why Do Multi-Agent LLM Systems Fail?](../papers/why-do-multi-agent-llm-systems-fail.md)
21
+ - [Silo-Bench: A Scalable Environment for Evaluating Distributed Coordination in Multi-Agent LLM Systems](../papers/silo-bench-a-scalable-environment-for-evaluating-distributed-coordination-in-multi-agent-llm-systems.md)
22
+ - [DPBench: Large Language Models Struggle with Simultaneous Coordination](../papers/dpbench-large-language-models-struggle-with-simultaneous-coordination.md)
23
+ - [CooperBench: Why Coding Agents Cannot be Your Teammates Yet](../papers/cooperbench-why-coding-agents-cannot-be-your-teammates-yet.md)
24
+ - [Incremental Planning to Control a Blackboard-Based Problem Solver](../papers/incremental-planning-to-control-a-blackboard-based-problem-solver.md)
@@ -7,13 +7,21 @@ summary: "How to use delegated benchmark families, pinned benchmarks, and coordi
7
7
 
8
8
  Wave's benchmark catalog lives in `docs/evals/benchmark-catalog.json`.
9
9
 
10
+ The executable local case corpus lives in `docs/evals/cases/`, and the benchmark runner is available through `wave benchmark`.
11
+ Frozen external pilot manifests live in `docs/evals/pilots/`, and external comparison arm templates live in `docs/evals/arm-templates/`.
12
+ An example command-template config shape lives in `docs/evals/external-command-config.sample.json`.
13
+ A runnable SWE-bench Pro config for the local task harness lives in `docs/evals/external-command-config.swe-bench-pro.json`.
14
+
10
15
  It has two jobs:
11
16
 
12
17
  - give `cont-EVAL` a repo-governed menu of allowed benchmark families and benchmark ids
13
18
  - document what each benchmark is trying to catch, including coordination failure modes and static paper baselines
19
+ - optionally point from benchmark ids to repo-local deterministic benchmark cases through `localCases`
14
20
 
15
21
  The catalog is reference metadata, not a run-history database. It tells the wave author and `cont-EVAL` what kinds of checks are allowed and what external benchmark or paper baseline those checks map to.
16
22
 
23
+ The local case corpus is the executable side of that metadata. It gives the repo a deterministic way to score the current Wave substrate on summary fidelity, targeted inbox recall, capability routing, contradiction handling, and closure guards before moving on to costlier live suites.
24
+
17
25
  For a full authored wave example that uses these patterns, see [docs/reference/sample-waves.md](../reference/sample-waves.md).
18
26
 
19
27
  These benchmark families are also Wave's operator-facing vocabulary for common MAS failure modes. For the research-side framing and the current architectural gaps, see [docs/research/coordination-failure-review.md](../research/coordination-failure-review.md).
@@ -84,6 +92,93 @@ The coordination-oriented families currently included in the catalog are:
84
92
  - `contradiction-recovery`
85
93
  Use when the risk is false consensus, unresolved conflicting claims, or clarification chains that appear resolved without real repair.
86
94
 
95
+ ## Local Case Corpus
96
+
97
+ The repo now ships deterministic local benchmark cases under `docs/evals/cases/`.
98
+
99
+ Each case:
100
+
101
+ - binds to one benchmark family and benchmark id
102
+ - defines a coordination fixture plus expected facts, inboxes, assignments, or closure guards
103
+ - is executable through `wave benchmark run`
104
+
105
+ Useful commands:
106
+
107
+ ```bash
108
+ pnpm exec wave benchmark list
109
+ pnpm exec wave benchmark show --case wave-hidden-profile-private-evidence --json
110
+ pnpm exec wave benchmark run --json
111
+ ```
112
+
113
+ The default output path is `.tmp/wave-benchmarks/latest/`.
114
+
115
+ These case runs are local benchmark artifacts, not committed run history.
116
+
117
+ Native mode is deterministic on purpose. `wave benchmark run` is meant to prove the coordination substrate before we move to live external suites. Its logged outputs are:
118
+
119
+ - per-case, per-arm `score`, `alignedScore`, `passed`, `direction`, `threshold`, `metrics`, `details`, and generated artifacts
120
+ - family summaries with direction-aligned mean score and pass rate
121
+ - arm comparisons with direction-aligned mean delta versus `single-agent` and bootstrap confidence intervals
122
+
123
+ When `waveControl` reporting is enabled, native runs publish `benchmark_run` and `benchmark_item` events through the same telemetry spine as live waves. For the full native-mode contract and the rationale for each metric, see [wave-benchmark-program.md](./wave-benchmark-program.md) and [proof-metrics.md](../reference/proof-metrics.md).
124
+
125
+ ## External Benchmark Workflow
126
+
127
+ The current direct external benchmark path starts with `SWE-bench Pro`.
128
+
129
+ Why:
130
+
131
+ - it keeps the first direct benchmark grounded in real repository bug-fix work
132
+ - it has a public harness and official verifier path
133
+ - it lets Wave compare `single-agent` and `full-wave` arms under matched settings
134
+
135
+ The second direct benchmark slot is intentionally deferred until a later CooperBench-oriented pass.
136
+
137
+ The frozen direct pilot is:
138
+
139
+ - `docs/evals/pilots/swe-bench-pro-public-pilot.json`
140
+
141
+ There is also a review-only diagnostic subset:
142
+
143
+ - `docs/evals/pilots/swe-bench-pro-public-full-wave-review-10.json`
144
+
145
+ Useful commands:
146
+
147
+ ```bash
148
+ pnpm exec wave benchmark external-list
149
+ pnpm exec wave benchmark external-show --adapter swe-bench-pro --json
150
+ pnpm exec wave benchmark external-pilots --json
151
+ pnpm exec wave benchmark external-run --adapter swe-bench-pro --command-config docs/evals/external-command-config.swe-bench-pro.json --dry-run --json
152
+ pnpm exec wave benchmark external-run --adapter swe-bench-pro --manifest docs/evals/pilots/swe-bench-pro-public-full-wave-review-10.json --arm full-wave --command-config docs/evals/external-command-config.swe-bench-pro.json --json
153
+ ```
154
+
155
+ For the first honest comparison:
156
+
157
+ - compare only `single-agent` and `full-wave`
158
+ - do not change model, executor, or budget assumptions between those two arms
159
+ - treat review-only subsets as diagnostic material, not as canonical pairwise comparison evidence
160
+
161
+ Each `wave benchmark external-run` output directory now includes:
162
+
163
+ - `results.json`
164
+ - `results.md`
165
+ - `failure-review.json`
166
+ - `failure-review.md`
167
+
168
+ Start with `failure-review.md` when a review-only batch returns many failures. It splits
169
+ verifier-image issues, setup or harness failures, trustworthy patch failures, and dry-run
170
+ planning-only output so the batch is easier to interpret.
171
+
172
+ When `waveControl` reporting is enabled, benchmark runs also publish through the same telemetry
173
+ spine as live waves:
174
+
175
+ - `benchmark_run` for the batch configuration and attestation hash
176
+ - `benchmark_item` for each task/arm execution
177
+ - `verification` for official harness output and linked verifier artifacts
178
+ - `review` for publishability, validity, and failure classification
179
+
180
+ That keeps benchmark trust evidence queryable alongside the runtime traces that produced it.
181
+
87
182
  ## How To Choose The Right Family
88
183
 
89
184
  Choose the family based on the failure you are most worried about, not just on the surface area being changed.
@@ -163,6 +258,6 @@ The benchmark catalog does not yet store:
163
258
 
164
259
  - local benchmark run history
165
260
  - local-vs-paper delta computation
166
- - automated benchmark execution plans
261
+ - a second direct benchmark beyond the current SWE-bench Pro path
167
262
 
168
263
  For now it is the schema and policy layer that keeps eval authoring, `cont-EVAL`, and coordination benchmarking aligned.
@@ -0,0 +1,13 @@
1
+ ---
2
+ title: "External Benchmark Arm Templates"
3
+ summary: "Frozen orchestration templates for honest external benchmark comparisons."
4
+ ---
5
+
6
+ # External Benchmark Arm Templates
7
+
8
+ These templates define the only two arm shapes used in the first honest external benchmark runs:
9
+
10
+ - `single-agent`
11
+ - `full-wave`
12
+
13
+ They are intentionally narrow so external benchmarks compare orchestration shape rather than silently changing model, executor, or budget assumptions.
@@ -0,0 +1,15 @@
1
+ {
2
+ "armId": "full-wave",
3
+ "title": "Full Wave Orchestration",
4
+ "roles": ["implementation", "cont-eval", "integration", "documentation", "cont-qa"],
5
+ "includeContEval": true,
6
+ "includeIntegrationSteward": true,
7
+ "includeDocumentationSteward": true,
8
+ "tracesRequired": true,
9
+ "notes": [
10
+ "Includes E0, A8, A9, and A0 in addition to implementation owners.",
11
+ "Compiled summaries and targeted inboxes are part of the arm behavior.",
12
+ "Proof-bounded closure and trace capture are required.",
13
+ "Must use the same model, executor, and benchmark verifier assumptions as the single-agent baseline."
14
+ ]
15
+ }
@@ -0,0 +1,15 @@
1
+ {
2
+ "armId": "single-agent",
3
+ "title": "Single Agent Baseline",
4
+ "roles": ["implementation"],
5
+ "includeContEval": false,
6
+ "includeIntegrationSteward": false,
7
+ "includeDocumentationSteward": false,
8
+ "tracesRequired": false,
9
+ "notes": [
10
+ "One implementation owner only.",
11
+ "No specialist decomposition.",
12
+ "No cont-EVAL, integration steward, or documentation steward.",
13
+ "Used only when model, executor, verifier, and budget are held constant relative to full-wave."
14
+ ]
15
+ }
@@ -120,6 +120,7 @@
120
120
  "private-evidence-integration": {
121
121
  "title": "Private Evidence Integration",
122
122
  "summary": "Checks whether separately observed facts are integrated into the final answer rather than merely repeated in conversation.",
123
+ "localCases": ["wave-hidden-profile-private-evidence"],
123
124
  "goal": "Measure end-to-end integration of distributed evidence into a coherent outcome.",
124
125
  "failureModes": [
125
126
  "communication-without-integration",
@@ -149,6 +150,7 @@
149
150
  "premature-consensus-guard": {
150
151
  "title": "Premature Consensus Guard",
151
152
  "summary": "Checks whether the system delays closure when important evidence is still siloed.",
153
+ "localCases": ["wave-premature-closure-guard"],
152
154
  "goal": "Measure resistance to converging early on shared but incomplete evidence.",
153
155
  "failureModes": [
154
156
  "premature-consensus",
@@ -205,6 +207,7 @@
205
207
  "cross-agent-state-reconstruction": {
206
208
  "title": "Cross-Agent State Reconstruction",
207
209
  "summary": "Checks whether the final shared state reflects facts that no single agent started with alone.",
210
+ "localCases": ["wave-silo-cross-agent-state"],
208
211
  "goal": "Measure whether the blackboard can reconstruct a correct global state from distributed local views.",
209
212
  "failureModes": [
210
213
  "information-silo",
@@ -363,6 +366,7 @@
363
366
  "lockstep-resolution": {
364
367
  "title": "Lockstep Resolution",
365
368
  "summary": "Checks whether the framework resolves many-way concurrent dependencies without circular waiting.",
369
+ "localCases": ["wave-simultaneous-lockstep"],
366
370
  "goal": "Measure coordination quality when several blocking tickets must resolve together.",
367
371
  "failureModes": [
368
372
  "circular-wait",
@@ -429,6 +433,7 @@
429
433
  "expert-routing-preservation": {
430
434
  "title": "Expert Routing Preservation",
431
435
  "summary": "Checks whether capability-targeted work is routed to the best available owner and stays there through closure.",
436
+ "localCases": ["wave-expert-routing-preservation"],
432
437
  "goal": "Measure whether the harness protects expert ownership instead of diluting it.",
433
438
  "failureModes": [
434
439
  "expert-underuse",
@@ -541,6 +546,7 @@
541
546
  "inbox-targeting-fidelity": {
542
547
  "title": "Inbox Targeting Fidelity",
543
548
  "summary": "Checks whether relevant facts reach the agents that own the impacted paths, components, or requests.",
549
+ "localCases": ["wave-blackboard-inbox-targeting"],
544
550
  "goal": "Measure whether inbox targeting reduces silos instead of creating them.",
545
551
  "failureModes": [
546
552
  "mis-targeted-context",
@@ -606,6 +612,7 @@
606
612
  "claim-conflict-detection": {
607
613
  "title": "Claim Conflict Detection",
608
614
  "summary": "Checks whether incompatible claims are surfaced in coordination or integration instead of passing through silently.",
615
+ "localCases": ["wave-contradiction-conflict"],
609
616
  "goal": "Measure whether the framework sees contradictory evidence before final closure.",
610
617
  "failureModes": [
611
618
  "false-consensus",
@@ -0,0 +1,47 @@
1
+ ---
2
+ title: "Wave Benchmark Cases"
3
+ summary: "Deterministic local benchmark cases for Wave-native coordination, routing, and closure evaluation."
4
+ ---
5
+
6
+ # Wave Benchmark Cases
7
+
8
+ Each file in this directory defines one deterministic benchmark case consumed by `wave benchmark`.
9
+
10
+ ## Why These Cases Exist
11
+
12
+ The benchmark catalog describes *what* a benchmark is meant to measure. These case files provide the local executable fixtures that let the repo score those ideas consistently.
13
+
14
+ They are designed to be:
15
+
16
+ - cheap
17
+ - deterministic
18
+ - transparent
19
+ - rooted in current Wave surfaces such as summaries, inboxes, request routing, and closure guards
20
+
21
+ ## File Shape
22
+
23
+ Each case file is a single JSON object with:
24
+
25
+ - `id`
26
+ - `familyId`
27
+ - `benchmarkId`
28
+ - `supportedArms`
29
+ - `fixture`
30
+ - `expectations`
31
+ - `scoring`
32
+
33
+ ## Current Arms
34
+
35
+ The runner currently compares:
36
+
37
+ - `single-agent`
38
+ - `multi-agent-minimal`
39
+ - `full-wave`
40
+
41
+ The `full-wave-plus-improvement` arm is supported by the loader for later benchmark-improvement loops but is not part of the initial deterministic corpus.
42
+
43
+ ## Current Limitation
44
+
45
+ The initial corpus is projection-backed rather than live-run-backed. It evaluates how well the current Wave substrate compiles and routes coordination state before we spend runtime budget on larger live suites.
46
+
47
+ That is intentional for the first milestone. The next layer will add trace-backed and external benchmark adapters on top of this format.
@@ -0,0 +1,73 @@
1
+ {
2
+ "version": 1,
3
+ "id": "wave-blackboard-inbox-targeting",
4
+ "title": "Inbox Targeting Fidelity",
5
+ "summary": "Critical doc and runtime facts should survive projection into the right owner inboxes instead of staying buried in raw coordination.",
6
+ "familyId": "blackboard-fidelity",
7
+ "benchmarkId": "inbox-targeting-fidelity",
8
+ "kind": "projection",
9
+ "supportedArms": ["single-agent", "multi-agent-minimal", "full-wave"],
10
+ "scoring": {
11
+ "kind": "projection-fidelity",
12
+ "primaryMetric": "targeted-inbox-recall",
13
+ "thresholds": {
14
+ "targeted-inbox-recall": 100,
15
+ "projection-consistency-rate": 100
16
+ },
17
+ "practicalWinThreshold": 20
18
+ },
19
+ "expectations": {
20
+ "summaryFacts": ["docs must note the new queue retry ceiling"],
21
+ "targetedInboxes": {
22
+ "a9": ["docs must note the new queue retry ceiling"],
23
+ "a1": ["runtime must enforce retry ceiling 3 before enqueue"]
24
+ }
25
+ },
26
+ "fixture": {
27
+ "lane": "main",
28
+ "waveNumber": 0,
29
+ "primaryAgentId": "a1",
30
+ "agents": [
31
+ {
32
+ "agentId": "a1",
33
+ "title": "Runtime Owner",
34
+ "ownedPaths": ["src/queue/retries.ts"],
35
+ "capabilities": ["runtime"]
36
+ },
37
+ {
38
+ "agentId": "a9",
39
+ "title": "Documentation Steward",
40
+ "ownedPaths": ["docs/plans/current-state.md"],
41
+ "capabilities": ["documentation"]
42
+ }
43
+ ],
44
+ "records": [
45
+ {
46
+ "id": "req-runtime-retry-cap",
47
+ "kind": "request",
48
+ "lane": "main",
49
+ "wave": 0,
50
+ "agentId": "a8",
51
+ "targets": ["agent:a1"],
52
+ "status": "open",
53
+ "priority": "high",
54
+ "artifactRefs": ["src/queue/retries.ts"],
55
+ "summary": "runtime must enforce retry ceiling 3 before enqueue",
56
+ "detail": "This runtime-facing fact must stay visible to the implementation owner."
57
+ },
58
+ {
59
+ "id": "req-doc-retry-cap",
60
+ "kind": "blocker",
61
+ "lane": "main",
62
+ "wave": 0,
63
+ "agentId": "a8",
64
+ "targets": ["agent:a9"],
65
+ "status": "open",
66
+ "priority": "normal",
67
+ "artifactRefs": ["docs/plans/current-state.md"],
68
+ "summary": "docs must note the new queue retry ceiling",
69
+ "detail": "The documentation steward needs the same coordination fact in a docs-owned surface."
70
+ }
71
+ ]
72
+ }
73
+ }
@@ -0,0 +1,104 @@
1
+ {
2
+ "version": 1,
3
+ "id": "wave-contradiction-conflict",
4
+ "title": "Claim Conflict Detection",
5
+ "summary": "Conflicting claims should be surfaced to the integration steward and converted into explicit repair work.",
6
+ "familyId": "contradiction-recovery",
7
+ "benchmarkId": "claim-conflict-detection",
8
+ "kind": "projection",
9
+ "supportedArms": ["single-agent", "multi-agent-minimal", "full-wave"],
10
+ "scoring": {
11
+ "kind": "contradiction-recovery",
12
+ "primaryMetric": "contradiction-detection-rate",
13
+ "thresholds": {
14
+ "contradiction-detection-rate": 100,
15
+ "repair-closure-rate": 100
16
+ },
17
+ "practicalWinThreshold": 25
18
+ },
19
+ "expectations": {
20
+ "targetedInboxes": {
21
+ "a8": [
22
+ "claim one: config flag enable_fast_path is safe in prod",
23
+ "claim two: config flag enable_fast_path leaks stale auth data"
24
+ ]
25
+ },
26
+ "requiredAssignments": [
27
+ {
28
+ "requestId": "repair-fast-path",
29
+ "assignedAgentId": "a8"
30
+ }
31
+ ]
32
+ },
33
+ "fixture": {
34
+ "lane": "main",
35
+ "waveNumber": 0,
36
+ "primaryAgentId": "a1",
37
+ "capabilityRouting": {
38
+ "preferredAgents": {
39
+ "integration": ["a8"]
40
+ }
41
+ },
42
+ "agents": [
43
+ {
44
+ "agentId": "a1",
45
+ "title": "Runtime Owner",
46
+ "ownedPaths": ["src/auth/fast-path.ts"],
47
+ "capabilities": ["runtime"]
48
+ },
49
+ {
50
+ "agentId": "a2",
51
+ "title": "Security Reviewer",
52
+ "ownedPaths": ["src/auth/session.ts"],
53
+ "capabilities": ["security"]
54
+ },
55
+ {
56
+ "agentId": "a8",
57
+ "title": "Integration Steward",
58
+ "ownedPaths": [".tmp/main-wave-launcher/integration/wave-0.md"],
59
+ "capabilities": ["integration"]
60
+ }
61
+ ],
62
+ "records": [
63
+ {
64
+ "id": "claim-fast-safe",
65
+ "kind": "claim",
66
+ "lane": "main",
67
+ "wave": 0,
68
+ "agentId": "a1",
69
+ "targets": ["agent:a8"],
70
+ "status": "open",
71
+ "priority": "high",
72
+ "artifactRefs": ["src/auth/fast-path.ts"],
73
+ "summary": "claim one: config flag enable_fast_path is safe in prod",
74
+ "detail": "The runtime owner believes the fast path is safe."
75
+ },
76
+ {
77
+ "id": "claim-fast-leak",
78
+ "kind": "claim",
79
+ "lane": "main",
80
+ "wave": 0,
81
+ "agentId": "a2",
82
+ "targets": ["agent:a8"],
83
+ "status": "open",
84
+ "priority": "high",
85
+ "artifactRefs": ["src/auth/session.ts", "src/auth/fast-path.ts"],
86
+ "summary": "claim two: config flag enable_fast_path leaks stale auth data",
87
+ "detail": "The security reviewer believes the same flag is unsafe."
88
+ },
89
+ {
90
+ "id": "repair-fast-path",
91
+ "kind": "request",
92
+ "lane": "main",
93
+ "wave": 0,
94
+ "agentId": "a8",
95
+ "targets": ["capability:integration"],
96
+ "status": "open",
97
+ "priority": "high",
98
+ "artifactRefs": ["src/auth/fast-path.ts"],
99
+ "summary": "Turn the contradiction into explicit repair work before closure",
100
+ "detail": "Integration must own the conflict repair loop."
101
+ }
102
+ ]
103
+ }
104
+ }
@@ -0,0 +1,69 @@
1
+ {
2
+ "version": 1,
3
+ "id": "wave-expert-routing-preservation",
4
+ "title": "Expert Routing Preservation",
5
+ "summary": "A capability-targeted request should route to the database specialist rather than diffuse into generic ownership.",
6
+ "familyId": "expertise-leverage",
7
+ "benchmarkId": "expert-routing-preservation",
8
+ "kind": "projection",
9
+ "supportedArms": ["single-agent", "multi-agent-minimal", "full-wave"],
10
+ "scoring": {
11
+ "kind": "expertise-routing",
12
+ "primaryMetric": "capability-routing-precision",
13
+ "thresholds": {
14
+ "capability-routing-precision": 100,
15
+ "expert-preservation-rate": 100
16
+ },
17
+ "practicalWinThreshold": 25
18
+ },
19
+ "expectations": {
20
+ "requiredAssignments": [
21
+ {
22
+ "requestId": "req-covering-index",
23
+ "assignedAgentId": "a2"
24
+ }
25
+ ],
26
+ "targetedInboxes": {
27
+ "a2": ["database expert says add a covering index on tenant_id and updated_at"]
28
+ }
29
+ },
30
+ "fixture": {
31
+ "lane": "main",
32
+ "waveNumber": 0,
33
+ "primaryAgentId": "a1",
34
+ "capabilityRouting": {
35
+ "preferredAgents": {
36
+ "database": ["a2"]
37
+ }
38
+ },
39
+ "agents": [
40
+ {
41
+ "agentId": "a1",
42
+ "title": "Generalist Owner",
43
+ "ownedPaths": ["src/reporting/service.ts"],
44
+ "capabilities": ["runtime"]
45
+ },
46
+ {
47
+ "agentId": "a2",
48
+ "title": "Database Specialist",
49
+ "ownedPaths": ["db/indexes/reporting.sql"],
50
+ "capabilities": ["database"]
51
+ }
52
+ ],
53
+ "records": [
54
+ {
55
+ "id": "req-covering-index",
56
+ "kind": "request",
57
+ "lane": "main",
58
+ "wave": 0,
59
+ "agentId": "a8",
60
+ "targets": ["capability:database"],
61
+ "status": "open",
62
+ "priority": "high",
63
+ "artifactRefs": ["db/indexes/reporting.sql"],
64
+ "summary": "database expert says add a covering index on tenant_id and updated_at",
65
+ "detail": "The database specialist should own this request rather than having it averaged away."
66
+ }
67
+ ]
68
+ }
69
+ }
@@ -0,0 +1,81 @@
1
+ {
2
+ "version": 1,
3
+ "id": "wave-hidden-profile-private-evidence",
4
+ "title": "Private Evidence Integration",
5
+ "summary": "Critical facts are split across two specialists and only the full Wave arm should surface both through targeted inboxes.",
6
+ "familyId": "hidden-profile-pooling",
7
+ "benchmarkId": "private-evidence-integration",
8
+ "kind": "projection",
9
+ "supportedArms": ["single-agent", "multi-agent-minimal", "full-wave"],
10
+ "scoring": {
11
+ "kind": "distributed-info",
12
+ "primaryMetric": "distributed-info-accuracy",
13
+ "thresholds": {
14
+ "distributed-info-accuracy": 100
15
+ },
16
+ "practicalWinThreshold": 20
17
+ },
18
+ "expectations": {
19
+ "globalFacts": [
20
+ "customer-facing outage is isolated to shard blue",
21
+ "the rollback must preserve migration 20260321_add_backfill_guard"
22
+ ],
23
+ "targetedInboxes": {
24
+ "a1": ["customer-facing outage is isolated to shard blue"],
25
+ "a2": ["the rollback must preserve migration 20260321_add_backfill_guard"]
26
+ }
27
+ },
28
+ "fixture": {
29
+ "lane": "main",
30
+ "waveNumber": 0,
31
+ "primaryAgentId": "a1",
32
+ "agents": [
33
+ {
34
+ "agentId": "a1",
35
+ "title": "API Specialist",
36
+ "ownedPaths": ["src/api/server.ts"],
37
+ "capabilities": ["api"]
38
+ },
39
+ {
40
+ "agentId": "a2",
41
+ "title": "Migration Specialist",
42
+ "ownedPaths": ["db/migrations/20260321_add_backfill_guard.sql"],
43
+ "capabilities": ["database"]
44
+ },
45
+ {
46
+ "agentId": "a8",
47
+ "title": "Integration Steward",
48
+ "ownedPaths": [".tmp/main-wave-launcher/integration/wave-0.md"],
49
+ "capabilities": ["integration"]
50
+ }
51
+ ],
52
+ "records": [
53
+ {
54
+ "id": "req-blue-shard",
55
+ "kind": "request",
56
+ "lane": "main",
57
+ "wave": 0,
58
+ "agentId": "a8",
59
+ "targets": ["agent:a1"],
60
+ "status": "open",
61
+ "priority": "high",
62
+ "artifactRefs": ["src/api/server.ts"],
63
+ "summary": "Need API confirmation: customer-facing outage is isolated to shard blue",
64
+ "detail": "Integration cannot close until the API owner confirms shard blue is the only failing shard."
65
+ },
66
+ {
67
+ "id": "req-backfill-guard",
68
+ "kind": "request",
69
+ "lane": "main",
70
+ "wave": 0,
71
+ "agentId": "a8",
72
+ "targets": ["agent:a2"],
73
+ "status": "open",
74
+ "priority": "high",
75
+ "artifactRefs": ["db/migrations/20260321_add_backfill_guard.sql"],
76
+ "summary": "Need migration confirmation: the rollback must preserve migration 20260321_add_backfill_guard",
77
+ "detail": "The database owner holds the rollback-safe migration constraint."
78
+ }
79
+ ]
80
+ }
81
+ }