npm - @chllming/wave-orchestration - Versions diffs - 0.6.2 → 0.7.0 - Mend

@chllming/wave-orchestration 0.6.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

package/CHANGELOG.md +64 -1
package/README.md +44 -8
package/docs/agents/wave-orchestrator-role.md +50 -0
package/docs/agents/wave-planner-role.md +39 -0
package/docs/context7/bundles.json +9 -0
package/docs/context7/planner-agent/README.md +25 -0
package/docs/context7/planner-agent/manifest.json +83 -0
package/docs/context7/planner-agent/papers/cooperbench-why-coding-agents-cannot-be-your-teammates-yet.md +3283 -0
package/docs/context7/planner-agent/papers/dova-deliberation-first-multi-agent-orchestration-for-autonomous-research-automation.md +1699 -0
package/docs/context7/planner-agent/papers/dpbench-large-language-models-struggle-with-simultaneous-coordination.md +2251 -0
package/docs/context7/planner-agent/papers/incremental-planning-to-control-a-blackboard-based-problem-solver.md +1729 -0
package/docs/context7/planner-agent/papers/silo-bench-a-scalable-environment-for-evaluating-distributed-coordination-in-multi-agent-llm-systems.md +3747 -0
package/docs/context7/planner-agent/papers/todoevolve-learning-to-architect-agent-planning-systems.md +1675 -0
package/docs/context7/planner-agent/papers/verified-multi-agent-orchestration-a-plan-execute-verify-replan-framework-for-complex-query-resolution.md +1173 -0
package/docs/context7/planner-agent/papers/why-do-multi-agent-llm-systems-fail.md +5211 -0
package/docs/context7/planner-agent/topics/planning-and-orchestration.md +24 -0
package/docs/evals/README.md +96 -1
package/docs/evals/arm-templates/README.md +13 -0
package/docs/evals/arm-templates/full-wave.json +15 -0
package/docs/evals/arm-templates/single-agent.json +15 -0
package/docs/evals/benchmark-catalog.json +7 -0
package/docs/evals/cases/README.md +47 -0
package/docs/evals/cases/wave-blackboard-inbox-targeting.json +73 -0
package/docs/evals/cases/wave-contradiction-conflict.json +104 -0
package/docs/evals/cases/wave-expert-routing-preservation.json +69 -0
package/docs/evals/cases/wave-hidden-profile-private-evidence.json +81 -0
package/docs/evals/cases/wave-premature-closure-guard.json +71 -0
package/docs/evals/cases/wave-silo-cross-agent-state.json +77 -0
package/docs/evals/cases/wave-simultaneous-lockstep.json +92 -0
package/docs/evals/cooperbench/real-world-mitigation.md +341 -0
package/docs/evals/external-benchmarks.json +85 -0
package/docs/evals/external-command-config.sample.json +9 -0
package/docs/evals/external-command-config.swe-bench-pro.json +8 -0
package/docs/evals/pilots/README.md +47 -0
package/docs/evals/pilots/swe-bench-pro-public-full-wave-review-10.json +64 -0
package/docs/evals/pilots/swe-bench-pro-public-pilot.json +111 -0
package/docs/evals/wave-benchmark-program.md +302 -0
package/docs/guides/planner.md +48 -11
package/docs/plans/context7-wave-orchestrator.md +20 -0
package/docs/plans/current-state.md +9 -1
package/docs/plans/examples/wave-benchmark-improvement.md +108 -0
package/docs/plans/examples/wave-example-live-proof.md +1 -1
package/docs/plans/examples/wave-example-rollout-fidelity.md +340 -0
package/docs/plans/wave-orchestrator.md +73 -11
package/docs/plans/waves/reviews/wave-1-benchmark-operator.md +118 -0
package/docs/reference/coordination-and-closure.md +436 -0
package/docs/reference/live-proof-waves.md +25 -3
package/docs/reference/npmjs-trusted-publishing.md +3 -3
package/docs/reference/proof-metrics.md +90 -0
package/docs/reference/runtime-config/README.md +61 -0
package/docs/reference/sample-waves.md +29 -18
package/docs/reference/wave-control.md +164 -0
package/docs/reference/wave-planning-lessons.md +131 -0
package/package.json +5 -4
package/releases/manifest.json +33 -0
package/scripts/research/agent-context-archive.mjs +18 -0
package/scripts/research/manifests/agent-context-expanded-2026-03-22.mjs +17 -0
package/scripts/research/sync-planner-context7-bundle.mjs +133 -0
package/scripts/wave-autonomous.mjs +2 -4
package/scripts/wave-orchestrator/adhoc.mjs +32 -11
package/scripts/wave-orchestrator/artifact-schemas.mjs +232 -0
package/scripts/wave-orchestrator/autonomous.mjs +27 -6
package/scripts/wave-orchestrator/benchmark-cases.mjs +374 -0
package/scripts/wave-orchestrator/benchmark-external.mjs +1384 -0
package/scripts/wave-orchestrator/benchmark.mjs +972 -0
package/scripts/wave-orchestrator/clarification-triage.mjs +78 -12
package/scripts/wave-orchestrator/config.mjs +175 -0
package/scripts/wave-orchestrator/control-cli.mjs +1123 -0
package/scripts/wave-orchestrator/control-plane.mjs +697 -0
package/scripts/wave-orchestrator/coord-cli.mjs +360 -2
package/scripts/wave-orchestrator/coordination-store.mjs +211 -9
package/scripts/wave-orchestrator/coordination.mjs +84 -0
package/scripts/wave-orchestrator/dashboard-renderer.mjs +38 -3
package/scripts/wave-orchestrator/dashboard-state.mjs +22 -0
package/scripts/wave-orchestrator/evals.mjs +23 -0
package/scripts/wave-orchestrator/executors.mjs +3 -2
package/scripts/wave-orchestrator/feedback.mjs +55 -0
package/scripts/wave-orchestrator/install.mjs +253 -26
package/scripts/wave-orchestrator/launcher-closure.mjs +4 -1
package/scripts/wave-orchestrator/launcher-runtime.mjs +24 -21
package/scripts/wave-orchestrator/launcher.mjs +800 -35
package/scripts/wave-orchestrator/package-update-notice.mjs +230 -0
package/scripts/wave-orchestrator/package-version.mjs +32 -0
package/scripts/wave-orchestrator/planner-context.mjs +75 -0
package/scripts/wave-orchestrator/planner.mjs +2270 -136
package/scripts/wave-orchestrator/proof-cli.mjs +195 -0
package/scripts/wave-orchestrator/proof-registry.mjs +317 -0
package/scripts/wave-orchestrator/replay.mjs +10 -4
package/scripts/wave-orchestrator/retry-cli.mjs +184 -0
package/scripts/wave-orchestrator/retry-control.mjs +225 -0
package/scripts/wave-orchestrator/shared.mjs +26 -0
package/scripts/wave-orchestrator/swe-bench-pro-task.mjs +1004 -0
package/scripts/wave-orchestrator/traces.mjs +157 -2
package/scripts/wave-orchestrator/wave-control-client.mjs +532 -0
package/scripts/wave-orchestrator/wave-control-schema.mjs +309 -0
package/scripts/wave-orchestrator/wave-files.mjs +17 -5
package/scripts/wave.mjs +39 -2
package/skills/repo-coding-rules/SKILL.md +1 -0
package/skills/role-cont-eval/SKILL.md +1 -0
package/skills/role-cont-qa/SKILL.md +13 -6
package/skills/role-deploy/SKILL.md +1 -0
package/skills/role-documentation/SKILL.md +4 -0
package/skills/role-implementation/SKILL.md +4 -0
package/skills/role-infra/SKILL.md +2 -1
package/skills/role-integration/SKILL.md +15 -8
package/skills/role-planner/SKILL.md +39 -0
package/skills/role-planner/skill.json +21 -0
package/skills/role-research/SKILL.md +1 -0
package/skills/role-security/SKILL.md +2 -2
package/skills/runtime-claude/SKILL.md +2 -1
package/skills/runtime-codex/SKILL.md +1 -0
package/skills/runtime-local/SKILL.md +2 -0
package/skills/runtime-opencode/SKILL.md +1 -0
package/skills/wave-core/SKILL.md +25 -6
package/skills/wave-core/references/marker-syntax.md +16 -8
package/wave.config.json +45 -0

package/docs/evals/cases/wave-premature-closure-guard.json ADDED Viewed

@@ -0,0 +1,71 @@
+{
+  "version": 1,
+  "id": "wave-premature-closure-guard",
+  "title": "Premature Closure Guard",
+  "summary": "A clarification-linked repair request remains open, so the full Wave arm should preserve a blocking guard instead of converging early.",
+  "familyId": "hidden-profile-pooling",
+  "benchmarkId": "premature-consensus-guard",
+  "kind": "projection",
+  "supportedArms": ["single-agent", "multi-agent-minimal", "full-wave"],
+  "scoring": {
+    "kind": "closure-guard",
+    "primaryMetric": "premature-convergence-rate",
+    "thresholds": {
+      "premature-convergence-rate": 0
+    },
+    "practicalWinThreshold": 50
+  },
+  "expectations": {
+    "clarificationRequestIds": ["clarify-missing-evidence"],
+    "requireBlockingGuard": true
+  },
+  "fixture": {
+    "lane": "main",
+    "waveNumber": 0,
+    "primaryAgentId": "a1",
+    "agents": [
+      {
+        "agentId": "a1",
+        "title": "Implementation Owner",
+        "ownedPaths": ["src/runtime.ts"],
+        "capabilities": ["runtime"]
+      },
+      {
+        "agentId": "a8",
+        "title": "Integration Steward",
+        "ownedPaths": [".tmp/main-wave-launcher/integration/wave-0.md"],
+        "capabilities": ["integration"]
+      }
+    ],
+    "records": [
+      {
+        "id": "clarify-missing-evidence",
+        "kind": "clarification-request",
+        "lane": "main",
+        "wave": 0,
+        "agentId": "a1",
+        "targets": ["launcher"],
+        "status": "open",
+        "priority": "high",
+        "artifactRefs": ["src/runtime.ts"],
+        "summary": "Missing evidence for restart durability",
+        "detail": "The implementation owner still needs proof for restart durability."
+      },
+      {
+        "id": "repair-missing-evidence",
+        "kind": "request",
+        "lane": "main",
+        "wave": 0,
+        "agentId": "a8",
+        "targets": ["agent:a1"],
+        "status": "open",
+        "priority": "high",
+        "artifactRefs": ["src/runtime.ts"],
+        "dependsOn": ["clarify-missing-evidence"],
+        "closureCondition": "clarification:clarify-missing-evidence",
+        "summary": "Repair request is still open while clarification remains unresolved",
+        "detail": "This request must remain blocking until the clarification chain is closed."
+      }
+    ]
+  }
+}

package/docs/evals/cases/wave-silo-cross-agent-state.json ADDED Viewed

@@ -0,0 +1,77 @@
+{
+  "version": 1,
+  "id": "wave-silo-cross-agent-state",
+  "title": "Cross-Agent State Reconstruction",
+  "summary": "The correct diagnosis requires facts from both the API and queue owners and should only reconstruct cleanly in the full Wave arm.",
+  "familyId": "silo-escape",
+  "benchmarkId": "cross-agent-state-reconstruction",
+  "kind": "projection",
+  "supportedArms": ["single-agent", "multi-agent-minimal", "full-wave"],
+  "scoring": {
+    "kind": "state-reconstruction",
+    "primaryMetric": "global-state-reconstruction-rate",
+    "thresholds": {
+      "global-state-reconstruction-rate": 100,
+      "summary-fact-retention-rate": 100
+    },
+    "practicalWinThreshold": 20
+  },
+  "expectations": {
+    "globalFacts": [
+      "api retries are saturating the worker queue",
+      "queue lag only spikes after the retry fanout begins"
+    ],
+    "summaryFacts": ["api retries are saturating the worker queue"],
+    "targetedInboxes": {
+      "a1": ["queue lag only spikes after the retry fanout begins"],
+      "a2": ["api retries are saturating the worker queue"]
+    }
+  },
+  "fixture": {
+    "lane": "main",
+    "waveNumber": 0,
+    "primaryAgentId": "a1",
+    "agents": [
+      {
+        "agentId": "a1",
+        "title": "API Owner",
+        "ownedPaths": ["src/api/retries.ts"],
+        "capabilities": ["api"]
+      },
+      {
+        "agentId": "a2",
+        "title": "Queue Owner",
+        "ownedPaths": ["src/queue/worker.ts"],
+        "capabilities": ["queue"]
+      }
+    ],
+    "records": [
+      {
+        "id": "block-api-fanout",
+        "kind": "blocker",
+        "lane": "main",
+        "wave": 0,
+        "agentId": "a1",
+        "targets": ["agent:a2"],
+        "status": "open",
+        "priority": "high",
+        "artifactRefs": ["src/api/retries.ts", "src/queue/worker.ts"],
+        "summary": "api retries are saturating the worker queue",
+        "detail": "The API owner sees retry fanout but needs queue evidence to reconstruct the full state."
+      },
+      {
+        "id": "block-queue-lag",
+        "kind": "blocker",
+        "lane": "main",
+        "wave": 0,
+        "agentId": "a2",
+        "targets": ["agent:a1"],
+        "status": "open",
+        "priority": "high",
+        "artifactRefs": ["src/queue/worker.ts", "src/api/retries.ts"],
+        "summary": "queue lag only spikes after the retry fanout begins",
+        "detail": "The queue owner sees the lag pattern but needs the API owner's retry context."
+      }
+    ]
+  }
+}

package/docs/evals/cases/wave-simultaneous-lockstep.json ADDED Viewed

@@ -0,0 +1,92 @@
+{
+  "version": 1,
+  "id": "wave-simultaneous-lockstep",
+  "title": "Lockstep Resolution",
+  "summary": "Two concurrent blocking requests should route to different specialists instead of collapsing into unresolved contention.",
+  "familyId": "simultaneous-coordination",
+  "benchmarkId": "lockstep-resolution",
+  "kind": "projection",
+  "supportedArms": ["single-agent", "multi-agent-minimal", "full-wave"],
+  "scoring": {
+    "kind": "simultaneous-coordination",
+    "primaryMetric": "contention-resolution-rate",
+    "thresholds": {
+      "contention-resolution-rate": 100,
+      "symmetry-breaking-rate": 100,
+      "deadlock-rate": 0
+    },
+    "practicalWinThreshold": 30
+  },
+  "expectations": {
+    "requiredAssignments": [
+      {
+        "requestId": "req-cache-guard",
+        "assignedAgentId": "a2"
+      },
+      {
+        "requestId": "req-queue-budget",
+        "assignedAgentId": "a3"
+      }
+    ],
+    "minimumDistinctAssignedAgents": 2
+  },
+  "fixture": {
+    "lane": "main",
+    "waveNumber": 0,
+    "primaryAgentId": "a1",
+    "capabilityRouting": {
+      "preferredAgents": {
+        "cache": ["a2"],
+        "queue": ["a3"]
+      }
+    },
+    "agents": [
+      {
+        "agentId": "a1",
+        "title": "Primary Owner",
+        "ownedPaths": ["src/runtime.ts"],
+        "capabilities": ["runtime"]
+      },
+      {
+        "agentId": "a2",
+        "title": "Cache Owner",
+        "ownedPaths": ["src/cache/guard.ts"],
+        "capabilities": ["cache"]
+      },
+      {
+        "agentId": "a3",
+        "title": "Queue Owner",
+        "ownedPaths": ["src/queue/budget.ts"],
+        "capabilities": ["queue"]
+      }
+    ],
+    "records": [
+      {
+        "id": "req-cache-guard",
+        "kind": "request",
+        "lane": "main",
+        "wave": 0,
+        "agentId": "a8",
+        "targets": ["capability:cache"],
+        "status": "open",
+        "priority": "high",
+        "artifactRefs": ["src/cache/guard.ts"],
+        "summary": "Concurrent fix one: cache guard must be updated before release",
+        "detail": "This blocking request should route to the cache owner."
+      },
+      {
+        "id": "req-queue-budget",
+        "kind": "request",
+        "lane": "main",
+        "wave": 0,
+        "agentId": "a8",
+        "targets": ["capability:queue"],
+        "status": "open",
+        "priority": "high",
+        "artifactRefs": ["src/queue/budget.ts"],
+        "summary": "Concurrent fix two: queue budget must be updated before release",
+        "detail": "This blocking request should route to the queue owner."
+      }
+    ]
+  }
+}

package/docs/evals/cooperbench/real-world-mitigation.md ADDED Viewed

@@ -0,0 +1,341 @@
+---
+summary: "Comparison of CooperBench coordination failure modes against LEAP-Claw Wave 7-10 traces, with concrete examples and the wave-framework countermeasures that helped or still leaked"
+read_when:
+  - You want to compare LEAP-Claw wave traces to the coordination failure taxonomy in CooperBench
+  - You need exact local message examples instead of a general impression
+  - You are deciding whether the wave framework mostly mitigates or still exhibits multi-agent coordination failures
+title: "CooperBench Versus LEAP-Claw Waves"
+---
+# CooperBench Versus LEAP-Claw Waves
+This report compares the failure taxonomy from
+[CooperBench](https://cooperbench.com/static/pdfs/main.pdf) with the concrete
+execution history from LEAP-Claw Waves 7-10.
+The short conclusion is:
+- we do still see the same broad classes of coordination failure that
+  CooperBench describes
+- the wave framework mitigates many of them by turning them into explicit,
+  machine-visible gate failures instead of silent merge-time corruption
+- the remaining gaps are mostly around stale state, retry semantics, and
+  escalation timing rather than uncontrolled code conflicts
+## Scope and evidence base
+This comparison uses:
+- Wave 7 rerun traces and remediation notes
+- Wave 8 execution-gap review
+- Wave 9 and Wave 10 launcher dashboards, summaries, and coordination traces
+- the current wave role prompts and wave-file structure
+Primary local evidence:
+- [Wave 7.1 Remediation](/home/coder/slowfast.ai/docs/plans/waves/reviews/wave-7.1-remediation.md)
+- [Wave 8 Execution Gap Review](/home/coder/slowfast.ai/docs/plans/waves/reviews/wave-8-execution-gap-review.md)
+- [Wave Planning Lessons](/home/coder/slowfast.ai/docs/plans/waves/reviews/wave-planning-lessons.md)
+- [Wave 10](/home/coder/slowfast.ai/docs/plans/waves/wave-10.md)
+- [Wave Integration Role](/home/coder/slowfast.ai/docs/agents/wave-integration-role.md)
+- [Wave Documentation Role](/home/coder/slowfast.ai/docs/agents/wave-documentation-role.md)
+- [Wave Evaluator Role](/home/coder/slowfast.ai/docs/agents/wave-evaluator-role.md)
+## The paper's three failure buckets
+CooperBench groups coordination failure into three buckets:
+1. communication channels become noisy, late, or inaccurate
+2. agents fail to carry out or preserve their commitments
+3. agents form incorrect beliefs about what their partners did, saw, or meant
+That grouping fits our traces very well.
+## 1. Communication failures: still present, but far more legible
+### What CooperBench warns about
+The paper highlights communication that is vague, late, repetitive, or
+incorrect. The practical problem is not merely "too much chat"; it is that
+messages fail to drive timely coordinated action.
+### Exact LEAP-Claw example: routed clarification plus immediate human escalation
+Wave 10 produced the clearest example.
+In the same coordination chain:
+- A7 asked for approved rollout drill and rollback commands
+- ownership policy routed that clarification to `A1`
+- the launcher still opened a human escalation immediately
+The exact records are visible in the archived Wave 10 trace:
+- clarification moved to `in_progress` with `detail: "Ownership policy resolved this clarification to A1."` in [coordination.raw.jsonl](/home/coder/slowfast.ai/.tmp/retry-archive/wave-10-20260322T195609Z/wave-10-traces/attempt-2/coordination.raw.jsonl#L28)
+- routed follow-up opened for `agent:A1` in [coordination.raw.jsonl](/home/coder/slowfast.ai/.tmp/retry-archive/wave-10-20260322T195609Z/wave-10-traces/attempt-2/coordination.raw.jsonl#L29)
+- explicit assignment to `A1` recorded in [coordination.raw.jsonl](/home/coder/slowfast.ai/.tmp/retry-archive/wave-10-20260322T195609Z/wave-10-traces/attempt-2/coordination.raw.jsonl#L30)
+- a human escalation for the same issue opened immediately afterward in [coordination.raw.jsonl](/home/coder/slowfast.ai/.tmp/retry-archive/wave-10-20260322T195609Z/wave-10-traces/attempt-2/coordination.raw.jsonl#L31)
+This is a genuine communication failure mode. The framework did not prevent the
+duplication. It created both a machine-routed clarification and a human ticket
+for the same issue before the routed path was exhausted.
+### What countered it
+The wave framework still improved the situation substantially:
+- the issue was recorded in durable structured logs rather than disappearing in
+  chat
+- the queue was inspectable with `pnpm wave:feedback -- list --lane leap-claw --pending`
+- the operator could answer the request with an exact command surface, and the
+  request file recorded that answer
+So the failure was not silent. The framework converted a latent ambiguity into a
+visible triage problem. That is better than raw agent-to-agent chat, but it is
+still an unresolved planner bug.
+### Secondary communication example: accurate but late handoff
+`A1` eventually resolved A7's question very clearly. The archived trace shows:
+- `A1` handoff: `"A7 clarification answered: approved Wave 10 command surface is on disk"` in [coordination.raw.jsonl](/home/coder/slowfast.ai/.tmp/retry-archive/wave-10-20260322T195609Z/wave-10-traces/attempt-2/coordination.raw.jsonl#L37)
+- `A1` resolved-by-policy note: `"Wave 10 A7 clarification resolved by published command surface and stop rules"` in [coordination.raw.jsonl](/home/coder/slowfast.ai/.tmp/retry-archive/wave-10-20260322T195609Z/wave-10-traces/attempt-2/coordination.raw.jsonl#L40)
+This is a positive sign: the agents can produce good coordination messages. The
+problem is reliability and timing, not total absence of the capability.
+## 2. Commitment drift: heavily mitigated, but still common
+### What CooperBench warns about
+The paper highlights agents making claims they do not operationally cash out,
+or failing to preserve agreed coordination points even after substantive work is
+done.
+### Exact LEAP-Claw example: work landed, protocol still failed
+Wave 10 `A1` shows this cleanly.
+On attempt 1, the launcher failed `A1` because the final structured proof marker
+was missing:
+- `"Implementation exit contract blocked wave 10: Missing [wave-proof] marker for A1."` in [wave-10.json](/home/coder/slowfast.ai/.tmp/leap-claw-wave-launcher/dashboards/wave-10.json#L205)
+But the agent had already landed the owned files:
+- `go/internal/rollout/apply/pilot_integration_test.go`
+- `go/internal/rollout/apply/rollback_switch.go`
+- `docs/plans/operations/wave-10-rollout-drill.md`
+Those deliverables appear in the later clean summary in [wave-10-10-a1.summary.json](/home/coder/slowfast.ai/.tmp/leap-claw-wave-launcher/status/wave-10-10-a1.summary.json#L43).
+This is not "the agent did nothing." It is closer to CooperBench's commitment
+drift pattern:
+- the substantive implementation commitment was met
+- the wave-protocol commitment was not met
+- the framework therefore refused to infer completion
+### Exact LEAP-Claw example: closure agents and formatting discipline
+Wave 7 exposed the same class of issue at closure level rather than
+implementation level.
+The remediation record states:
+- structured marker parsing was too brittle for backtick-wrapped or fenced
+  markers in [wave-7.1-remediation.md](/home/coder/slowfast.ai/docs/plans/waves/reviews/wave-7.1-remediation.md#L17)
+- local fixes then required A0, A8, and A9 to emit final markers as plain last
+  lines in [wave-7.1-remediation.md](/home/coder/slowfast.ai/docs/plans/waves/reviews/wave-7.1-remediation.md#L42)
+Again, the framework did not stop the omission. But it did keep the omission
+from becoming a false success.
+### What countered it
+This is where the wave framework helps the most.
+The repo now explicitly counteracts commitment drift with:
+- structured marker requirements for A8, A9, and A0 in [wave-10.md](/home/coder/slowfast.ai/docs/plans/waves/wave-10.md#L50)
+- explicit `### Deliverables` and `### Proof artifacts` in [wave-10.md](/home/coder/slowfast.ai/docs/plans/waves/wave-10.md#L171)
+- a standing implementation skill that says landed files without required
+  markers are not done
+- A8, A9, and A0 closure gates that refuse to treat intent as closure
+So yes, we still see commitment drift. But the framework mostly catches it as a
+protocol failure before the lane advances.
+## 3. Incorrect expectations: this is our biggest remaining problem
+### What CooperBench warns about
+The paper's third bucket is incorrect expectations about others' plans,
+observations, or communication. In practice, this causes duplicate work,
+mis-sequencing, or reasoning from stale or partial state.
+### Exact LEAP-Claw example: stale status reuse in live-proof waves
+Wave 8 documented this explicitly.
+The review records:
+- stale generated state was reused too aggressively in [wave-8-execution-gap-review.md](/home/coder/slowfast.ai/docs/plans/waves/reviews/wave-8-execution-gap-review.md#L122)
+- `A3` had exited `0` without a closure-grade summary, yet that stale status
+  was treated as reusable in [wave-8-execution-gap-review.md](/home/coder/slowfast.ai/docs/plans/waves/reviews/wave-8-execution-gap-review.md#L128)
+- `A6` reused an obsolete proof-gap summary after the missing live proof bundle
+  already existed in [wave-8-execution-gap-review.md](/home/coder/slowfast.ai/docs/plans/waves/reviews/wave-8-execution-gap-review.md#L130)
+This maps directly to the paper's "incorrect expectations" bucket. The runtime
+was effectively reasoning as if prior agent observations were still current.
+### Exact LEAP-Claw example: shared-component retry stranded sibling owners
+Wave 10 retry showed an even sharper version.
+After the second `A1` attempt, the clean summary explicitly said:
+- `proof.state = met` in [wave-10-10-a1.summary.json](/home/coder/slowfast.ai/.tmp/leap-claw-wave-launcher/status/wave-10-10-a1.summary.json#L6)
+- the remaining component gap was outside A1 and belonged to live pilot
+  authority in [wave-10-10-a1.summary.json](/home/coder/slowfast.ai/.tmp/leap-claw-wave-launcher/status/wave-10-10-a1.summary.json#L25)
+But the dashboard still ended the wave at A1:
+- `A1` ended `Exit component-gap` in [wave-10.json](/home/coder/slowfast.ai/.tmp/leap-claw-wave-launcher/dashboards/wave-10.json#L116)
+- `A2` stayed pending with `"Stale status=0 ignored due to prompt drift or missing metadata"` in [wave-10.json](/home/coder/slowfast.ai/.tmp/leap-claw-wave-launcher/dashboards/wave-10.json#L139)
+- `A7` stayed pending with the same stale-state message in [wave-10.json](/home/coder/slowfast.ai/.tmp/leap-claw-wave-launcher/dashboards/wave-10.json#L162)
+This is not a simple code-quality problem. It is a coordination-state problem:
+- the launcher knew the remaining `pilot-live` gap was sibling-owned
+- the launcher still treated `A1` as the terminal failing point
+That is very close to the paper's claim that agents or systems form incorrect
+expectations about partner state and then act on the wrong mental model.
+### Exact LEAP-Claw example: stale integration and closure artifacts
+Wave 7 also hit this category. The remediation note records:
+- final closure artifacts could stay stale or synthesized instead of reflecting
+  the authoritative rerun in [wave-7.1-remediation.md](/home/coder/slowfast.ai/docs/plans/waves/reviews/wave-7.1-remediation.md#L21)
+That is again an expectations problem: the system continued to act as if earlier
+closure state was still authoritative.
+### What countered it
+The wave framework pushes hard against this class of error, but it does not
+eliminate it.
+The main countermeasures are:
+- A8 as a dedicated integration steward that checks contradictions and proof gaps
+  before docs and evaluation in [wave-integration-role.md](/home/coder/slowfast.ai/docs/agents/wave-integration-role.md#L35)
+- A9 refusing to treat early doc updates as final if integration is not closed
+  in [wave-documentation-role.md](/home/coder/slowfast.ai/docs/agents/wave-documentation-role.md#L57)
+- A0 treating the final closure sweep as authoritative in [wave-evaluator-role.md](/home/coder/slowfast.ai/docs/agents/wave-evaluator-role.md#L128)
+- explicit proof-bundle doctrine for `pilot-live` and above in
+  [wave-planning-lessons.md](/home/coder/slowfast.ai/docs/plans/waves/reviews/wave-planning-lessons.md#L18)
+These are real mitigations. They are why stale or wrong expectations usually
+show up as blocked waves rather than false passes.
+But this is still the area where the runtime leaks most.
+## 4. Failure modes we mostly avoid because of the framework
+CooperBench centers workspaces with overlapping code and partial observability.
+We do share the partial-observability problem, but the wave framework avoids
+some of the worst merge-era failure modes by design.
+### Resource-division failures are much rarer
+Wave files impose explicit resource division.
+Wave 10 does this in the open:
+- A1 owns `go/internal/rollout/apply/` plus one runbook in [wave-10.md](/home/coder/slowfast.ai/docs/plans/waves/wave-10.md#L192)
+- A2 owns `go/internal/rollout/shadow/`, `go/internal/cluster/view/rollout_status_test.go`, and one QA doc in [wave-10.md](/home/coder/slowfast.ai/docs/plans/waves/wave-10.md#L252)
+- A7 owns the live proof bundle and review note in [wave-10.md](/home/coder/slowfast.ai/docs/plans/waves/wave-10.md#L258)
+This is close to the paper's successful "resource division" pattern. The key
+difference is that our framework makes the split declarative up front instead of
+hoping the agents negotiate it reliably in freeform chat.
+### Role division is strong
+The framework also forces role division:
+- implementation agents own concrete deliverables
+- A8 owns cross-agent coherence, not code delivery
+- A9 owns shared-plan synchronization
+- A0 owns final gate truth
+That division is encoded in the wave file and standing role prompts, not only in
+agent memory.
+In practice, this means many failures that would become destructive code
+overwrites in a looser system instead become:
+- missing markers
+- unresolved component gaps
+- stale-state reuse bugs
+- over-eager escalations
+Those are still real problems, but they are safer problems.
+## 5. What the framework is actually doing
+The paper argues that many systems rely on scaffolds and active supervision
+rather than raw cooperative ability. That is also true here.
+The wave framework is not evidence that the agents have solved social
+intelligence. It is evidence that we have built stronger external scaffolding:
+- explicit ownership
+- explicit deliverables
+- explicit proof artifacts
+- explicit maturity levels
+- explicit integration and evaluator gates
+- durable coordination records
+This scaffolding does three useful things:
+1. it reduces ambiguous coordination space
+2. it makes hidden contradictions visible
+3. it keeps many failures from being mistaken for success
+That is a meaningful mitigation, but it is not the same as eliminating the
+underlying coordination problem.
+## 6. Bottom line
+The honest comparison is:
+- yes, we still see the CooperBench failure classes in real wave traces
+- no, they usually do not show up as uncontrolled agent chaos
+- instead, they show up as:
+  - duplicated escalation paths
+  - missing marker failures
+  - stale closure or status reuse
+  - shared-component retry bugs
+So the wave framework mostly mitigates these failures by containing them,
+surfacing them, and refusing to advance the lane on bad coordination state.
+What it does not yet fully solve:
+- premature or duplicated escalation
+- stale-state invalidation for high-maturity waves
+- shared-component retry semantics once one owner becomes clean
+- the gap between "agent landed a correct slice" and "the runtime moved the
+  whole shared component forward correctly"
+That means the right claim is not "the framework solves multi-agent
+coordination." The right claim is:
+- it meaningfully narrows the failure surface
+- it converts many soft coordination mistakes into explicit gate failures
+- it still needs better runtime behavior around retries, stale state, and
+  escalation timing

package/docs/evals/external-benchmarks.json ADDED Viewed

@@ -0,0 +1,85 @@
+{
+  "version": 1,
+  "adapters": [
+    {
+      "id": "swe-bench-pro",
+      "title": "SWE-bench Pro",
+      "mode": "direct",
+      "sourceBenchmark": "SWE-bench Pro",
+      "split": "public",
+      "pilotManifestPath": "docs/evals/pilots/swe-bench-pro-public-pilot.json",
+      "officialDocsUrl": "https://scaleapi.github.io/SWE-bench_Pro-os/",
+      "officialCodeUrl": "https://github.com/scaleapi/SWE-bench_Pro-os",
+      "summary": "Contamination-resistant long-horizon software engineering benchmark for public, held-out, and commercial repositories.",
+      "commandTemplate": "",
+      "metrics": ["task-success-rate", "cost-per-solved-task", "wall-clock-per-solved-task"],
+      "notes": [
+        "Use the public split for the first direct external benchmark run and rely on the official verifier for pass or fail.",
+        "Keep the base model, executor, and budget identical across the `single-agent` and `full-wave` arms.",
+        "The second direct benchmark slot is intentionally deferred until the later CooperBench pass."
+      ]
+    },
+    {
+      "id": "skillsbench-style-ablation",
+      "title": "SkillsBench-style Ablation",
+      "mode": "adapted",
+      "sourceBenchmark": "SkillsBench",
+      "summary": "Adapt the SkillsBench methodology to Wave skill bundles by comparing no skills, curated skills, and overbroad skills.",
+      "commandTemplate": "wave benchmark run --arm single-agent --arm multi-agent-minimal --arm full-wave",
+      "metrics": ["pass-rate-delta", "negative-skill-regression-rate", "runtime-cost"],
+      "notes": [
+        "This is a local adaptation rather than a direct external suite.",
+        "The initial repo benchmark runner ships the local corpus and registry, not the full external execution harness."
+      ]
+    },
+    {
+      "id": "evoclaw-style-sequence",
+      "title": "EvoClaw-style Sequence",
+      "mode": "adapted",
+      "sourceBenchmark": "EvoClaw",
+      "summary": "Sequence multiple dependent waves to measure long-horizon maintenance and error accumulation.",
+      "commandTemplate": "wave benchmark run --arm single-agent --arm full-wave --family silo-escape",
+      "metrics": ["milestone-pass-decay", "reopen-rate", "regression-carryover"],
+      "notes": [
+        "Use the local benchmark harness to define milestone DAGs or ordered wave sequences.",
+        "Best used after the deterministic coordination corpus is stable."
+      ]
+    },
+    {
+      "id": "silo-bench-style-coordination",
+      "title": "Silo-Bench-style Coordination",
+      "mode": "adapted",
+      "sourceBenchmark": "Silo-Bench",
+      "summary": "Distributed-information and communication-reasoning-gap evaluations adapted into Wave-native coordination fixtures.",
+      "commandTemplate": "wave benchmark run --family hidden-profile-pooling --family silo-escape",
+      "metrics": ["distributed-info-accuracy", "global-state-reconstruction-rate", "communication-reasoning-gap"],
+      "notes": [
+        "The shipped local cases in docs/evals/cases/ are the first adaptation layer for this family."
+      ]
+    },
+    {
+      "id": "hiddenbench-style-pooling",
+      "title": "HiddenBench-style Pooling",
+      "mode": "adapted",
+      "sourceBenchmark": "HiddenBench",
+      "summary": "Asymmetric-information tasks that focus specifically on whether decision-changing private evidence reaches shared state before closure.",
+      "commandTemplate": "wave benchmark run --family hidden-profile-pooling",
+      "metrics": ["distributed-info-accuracy", "premature-convergence-rate"],
+      "notes": [
+        "This is the recommended next coordination benchmark after the first SWE-bench Pro pilot."
+      ]
+    },
+    {
+      "id": "dpbench-style-contention",
+      "title": "DPBench-style Contention",
+      "mode": "adapted",
+      "sourceBenchmark": "DPBench",
+      "summary": "Simultaneous coordination and contention cases adapted into capability-routing and helper-assignment fixtures.",
+      "commandTemplate": "wave benchmark run --family simultaneous-coordination",
+      "metrics": ["deadlock-rate", "contention-resolution-rate", "symmetry-breaking-rate"],
+      "notes": [
+        "The initial local corpus measures the routing and blocking substrate before live concurrent execution is added."
+      ]
+    }
+  ]
+}

package/docs/evals/external-command-config.sample.json ADDED Viewed

@@ -0,0 +1,9 @@
+{
+  "adapters": {
+    "swe-bench-pro": {
+      "single-agent": "external-harness run --benchmark swe-bench-pro --task {task_id} --arm {arm} --model {model_id} --executor {executor_command}",
+      "full-wave": "external-harness run --benchmark swe-bench-pro --task {task_id} --arm {arm} --model {model_id} --executor {executor_command}",
+      "verify": "external-harness verify --benchmark swe-bench-pro --task {task_id} --arm {arm}"
+    }
+  }
+}