npm - @chllming/wave-orchestration - Versions diffs - 0.6.3 → 0.7.1 - Mend

@chllming/wave-orchestration 0.6.3 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

package/CHANGELOG.md +82 -1
package/README.md +40 -7
package/docs/agents/wave-orchestrator-role.md +50 -0
package/docs/agents/wave-planner-role.md +39 -0
package/docs/context7/bundles.json +9 -0
package/docs/context7/planner-agent/README.md +25 -0
package/docs/context7/planner-agent/manifest.json +83 -0
package/docs/context7/planner-agent/papers/cooperbench-why-coding-agents-cannot-be-your-teammates-yet.md +3283 -0
package/docs/context7/planner-agent/papers/dova-deliberation-first-multi-agent-orchestration-for-autonomous-research-automation.md +1699 -0
package/docs/context7/planner-agent/papers/dpbench-large-language-models-struggle-with-simultaneous-coordination.md +2251 -0
package/docs/context7/planner-agent/papers/incremental-planning-to-control-a-blackboard-based-problem-solver.md +1729 -0
package/docs/context7/planner-agent/papers/silo-bench-a-scalable-environment-for-evaluating-distributed-coordination-in-multi-agent-llm-systems.md +3747 -0
package/docs/context7/planner-agent/papers/todoevolve-learning-to-architect-agent-planning-systems.md +1675 -0
package/docs/context7/planner-agent/papers/verified-multi-agent-orchestration-a-plan-execute-verify-replan-framework-for-complex-query-resolution.md +1173 -0
package/docs/context7/planner-agent/papers/why-do-multi-agent-llm-systems-fail.md +5211 -0
package/docs/context7/planner-agent/topics/planning-and-orchestration.md +24 -0
package/docs/evals/README.md +96 -1
package/docs/evals/arm-templates/README.md +13 -0
package/docs/evals/arm-templates/full-wave.json +15 -0
package/docs/evals/arm-templates/single-agent.json +15 -0
package/docs/evals/benchmark-catalog.json +7 -0
package/docs/evals/cases/README.md +47 -0
package/docs/evals/cases/wave-blackboard-inbox-targeting.json +73 -0
package/docs/evals/cases/wave-contradiction-conflict.json +104 -0
package/docs/evals/cases/wave-expert-routing-preservation.json +69 -0
package/docs/evals/cases/wave-hidden-profile-private-evidence.json +81 -0
package/docs/evals/cases/wave-premature-closure-guard.json +71 -0
package/docs/evals/cases/wave-silo-cross-agent-state.json +77 -0
package/docs/evals/cases/wave-simultaneous-lockstep.json +92 -0
package/docs/evals/cooperbench/real-world-mitigation.md +341 -0
package/docs/evals/external-benchmarks.json +85 -0
package/docs/evals/external-command-config.sample.json +9 -0
package/docs/evals/external-command-config.swe-bench-pro.json +8 -0
package/docs/evals/pilots/README.md +47 -0
package/docs/evals/pilots/swe-bench-pro-public-full-wave-review-10.json +64 -0
package/docs/evals/pilots/swe-bench-pro-public-pilot.json +111 -0
package/docs/evals/wave-benchmark-program.md +302 -0
package/docs/guides/planner.md +67 -11
package/docs/guides/terminal-surfaces.md +12 -0
package/docs/plans/context7-wave-orchestrator.md +20 -0
package/docs/plans/current-state.md +8 -1
package/docs/plans/examples/wave-benchmark-improvement.md +108 -0
package/docs/plans/examples/wave-example-live-proof.md +1 -1
package/docs/plans/examples/wave-example-rollout-fidelity.md +340 -0
package/docs/plans/migration.md +26 -0
package/docs/plans/wave-orchestrator.md +60 -12
package/docs/plans/waves/reviews/wave-1-benchmark-operator.md +118 -0
package/docs/reference/cli-reference.md +547 -0
package/docs/reference/coordination-and-closure.md +436 -0
package/docs/reference/live-proof-waves.md +25 -3
package/docs/reference/npmjs-trusted-publishing.md +3 -3
package/docs/reference/proof-metrics.md +90 -0
package/docs/reference/runtime-config/README.md +63 -2
package/docs/reference/runtime-config/codex.md +2 -1
package/docs/reference/sample-waves.md +29 -18
package/docs/reference/wave-control.md +164 -0
package/docs/reference/wave-planning-lessons.md +131 -0
package/package.json +5 -4
package/releases/manifest.json +40 -0
package/scripts/research/agent-context-archive.mjs +18 -0
package/scripts/research/manifests/agent-context-expanded-2026-03-22.mjs +17 -0
package/scripts/research/sync-planner-context7-bundle.mjs +133 -0
package/scripts/wave-orchestrator/agent-state.mjs +11 -2
package/scripts/wave-orchestrator/artifact-schemas.mjs +232 -0
package/scripts/wave-orchestrator/autonomous.mjs +7 -0
package/scripts/wave-orchestrator/benchmark-cases.mjs +374 -0
package/scripts/wave-orchestrator/benchmark-external.mjs +1384 -0
package/scripts/wave-orchestrator/benchmark.mjs +972 -0
package/scripts/wave-orchestrator/clarification-triage.mjs +78 -12
package/scripts/wave-orchestrator/config.mjs +175 -0
package/scripts/wave-orchestrator/control-cli.mjs +1216 -0
package/scripts/wave-orchestrator/control-plane.mjs +697 -0
package/scripts/wave-orchestrator/coord-cli.mjs +360 -2
package/scripts/wave-orchestrator/coordination-store.mjs +211 -9
package/scripts/wave-orchestrator/coordination.mjs +84 -0
package/scripts/wave-orchestrator/dashboard-renderer.mjs +120 -5
package/scripts/wave-orchestrator/dashboard-state.mjs +22 -0
package/scripts/wave-orchestrator/evals.mjs +23 -0
package/scripts/wave-orchestrator/executors.mjs +3 -2
package/scripts/wave-orchestrator/feedback.mjs +55 -0
package/scripts/wave-orchestrator/install.mjs +151 -2
package/scripts/wave-orchestrator/launcher-closure.mjs +4 -1
package/scripts/wave-orchestrator/launcher-runtime.mjs +33 -30
package/scripts/wave-orchestrator/launcher.mjs +884 -36
package/scripts/wave-orchestrator/planner-context.mjs +75 -0
package/scripts/wave-orchestrator/planner.mjs +2270 -136
package/scripts/wave-orchestrator/proof-cli.mjs +195 -0
package/scripts/wave-orchestrator/proof-registry.mjs +317 -0
package/scripts/wave-orchestrator/replay.mjs +10 -4
package/scripts/wave-orchestrator/retry-cli.mjs +184 -0
package/scripts/wave-orchestrator/retry-control.mjs +225 -0
package/scripts/wave-orchestrator/shared.mjs +26 -0
package/scripts/wave-orchestrator/swe-bench-pro-task.mjs +1004 -0
package/scripts/wave-orchestrator/terminals.mjs +1 -1
package/scripts/wave-orchestrator/traces.mjs +157 -2
package/scripts/wave-orchestrator/wave-control-client.mjs +532 -0
package/scripts/wave-orchestrator/wave-control-schema.mjs +309 -0
package/scripts/wave-orchestrator/wave-files.mjs +144 -23
package/scripts/wave.mjs +27 -0
package/skills/repo-coding-rules/SKILL.md +1 -0
package/skills/role-cont-eval/SKILL.md +1 -0
package/skills/role-cont-qa/SKILL.md +13 -6
package/skills/role-deploy/SKILL.md +1 -0
package/skills/role-documentation/SKILL.md +4 -0
package/skills/role-implementation/SKILL.md +4 -0
package/skills/role-infra/SKILL.md +2 -1
package/skills/role-integration/SKILL.md +15 -8
package/skills/role-planner/SKILL.md +39 -0
package/skills/role-planner/skill.json +21 -0
package/skills/role-research/SKILL.md +1 -0
package/skills/role-security/SKILL.md +2 -2
package/skills/runtime-claude/SKILL.md +2 -1
package/skills/runtime-codex/SKILL.md +1 -0
package/skills/runtime-local/SKILL.md +2 -0
package/skills/runtime-opencode/SKILL.md +1 -0
package/skills/wave-core/SKILL.md +25 -6
package/skills/wave-core/references/marker-syntax.md +16 -8
package/wave.config.json +45 -0

package/docs/context7/planner-agent/topics/planning-and-orchestration.md ADDED Viewed

@@ -0,0 +1,24 @@
+---
+summary: 'Curated planning and orchestration corpus exported for the agentic planner Context7 bundle.'
+read_when:
+  - You are publishing or refreshing the planner-agentic Context7 library
+  - You need the exact planner research subset that Wave ships for agentic planning
+title: 'Planner Agentic Context7 Corpus'
+---
+# Planner Agentic Context7 Corpus
+This file is the tracked topic index for the planner-specific Context7 corpus.
+It intentionally references only the copied files that ship under
+`docs/context7/planner-agent/`.
+## Included papers
+- [Verified Multi-Agent Orchestration: A Plan-Execute-Verify-Replan Framework for Complex Query Resolution](../papers/verified-multi-agent-orchestration-a-plan-execute-verify-replan-framework-for-complex-query-resolution.md)
+- [TodoEvolve: Learning to Architect Agent Planning Systems](../papers/todoevolve-learning-to-architect-agent-planning-systems.md)
+- [DOVA: Deliberation-First Multi-Agent Orchestration for Autonomous Research Automation](../papers/dova-deliberation-first-multi-agent-orchestration-for-autonomous-research-automation.md)
+- [Why Do Multi-Agent LLM Systems Fail?](../papers/why-do-multi-agent-llm-systems-fail.md)
+- [Silo-Bench: A Scalable Environment for Evaluating Distributed Coordination in Multi-Agent LLM Systems](../papers/silo-bench-a-scalable-environment-for-evaluating-distributed-coordination-in-multi-agent-llm-systems.md)
+- [DPBench: Large Language Models Struggle with Simultaneous Coordination](../papers/dpbench-large-language-models-struggle-with-simultaneous-coordination.md)
+- [CooperBench: Why Coding Agents Cannot be Your Teammates Yet](../papers/cooperbench-why-coding-agents-cannot-be-your-teammates-yet.md)
+- [Incremental Planning to Control a Blackboard-Based Problem Solver](../papers/incremental-planning-to-control-a-blackboard-based-problem-solver.md)

package/docs/evals/README.md CHANGED Viewed

@@ -7,13 +7,21 @@ summary: "How to use delegated benchmark families, pinned benchmarks, and coordi
 Wave's benchmark catalog lives in `docs/evals/benchmark-catalog.json`.
+The executable local case corpus lives in `docs/evals/cases/`, and the benchmark runner is available through `wave benchmark`.
+Frozen external pilot manifests live in `docs/evals/pilots/`, and external comparison arm templates live in `docs/evals/arm-templates/`.
+An example command-template config shape lives in `docs/evals/external-command-config.sample.json`.
+A runnable SWE-bench Pro config for the local task harness lives in `docs/evals/external-command-config.swe-bench-pro.json`.
 It has two jobs:
 - give `cont-EVAL` a repo-governed menu of allowed benchmark families and benchmark ids
 - document what each benchmark is trying to catch, including coordination failure modes and static paper baselines
+- optionally point from benchmark ids to repo-local deterministic benchmark cases through `localCases`
 The catalog is reference metadata, not a run-history database. It tells the wave author and `cont-EVAL` what kinds of checks are allowed and what external benchmark or paper baseline those checks map to.
+The local case corpus is the executable side of that metadata. It gives the repo a deterministic way to score the current Wave substrate on summary fidelity, targeted inbox recall, capability routing, contradiction handling, and closure guards before moving on to costlier live suites.
 For a full authored wave example that uses these patterns, see [docs/reference/sample-waves.md](../reference/sample-waves.md).
 These benchmark families are also Wave's operator-facing vocabulary for common MAS failure modes. For the research-side framing and the current architectural gaps, see [docs/research/coordination-failure-review.md](../research/coordination-failure-review.md).
@@ -84,6 +92,93 @@ The coordination-oriented families currently included in the catalog are:
 - `contradiction-recovery`
   Use when the risk is false consensus, unresolved conflicting claims, or clarification chains that appear resolved without real repair.
+## Local Case Corpus
+The repo now ships deterministic local benchmark cases under `docs/evals/cases/`.
+Each case:
+- binds to one benchmark family and benchmark id
+- defines a coordination fixture plus expected facts, inboxes, assignments, or closure guards
+- is executable through `wave benchmark run`
+Useful commands:
+```bash
+pnpm exec wave benchmark list
+pnpm exec wave benchmark show --case wave-hidden-profile-private-evidence --json
+pnpm exec wave benchmark run --json
+```
+The default output path is `.tmp/wave-benchmarks/latest/`.
+These case runs are local benchmark artifacts, not committed run history.
+Native mode is deterministic on purpose. `wave benchmark run` is meant to prove the coordination substrate before we move to live external suites. Its logged outputs are:
+- per-case, per-arm `score`, `alignedScore`, `passed`, `direction`, `threshold`, `metrics`, `details`, and generated artifacts
+- family summaries with direction-aligned mean score and pass rate
+- arm comparisons with direction-aligned mean delta versus `single-agent` and bootstrap confidence intervals
+When `waveControl` reporting is enabled, native runs publish `benchmark_run` and `benchmark_item` events through the same telemetry spine as live waves. For the full native-mode contract and the rationale for each metric, see [wave-benchmark-program.md](./wave-benchmark-program.md) and [proof-metrics.md](../reference/proof-metrics.md).
+## External Benchmark Workflow
+The current direct external benchmark path starts with `SWE-bench Pro`.
+Why:
+- it keeps the first direct benchmark grounded in real repository bug-fix work
+- it has a public harness and official verifier path
+- it lets Wave compare `single-agent` and `full-wave` arms under matched settings
+The second direct benchmark slot is intentionally deferred until a later CooperBench-oriented pass.
+The frozen direct pilot is:
+- `docs/evals/pilots/swe-bench-pro-public-pilot.json`
+There is also a review-only diagnostic subset:
+- `docs/evals/pilots/swe-bench-pro-public-full-wave-review-10.json`
+Useful commands:
+```bash
+pnpm exec wave benchmark external-list
+pnpm exec wave benchmark external-show --adapter swe-bench-pro --json
+pnpm exec wave benchmark external-pilots --json
+pnpm exec wave benchmark external-run --adapter swe-bench-pro --command-config docs/evals/external-command-config.swe-bench-pro.json --dry-run --json
+pnpm exec wave benchmark external-run --adapter swe-bench-pro --manifest docs/evals/pilots/swe-bench-pro-public-full-wave-review-10.json --arm full-wave --command-config docs/evals/external-command-config.swe-bench-pro.json --json
+```
+For the first honest comparison:
+- compare only `single-agent` and `full-wave`
+- do not change model, executor, or budget assumptions between those two arms
+- treat review-only subsets as diagnostic material, not as canonical pairwise comparison evidence
+Each `wave benchmark external-run` output directory now includes:
+- `results.json`
+- `results.md`
+- `failure-review.json`
+- `failure-review.md`
+Start with `failure-review.md` when a review-only batch returns many failures. It splits
+verifier-image issues, setup or harness failures, trustworthy patch failures, and dry-run
+planning-only output so the batch is easier to interpret.
+When `waveControl` reporting is enabled, benchmark runs also publish through the same telemetry
+spine as live waves:
+- `benchmark_run` for the batch configuration and attestation hash
+- `benchmark_item` for each task/arm execution
+- `verification` for official harness output and linked verifier artifacts
+- `review` for publishability, validity, and failure classification
+That keeps benchmark trust evidence queryable alongside the runtime traces that produced it.
 ## How To Choose The Right Family
 Choose the family based on the failure you are most worried about, not just on the surface area being changed.
@@ -163,6 +258,6 @@ The benchmark catalog does not yet store:
 - local benchmark run history
 - local-vs-paper delta computation
-- automated benchmark execution plans
+- a second direct benchmark beyond the current SWE-bench Pro path
 For now it is the schema and policy layer that keeps eval authoring, `cont-EVAL`, and coordination benchmarking aligned.

package/docs/evals/arm-templates/README.md ADDED Viewed

@@ -0,0 +1,13 @@
+---
+title: "External Benchmark Arm Templates"
+summary: "Frozen orchestration templates for honest external benchmark comparisons."
+---
+# External Benchmark Arm Templates
+These templates define the only two arm shapes used in the first honest external benchmark runs:
+- `single-agent`
+- `full-wave`
+They are intentionally narrow so external benchmarks compare orchestration shape rather than silently changing model, executor, or budget assumptions.

package/docs/evals/arm-templates/full-wave.json ADDED Viewed

@@ -0,0 +1,15 @@
+{
+  "armId": "full-wave",
+  "title": "Full Wave Orchestration",
+  "roles": ["implementation", "cont-eval", "integration", "documentation", "cont-qa"],
+  "includeContEval": true,
+  "includeIntegrationSteward": true,
+  "includeDocumentationSteward": true,
+  "tracesRequired": true,
+  "notes": [
+    "Includes E0, A8, A9, and A0 in addition to implementation owners.",
+    "Compiled summaries and targeted inboxes are part of the arm behavior.",
+    "Proof-bounded closure and trace capture are required.",
+    "Must use the same model, executor, and benchmark verifier assumptions as the single-agent baseline."
+  ]
+}

package/docs/evals/arm-templates/single-agent.json ADDED Viewed

@@ -0,0 +1,15 @@
+{
+  "armId": "single-agent",
+  "title": "Single Agent Baseline",
+  "roles": ["implementation"],
+  "includeContEval": false,
+  "includeIntegrationSteward": false,
+  "includeDocumentationSteward": false,
+  "tracesRequired": false,
+  "notes": [
+    "One implementation owner only.",
+    "No specialist decomposition.",
+    "No cont-EVAL, integration steward, or documentation steward.",
+    "Used only when model, executor, verifier, and budget are held constant relative to full-wave."
+  ]
+}

package/docs/evals/benchmark-catalog.json CHANGED Viewed

@@ -120,6 +120,7 @@
         "private-evidence-integration": {
           "title": "Private Evidence Integration",
           "summary": "Checks whether separately observed facts are integrated into the final answer rather than merely repeated in conversation.",
+          "localCases": ["wave-hidden-profile-private-evidence"],
           "goal": "Measure end-to-end integration of distributed evidence into a coherent outcome.",
           "failureModes": [
             "communication-without-integration",
@@ -149,6 +150,7 @@
         "premature-consensus-guard": {
           "title": "Premature Consensus Guard",
           "summary": "Checks whether the system delays closure when important evidence is still siloed.",
+          "localCases": ["wave-premature-closure-guard"],
           "goal": "Measure resistance to converging early on shared but incomplete evidence.",
           "failureModes": [
             "premature-consensus",
@@ -205,6 +207,7 @@
         "cross-agent-state-reconstruction": {
           "title": "Cross-Agent State Reconstruction",
           "summary": "Checks whether the final shared state reflects facts that no single agent started with alone.",
+          "localCases": ["wave-silo-cross-agent-state"],
           "goal": "Measure whether the blackboard can reconstruct a correct global state from distributed local views.",
           "failureModes": [
             "information-silo",
@@ -363,6 +366,7 @@
         "lockstep-resolution": {
           "title": "Lockstep Resolution",
           "summary": "Checks whether the framework resolves many-way concurrent dependencies without circular waiting.",
+          "localCases": ["wave-simultaneous-lockstep"],
           "goal": "Measure coordination quality when several blocking tickets must resolve together.",
           "failureModes": [
             "circular-wait",
@@ -429,6 +433,7 @@
         "expert-routing-preservation": {
           "title": "Expert Routing Preservation",
           "summary": "Checks whether capability-targeted work is routed to the best available owner and stays there through closure.",
+          "localCases": ["wave-expert-routing-preservation"],
           "goal": "Measure whether the harness protects expert ownership instead of diluting it.",
           "failureModes": [
             "expert-underuse",
@@ -541,6 +546,7 @@
         "inbox-targeting-fidelity": {
           "title": "Inbox Targeting Fidelity",
           "summary": "Checks whether relevant facts reach the agents that own the impacted paths, components, or requests.",
+          "localCases": ["wave-blackboard-inbox-targeting"],
           "goal": "Measure whether inbox targeting reduces silos instead of creating them.",
           "failureModes": [
             "mis-targeted-context",
@@ -606,6 +612,7 @@
         "claim-conflict-detection": {
           "title": "Claim Conflict Detection",
           "summary": "Checks whether incompatible claims are surfaced in coordination or integration instead of passing through silently.",
+          "localCases": ["wave-contradiction-conflict"],
           "goal": "Measure whether the framework sees contradictory evidence before final closure.",
           "failureModes": [
             "false-consensus",

package/docs/evals/cases/README.md ADDED Viewed

@@ -0,0 +1,47 @@
+---
+title: "Wave Benchmark Cases"
+summary: "Deterministic local benchmark cases for Wave-native coordination, routing, and closure evaluation."
+---
+# Wave Benchmark Cases
+Each file in this directory defines one deterministic benchmark case consumed by `wave benchmark`.
+## Why These Cases Exist
+The benchmark catalog describes *what* a benchmark is meant to measure. These case files provide the local executable fixtures that let the repo score those ideas consistently.
+They are designed to be:
+- cheap
+- deterministic
+- transparent
+- rooted in current Wave surfaces such as summaries, inboxes, request routing, and closure guards
+## File Shape
+Each case file is a single JSON object with:
+- `id`
+- `familyId`
+- `benchmarkId`
+- `supportedArms`
+- `fixture`
+- `expectations`
+- `scoring`
+## Current Arms
+The runner currently compares:
+- `single-agent`
+- `multi-agent-minimal`
+- `full-wave`
+The `full-wave-plus-improvement` arm is supported by the loader for later benchmark-improvement loops but is not part of the initial deterministic corpus.
+## Current Limitation
+The initial corpus is projection-backed rather than live-run-backed. It evaluates how well the current Wave substrate compiles and routes coordination state before we spend runtime budget on larger live suites.
+That is intentional for the first milestone. The next layer will add trace-backed and external benchmark adapters on top of this format.

package/docs/evals/cases/wave-blackboard-inbox-targeting.json ADDED Viewed

@@ -0,0 +1,73 @@
+{
+  "version": 1,
+  "id": "wave-blackboard-inbox-targeting",
+  "title": "Inbox Targeting Fidelity",
+  "summary": "Critical doc and runtime facts should survive projection into the right owner inboxes instead of staying buried in raw coordination.",
+  "familyId": "blackboard-fidelity",
+  "benchmarkId": "inbox-targeting-fidelity",
+  "kind": "projection",
+  "supportedArms": ["single-agent", "multi-agent-minimal", "full-wave"],
+  "scoring": {
+    "kind": "projection-fidelity",
+    "primaryMetric": "targeted-inbox-recall",
+    "thresholds": {
+      "targeted-inbox-recall": 100,
+      "projection-consistency-rate": 100
+    },
+    "practicalWinThreshold": 20
+  },
+  "expectations": {
+    "summaryFacts": ["docs must note the new queue retry ceiling"],
+    "targetedInboxes": {
+      "a9": ["docs must note the new queue retry ceiling"],
+      "a1": ["runtime must enforce retry ceiling 3 before enqueue"]
+    }
+  },
+  "fixture": {
+    "lane": "main",
+    "waveNumber": 0,
+    "primaryAgentId": "a1",
+    "agents": [
+      {
+        "agentId": "a1",
+        "title": "Runtime Owner",
+        "ownedPaths": ["src/queue/retries.ts"],
+        "capabilities": ["runtime"]
+      },
+      {
+        "agentId": "a9",
+        "title": "Documentation Steward",
+        "ownedPaths": ["docs/plans/current-state.md"],
+        "capabilities": ["documentation"]
+      }
+    ],
+    "records": [
+      {
+        "id": "req-runtime-retry-cap",
+        "kind": "request",
+        "lane": "main",
+        "wave": 0,
+        "agentId": "a8",
+        "targets": ["agent:a1"],
+        "status": "open",
+        "priority": "high",
+        "artifactRefs": ["src/queue/retries.ts"],
+        "summary": "runtime must enforce retry ceiling 3 before enqueue",
+        "detail": "This runtime-facing fact must stay visible to the implementation owner."
+      },
+      {
+        "id": "req-doc-retry-cap",
+        "kind": "blocker",
+        "lane": "main",
+        "wave": 0,
+        "agentId": "a8",
+        "targets": ["agent:a9"],
+        "status": "open",
+        "priority": "normal",
+        "artifactRefs": ["docs/plans/current-state.md"],
+        "summary": "docs must note the new queue retry ceiling",
+        "detail": "The documentation steward needs the same coordination fact in a docs-owned surface."
+      }
+    ]
+  }
+}

package/docs/evals/cases/wave-contradiction-conflict.json ADDED Viewed

@@ -0,0 +1,104 @@
+{
+  "version": 1,
+  "id": "wave-contradiction-conflict",
+  "title": "Claim Conflict Detection",
+  "summary": "Conflicting claims should be surfaced to the integration steward and converted into explicit repair work.",
+  "familyId": "contradiction-recovery",
+  "benchmarkId": "claim-conflict-detection",
+  "kind": "projection",
+  "supportedArms": ["single-agent", "multi-agent-minimal", "full-wave"],
+  "scoring": {
+    "kind": "contradiction-recovery",
+    "primaryMetric": "contradiction-detection-rate",
+    "thresholds": {
+      "contradiction-detection-rate": 100,
+      "repair-closure-rate": 100
+    },
+    "practicalWinThreshold": 25
+  },
+  "expectations": {
+    "targetedInboxes": {
+      "a8": [
+        "claim one: config flag enable_fast_path is safe in prod",
+        "claim two: config flag enable_fast_path leaks stale auth data"
+      ]
+    },
+    "requiredAssignments": [
+      {
+        "requestId": "repair-fast-path",
+        "assignedAgentId": "a8"
+      }
+    ]
+  },
+  "fixture": {
+    "lane": "main",
+    "waveNumber": 0,
+    "primaryAgentId": "a1",
+    "capabilityRouting": {
+      "preferredAgents": {
+        "integration": ["a8"]
+      }
+    },
+    "agents": [
+      {
+        "agentId": "a1",
+        "title": "Runtime Owner",
+        "ownedPaths": ["src/auth/fast-path.ts"],
+        "capabilities": ["runtime"]
+      },
+      {
+        "agentId": "a2",
+        "title": "Security Reviewer",
+        "ownedPaths": ["src/auth/session.ts"],
+        "capabilities": ["security"]
+      },
+      {
+        "agentId": "a8",
+        "title": "Integration Steward",
+        "ownedPaths": [".tmp/main-wave-launcher/integration/wave-0.md"],
+        "capabilities": ["integration"]
+      }
+    ],
+    "records": [
+      {
+        "id": "claim-fast-safe",
+        "kind": "claim",
+        "lane": "main",
+        "wave": 0,
+        "agentId": "a1",
+        "targets": ["agent:a8"],
+        "status": "open",
+        "priority": "high",
+        "artifactRefs": ["src/auth/fast-path.ts"],
+        "summary": "claim one: config flag enable_fast_path is safe in prod",
+        "detail": "The runtime owner believes the fast path is safe."
+      },
+      {
+        "id": "claim-fast-leak",
+        "kind": "claim",
+        "lane": "main",
+        "wave": 0,
+        "agentId": "a2",
+        "targets": ["agent:a8"],
+        "status": "open",
+        "priority": "high",
+        "artifactRefs": ["src/auth/session.ts", "src/auth/fast-path.ts"],
+        "summary": "claim two: config flag enable_fast_path leaks stale auth data",
+        "detail": "The security reviewer believes the same flag is unsafe."
+      },
+      {
+        "id": "repair-fast-path",
+        "kind": "request",
+        "lane": "main",
+        "wave": 0,
+        "agentId": "a8",
+        "targets": ["capability:integration"],
+        "status": "open",
+        "priority": "high",
+        "artifactRefs": ["src/auth/fast-path.ts"],
+        "summary": "Turn the contradiction into explicit repair work before closure",
+        "detail": "Integration must own the conflict repair loop."
+      }
+    ]
+  }
+}

package/docs/evals/cases/wave-expert-routing-preservation.json ADDED Viewed

@@ -0,0 +1,69 @@
+{
+  "version": 1,
+  "id": "wave-expert-routing-preservation",
+  "title": "Expert Routing Preservation",
+  "summary": "A capability-targeted request should route to the database specialist rather than diffuse into generic ownership.",
+  "familyId": "expertise-leverage",
+  "benchmarkId": "expert-routing-preservation",
+  "kind": "projection",
+  "supportedArms": ["single-agent", "multi-agent-minimal", "full-wave"],
+  "scoring": {
+    "kind": "expertise-routing",
+    "primaryMetric": "capability-routing-precision",
+    "thresholds": {
+      "capability-routing-precision": 100,
+      "expert-preservation-rate": 100
+    },
+    "practicalWinThreshold": 25
+  },
+  "expectations": {
+    "requiredAssignments": [
+      {
+        "requestId": "req-covering-index",
+        "assignedAgentId": "a2"
+      }
+    ],
+    "targetedInboxes": {
+      "a2": ["database expert says add a covering index on tenant_id and updated_at"]
+    }
+  },
+  "fixture": {
+    "lane": "main",
+    "waveNumber": 0,
+    "primaryAgentId": "a1",
+    "capabilityRouting": {
+      "preferredAgents": {
+        "database": ["a2"]
+      }
+    },
+    "agents": [
+      {
+        "agentId": "a1",
+        "title": "Generalist Owner",
+        "ownedPaths": ["src/reporting/service.ts"],
+        "capabilities": ["runtime"]
+      },
+      {
+        "agentId": "a2",
+        "title": "Database Specialist",
+        "ownedPaths": ["db/indexes/reporting.sql"],
+        "capabilities": ["database"]
+      }
+    ],
+    "records": [
+      {
+        "id": "req-covering-index",
+        "kind": "request",
+        "lane": "main",
+        "wave": 0,
+        "agentId": "a8",
+        "targets": ["capability:database"],
+        "status": "open",
+        "priority": "high",
+        "artifactRefs": ["db/indexes/reporting.sql"],
+        "summary": "database expert says add a covering index on tenant_id and updated_at",
+        "detail": "The database specialist should own this request rather than having it averaged away."
+      }
+    ]
+  }
+}

package/docs/evals/cases/wave-hidden-profile-private-evidence.json ADDED Viewed

@@ -0,0 +1,81 @@
+{
+  "version": 1,
+  "id": "wave-hidden-profile-private-evidence",
+  "title": "Private Evidence Integration",
+  "summary": "Critical facts are split across two specialists and only the full Wave arm should surface both through targeted inboxes.",
+  "familyId": "hidden-profile-pooling",
+  "benchmarkId": "private-evidence-integration",
+  "kind": "projection",
+  "supportedArms": ["single-agent", "multi-agent-minimal", "full-wave"],
+  "scoring": {
+    "kind": "distributed-info",
+    "primaryMetric": "distributed-info-accuracy",
+    "thresholds": {
+      "distributed-info-accuracy": 100
+    },
+    "practicalWinThreshold": 20
+  },
+  "expectations": {
+    "globalFacts": [
+      "customer-facing outage is isolated to shard blue",
+      "the rollback must preserve migration 20260321_add_backfill_guard"
+    ],
+    "targetedInboxes": {
+      "a1": ["customer-facing outage is isolated to shard blue"],
+      "a2": ["the rollback must preserve migration 20260321_add_backfill_guard"]
+    }
+  },
+  "fixture": {
+    "lane": "main",
+    "waveNumber": 0,
+    "primaryAgentId": "a1",
+    "agents": [
+      {
+        "agentId": "a1",
+        "title": "API Specialist",
+        "ownedPaths": ["src/api/server.ts"],
+        "capabilities": ["api"]
+      },
+      {
+        "agentId": "a2",
+        "title": "Migration Specialist",
+        "ownedPaths": ["db/migrations/20260321_add_backfill_guard.sql"],
+        "capabilities": ["database"]
+      },
+      {
+        "agentId": "a8",
+        "title": "Integration Steward",
+        "ownedPaths": [".tmp/main-wave-launcher/integration/wave-0.md"],
+        "capabilities": ["integration"]
+      }
+    ],
+    "records": [
+      {
+        "id": "req-blue-shard",
+        "kind": "request",
+        "lane": "main",
+        "wave": 0,
+        "agentId": "a8",
+        "targets": ["agent:a1"],
+        "status": "open",
+        "priority": "high",
+        "artifactRefs": ["src/api/server.ts"],
+        "summary": "Need API confirmation: customer-facing outage is isolated to shard blue",
+        "detail": "Integration cannot close until the API owner confirms shard blue is the only failing shard."
+      },
+      {
+        "id": "req-backfill-guard",
+        "kind": "request",
+        "lane": "main",
+        "wave": 0,
+        "agentId": "a8",
+        "targets": ["agent:a2"],
+        "status": "open",
+        "priority": "high",
+        "artifactRefs": ["db/migrations/20260321_add_backfill_guard.sql"],
+        "summary": "Need migration confirmation: the rollback must preserve migration 20260321_add_backfill_guard",
+        "detail": "The database owner holds the rollback-safe migration constraint."
+      }
+    ]
+  }
+}