@chllming/wave-orchestration 0.6.3 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +82 -1
- package/README.md +40 -7
- package/docs/agents/wave-orchestrator-role.md +50 -0
- package/docs/agents/wave-planner-role.md +39 -0
- package/docs/context7/bundles.json +9 -0
- package/docs/context7/planner-agent/README.md +25 -0
- package/docs/context7/planner-agent/manifest.json +83 -0
- package/docs/context7/planner-agent/papers/cooperbench-why-coding-agents-cannot-be-your-teammates-yet.md +3283 -0
- package/docs/context7/planner-agent/papers/dova-deliberation-first-multi-agent-orchestration-for-autonomous-research-automation.md +1699 -0
- package/docs/context7/planner-agent/papers/dpbench-large-language-models-struggle-with-simultaneous-coordination.md +2251 -0
- package/docs/context7/planner-agent/papers/incremental-planning-to-control-a-blackboard-based-problem-solver.md +1729 -0
- package/docs/context7/planner-agent/papers/silo-bench-a-scalable-environment-for-evaluating-distributed-coordination-in-multi-agent-llm-systems.md +3747 -0
- package/docs/context7/planner-agent/papers/todoevolve-learning-to-architect-agent-planning-systems.md +1675 -0
- package/docs/context7/planner-agent/papers/verified-multi-agent-orchestration-a-plan-execute-verify-replan-framework-for-complex-query-resolution.md +1173 -0
- package/docs/context7/planner-agent/papers/why-do-multi-agent-llm-systems-fail.md +5211 -0
- package/docs/context7/planner-agent/topics/planning-and-orchestration.md +24 -0
- package/docs/evals/README.md +96 -1
- package/docs/evals/arm-templates/README.md +13 -0
- package/docs/evals/arm-templates/full-wave.json +15 -0
- package/docs/evals/arm-templates/single-agent.json +15 -0
- package/docs/evals/benchmark-catalog.json +7 -0
- package/docs/evals/cases/README.md +47 -0
- package/docs/evals/cases/wave-blackboard-inbox-targeting.json +73 -0
- package/docs/evals/cases/wave-contradiction-conflict.json +104 -0
- package/docs/evals/cases/wave-expert-routing-preservation.json +69 -0
- package/docs/evals/cases/wave-hidden-profile-private-evidence.json +81 -0
- package/docs/evals/cases/wave-premature-closure-guard.json +71 -0
- package/docs/evals/cases/wave-silo-cross-agent-state.json +77 -0
- package/docs/evals/cases/wave-simultaneous-lockstep.json +92 -0
- package/docs/evals/cooperbench/real-world-mitigation.md +341 -0
- package/docs/evals/external-benchmarks.json +85 -0
- package/docs/evals/external-command-config.sample.json +9 -0
- package/docs/evals/external-command-config.swe-bench-pro.json +8 -0
- package/docs/evals/pilots/README.md +47 -0
- package/docs/evals/pilots/swe-bench-pro-public-full-wave-review-10.json +64 -0
- package/docs/evals/pilots/swe-bench-pro-public-pilot.json +111 -0
- package/docs/evals/wave-benchmark-program.md +302 -0
- package/docs/guides/planner.md +67 -11
- package/docs/guides/terminal-surfaces.md +12 -0
- package/docs/plans/context7-wave-orchestrator.md +20 -0
- package/docs/plans/current-state.md +8 -1
- package/docs/plans/examples/wave-benchmark-improvement.md +108 -0
- package/docs/plans/examples/wave-example-live-proof.md +1 -1
- package/docs/plans/examples/wave-example-rollout-fidelity.md +340 -0
- package/docs/plans/migration.md +26 -0
- package/docs/plans/wave-orchestrator.md +60 -12
- package/docs/plans/waves/reviews/wave-1-benchmark-operator.md +118 -0
- package/docs/reference/cli-reference.md +547 -0
- package/docs/reference/coordination-and-closure.md +436 -0
- package/docs/reference/live-proof-waves.md +25 -3
- package/docs/reference/npmjs-trusted-publishing.md +3 -3
- package/docs/reference/proof-metrics.md +90 -0
- package/docs/reference/runtime-config/README.md +63 -2
- package/docs/reference/runtime-config/codex.md +2 -1
- package/docs/reference/sample-waves.md +29 -18
- package/docs/reference/wave-control.md +164 -0
- package/docs/reference/wave-planning-lessons.md +131 -0
- package/package.json +5 -4
- package/releases/manifest.json +40 -0
- package/scripts/research/agent-context-archive.mjs +18 -0
- package/scripts/research/manifests/agent-context-expanded-2026-03-22.mjs +17 -0
- package/scripts/research/sync-planner-context7-bundle.mjs +133 -0
- package/scripts/wave-orchestrator/agent-state.mjs +11 -2
- package/scripts/wave-orchestrator/artifact-schemas.mjs +232 -0
- package/scripts/wave-orchestrator/autonomous.mjs +7 -0
- package/scripts/wave-orchestrator/benchmark-cases.mjs +374 -0
- package/scripts/wave-orchestrator/benchmark-external.mjs +1384 -0
- package/scripts/wave-orchestrator/benchmark.mjs +972 -0
- package/scripts/wave-orchestrator/clarification-triage.mjs +78 -12
- package/scripts/wave-orchestrator/config.mjs +175 -0
- package/scripts/wave-orchestrator/control-cli.mjs +1216 -0
- package/scripts/wave-orchestrator/control-plane.mjs +697 -0
- package/scripts/wave-orchestrator/coord-cli.mjs +360 -2
- package/scripts/wave-orchestrator/coordination-store.mjs +211 -9
- package/scripts/wave-orchestrator/coordination.mjs +84 -0
- package/scripts/wave-orchestrator/dashboard-renderer.mjs +120 -5
- package/scripts/wave-orchestrator/dashboard-state.mjs +22 -0
- package/scripts/wave-orchestrator/evals.mjs +23 -0
- package/scripts/wave-orchestrator/executors.mjs +3 -2
- package/scripts/wave-orchestrator/feedback.mjs +55 -0
- package/scripts/wave-orchestrator/install.mjs +151 -2
- package/scripts/wave-orchestrator/launcher-closure.mjs +4 -1
- package/scripts/wave-orchestrator/launcher-runtime.mjs +33 -30
- package/scripts/wave-orchestrator/launcher.mjs +884 -36
- package/scripts/wave-orchestrator/planner-context.mjs +75 -0
- package/scripts/wave-orchestrator/planner.mjs +2270 -136
- package/scripts/wave-orchestrator/proof-cli.mjs +195 -0
- package/scripts/wave-orchestrator/proof-registry.mjs +317 -0
- package/scripts/wave-orchestrator/replay.mjs +10 -4
- package/scripts/wave-orchestrator/retry-cli.mjs +184 -0
- package/scripts/wave-orchestrator/retry-control.mjs +225 -0
- package/scripts/wave-orchestrator/shared.mjs +26 -0
- package/scripts/wave-orchestrator/swe-bench-pro-task.mjs +1004 -0
- package/scripts/wave-orchestrator/terminals.mjs +1 -1
- package/scripts/wave-orchestrator/traces.mjs +157 -2
- package/scripts/wave-orchestrator/wave-control-client.mjs +532 -0
- package/scripts/wave-orchestrator/wave-control-schema.mjs +309 -0
- package/scripts/wave-orchestrator/wave-files.mjs +144 -23
- package/scripts/wave.mjs +27 -0
- package/skills/repo-coding-rules/SKILL.md +1 -0
- package/skills/role-cont-eval/SKILL.md +1 -0
- package/skills/role-cont-qa/SKILL.md +13 -6
- package/skills/role-deploy/SKILL.md +1 -0
- package/skills/role-documentation/SKILL.md +4 -0
- package/skills/role-implementation/SKILL.md +4 -0
- package/skills/role-infra/SKILL.md +2 -1
- package/skills/role-integration/SKILL.md +15 -8
- package/skills/role-planner/SKILL.md +39 -0
- package/skills/role-planner/skill.json +21 -0
- package/skills/role-research/SKILL.md +1 -0
- package/skills/role-security/SKILL.md +2 -2
- package/skills/runtime-claude/SKILL.md +2 -1
- package/skills/runtime-codex/SKILL.md +1 -0
- package/skills/runtime-local/SKILL.md +2 -0
- package/skills/runtime-opencode/SKILL.md +1 -0
- package/skills/wave-core/SKILL.md +25 -6
- package/skills/wave-core/references/marker-syntax.md +16 -8
- package/wave.config.json +45 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
---
|
|
2
|
+
summary: 'Curated planning and orchestration corpus exported for the agentic planner Context7 bundle.'
|
|
3
|
+
read_when:
|
|
4
|
+
- You are publishing or refreshing the planner-agentic Context7 library
|
|
5
|
+
- You need the exact planner research subset that Wave ships for agentic planning
|
|
6
|
+
title: 'Planner Agentic Context7 Corpus'
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Planner Agentic Context7 Corpus
|
|
10
|
+
|
|
11
|
+
This file is the tracked topic index for the planner-specific Context7 corpus.
|
|
12
|
+
It intentionally references only the copied files that ship under
|
|
13
|
+
`docs/context7/planner-agent/`.
|
|
14
|
+
|
|
15
|
+
## Included papers
|
|
16
|
+
|
|
17
|
+
- [Verified Multi-Agent Orchestration: A Plan-Execute-Verify-Replan Framework for Complex Query Resolution](../papers/verified-multi-agent-orchestration-a-plan-execute-verify-replan-framework-for-complex-query-resolution.md)
|
|
18
|
+
- [TodoEvolve: Learning to Architect Agent Planning Systems](../papers/todoevolve-learning-to-architect-agent-planning-systems.md)
|
|
19
|
+
- [DOVA: Deliberation-First Multi-Agent Orchestration for Autonomous Research Automation](../papers/dova-deliberation-first-multi-agent-orchestration-for-autonomous-research-automation.md)
|
|
20
|
+
- [Why Do Multi-Agent LLM Systems Fail?](../papers/why-do-multi-agent-llm-systems-fail.md)
|
|
21
|
+
- [Silo-Bench: A Scalable Environment for Evaluating Distributed Coordination in Multi-Agent LLM Systems](../papers/silo-bench-a-scalable-environment-for-evaluating-distributed-coordination-in-multi-agent-llm-systems.md)
|
|
22
|
+
- [DPBench: Large Language Models Struggle with Simultaneous Coordination](../papers/dpbench-large-language-models-struggle-with-simultaneous-coordination.md)
|
|
23
|
+
- [CooperBench: Why Coding Agents Cannot be Your Teammates Yet](../papers/cooperbench-why-coding-agents-cannot-be-your-teammates-yet.md)
|
|
24
|
+
- [Incremental Planning to Control a Blackboard-Based Problem Solver](../papers/incremental-planning-to-control-a-blackboard-based-problem-solver.md)
|
package/docs/evals/README.md
CHANGED
|
@@ -7,13 +7,21 @@ summary: "How to use delegated benchmark families, pinned benchmarks, and coordi
|
|
|
7
7
|
|
|
8
8
|
Wave's benchmark catalog lives in `docs/evals/benchmark-catalog.json`.
|
|
9
9
|
|
|
10
|
+
The executable local case corpus lives in `docs/evals/cases/`, and the benchmark runner is available through `wave benchmark`.
|
|
11
|
+
Frozen external pilot manifests live in `docs/evals/pilots/`, and external comparison arm templates live in `docs/evals/arm-templates/`.
|
|
12
|
+
An example command-template config shape lives in `docs/evals/external-command-config.sample.json`.
|
|
13
|
+
A runnable SWE-bench Pro config for the local task harness lives in `docs/evals/external-command-config.swe-bench-pro.json`.
|
|
14
|
+
|
|
10
15
|
It has two jobs:
|
|
11
16
|
|
|
12
17
|
- give `cont-EVAL` a repo-governed menu of allowed benchmark families and benchmark ids
|
|
13
18
|
- document what each benchmark is trying to catch, including coordination failure modes and static paper baselines
|
|
19
|
+
- optionally point from benchmark ids to repo-local deterministic benchmark cases through `localCases`
|
|
14
20
|
|
|
15
21
|
The catalog is reference metadata, not a run-history database. It tells the wave author and `cont-EVAL` what kinds of checks are allowed and what external benchmark or paper baseline those checks map to.
|
|
16
22
|
|
|
23
|
+
The local case corpus is the executable side of that metadata. It gives the repo a deterministic way to score the current Wave substrate on summary fidelity, targeted inbox recall, capability routing, contradiction handling, and closure guards before moving on to costlier live suites.
|
|
24
|
+
|
|
17
25
|
For a full authored wave example that uses these patterns, see [docs/reference/sample-waves.md](../reference/sample-waves.md).
|
|
18
26
|
|
|
19
27
|
These benchmark families are also Wave's operator-facing vocabulary for common MAS failure modes. For the research-side framing and the current architectural gaps, see [docs/research/coordination-failure-review.md](../research/coordination-failure-review.md).
|
|
@@ -84,6 +92,93 @@ The coordination-oriented families currently included in the catalog are:
|
|
|
84
92
|
- `contradiction-recovery`
|
|
85
93
|
Use when the risk is false consensus, unresolved conflicting claims, or clarification chains that appear resolved without real repair.
|
|
86
94
|
|
|
95
|
+
## Local Case Corpus
|
|
96
|
+
|
|
97
|
+
The repo now ships deterministic local benchmark cases under `docs/evals/cases/`.
|
|
98
|
+
|
|
99
|
+
Each case:
|
|
100
|
+
|
|
101
|
+
- binds to one benchmark family and benchmark id
|
|
102
|
+
- defines a coordination fixture plus expected facts, inboxes, assignments, or closure guards
|
|
103
|
+
- is executable through `wave benchmark run`
|
|
104
|
+
|
|
105
|
+
Useful commands:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
pnpm exec wave benchmark list
|
|
109
|
+
pnpm exec wave benchmark show --case wave-hidden-profile-private-evidence --json
|
|
110
|
+
pnpm exec wave benchmark run --json
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
The default output path is `.tmp/wave-benchmarks/latest/`.
|
|
114
|
+
|
|
115
|
+
These case runs are local benchmark artifacts, not committed run history.
|
|
116
|
+
|
|
117
|
+
Native mode is deterministic on purpose. `wave benchmark run` is meant to prove the coordination substrate before we move to live external suites. Its logged outputs are:
|
|
118
|
+
|
|
119
|
+
- per-case, per-arm `score`, `alignedScore`, `passed`, `direction`, `threshold`, `metrics`, `details`, and generated artifacts
|
|
120
|
+
- family summaries with direction-aligned mean score and pass rate
|
|
121
|
+
- arm comparisons with direction-aligned mean delta versus `single-agent` and bootstrap confidence intervals
|
|
122
|
+
|
|
123
|
+
When `waveControl` reporting is enabled, native runs publish `benchmark_run` and `benchmark_item` events through the same telemetry spine as live waves. For the full native-mode contract and the rationale for each metric, see [wave-benchmark-program.md](./wave-benchmark-program.md) and [proof-metrics.md](../reference/proof-metrics.md).
|
|
124
|
+
|
|
125
|
+
## External Benchmark Workflow
|
|
126
|
+
|
|
127
|
+
The current direct external benchmark path starts with `SWE-bench Pro`.
|
|
128
|
+
|
|
129
|
+
Why:
|
|
130
|
+
|
|
131
|
+
- it keeps the first direct benchmark grounded in real repository bug-fix work
|
|
132
|
+
- it has a public harness and official verifier path
|
|
133
|
+
- it lets Wave compare `single-agent` and `full-wave` arms under matched settings
|
|
134
|
+
|
|
135
|
+
The second direct benchmark slot is intentionally deferred until a later CooperBench-oriented pass.
|
|
136
|
+
|
|
137
|
+
The frozen direct pilot is:
|
|
138
|
+
|
|
139
|
+
- `docs/evals/pilots/swe-bench-pro-public-pilot.json`
|
|
140
|
+
|
|
141
|
+
There is also a review-only diagnostic subset:
|
|
142
|
+
|
|
143
|
+
- `docs/evals/pilots/swe-bench-pro-public-full-wave-review-10.json`
|
|
144
|
+
|
|
145
|
+
Useful commands:
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
pnpm exec wave benchmark external-list
|
|
149
|
+
pnpm exec wave benchmark external-show --adapter swe-bench-pro --json
|
|
150
|
+
pnpm exec wave benchmark external-pilots --json
|
|
151
|
+
pnpm exec wave benchmark external-run --adapter swe-bench-pro --command-config docs/evals/external-command-config.swe-bench-pro.json --dry-run --json
|
|
152
|
+
pnpm exec wave benchmark external-run --adapter swe-bench-pro --manifest docs/evals/pilots/swe-bench-pro-public-full-wave-review-10.json --arm full-wave --command-config docs/evals/external-command-config.swe-bench-pro.json --json
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
For the first honest comparison:
|
|
156
|
+
|
|
157
|
+
- compare only `single-agent` and `full-wave`
|
|
158
|
+
- do not change model, executor, or budget assumptions between those two arms
|
|
159
|
+
- treat review-only subsets as diagnostic material, not as canonical pairwise comparison evidence
|
|
160
|
+
|
|
161
|
+
Each `wave benchmark external-run` output directory now includes:
|
|
162
|
+
|
|
163
|
+
- `results.json`
|
|
164
|
+
- `results.md`
|
|
165
|
+
- `failure-review.json`
|
|
166
|
+
- `failure-review.md`
|
|
167
|
+
|
|
168
|
+
Start with `failure-review.md` when a review-only batch returns many failures. It splits
|
|
169
|
+
verifier-image issues, setup or harness failures, trustworthy patch failures, and dry-run
|
|
170
|
+
planning-only output so the batch is easier to interpret.
|
|
171
|
+
|
|
172
|
+
When `waveControl` reporting is enabled, benchmark runs also publish through the same telemetry
|
|
173
|
+
spine as live waves:
|
|
174
|
+
|
|
175
|
+
- `benchmark_run` for the batch configuration and attestation hash
|
|
176
|
+
- `benchmark_item` for each task/arm execution
|
|
177
|
+
- `verification` for official harness output and linked verifier artifacts
|
|
178
|
+
- `review` for publishability, validity, and failure classification
|
|
179
|
+
|
|
180
|
+
That keeps benchmark trust evidence queryable alongside the runtime traces that produced it.
|
|
181
|
+
|
|
87
182
|
## How To Choose The Right Family
|
|
88
183
|
|
|
89
184
|
Choose the family based on the failure you are most worried about, not just on the surface area being changed.
|
|
@@ -163,6 +258,6 @@ The benchmark catalog does not yet store:
|
|
|
163
258
|
|
|
164
259
|
- local benchmark run history
|
|
165
260
|
- local-vs-paper delta computation
|
|
166
|
-
-
|
|
261
|
+
- a second direct benchmark beyond the current SWE-bench Pro path
|
|
167
262
|
|
|
168
263
|
For now it is the schema and policy layer that keeps eval authoring, `cont-EVAL`, and coordination benchmarking aligned.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "External Benchmark Arm Templates"
|
|
3
|
+
summary: "Frozen orchestration templates for honest external benchmark comparisons."
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# External Benchmark Arm Templates
|
|
7
|
+
|
|
8
|
+
These templates define the only two arm shapes used in the first honest external benchmark runs:
|
|
9
|
+
|
|
10
|
+
- `single-agent`
|
|
11
|
+
- `full-wave`
|
|
12
|
+
|
|
13
|
+
They are intentionally narrow so external benchmarks compare orchestration shape rather than silently changing model, executor, or budget assumptions.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
"armId": "full-wave",
|
|
3
|
+
"title": "Full Wave Orchestration",
|
|
4
|
+
"roles": ["implementation", "cont-eval", "integration", "documentation", "cont-qa"],
|
|
5
|
+
"includeContEval": true,
|
|
6
|
+
"includeIntegrationSteward": true,
|
|
7
|
+
"includeDocumentationSteward": true,
|
|
8
|
+
"tracesRequired": true,
|
|
9
|
+
"notes": [
|
|
10
|
+
"Includes E0, A8, A9, and A0 in addition to implementation owners.",
|
|
11
|
+
"Compiled summaries and targeted inboxes are part of the arm behavior.",
|
|
12
|
+
"Proof-bounded closure and trace capture are required.",
|
|
13
|
+
"Must use the same model, executor, and benchmark verifier assumptions as the single-agent baseline."
|
|
14
|
+
]
|
|
15
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
"armId": "single-agent",
|
|
3
|
+
"title": "Single Agent Baseline",
|
|
4
|
+
"roles": ["implementation"],
|
|
5
|
+
"includeContEval": false,
|
|
6
|
+
"includeIntegrationSteward": false,
|
|
7
|
+
"includeDocumentationSteward": false,
|
|
8
|
+
"tracesRequired": false,
|
|
9
|
+
"notes": [
|
|
10
|
+
"One implementation owner only.",
|
|
11
|
+
"No specialist decomposition.",
|
|
12
|
+
"No cont-EVAL, integration steward, or documentation steward.",
|
|
13
|
+
"Used only when model, executor, verifier, and budget are held constant relative to full-wave."
|
|
14
|
+
]
|
|
15
|
+
}
|
|
@@ -120,6 +120,7 @@
|
|
|
120
120
|
"private-evidence-integration": {
|
|
121
121
|
"title": "Private Evidence Integration",
|
|
122
122
|
"summary": "Checks whether separately observed facts are integrated into the final answer rather than merely repeated in conversation.",
|
|
123
|
+
"localCases": ["wave-hidden-profile-private-evidence"],
|
|
123
124
|
"goal": "Measure end-to-end integration of distributed evidence into a coherent outcome.",
|
|
124
125
|
"failureModes": [
|
|
125
126
|
"communication-without-integration",
|
|
@@ -149,6 +150,7 @@
|
|
|
149
150
|
"premature-consensus-guard": {
|
|
150
151
|
"title": "Premature Consensus Guard",
|
|
151
152
|
"summary": "Checks whether the system delays closure when important evidence is still siloed.",
|
|
153
|
+
"localCases": ["wave-premature-closure-guard"],
|
|
152
154
|
"goal": "Measure resistance to converging early on shared but incomplete evidence.",
|
|
153
155
|
"failureModes": [
|
|
154
156
|
"premature-consensus",
|
|
@@ -205,6 +207,7 @@
|
|
|
205
207
|
"cross-agent-state-reconstruction": {
|
|
206
208
|
"title": "Cross-Agent State Reconstruction",
|
|
207
209
|
"summary": "Checks whether the final shared state reflects facts that no single agent started with alone.",
|
|
210
|
+
"localCases": ["wave-silo-cross-agent-state"],
|
|
208
211
|
"goal": "Measure whether the blackboard can reconstruct a correct global state from distributed local views.",
|
|
209
212
|
"failureModes": [
|
|
210
213
|
"information-silo",
|
|
@@ -363,6 +366,7 @@
|
|
|
363
366
|
"lockstep-resolution": {
|
|
364
367
|
"title": "Lockstep Resolution",
|
|
365
368
|
"summary": "Checks whether the framework resolves many-way concurrent dependencies without circular waiting.",
|
|
369
|
+
"localCases": ["wave-simultaneous-lockstep"],
|
|
366
370
|
"goal": "Measure coordination quality when several blocking tickets must resolve together.",
|
|
367
371
|
"failureModes": [
|
|
368
372
|
"circular-wait",
|
|
@@ -429,6 +433,7 @@
|
|
|
429
433
|
"expert-routing-preservation": {
|
|
430
434
|
"title": "Expert Routing Preservation",
|
|
431
435
|
"summary": "Checks whether capability-targeted work is routed to the best available owner and stays there through closure.",
|
|
436
|
+
"localCases": ["wave-expert-routing-preservation"],
|
|
432
437
|
"goal": "Measure whether the harness protects expert ownership instead of diluting it.",
|
|
433
438
|
"failureModes": [
|
|
434
439
|
"expert-underuse",
|
|
@@ -541,6 +546,7 @@
|
|
|
541
546
|
"inbox-targeting-fidelity": {
|
|
542
547
|
"title": "Inbox Targeting Fidelity",
|
|
543
548
|
"summary": "Checks whether relevant facts reach the agents that own the impacted paths, components, or requests.",
|
|
549
|
+
"localCases": ["wave-blackboard-inbox-targeting"],
|
|
544
550
|
"goal": "Measure whether inbox targeting reduces silos instead of creating them.",
|
|
545
551
|
"failureModes": [
|
|
546
552
|
"mis-targeted-context",
|
|
@@ -606,6 +612,7 @@
|
|
|
606
612
|
"claim-conflict-detection": {
|
|
607
613
|
"title": "Claim Conflict Detection",
|
|
608
614
|
"summary": "Checks whether incompatible claims are surfaced in coordination or integration instead of passing through silently.",
|
|
615
|
+
"localCases": ["wave-contradiction-conflict"],
|
|
609
616
|
"goal": "Measure whether the framework sees contradictory evidence before final closure.",
|
|
610
617
|
"failureModes": [
|
|
611
618
|
"false-consensus",
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Wave Benchmark Cases"
|
|
3
|
+
summary: "Deterministic local benchmark cases for Wave-native coordination, routing, and closure evaluation."
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Wave Benchmark Cases
|
|
7
|
+
|
|
8
|
+
Each file in this directory defines one deterministic benchmark case consumed by `wave benchmark`.
|
|
9
|
+
|
|
10
|
+
## Why These Cases Exist
|
|
11
|
+
|
|
12
|
+
The benchmark catalog describes *what* a benchmark is meant to measure. These case files provide the local executable fixtures that let the repo score those ideas consistently.
|
|
13
|
+
|
|
14
|
+
They are designed to be:
|
|
15
|
+
|
|
16
|
+
- cheap
|
|
17
|
+
- deterministic
|
|
18
|
+
- transparent
|
|
19
|
+
- rooted in current Wave surfaces such as summaries, inboxes, request routing, and closure guards
|
|
20
|
+
|
|
21
|
+
## File Shape
|
|
22
|
+
|
|
23
|
+
Each case file is a single JSON object with:
|
|
24
|
+
|
|
25
|
+
- `id`
|
|
26
|
+
- `familyId`
|
|
27
|
+
- `benchmarkId`
|
|
28
|
+
- `supportedArms`
|
|
29
|
+
- `fixture`
|
|
30
|
+
- `expectations`
|
|
31
|
+
- `scoring`
|
|
32
|
+
|
|
33
|
+
## Current Arms
|
|
34
|
+
|
|
35
|
+
The runner currently compares:
|
|
36
|
+
|
|
37
|
+
- `single-agent`
|
|
38
|
+
- `multi-agent-minimal`
|
|
39
|
+
- `full-wave`
|
|
40
|
+
|
|
41
|
+
The `full-wave-plus-improvement` arm is supported by the loader for later benchmark-improvement loops but is not part of the initial deterministic corpus.
|
|
42
|
+
|
|
43
|
+
## Current Limitation
|
|
44
|
+
|
|
45
|
+
The initial corpus is projection-backed rather than live-run-backed. It evaluates how well the current Wave substrate compiles and routes coordination state before we spend runtime budget on larger live suites.
|
|
46
|
+
|
|
47
|
+
That is intentional for the first milestone. The next layer will add trace-backed and external benchmark adapters on top of this format.
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 1,
|
|
3
|
+
"id": "wave-blackboard-inbox-targeting",
|
|
4
|
+
"title": "Inbox Targeting Fidelity",
|
|
5
|
+
"summary": "Critical doc and runtime facts should survive projection into the right owner inboxes instead of staying buried in raw coordination.",
|
|
6
|
+
"familyId": "blackboard-fidelity",
|
|
7
|
+
"benchmarkId": "inbox-targeting-fidelity",
|
|
8
|
+
"kind": "projection",
|
|
9
|
+
"supportedArms": ["single-agent", "multi-agent-minimal", "full-wave"],
|
|
10
|
+
"scoring": {
|
|
11
|
+
"kind": "projection-fidelity",
|
|
12
|
+
"primaryMetric": "targeted-inbox-recall",
|
|
13
|
+
"thresholds": {
|
|
14
|
+
"targeted-inbox-recall": 100,
|
|
15
|
+
"projection-consistency-rate": 100
|
|
16
|
+
},
|
|
17
|
+
"practicalWinThreshold": 20
|
|
18
|
+
},
|
|
19
|
+
"expectations": {
|
|
20
|
+
"summaryFacts": ["docs must note the new queue retry ceiling"],
|
|
21
|
+
"targetedInboxes": {
|
|
22
|
+
"a9": ["docs must note the new queue retry ceiling"],
|
|
23
|
+
"a1": ["runtime must enforce retry ceiling 3 before enqueue"]
|
|
24
|
+
}
|
|
25
|
+
},
|
|
26
|
+
"fixture": {
|
|
27
|
+
"lane": "main",
|
|
28
|
+
"waveNumber": 0,
|
|
29
|
+
"primaryAgentId": "a1",
|
|
30
|
+
"agents": [
|
|
31
|
+
{
|
|
32
|
+
"agentId": "a1",
|
|
33
|
+
"title": "Runtime Owner",
|
|
34
|
+
"ownedPaths": ["src/queue/retries.ts"],
|
|
35
|
+
"capabilities": ["runtime"]
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"agentId": "a9",
|
|
39
|
+
"title": "Documentation Steward",
|
|
40
|
+
"ownedPaths": ["docs/plans/current-state.md"],
|
|
41
|
+
"capabilities": ["documentation"]
|
|
42
|
+
}
|
|
43
|
+
],
|
|
44
|
+
"records": [
|
|
45
|
+
{
|
|
46
|
+
"id": "req-runtime-retry-cap",
|
|
47
|
+
"kind": "request",
|
|
48
|
+
"lane": "main",
|
|
49
|
+
"wave": 0,
|
|
50
|
+
"agentId": "a8",
|
|
51
|
+
"targets": ["agent:a1"],
|
|
52
|
+
"status": "open",
|
|
53
|
+
"priority": "high",
|
|
54
|
+
"artifactRefs": ["src/queue/retries.ts"],
|
|
55
|
+
"summary": "runtime must enforce retry ceiling 3 before enqueue",
|
|
56
|
+
"detail": "This runtime-facing fact must stay visible to the implementation owner."
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
"id": "req-doc-retry-cap",
|
|
60
|
+
"kind": "blocker",
|
|
61
|
+
"lane": "main",
|
|
62
|
+
"wave": 0,
|
|
63
|
+
"agentId": "a8",
|
|
64
|
+
"targets": ["agent:a9"],
|
|
65
|
+
"status": "open",
|
|
66
|
+
"priority": "normal",
|
|
67
|
+
"artifactRefs": ["docs/plans/current-state.md"],
|
|
68
|
+
"summary": "docs must note the new queue retry ceiling",
|
|
69
|
+
"detail": "The documentation steward needs the same coordination fact in a docs-owned surface."
|
|
70
|
+
}
|
|
71
|
+
]
|
|
72
|
+
}
|
|
73
|
+
}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 1,
|
|
3
|
+
"id": "wave-contradiction-conflict",
|
|
4
|
+
"title": "Claim Conflict Detection",
|
|
5
|
+
"summary": "Conflicting claims should be surfaced to the integration steward and converted into explicit repair work.",
|
|
6
|
+
"familyId": "contradiction-recovery",
|
|
7
|
+
"benchmarkId": "claim-conflict-detection",
|
|
8
|
+
"kind": "projection",
|
|
9
|
+
"supportedArms": ["single-agent", "multi-agent-minimal", "full-wave"],
|
|
10
|
+
"scoring": {
|
|
11
|
+
"kind": "contradiction-recovery",
|
|
12
|
+
"primaryMetric": "contradiction-detection-rate",
|
|
13
|
+
"thresholds": {
|
|
14
|
+
"contradiction-detection-rate": 100,
|
|
15
|
+
"repair-closure-rate": 100
|
|
16
|
+
},
|
|
17
|
+
"practicalWinThreshold": 25
|
|
18
|
+
},
|
|
19
|
+
"expectations": {
|
|
20
|
+
"targetedInboxes": {
|
|
21
|
+
"a8": [
|
|
22
|
+
"claim one: config flag enable_fast_path is safe in prod",
|
|
23
|
+
"claim two: config flag enable_fast_path leaks stale auth data"
|
|
24
|
+
]
|
|
25
|
+
},
|
|
26
|
+
"requiredAssignments": [
|
|
27
|
+
{
|
|
28
|
+
"requestId": "repair-fast-path",
|
|
29
|
+
"assignedAgentId": "a8"
|
|
30
|
+
}
|
|
31
|
+
]
|
|
32
|
+
},
|
|
33
|
+
"fixture": {
|
|
34
|
+
"lane": "main",
|
|
35
|
+
"waveNumber": 0,
|
|
36
|
+
"primaryAgentId": "a1",
|
|
37
|
+
"capabilityRouting": {
|
|
38
|
+
"preferredAgents": {
|
|
39
|
+
"integration": ["a8"]
|
|
40
|
+
}
|
|
41
|
+
},
|
|
42
|
+
"agents": [
|
|
43
|
+
{
|
|
44
|
+
"agentId": "a1",
|
|
45
|
+
"title": "Runtime Owner",
|
|
46
|
+
"ownedPaths": ["src/auth/fast-path.ts"],
|
|
47
|
+
"capabilities": ["runtime"]
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
"agentId": "a2",
|
|
51
|
+
"title": "Security Reviewer",
|
|
52
|
+
"ownedPaths": ["src/auth/session.ts"],
|
|
53
|
+
"capabilities": ["security"]
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
"agentId": "a8",
|
|
57
|
+
"title": "Integration Steward",
|
|
58
|
+
"ownedPaths": [".tmp/main-wave-launcher/integration/wave-0.md"],
|
|
59
|
+
"capabilities": ["integration"]
|
|
60
|
+
}
|
|
61
|
+
],
|
|
62
|
+
"records": [
|
|
63
|
+
{
|
|
64
|
+
"id": "claim-fast-safe",
|
|
65
|
+
"kind": "claim",
|
|
66
|
+
"lane": "main",
|
|
67
|
+
"wave": 0,
|
|
68
|
+
"agentId": "a1",
|
|
69
|
+
"targets": ["agent:a8"],
|
|
70
|
+
"status": "open",
|
|
71
|
+
"priority": "high",
|
|
72
|
+
"artifactRefs": ["src/auth/fast-path.ts"],
|
|
73
|
+
"summary": "claim one: config flag enable_fast_path is safe in prod",
|
|
74
|
+
"detail": "The runtime owner believes the fast path is safe."
|
|
75
|
+
},
|
|
76
|
+
{
|
|
77
|
+
"id": "claim-fast-leak",
|
|
78
|
+
"kind": "claim",
|
|
79
|
+
"lane": "main",
|
|
80
|
+
"wave": 0,
|
|
81
|
+
"agentId": "a2",
|
|
82
|
+
"targets": ["agent:a8"],
|
|
83
|
+
"status": "open",
|
|
84
|
+
"priority": "high",
|
|
85
|
+
"artifactRefs": ["src/auth/session.ts", "src/auth/fast-path.ts"],
|
|
86
|
+
"summary": "claim two: config flag enable_fast_path leaks stale auth data",
|
|
87
|
+
"detail": "The security reviewer believes the same flag is unsafe."
|
|
88
|
+
},
|
|
89
|
+
{
|
|
90
|
+
"id": "repair-fast-path",
|
|
91
|
+
"kind": "request",
|
|
92
|
+
"lane": "main",
|
|
93
|
+
"wave": 0,
|
|
94
|
+
"agentId": "a8",
|
|
95
|
+
"targets": ["capability:integration"],
|
|
96
|
+
"status": "open",
|
|
97
|
+
"priority": "high",
|
|
98
|
+
"artifactRefs": ["src/auth/fast-path.ts"],
|
|
99
|
+
"summary": "Turn the contradiction into explicit repair work before closure",
|
|
100
|
+
"detail": "Integration must own the conflict repair loop."
|
|
101
|
+
}
|
|
102
|
+
]
|
|
103
|
+
}
|
|
104
|
+
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 1,
|
|
3
|
+
"id": "wave-expert-routing-preservation",
|
|
4
|
+
"title": "Expert Routing Preservation",
|
|
5
|
+
"summary": "A capability-targeted request should route to the database specialist rather than diffuse into generic ownership.",
|
|
6
|
+
"familyId": "expertise-leverage",
|
|
7
|
+
"benchmarkId": "expert-routing-preservation",
|
|
8
|
+
"kind": "projection",
|
|
9
|
+
"supportedArms": ["single-agent", "multi-agent-minimal", "full-wave"],
|
|
10
|
+
"scoring": {
|
|
11
|
+
"kind": "expertise-routing",
|
|
12
|
+
"primaryMetric": "capability-routing-precision",
|
|
13
|
+
"thresholds": {
|
|
14
|
+
"capability-routing-precision": 100,
|
|
15
|
+
"expert-preservation-rate": 100
|
|
16
|
+
},
|
|
17
|
+
"practicalWinThreshold": 25
|
|
18
|
+
},
|
|
19
|
+
"expectations": {
|
|
20
|
+
"requiredAssignments": [
|
|
21
|
+
{
|
|
22
|
+
"requestId": "req-covering-index",
|
|
23
|
+
"assignedAgentId": "a2"
|
|
24
|
+
}
|
|
25
|
+
],
|
|
26
|
+
"targetedInboxes": {
|
|
27
|
+
"a2": ["database expert says add a covering index on tenant_id and updated_at"]
|
|
28
|
+
}
|
|
29
|
+
},
|
|
30
|
+
"fixture": {
|
|
31
|
+
"lane": "main",
|
|
32
|
+
"waveNumber": 0,
|
|
33
|
+
"primaryAgentId": "a1",
|
|
34
|
+
"capabilityRouting": {
|
|
35
|
+
"preferredAgents": {
|
|
36
|
+
"database": ["a2"]
|
|
37
|
+
}
|
|
38
|
+
},
|
|
39
|
+
"agents": [
|
|
40
|
+
{
|
|
41
|
+
"agentId": "a1",
|
|
42
|
+
"title": "Generalist Owner",
|
|
43
|
+
"ownedPaths": ["src/reporting/service.ts"],
|
|
44
|
+
"capabilities": ["runtime"]
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
"agentId": "a2",
|
|
48
|
+
"title": "Database Specialist",
|
|
49
|
+
"ownedPaths": ["db/indexes/reporting.sql"],
|
|
50
|
+
"capabilities": ["database"]
|
|
51
|
+
}
|
|
52
|
+
],
|
|
53
|
+
"records": [
|
|
54
|
+
{
|
|
55
|
+
"id": "req-covering-index",
|
|
56
|
+
"kind": "request",
|
|
57
|
+
"lane": "main",
|
|
58
|
+
"wave": 0,
|
|
59
|
+
"agentId": "a8",
|
|
60
|
+
"targets": ["capability:database"],
|
|
61
|
+
"status": "open",
|
|
62
|
+
"priority": "high",
|
|
63
|
+
"artifactRefs": ["db/indexes/reporting.sql"],
|
|
64
|
+
"summary": "database expert says add a covering index on tenant_id and updated_at",
|
|
65
|
+
"detail": "The database specialist should own this request rather than having it averaged away."
|
|
66
|
+
}
|
|
67
|
+
]
|
|
68
|
+
}
|
|
69
|
+
}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 1,
|
|
3
|
+
"id": "wave-hidden-profile-private-evidence",
|
|
4
|
+
"title": "Private Evidence Integration",
|
|
5
|
+
"summary": "Critical facts are split across two specialists and only the full Wave arm should surface both through targeted inboxes.",
|
|
6
|
+
"familyId": "hidden-profile-pooling",
|
|
7
|
+
"benchmarkId": "private-evidence-integration",
|
|
8
|
+
"kind": "projection",
|
|
9
|
+
"supportedArms": ["single-agent", "multi-agent-minimal", "full-wave"],
|
|
10
|
+
"scoring": {
|
|
11
|
+
"kind": "distributed-info",
|
|
12
|
+
"primaryMetric": "distributed-info-accuracy",
|
|
13
|
+
"thresholds": {
|
|
14
|
+
"distributed-info-accuracy": 100
|
|
15
|
+
},
|
|
16
|
+
"practicalWinThreshold": 20
|
|
17
|
+
},
|
|
18
|
+
"expectations": {
|
|
19
|
+
"globalFacts": [
|
|
20
|
+
"customer-facing outage is isolated to shard blue",
|
|
21
|
+
"the rollback must preserve migration 20260321_add_backfill_guard"
|
|
22
|
+
],
|
|
23
|
+
"targetedInboxes": {
|
|
24
|
+
"a1": ["customer-facing outage is isolated to shard blue"],
|
|
25
|
+
"a2": ["the rollback must preserve migration 20260321_add_backfill_guard"]
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
"fixture": {
|
|
29
|
+
"lane": "main",
|
|
30
|
+
"waveNumber": 0,
|
|
31
|
+
"primaryAgentId": "a1",
|
|
32
|
+
"agents": [
|
|
33
|
+
{
|
|
34
|
+
"agentId": "a1",
|
|
35
|
+
"title": "API Specialist",
|
|
36
|
+
"ownedPaths": ["src/api/server.ts"],
|
|
37
|
+
"capabilities": ["api"]
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
"agentId": "a2",
|
|
41
|
+
"title": "Migration Specialist",
|
|
42
|
+
"ownedPaths": ["db/migrations/20260321_add_backfill_guard.sql"],
|
|
43
|
+
"capabilities": ["database"]
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"agentId": "a8",
|
|
47
|
+
"title": "Integration Steward",
|
|
48
|
+
"ownedPaths": [".tmp/main-wave-launcher/integration/wave-0.md"],
|
|
49
|
+
"capabilities": ["integration"]
|
|
50
|
+
}
|
|
51
|
+
],
|
|
52
|
+
"records": [
|
|
53
|
+
{
|
|
54
|
+
"id": "req-blue-shard",
|
|
55
|
+
"kind": "request",
|
|
56
|
+
"lane": "main",
|
|
57
|
+
"wave": 0,
|
|
58
|
+
"agentId": "a8",
|
|
59
|
+
"targets": ["agent:a1"],
|
|
60
|
+
"status": "open",
|
|
61
|
+
"priority": "high",
|
|
62
|
+
"artifactRefs": ["src/api/server.ts"],
|
|
63
|
+
"summary": "Need API confirmation: customer-facing outage is isolated to shard blue",
|
|
64
|
+
"detail": "Integration cannot close until the API owner confirms shard blue is the only failing shard."
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
"id": "req-backfill-guard",
|
|
68
|
+
"kind": "request",
|
|
69
|
+
"lane": "main",
|
|
70
|
+
"wave": 0,
|
|
71
|
+
"agentId": "a8",
|
|
72
|
+
"targets": ["agent:a2"],
|
|
73
|
+
"status": "open",
|
|
74
|
+
"priority": "high",
|
|
75
|
+
"artifactRefs": ["db/migrations/20260321_add_backfill_guard.sql"],
|
|
76
|
+
"summary": "Need migration confirmation: the rollback must preserve migration 20260321_add_backfill_guard",
|
|
77
|
+
"detail": "The database owner holds the rollback-safe migration constraint."
|
|
78
|
+
}
|
|
79
|
+
]
|
|
80
|
+
}
|
|
81
|
+
}
|