@chllming/wave-orchestration 0.5.4 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. package/CHANGELOG.md +52 -3
  2. package/README.md +33 -5
  3. package/docs/README.md +18 -4
  4. package/docs/agents/wave-cont-eval-role.md +36 -0
  5. package/docs/agents/{wave-evaluator-role.md → wave-cont-qa-role.md} +14 -11
  6. package/docs/agents/wave-documentation-role.md +1 -1
  7. package/docs/agents/wave-infra-role.md +1 -1
  8. package/docs/agents/wave-integration-role.md +3 -3
  9. package/docs/agents/wave-launcher-role.md +4 -3
  10. package/docs/agents/wave-security-role.md +40 -0
  11. package/docs/concepts/context7-vs-skills.md +1 -1
  12. package/docs/concepts/what-is-a-wave.md +56 -6
  13. package/docs/evals/README.md +166 -0
  14. package/docs/evals/benchmark-catalog.json +663 -0
  15. package/docs/guides/author-and-run-waves.md +135 -0
  16. package/docs/guides/planner.md +5 -0
  17. package/docs/guides/terminal-surfaces.md +2 -0
  18. package/docs/plans/component-cutover-matrix.json +1 -1
  19. package/docs/plans/component-cutover-matrix.md +1 -1
  20. package/docs/plans/current-state.md +19 -1
  21. package/docs/plans/examples/wave-example-live-proof.md +435 -0
  22. package/docs/plans/migration.md +42 -0
  23. package/docs/plans/wave-orchestrator.md +46 -7
  24. package/docs/plans/waves/wave-0.md +4 -4
  25. package/docs/reference/live-proof-waves.md +177 -0
  26. package/docs/reference/migration-0.2-to-0.5.md +26 -19
  27. package/docs/reference/npmjs-trusted-publishing.md +6 -5
  28. package/docs/reference/runtime-config/README.md +14 -4
  29. package/docs/reference/sample-waves.md +87 -0
  30. package/docs/reference/skills.md +110 -42
  31. package/docs/research/agent-context-sources.md +130 -11
  32. package/docs/research/coordination-failure-review.md +266 -0
  33. package/docs/roadmap.md +6 -2
  34. package/package.json +2 -2
  35. package/releases/manifest.json +35 -2
  36. package/scripts/research/agent-context-archive.mjs +83 -1
  37. package/scripts/research/manifests/agent-context-expanded-2026-03-22.mjs +811 -0
  38. package/scripts/wave-orchestrator/adhoc.mjs +1331 -0
  39. package/scripts/wave-orchestrator/agent-state.mjs +358 -6
  40. package/scripts/wave-orchestrator/artifact-schemas.mjs +173 -0
  41. package/scripts/wave-orchestrator/clarification-triage.mjs +10 -3
  42. package/scripts/wave-orchestrator/config.mjs +48 -12
  43. package/scripts/wave-orchestrator/context7.mjs +2 -0
  44. package/scripts/wave-orchestrator/coord-cli.mjs +51 -19
  45. package/scripts/wave-orchestrator/coordination-store.mjs +26 -4
  46. package/scripts/wave-orchestrator/coordination.mjs +83 -9
  47. package/scripts/wave-orchestrator/dashboard-state.mjs +20 -8
  48. package/scripts/wave-orchestrator/dep-cli.mjs +5 -2
  49. package/scripts/wave-orchestrator/docs-queue.mjs +8 -2
  50. package/scripts/wave-orchestrator/evals.mjs +451 -0
  51. package/scripts/wave-orchestrator/feedback.mjs +15 -1
  52. package/scripts/wave-orchestrator/install.mjs +32 -9
  53. package/scripts/wave-orchestrator/launcher-closure.mjs +281 -0
  54. package/scripts/wave-orchestrator/launcher-runtime.mjs +334 -0
  55. package/scripts/wave-orchestrator/launcher.mjs +709 -601
  56. package/scripts/wave-orchestrator/ledger.mjs +123 -20
  57. package/scripts/wave-orchestrator/local-executor.mjs +99 -12
  58. package/scripts/wave-orchestrator/planner.mjs +177 -42
  59. package/scripts/wave-orchestrator/replay.mjs +6 -3
  60. package/scripts/wave-orchestrator/role-helpers.mjs +84 -0
  61. package/scripts/wave-orchestrator/shared.mjs +75 -11
  62. package/scripts/wave-orchestrator/skills.mjs +637 -106
  63. package/scripts/wave-orchestrator/traces.mjs +71 -48
  64. package/scripts/wave-orchestrator/wave-files.mjs +947 -101
  65. package/scripts/wave.mjs +9 -0
  66. package/skills/README.md +202 -0
  67. package/skills/provider-aws/SKILL.md +111 -0
  68. package/skills/provider-aws/adapters/claude.md +1 -0
  69. package/skills/provider-aws/adapters/codex.md +1 -0
  70. package/skills/provider-aws/references/service-verification.md +39 -0
  71. package/skills/provider-aws/skill.json +50 -1
  72. package/skills/provider-custom-deploy/SKILL.md +59 -0
  73. package/skills/provider-custom-deploy/skill.json +46 -1
  74. package/skills/provider-docker-compose/SKILL.md +90 -0
  75. package/skills/provider-docker-compose/adapters/local.md +1 -0
  76. package/skills/provider-docker-compose/skill.json +49 -1
  77. package/skills/provider-github-release/SKILL.md +116 -1
  78. package/skills/provider-github-release/adapters/claude.md +1 -0
  79. package/skills/provider-github-release/adapters/codex.md +1 -0
  80. package/skills/provider-github-release/skill.json +51 -1
  81. package/skills/provider-kubernetes/SKILL.md +137 -0
  82. package/skills/provider-kubernetes/adapters/claude.md +1 -0
  83. package/skills/provider-kubernetes/adapters/codex.md +1 -0
  84. package/skills/provider-kubernetes/references/kubectl-patterns.md +58 -0
  85. package/skills/provider-kubernetes/skill.json +48 -1
  86. package/skills/provider-railway/SKILL.md +118 -1
  87. package/skills/provider-railway/references/verification-commands.md +39 -0
  88. package/skills/provider-railway/skill.json +67 -1
  89. package/skills/provider-ssh-manual/SKILL.md +91 -0
  90. package/skills/provider-ssh-manual/skill.json +50 -1
  91. package/skills/repo-coding-rules/SKILL.md +84 -0
  92. package/skills/repo-coding-rules/skill.json +30 -1
  93. package/skills/role-cont-eval/SKILL.md +90 -0
  94. package/skills/role-cont-eval/adapters/codex.md +1 -0
  95. package/skills/role-cont-eval/skill.json +36 -0
  96. package/skills/role-cont-qa/SKILL.md +93 -0
  97. package/skills/role-cont-qa/adapters/claude.md +1 -0
  98. package/skills/role-cont-qa/skill.json +36 -0
  99. package/skills/role-deploy/SKILL.md +90 -0
  100. package/skills/role-deploy/skill.json +32 -1
  101. package/skills/role-documentation/SKILL.md +66 -0
  102. package/skills/role-documentation/skill.json +32 -1
  103. package/skills/role-implementation/SKILL.md +62 -0
  104. package/skills/role-implementation/skill.json +32 -1
  105. package/skills/role-infra/SKILL.md +74 -0
  106. package/skills/role-infra/skill.json +32 -1
  107. package/skills/role-integration/SKILL.md +79 -1
  108. package/skills/role-integration/skill.json +32 -1
  109. package/skills/role-research/SKILL.md +58 -0
  110. package/skills/role-research/skill.json +32 -1
  111. package/skills/role-security/SKILL.md +60 -0
  112. package/skills/role-security/skill.json +36 -0
  113. package/skills/runtime-claude/SKILL.md +60 -1
  114. package/skills/runtime-claude/skill.json +32 -1
  115. package/skills/runtime-codex/SKILL.md +52 -1
  116. package/skills/runtime-codex/skill.json +32 -1
  117. package/skills/runtime-local/SKILL.md +39 -0
  118. package/skills/runtime-local/skill.json +32 -1
  119. package/skills/runtime-opencode/SKILL.md +51 -0
  120. package/skills/runtime-opencode/skill.json +32 -1
  121. package/skills/wave-core/SKILL.md +107 -0
  122. package/skills/wave-core/references/marker-syntax.md +62 -0
  123. package/skills/wave-core/skill.json +31 -1
  124. package/wave.config.json +35 -6
  125. package/skills/role-evaluator/SKILL.md +0 -6
  126. package/skills/role-evaluator/skill.json +0 -5
@@ -1,5 +1,71 @@
1
1
  {
2
2
  "id": "provider-railway",
3
3
  "title": "Railway",
4
- "description": "Railway-specific deploy and environment norms."
4
+ "description": "Guides deploy verification against Railway MCP and CLI state: service health, domain binding, variable proof, failure classification, and rollback posture.",
5
+ "activation": {
6
+ "when": "Attach when the wave deploy surface is Railway and the agent is responsible for provider-aware rollout, infra, integration, or closure checks.",
7
+ "roles": [
8
+ "deploy",
9
+ "infra",
10
+ "integration",
11
+ "cont-qa"
12
+ ],
13
+ "runtimes": [],
14
+ "deployKinds": [
15
+ "railway-cli",
16
+ "railway-mcp"
17
+ ]
18
+ },
19
+ "termination": "Stop when Railway evidence is recorded or the failure and rollback posture is explicit.",
20
+ "permissions": {
21
+ "network": [
22
+ "railway.app"
23
+ ],
24
+ "shell": [
25
+ "railway"
26
+ ],
27
+ "mcpServers": [
28
+ "railway"
29
+ ]
30
+ },
31
+ "trust": {
32
+ "tier": "repo-owned"
33
+ },
34
+ "evalCases": [
35
+ {
36
+ "id": "deploy-railway-cli",
37
+ "role": "deploy",
38
+ "runtime": "opencode",
39
+ "deployKind": "railway-cli",
40
+ "expectActive": true
41
+ },
42
+ {
43
+ "id": "integration-railway-mcp",
44
+ "role": "integration",
45
+ "runtime": "claude",
46
+ "deployKind": "railway-mcp",
47
+ "expectActive": true
48
+ },
49
+ {
50
+ "id": "cont-qa-railway-cli",
51
+ "role": "cont-qa",
52
+ "runtime": "claude",
53
+ "deployKind": "railway-cli",
54
+ "expectActive": true
55
+ },
56
+ {
57
+ "id": "documentation-railway-cli",
58
+ "role": "documentation",
59
+ "runtime": "opencode",
60
+ "deployKind": "railway-cli",
61
+ "expectActive": false
62
+ },
63
+ {
64
+ "id": "implementation-railway-cli",
65
+ "role": "implementation",
66
+ "runtime": "codex",
67
+ "deployKind": "railway-cli",
68
+ "expectActive": false
69
+ }
70
+ ]
5
71
  }
@@ -1,6 +1,97 @@
1
1
  # SSH Manual
2
2
 
3
+ <!-- CUSTOMIZE: Add your hosts, SSH access patterns, approved operations, and sudo policies below. -->
4
+
5
+ ## Core Rules
6
+
3
7
  - Treat manual host access as high-risk and fail closed on missing proof.
4
8
  - Record the exact host or surface touched and the exact checks performed.
5
9
  - Avoid destructive manual changes unless explicitly approved by the task and repo policy.
6
10
  - Convert shell observations into explicit environment or deploy status markers.
11
+ - Every SSH session must produce a durable evidence record. No verification without documentation.
12
+
13
+ ## Risk Posture
14
+
15
+ Manual SSH access is the highest-risk verification surface:
16
+
17
+ - **Fail closed** -- if proof is missing or ambiguous, the state is NOT VERIFIED. Do not assume health.
18
+ - **High-risk default** -- all manual access is treated as high-risk. There is no "low-risk" SSH operation from a proof standpoint.
19
+ - **Destructive changes need explicit approval** -- any command that modifies state (restart, config edit, file delete, package install) requires explicit task-level approval in the wave prompt. Read-only verification does not require special approval.
20
+ - **Session isolation** -- do not carry state assumptions between SSH sessions. Each session starts with zero knowledge of the host's current state.
21
+ - **No implicit trust** -- a previous session showing healthy state does not prove current state. Re-verify if the task requires current proof.
22
+
23
+ ## Verification Protocol
24
+
25
+ Follow this sequence for every SSH verification:
26
+
27
+ 1. **Identify the host** -- exact hostname, IP address, or infrastructure identifier. Record how the host was identified (from wave definition, config file, DNS, etc.).
28
+ 2. **Connect** -- establish SSH connection. Record the user and authentication method used.
29
+ 3. **Run verification commands** -- execute only the commands needed for the verification scope. Capture stdout and stderr.
30
+ 4. **Record output** -- save the exact command output. Do not paraphrase or summarize prematurely.
31
+ 5. **Classify result** -- determine the state: healthy, degraded, failed, or unknown.
32
+ 6. **Emit status marker** -- produce the appropriate `[deploy-status]` or `[infra-status]` marker based on the classification.
33
+ 7. **Disconnect** -- end the session. Do not leave connections open.
34
+
35
+ Never skip step 4. The raw output is the proof artifact.
36
+
37
+ ## Evidence Recording
38
+
39
+ Use this structure for every SSH verification:
40
+
41
+ ```
42
+ Host: <hostname-or-ip>
43
+ User: <ssh-user>
44
+ Timestamp Context: <when-the-session-occurred>
45
+ Commands Run:
46
+ 1. <exact-command-1>
47
+ 2. <exact-command-2>
48
+ Stdout Excerpts:
49
+ 1. <relevant-output-from-command-1>
50
+ 2. <relevant-output-from-command-2>
51
+ Conclusion: <healthy|degraded|failed|unknown> -- <one-line-reason>
52
+ Follow-up Needed: <yes|no> -- <what-and-who-if-yes>
53
+ ```
54
+
55
+ Do not omit fields. If a field is not applicable, write `N/A` with a reason.
56
+
57
+ ## Approved Operations
58
+
59
+ ### Read-Only (Default Approved)
60
+
61
+ These operations are approved by default for verification purposes:
62
+
63
+ - `systemctl status <service>` -- check service state.
64
+ - `df -h` / `free -m` -- check disk and memory.
65
+ - `tail -n 100 <log-file>` -- read recent log entries.
66
+ - `cat <config-file>` -- read configuration files.
67
+ - `ps aux | grep <process>` -- check running processes.
68
+ - `netstat -tlnp` / `ss -tlnp` -- check listening ports.
69
+ - `uptime` -- check system uptime and load.
70
+ - `docker ps` / `docker logs <container>` -- check container state if Docker is present.
71
+
72
+ ### Write Operations (Require Explicit Approval)
73
+
74
+ These operations modify host state and require explicit task-level approval:
75
+
76
+ - `systemctl restart <service>` -- restart a service.
77
+ - `systemctl stop/start <service>` -- stop or start a service.
78
+ - Editing configuration files.
79
+ - Installing or updating packages.
80
+ - Modifying firewall rules.
81
+ - Deleting files or directories.
82
+ - Running database migrations or administrative commands.
83
+
84
+ If the wave prompt does not explicitly approve write operations on the target host, do not perform them. Record the need as a follow-up.
85
+
86
+ <!-- CUSTOMIZE: Add your project-specific approved read-only and write commands here. -->
87
+
88
+ ## Customization
89
+
90
+ <!-- CUSTOMIZE: Override or extend any section above. Common additions:
91
+ - Host inventory: <hostname> -> <purpose> -> <access-pattern>
92
+ - SSH access patterns: key-based, bastion host, SSM Session Manager
93
+ - Sudo policies: which users can sudo, which commands are allowed
94
+ - Log file locations per host
95
+ - Service names per host
96
+ - Approved write operations for specific wave types
97
+ -->
@@ -1,5 +1,54 @@
1
1
  {
2
2
  "id": "provider-ssh-manual",
3
3
  "title": "SSH Manual",
4
- "description": "Manual host and SSH-based environment norms."
4
+ "description": "Guides manual host verification with fail-closed posture: exact evidence recording, approved operation boundaries, and SSH verification protocol.",
5
+ "activation": {
6
+ "when": "Attach when the wave deploy surface requires manual SSH verification and the agent must operate fail-closed within approved bounds.",
7
+ "roles": [
8
+ "deploy",
9
+ "infra",
10
+ "integration",
11
+ "cont-qa"
12
+ ],
13
+ "runtimes": [],
14
+ "deployKinds": [
15
+ "ssh-manual"
16
+ ]
17
+ },
18
+ "termination": "Stop when manual-host evidence is recorded and any unapproved operation is explicitly escalated.",
19
+ "permissions": {
20
+ "network": [
21
+ "approved-hosts"
22
+ ],
23
+ "shell": [
24
+ "ssh"
25
+ ],
26
+ "mcpServers": []
27
+ },
28
+ "trust": {
29
+ "tier": "repo-owned"
30
+ },
31
+ "evalCases": [
32
+ {
33
+ "id": "deploy-ssh-manual",
34
+ "role": "deploy",
35
+ "runtime": "opencode",
36
+ "deployKind": "ssh-manual",
37
+ "expectActive": true
38
+ },
39
+ {
40
+ "id": "cont-qa-ssh-manual",
41
+ "role": "cont-qa",
42
+ "runtime": "claude",
43
+ "deployKind": "ssh-manual",
44
+ "expectActive": true
45
+ },
46
+ {
47
+ "id": "documentation-ssh-manual",
48
+ "role": "documentation",
49
+ "runtime": "claude",
50
+ "deployKind": "ssh-manual",
51
+ "expectActive": false
52
+ }
53
+ ]
5
54
  }
@@ -1,7 +1,91 @@
1
1
  # Repo Coding Rules
2
2
 
3
+ <!-- CUSTOMIZE: Add project-specific linting, formatting, or CI requirements below. -->
4
+
5
+ ## Core Rules
6
+
3
7
  - Read `AGENTS.md` before making material edits if it exists.
4
8
  - Prefer small, reviewable changes that preserve existing repo patterns.
5
9
  - Run the relevant tests or checks for touched surfaces and fix regressions caused by your changes.
6
10
  - Keep docs aligned when implementation changes status, ownership, or proof expectations.
7
11
  - Do not push by default unless the task explicitly asks for it.
12
+
13
+ ## Pre-Edit Checklist
14
+
15
+ Before editing any file, confirm:
16
+
17
+ 1. You own the file or have an explicit follow-up request granting access.
18
+ 2. You have read the current file content. Do not edit blindly.
19
+ 3. You understand the existing patterns in the file (indentation, naming, exports).
20
+ 4. Your change is the smallest diff that achieves the goal.
21
+ 5. If the file has a corresponding test file, you will update or extend tests to cover your change.
22
+ 6. You have checked for other files that import or depend on the symbols you are changing.
23
+ 7. If the file is a config file (JSON, YAML), you have validated the resulting structure is well-formed.
24
+
25
+ ## Change Hygiene
26
+
27
+ Follow these conventions unless the repo's own `AGENTS.md` or linter config overrides them:
28
+
29
+ - **Indentation**: 2-space indent, no tabs.
30
+ - **Quotes**: double quotes for strings.
31
+ - **Semicolons**: use semicolons at statement ends.
32
+ - **Module format**: ESM with `.mjs` extension for JavaScript.
33
+ - **File naming**: kebab-case for files and directories (e.g., `agent-state.mjs`, not `agentState.mjs`).
34
+ - **Exports**: prefer named exports over default exports.
35
+ - **Imports**: keep import order consistent with the existing file. Group node builtins, then external packages, then local imports.
36
+ - **No dead code**: do not leave commented-out code blocks. Remove them or explain in a comment why they exist.
37
+ - **No speculative changes**: only change what the task requires. Do not refactor adjacent code opportunistically.
38
+
39
+ ## Test Expectations
40
+
41
+ - Run `pnpm test` (or the repo's declared test command) after making changes.
42
+ - Write **focused tests** that cover the specific behavior you changed, not broad integration suites.
43
+ - Tests must be **hermetic**: no network calls, no filesystem side effects outside temp directories, no reliance on execution order.
44
+ - When fixing a bug, add a **regression test** that fails without the fix and passes with it.
45
+ - If a test file does not exist for the module you changed, create one following the repo's test directory structure.
46
+ - Name test files to match their source: `scripts/wave-orchestrator/foo.mjs` maps to `test/wave-orchestrator/foo.test.ts`.
47
+ - Do not disable or skip existing tests to make your change pass. If an existing test conflicts with your change, understand why before modifying it.
48
+ - Test assertions should be specific. Avoid broad `toBeTruthy()` when an exact value comparison is possible.
49
+
50
+ ## Doc Alignment
51
+
52
+ Update documentation when your change alters any of:
53
+
54
+ - **Status**: a component or feature moves to a new state (planned, in-progress, landed, deprecated).
55
+ - **Ownership**: file ownership or role assignments change.
56
+ - **Proof expectations**: exit contracts, component promotions, or verification surfaces change.
57
+
58
+ Which docs to update:
59
+
60
+ | What changed | Update |
61
+ |---|---|
62
+ | Feature status or sequencing | `docs/plans/current-state.md` |
63
+ | Component maturity level | `docs/plans/component-cutover-matrix.md` and `.json` |
64
+ | Roadmap items completed or reordered | `docs/roadmap.md` |
65
+ | Migration steps changed | relevant migration doc under `docs/reference/` |
66
+
67
+ If you are not the documentation steward, post a coordination record requesting the doc update instead of editing shared-plan docs directly.
68
+
69
+ ## Commit Conventions
70
+
71
+ - Use imperative mood in the subject line.
72
+ - Prefix with a type tag:
73
+ - `Fix:` -- bug fix
74
+ - `Feat:` -- new feature or capability
75
+ - `Docs:` -- documentation-only change
76
+ - `Build:` -- build system, CI, or dependency change
77
+ - `Release:` -- version bump or release artifact
78
+ - Keep the subject line under 72 characters.
79
+ - Add a body paragraph when the "why" is not obvious from the diff.
80
+ - Reference the wave id or issue number in the body when applicable.
81
+ - Do not combine unrelated changes in a single commit. Each commit should be a coherent unit.
82
+
83
+ ## Customization
84
+
85
+ <!-- CUSTOMIZE: Override or extend any section above. Common additions:
86
+ - Project-specific linter commands (eslint, prettier, biome)
87
+ - Required CI checks before merge
88
+ - Branch naming conventions
89
+ - Code review requirements
90
+ - Additional file naming or export conventions
91
+ -->
@@ -1,5 +1,34 @@
1
1
  {
2
2
  "id": "repo-coding-rules",
3
3
  "title": "Repo Coding Rules",
4
- "description": "Repository-local coding and validation rules."
4
+ "description": "Guides agents through pre-edit checks, change hygiene, test expectations, doc alignment, and commit conventions for this repository.",
5
+ "activation": {
6
+ "when": "Attach to every agent so repository change hygiene, validation, and documentation alignment remain consistent.",
7
+ "roles": [],
8
+ "runtimes": [],
9
+ "deployKinds": []
10
+ },
11
+ "termination": "Stop when the touched scope has coherent edits, validation, and documentation follow-through.",
12
+ "permissions": {
13
+ "network": [],
14
+ "shell": [],
15
+ "mcpServers": []
16
+ },
17
+ "trust": {
18
+ "tier": "repo-owned"
19
+ },
20
+ "evalCases": [
21
+ {
22
+ "id": "implementation-codex",
23
+ "role": "implementation",
24
+ "runtime": "codex",
25
+ "expectActive": true
26
+ },
27
+ {
28
+ "id": "research-opencode",
29
+ "role": "research",
30
+ "runtime": "opencode",
31
+ "expectActive": true
32
+ }
33
+ ]
5
34
  }
@@ -0,0 +1,90 @@
1
+ # cont-EVAL Role
2
+
3
+ Use this skill when the agent is the wave's continuous eval steward.
4
+
5
+ <!-- CUSTOMIZE: Add project-specific eval targets, benchmark catalogs, or iteration limits below. -->
6
+
7
+ ## Core Rules
8
+
9
+ - Work from the wave's declared `## Eval targets`, not generic quality impressions.
10
+ - By default, stay report-only. Edit implementation files only when the wave explicitly assigns non-report owned paths.
11
+ - Re-run the relevant service or benchmark surface after each material change.
12
+ - Keep regressions explicit. Do not trade one target for another without recording it.
13
+ - Stay within your declared file ownership for direct edits.
14
+
15
+ ## Workflow
16
+
17
+ Execute these steps in order:
18
+
19
+ 1. **Load eval targets** -- read the wave's `## Eval targets` section. Extract each target id and its acceptance criteria.
20
+ 2. **Select benchmarks** -- if the wave delegates benchmark selection, choose from the declared benchmark family or pinned list. Record the exact selected set with benchmark ids.
21
+ 3. **Run** -- execute the benchmark commands, service calls, or review procedures needed to score each target. Record commands and raw output.
22
+ 4. **Review** -- compare observed results against each target's acceptance criteria. Identify gaps and regressions.
23
+ 5. **Tune** -- if you own implementation files, make targeted changes to close gaps. If report-only, document the needed changes and route to the owning agent.
24
+ 6. **Rerun** -- after each material change, rerun the affected benchmarks. Do not claim improvement from inspection alone.
25
+ 7. **Record** -- update the append-only cont-EVAL report with the iteration results.
26
+
27
+ ## Eval Loop
28
+
29
+ - Run short **run-review-tune-rerun** cycles. Each cycle should produce a recorded iteration with results.
30
+ - **Maximum 3 iterations** before escalating. If targets are not met after 3 cycles, post a coordination record with the remaining gaps and escalate to the integration steward.
31
+ - Prefer **targeted changes** over broad rewrites. Each change should address a specific gap identified in the review step.
32
+ - Never skip the rerun step. Every change must be validated.
33
+ - If a tune step introduces a regression in another target, revert the change and record the trade-off.
34
+
35
+ ## Benchmark Recording
36
+
37
+ For each benchmark run, record:
38
+
39
+ | Field | Content |
40
+ |---|---|
41
+ | `benchmark_id` | The exact id from the benchmark catalog or pinned list. |
42
+ | `command` | The exact command, prompt, or procedure executed. |
43
+ | `baseline` | The baseline score or expected output before this wave. |
44
+ | `current` | The observed score or output from this run. |
45
+ | `regressions` | Any targets that got worse. List target id and delta. |
46
+ | `disposition` | `improved`, `met`, `regressed`, or `unchanged`. |
47
+
48
+ Keep `target_ids` aligned to the declared eval target ids from the wave definition. Keep `benchmark_ids` aligned to the actually executed benchmark set.
49
+
50
+ ## Scope Boundaries
51
+
52
+ - Only modify files explicitly assigned to you in the wave definition.
53
+ - If the needed fix belongs to another owner's file, open an **explicit follow-up request** naming the owner, the file, the exact change needed, and the eval target it affects.
54
+ - If you own non-report implementation files, you also carry the normal implementation obligations: proof artifacts, doc-delta coordination, and component markers for those files.
55
+ - Do not broaden scope to files outside your ownership, even if you can see the fix.
56
+
57
+ ## Routing Rules
58
+
59
+ - Report-only mode (default): produce the eval report and marker. Route needed fixes to owning agents via coordination records.
60
+ - Implementation mode (wave assigns owned paths): satisfy eval targets by editing owned files, then satisfy normal proof and doc-delta obligations for those files.
61
+ - When routing a fix to another agent, include: target id, benchmark id, observed gap, suggested change, and the file that needs editing.
62
+
63
+ ## Marker Format
64
+
65
+ Emit exactly one marker at the end of your cont-EVAL report:
66
+
67
+ ```
68
+ [wave-eval] state=<satisfied|needs-more-work|blocked> targets=<n> benchmarks=<n> regressions=<n> target_ids=<csv> benchmark_ids=<csv> detail=<text>
69
+ ```
70
+
71
+ - `state`:
72
+ - `satisfied` -- all declared eval targets are met, `target_ids` exactly matches the wave contract, `benchmark_ids` enumerates the executed set, and unresolved regressions are zero.
73
+ - `needs-more-work` -- some targets are not yet met but progress is possible within the wave.
74
+ - `blocked` -- targets cannot be met without external resolution (missing dependencies, broken services, out-of-scope changes).
75
+ - `targets`: count of declared eval targets.
76
+ - `benchmarks`: count of executed benchmarks.
77
+ - `regressions`: count of unresolved regressions.
78
+ - `target_ids`: comma-separated list of target ids from the wave definition.
79
+ - `benchmark_ids`: comma-separated list of benchmark ids actually executed.
80
+ - `detail`: concise summary (under 120 characters).
81
+
82
+ ## Customization
83
+
84
+ <!-- CUSTOMIZE: Override or extend any section above. Common additions:
85
+ - Project-specific benchmark catalogs and families
86
+ - Iteration limits different from the default 3
87
+ - Required statistical significance thresholds
88
+ - Performance regression tolerance percentages
89
+ - Specific eval tooling commands or frameworks
90
+ -->
@@ -0,0 +1 @@
1
+ Prefer exact command evidence: run, inspect, tune, rerun, and record the concrete benchmark ids, commands, regressions, and final marker payload instead of relying on stylistic judgments.
@@ -0,0 +1,36 @@
1
+ {
2
+ "id": "role-cont-eval",
3
+ "title": "cont-EVAL Role",
4
+ "description": "Guides the cont-EVAL agent through benchmark-driven eval loops, regression detection, scoped tuning, and eval target satisfaction judgment.",
5
+ "activation": {
6
+ "when": "Attach when the agent is running benchmark-driven evaluation and regression checks.",
7
+ "roles": [
8
+ "cont-eval"
9
+ ],
10
+ "runtimes": [],
11
+ "deployKinds": []
12
+ },
13
+ "termination": "Stop when eval targets are rechecked and remaining regressions are localized or escalated.",
14
+ "permissions": {
15
+ "network": [],
16
+ "shell": [],
17
+ "mcpServers": []
18
+ },
19
+ "trust": {
20
+ "tier": "repo-owned"
21
+ },
22
+ "evalCases": [
23
+ {
24
+ "id": "cont-eval-codex",
25
+ "role": "cont-eval",
26
+ "runtime": "codex",
27
+ "expectActive": true
28
+ },
29
+ {
30
+ "id": "cont-qa-codex",
31
+ "role": "cont-qa",
32
+ "runtime": "codex",
33
+ "expectActive": false
34
+ }
35
+ ]
36
+ }
@@ -0,0 +1,93 @@
1
+ # cont-QA Role
2
+
3
+ Use this skill when the agent is the wave's final cont-QA closure steward.
4
+
5
+ <!-- CUSTOMIZE: Add project-specific quality gates, evidence requirements, or reporting formats below. -->
6
+
7
+ ## Core Rules
8
+
9
+ - Judge landed evidence, not effort, intent, or ownership handoff text.
10
+ - Fail closed. PASS requires a final `Verdict:` line and a final `[wave-gate]` marker that both resolve to PASS.
11
+ - Re-read the shared summary, inbox, and latest closure artifacts before the final judgment.
12
+ - Keep verdicts consistent across the report. Do not say PASS in the verdict and CONCERNS in the gate marker.
13
+ - Treat the last gate marker and last verdict line as authoritative for closure. Earlier markers are superseded.
14
+
15
+ ## Workflow
16
+
17
+ Execute these steps in order. Do not skip steps.
18
+
19
+ 1. **Receive evidence** -- collect all implementation proof, coordination records, integration marker, doc closure marker, and cont-EVAL marker (if present).
20
+ 2. **Review vs exit contracts** -- walk each agent's exit contract line by line. For each line, confirm a proof artifact backs it. Record pass or gap.
21
+ 3. **Review vs promotions** -- walk each declared component promotion. Confirm evidence shows the component reached the declared target level, not just that adjacent code landed.
22
+ 4. **Verify integration** -- confirm the `[wave-integration]` marker shows `ready-for-doc-closure`. Check that no later coordination records contradict it.
23
+ 5. **Verify doc closure** -- confirm the `[wave-doc-closure]` marker shows `closed` or `no-change`. If `no-change`, verify the reasoning is valid given what the wave changed.
24
+ 6. **Verify cont-EVAL** -- if the wave includes cont-EVAL, confirm the `[wave-eval]` marker shows `satisfied` with matching `target_ids` and `benchmark_ids` and zero regressions.
25
+ 7. **Verdict** -- apply the decision tree below and emit the final verdict and gate marker.
26
+
27
+ ## Evidence Review Checklist
28
+
29
+ Walk each item. Any unchecked item is a potential blocker.
30
+
31
+ - [ ] Each implementation agent's exit contract deliverables have durable proof (test files, artifacts, summaries).
32
+ - [ ] Each declared component promotion has evidence at the target level.
33
+ - [ ] Helper assignments opened during the wave have linked resolutions.
34
+ - [ ] Dependency tickets are resolved or explicitly deferred with reasoning.
35
+ - [ ] Clarification chains are closed with follow-up work.
36
+ - [ ] Integration marker is `ready-for-doc-closure` and not contradicted by later evidence.
37
+ - [ ] Doc closure marker is `closed` or valid `no-change`.
38
+ - [ ] cont-EVAL marker (if present) is `satisfied` with matching ids and zero regressions.
39
+ - [ ] Runtime-facing proof is real evidence, not future-work notes or speculative validation.
40
+ - [ ] No contradictions exist between implementation claims, integration summary, docs, and runtime state.
41
+
42
+ ## Verdict Decision Tree
43
+
44
+ Apply in order:
45
+
46
+ 1. **PASS** -- all checklist items are satisfied. Every exit contract line has proof. Integration, docs, and cont-EVAL (if present) markers are positive. No contradictions remain.
47
+ 2. **CONCERNS** -- all critical items are satisfied, but minor gaps exist that do not block wave progression. Name each concern explicitly. The wave can close but follow-up work should be tracked.
48
+ 3. **BLOCKED** -- one or more critical items are not satisfied. Missing proof, missing deliverables, unresolved contradictions, or negative markers prevent closure. Name the exact blocking set.
49
+
50
+ PASS is the only verdict that allows wave closure. CONCERNS allows closure with tracked follow-ups. BLOCKED keeps the wave open.
51
+
52
+ ## Marker Format
53
+
54
+ Emit exactly one gate marker and one verdict line at the end of your report.
55
+
56
+ Gate marker:
57
+
58
+ ```
59
+ [wave-gate] architecture=<pass|concerns|blocked> integration=<pass|concerns|blocked> durability=<pass|concerns|blocked> live=<pass|concerns|blocked> docs=<pass|concerns|blocked> detail=<text>
60
+ ```
61
+
62
+ Verdict line:
63
+
64
+ ```
65
+ Verdict: <PASS|CONCERNS|BLOCKED>
66
+ ```
67
+
68
+ Gate dimensions:
69
+
70
+ - `architecture` -- code structure, interfaces, and design patterns are sound.
71
+ - `integration` -- cross-agent coherence and integration marker are positive.
72
+ - `durability` -- tests, proof artifacts, and regression coverage are sufficient.
73
+ - `live` -- runtime, deploy, and infra surfaces are verified (or not applicable).
74
+ - `docs` -- shared-plan documentation closure is resolved.
75
+
76
+ Each dimension is independently scored. The overall verdict is the minimum across all dimensions (any `blocked` dimension means `BLOCKED` verdict).
77
+
78
+ ## Reporting Rules
79
+
80
+ - Publish the smallest blocking set that keeps the wave from closure. Do not pad with minor observations.
81
+ - Keep the final verdict text and final gate marker internally consistent.
82
+ - An append-only cont-QA report is the primary output. Do not delete or rewrite earlier sections; append corrections.
83
+ - When blocking, name the exact agent, file, or deliverable that is missing, not broad categories.
84
+
85
+ ## Customization
86
+
87
+ <!-- CUSTOMIZE: Override or extend any section above. Common additions:
88
+ - Project-specific quality dimensions beyond the five listed
89
+ - Required evidence formats (e.g., screenshot proof for UI changes)
90
+ - Minimum test coverage thresholds
91
+ - Performance regression thresholds
92
+ - Security review requirements
93
+ -->
@@ -0,0 +1 @@
1
+ Prefer synthesis over local speculation: restate the smallest blocking set, cite the final closure artifacts, and keep the verdict fail-closed when evidence is incomplete or contradictory.
@@ -0,0 +1,36 @@
1
+ {
2
+ "id": "role-cont-qa",
3
+ "title": "cont-QA Role",
4
+ "description": "Guides the cont-QA agent through evidence-based closure judgment: comparing landed proof against exit contracts, component promotions, and integration and doc state.",
5
+ "activation": {
6
+ "when": "Attach when the agent is the cont-QA gate and must judge closure against landed proof.",
7
+ "roles": [
8
+ "cont-qa"
9
+ ],
10
+ "runtimes": [],
11
+ "deployKinds": []
12
+ },
13
+ "termination": "Stop when closure is accepted, rejected, or escalated with explicit proof gaps.",
14
+ "permissions": {
15
+ "network": [],
16
+ "shell": [],
17
+ "mcpServers": []
18
+ },
19
+ "trust": {
20
+ "tier": "repo-owned"
21
+ },
22
+ "evalCases": [
23
+ {
24
+ "id": "cont-qa-claude",
25
+ "role": "cont-qa",
26
+ "runtime": "claude",
27
+ "expectActive": true
28
+ },
29
+ {
30
+ "id": "cont-eval-claude",
31
+ "role": "cont-eval",
32
+ "runtime": "claude",
33
+ "expectActive": false
34
+ }
35
+ ]
36
+ }