cool-workflow 0.1.80 → 0.1.81

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. package/.claude-plugin/plugin.json +1 -1
  2. package/.codex-plugin/plugin.json +1 -1
  3. package/README.md +42 -2
  4. package/apps/architecture-review/app.json +1 -1
  5. package/apps/architecture-review-fast/app.json +1 -1
  6. package/apps/end-to-end-golden-path/app.json +1 -1
  7. package/apps/pr-review-fix-ci/app.json +1 -1
  8. package/apps/release-cut/app.json +1 -1
  9. package/apps/research-synthesis/app.json +1 -1
  10. package/dist/agent-config.js +21 -7
  11. package/dist/candidate-scoring.js +42 -22
  12. package/dist/capability-core.js +94 -17
  13. package/dist/capability-registry.js +138 -171
  14. package/dist/cli.js +90 -100
  15. package/dist/collaboration.js +5 -6
  16. package/dist/commit.js +20 -6
  17. package/dist/compare.js +18 -0
  18. package/dist/coordinator/classify.js +45 -0
  19. package/dist/coordinator/paths.js +42 -0
  20. package/dist/coordinator/util.js +129 -0
  21. package/dist/coordinator.js +127 -300
  22. package/dist/dispatch.js +35 -0
  23. package/dist/drive.js +7 -7
  24. package/dist/error-feedback.js +8 -4
  25. package/dist/evidence-reasoning.js +1 -1
  26. package/dist/execution-backend/agent.js +331 -0
  27. package/dist/execution-backend/probes.js +96 -0
  28. package/dist/execution-backend/util.js +47 -0
  29. package/dist/execution-backend.js +67 -420
  30. package/dist/mcp-server.js +34 -173
  31. package/dist/multi-agent/graph.js +84 -0
  32. package/dist/multi-agent/helpers.js +145 -0
  33. package/dist/multi-agent/paths.js +22 -0
  34. package/dist/multi-agent-eval/format.js +194 -0
  35. package/dist/multi-agent-eval/normalize.js +51 -0
  36. package/dist/multi-agent-eval.js +39 -244
  37. package/dist/multi-agent-host.js +0 -19
  38. package/dist/multi-agent.js +125 -314
  39. package/dist/node-snapshot.js +3 -3
  40. package/dist/observability/format.js +61 -0
  41. package/dist/observability/intake.js +98 -0
  42. package/dist/observability.js +14 -160
  43. package/dist/operator-ux/format.js +364 -0
  44. package/dist/operator-ux.js +22 -363
  45. package/dist/orchestrator/report.js +8 -0
  46. package/dist/orchestrator.js +25 -8
  47. package/dist/reclamation.js +26 -21
  48. package/dist/run-export.js +138 -14
  49. package/dist/run-registry/derive.js +172 -0
  50. package/dist/run-registry/format.js +124 -0
  51. package/dist/run-registry/gc.js +251 -0
  52. package/dist/run-registry/policy.js +16 -0
  53. package/dist/run-registry/queue.js +116 -0
  54. package/dist/run-registry.js +78 -593
  55. package/dist/run-state-schema.js +1 -0
  56. package/dist/sandbox-profile.js +43 -2
  57. package/dist/state-explosion/format.js +159 -0
  58. package/dist/state-explosion/helpers.js +82 -0
  59. package/dist/state-explosion.js +65 -283
  60. package/dist/state-node.js +19 -4
  61. package/dist/telemetry-attestation.js +55 -0
  62. package/dist/telemetry-demo.js +15 -3
  63. package/dist/telemetry-ledger.js +60 -15
  64. package/dist/topology.js +25 -8
  65. package/dist/triggers.js +33 -14
  66. package/dist/trust-audit.js +145 -33
  67. package/dist/version.js +1 -1
  68. package/dist/worker-isolation/helpers.js +51 -0
  69. package/dist/worker-isolation/paths.js +46 -0
  70. package/dist/worker-isolation.js +39 -115
  71. package/docs/agent-delegation-drive.7.md +13 -0
  72. package/docs/cli-mcp-parity.7.md +4 -0
  73. package/docs/contract-migration-tooling.7.md +2 -0
  74. package/docs/control-plane-scheduling.7.md +2 -0
  75. package/docs/dogfood/resume-drive-real-agent-2026-06-14.md +40 -0
  76. package/docs/durable-state-and-locking.7.md +4 -0
  77. package/docs/evidence-adoption-reasoning-chain.7.md +2 -0
  78. package/docs/execution-backends.7.md +2 -0
  79. package/docs/index.md +1 -0
  80. package/docs/launch/launch-kit.md +46 -23
  81. package/docs/launch/pre-launch-checklist.md +14 -14
  82. package/docs/multi-agent-cli-mcp-surface.7.md +4 -0
  83. package/docs/multi-agent-eval-replay-harness.7.md +2 -0
  84. package/docs/multi-agent-operator-ux.7.md +2 -0
  85. package/docs/multi-agent-trust-policy-audit.7.md +27 -0
  86. package/docs/node-snapshot-diff-replay.7.md +2 -0
  87. package/docs/observability-cost-accounting.7.md +2 -0
  88. package/docs/project-index.md +18 -5
  89. package/docs/real-execution-backends.7.md +2 -0
  90. package/docs/release-and-migration.7.md +4 -0
  91. package/docs/release-tooling.7.md +2 -0
  92. package/docs/run-registry-control-plane.7.md +54 -8
  93. package/docs/run-retention-reclamation.7.md +4 -0
  94. package/docs/state-explosion-management.7.md +2 -0
  95. package/docs/team-collaboration.7.md +2 -0
  96. package/docs/trust-model.md +267 -0
  97. package/docs/vendor-manifest-loadability.7.md +43 -0
  98. package/docs/web-desktop-workbench.7.md +2 -0
  99. package/manifest/plugin.manifest.json +1 -1
  100. package/package.json +4 -2
  101. package/scripts/agents/builtin-templates.json +7 -0
  102. package/scripts/bump-version.js +5 -11
  103. package/scripts/canonical-apps-list.js +64 -0
  104. package/scripts/canonical-apps.js +19 -4
  105. package/scripts/dogfood-release.js +1 -1
  106. package/scripts/golden-path.js +4 -4
  107. package/scripts/parity-check.js +5 -0
  108. package/scripts/release-check.js +5 -1
  109. package/scripts/version-sync-check.js +5 -8
  110. package/dist/capability-dispatcher.js +0 -86
@@ -0,0 +1,267 @@
1
+ # Trust Model & Limitations
2
+
3
+ > **Read this before you trust a cool-workflow record.** This document states
4
+ > exactly what CW's cryptographic guarantees prove, and — just as important —
5
+ > what they do **not** prove. We would rather lose a skeptical reader here than
6
+ > have them over-trust a green checkmark in production. If anything below reads
7
+ > as an overclaim, it is a bug; please file it.
8
+
9
+ CW is an **auditable control-plane**. It plans, dispatches, records, and verifies
10
+ agent work — it does **not** run the model itself. That single architectural
11
+ choice is what the guarantees below rest on, and it is also the source of their
12
+ honest ceiling.
13
+
14
+ ---
15
+
16
+ ## TL;DR
17
+
18
+ - CW's ed25519 signature + hash-chained ledger prove **integrity and
19
+ attribution**: a recorded usage figure was signed by the keyholder and has not
20
+ been edited since it was recorded. Both re-verify **offline** — the recorded
21
+ ledger's integrity with **no key at all** (`cw telemetry verify`), and each
22
+ `attested` signature with the **public key alone** (`cw telemetry verify
23
+ --pubkey <public.pem>`; also reproduced by `cw demo tamper`).
24
+ - They do **not** prove the original number was **true**. A dishonest signer can
25
+ sign a lie; the lie is then cryptographically bound to its signer, but it is
26
+ still a lie.
27
+ - **CW holds no private key.** It can verify, but it can neither forge a
28
+ signature nor measure usage itself (by design — see the red line below).
29
+ - The honest gap is **single-keyholder / no second party**: when the same
30
+ operator runs CW *and* holds the only signing key, integrity is real but there
31
+ is no independent party attesting that the source was honest. **This is exactly
32
+ why we are seeking early integration partners** who supply an independent
33
+ second party / co-signer. See [Closing the gap](#closing-the-gap-the-second-party).
34
+
35
+ ---
36
+
37
+ ## What the cryptography is, precisely
38
+
39
+ There are two distinct mechanisms. Conflating them is the most common way to
40
+ over- or under-state the guarantee, so they are kept separate here.
41
+
42
+ ### 1. The telemetry signature (ed25519) — attribution of a reported number
43
+
44
+ The agent (the **executor**) self-reports its token usage. A control-plane that
45
+ records that number verbatim is recording a **claim**. To turn the claim into an
46
+ **attestation**, the executor signs a canonical payload with its **private key**:
47
+
48
+ ```
49
+ sign({ usage, runId, taskId, promptDigest }) // ed25519, executor-side
50
+ ```
51
+
52
+ The `runId` / `taskId` / `promptDigest` binding is load-bearing: it ties the
53
+ signature to **this** hop, so a valid signature from one task cannot be replayed
54
+ onto another. `promptDigest` is the sha256 of the exact worker prompt CW handed
55
+ the agent.
56
+
57
+ CW then **verifies** that signature against an **operator-provisioned public
58
+ key**. CW holds *only* the public half. From `telemetry-attestation.ts`:
59
+
60
+ > CW VERIFIES that signature against an operator-provisioned PUBLIC key. CW holds
61
+ > ONLY the public key — it can verify, but can neither forge a signature nor (the
62
+ > red line) call a model to measure usage itself.
63
+
64
+ The result is one of three honest states, surfaced loudly and never silently
65
+ upgraded to "trusted":
66
+
67
+ | State | Meaning |
68
+ |---|---|
69
+ | `attested` | A valid ed25519 signature over the reported usage, bound to this run/task/prompt, verified against the configured public key. |
70
+ | `unattested` | Usage was reported but the signature is missing, malformed, made with the wrong key, or does not match the payload (tampered or replayed). Also: no trust key configured. |
71
+ | `absent` | The agent reported no usage at all. |
72
+
73
+ Defaults are honest: no signature ⇒ `unattested`; no usage ⇒ `absent`. **Usage
74
+ is never silently recorded as trusted.** The opt-in `require-attested-telemetry`
75
+ policy fails the run closed on anything other than `attested`.
76
+
77
+ ### 2. The hash-chained ledgers — tamper-evidence of the recorded log
78
+
79
+ A signature proves the agent *said* a number in flight. It does not, by itself,
80
+ prove that **CW recorded exactly that** and that **nobody edited the record
81
+ afterward**. That is the job of the append-only, hash-chained ledgers:
82
+
83
+ - **Telemetry ledger** (`telemetry.json`, one entry per agent hop): each entry
84
+ chains to the previous via `prevHash`, and `recordHash = sha256(canonical
85
+ entry)`. Flip a recorded verdict (`unattested` → `attested`) or edit a recorded
86
+ usage digest, and the chain no longer recomputes.
87
+ - **Trust-audit event log** (`events.jsonl`): the same discipline applied to
88
+ every recorded decision — sandbox path allow/deny, policy snapshots,
89
+ verifier-gated commits, collaboration approvals.
90
+
91
+ Verification **recomputes every hash independently and never trusts the stored
92
+ value**, so an edited, reordered, removed, or truncated entry flips
93
+ `verified = false`. A ledger that exists but cannot be parsed **fails closed** —
94
+ it is treated as corrupt, never silently as the clean empty chain.
95
+
96
+ This is all **offline**. The chain re-proof needs **no key at all**; add
97
+ `--pubkey <public.pem>` to re-run the signature **attribution** check against the
98
+ stored raw usage for every `attested` record. There is no telemetry service to
99
+ trust or breach — the record proves its own integrity, and a third-party auditor
100
+ can re-run both checks on their own machine.
101
+
102
+ ---
103
+
104
+ ## What this DOES prove
105
+
106
+ For telemetry, if `cw telemetry verify <run> --pubkey <public.pem>` reports green,
107
+ you can rely on **all** of the following, and only these:
108
+
109
+ 1. **Attribution.** Each `attested` usage figure was signed by the holder of the
110
+ configured private key, over a payload bound to that specific run, task, and
111
+ prompt. It is **non-repudiable**: the signer cannot later disown it, and it
112
+ could not have been replayed from a different hop.
113
+ 2. **Tamper-evidence of the record.** The recorded ledger — verdicts, usage
114
+ digests, audit decisions — has not been edited, reordered, truncated, or had
115
+ entries removed since it was written, *to the extent a self-recomputable chain
116
+ can detect* (see the threat-model caveat below). Casual or partial tampering,
117
+ accidental corruption, truncation, and forged unchained lines are all caught.
118
+ 3. **Offline, independent re-verification.** Re-proving the recorded ledger needs
119
+ no network, no CW service, and no trust in our infrastructure — `cw telemetry
120
+ verify` recomputes the chain on your machine (and needs no key to do it). With
121
+ `--pubkey`, the ed25519 **attribution** is independently re-checked with the
122
+ **public key alone**; `cw demo tamper` reproduces that sign-and-catch
123
+ end-to-end, offline. The integrity claim does not depend on trusting us.
124
+ 4. **CW never forged or measured anything.** CW holds no private key and never
125
+ calls a model. It cannot mint a signature, and it cannot fabricate a usage
126
+ number to sign. What it records, it received and verified.
127
+
128
+ ---
129
+
130
+ ## What this DOES NOT prove
131
+
132
+ Equally load-bearing. None of the following are within the guarantee, and we will
133
+ not imply otherwise:
134
+
135
+ 1. **It does not prove the reported number is true.** A signature proves *who*
136
+ said it and that it *wasn't altered* — **not** that it was correct at the
137
+ source. Quoting the code's own honest ceiling:
138
+
139
+ > A dishonest keyholder can still sign a lie, but the lie is now
140
+ > cryptographically bound to its signer.
141
+
142
+ CW deliberately does **not** independently measure usage (doing so would mean
143
+ calling the model — the red line it refuses to cross). So the strongest honest
144
+ claim is **attribution, not ground-truth measurement**.
145
+
146
+ 2. **It does not defend against a single party who holds both roles.** If the
147
+ same operator runs CW, holds the signing private key, *and* controls the
148
+ machine the ledger lives on, then a green verdict attests that **that party**
149
+ signed and that **that party's** record is internally consistent. It does not
150
+ bring in any *independent* party. Self-consistency is not third-party
151
+ verification.
152
+
153
+ 3. **A determined local writer can re-chain the whole log.** The hash-chain's
154
+ genesis is `sha256(runId)` — a value the local writer knows. So the chain
155
+ detects edits to *part* of a log, but a writer who edits an entry and then
156
+ **re-computes every subsequent hash** with CW's own sha256 produces a log that
157
+ re-verifies green. From `trust-audit.ts`:
158
+
159
+ > THREAT MODEL (be honest about the limit): the genesis is sha256(runId), so
160
+ > this detects casual/partial tampering, accidental corruption, truncation,
161
+ > removal, and forged-unchained lines — but NOT a determined local writer who
162
+ > re-chains the WHOLE log with this module's own sha256 after an edit.
163
+
164
+ This is **inherent** to any local, self-recomputable chain. Closing it needs an
165
+ anchor the writer cannot reproduce. CW **cannot mint that anchor itself** —
166
+ because by design it holds no private key. The one cryptographic anchor that
167
+ exists is the **agent's** telemetry signature, which covers agent-reported
168
+ *usage* — it does **not** cover CW-only decisions (sandbox / policy /
169
+ commit-gate), which have no external signer.
170
+
171
+ For those CW-only decisions, the only stronger guarantee available today is
172
+ **operational**, not cryptographic: commit `events.jsonl` to an external
173
+ append-only medium (git history, a remote append-only log) that the local
174
+ writer cannot rewrite. The chain is a **strict upgrade** over a bare
175
+ append-only log — not a substitute for an external anchor.
176
+
177
+ 4. **It says nothing about the quality, safety, or correctness of the work.**
178
+ Attestation is about *provenance and integrity of records*, not about whether
179
+ the agent's output is good, secure, or even functional. Other CW mechanisms
180
+ (verifier gate, schema validation, evidence grounding) speak to that; the
181
+ cryptography here does not.
182
+
183
+ ---
184
+
185
+ ## The single-keyholder limitation (stated plainly)
186
+
187
+ > **The core honest gap:** when the same operator runs CW and holds the only
188
+ > verification/signing key, tamper-evidence proves that **records were not edited
189
+ > after the fact** — it does **not** prove that the **original signer was
190
+ > honest**. Integrity, yes. A trustworthy source, not necessarily.
191
+
192
+ Concretely, in a single-party setup:
193
+
194
+ - The operator provisions the keypair.
195
+ - The operator's agent process signs usage with the private key.
196
+ - CW (run by the same operator) verifies with the public key and writes the
197
+ ledger to the operator's disk.
198
+
199
+ Every cryptographic check can pass while a motivated single party fabricates the
200
+ source number, or — given the genesis caveat above — rewrites the whole local
201
+ chain. **Cryptography cannot manufacture a second party that does not exist.**
202
+ Separation of duties is the property auditors require everywhere; with one
203
+ operator wearing both hats, it is structurally absent no matter how good the
204
+ math is.
205
+
206
+ We are not going to argue this point away. It is real, it is the most important
207
+ limitation in this document, and it is the right critique to raise.
208
+
209
+ ---
210
+
211
+ ## Closing the gap: the second party
212
+
213
+ The fix is **not** more cryptography on one machine — it is an **independent
214
+ second party**, which is precisely the thing a single operator cannot self-supply.
215
+ This is why CW's near-term priority is **early integration partners**, and what we
216
+ mean by that concretely:
217
+
218
+ - **An independent co-signer / second keyholder.** A second party (a different
219
+ team, a CI identity outside the operator's control, or a partner's signing
220
+ service) holds a key the operator does not. When that party counter-signs runs —
221
+ or *is* the executor that signs usage — a green verdict starts to mean
222
+ "two parties who do not fully trust each other agree," which is the property
223
+ single-party attestation structurally cannot provide.
224
+ - **An external append-only anchor.** Pushing `events.jsonl` to a medium the local
225
+ operator cannot rewrite (a partner-held log, a public transparency log, signed
226
+ git history on a remote the operator doesn't control) closes the re-chain gap
227
+ for CW-only decisions described above.
228
+ - **Separated execution and verification.** The party that *spends the money*
229
+ (runs the model) and the party that *keeps the books* (CW) being genuinely
230
+ different entities turns CW's separation-of-duties design from an architectural
231
+ intent into an enforced fact.
232
+
233
+ If you are a potential partner who can supply an independent second party — a
234
+ co-signer, an external anchor, or separated execution — **that is the
235
+ collaboration we are actively looking for.** We would rather ship this honestly
236
+ and earn the second party than paper over the gap with a stronger-sounding claim
237
+ than the math supports.
238
+
239
+ ---
240
+
241
+ ## How to verify for yourself
242
+
243
+ - `cw telemetry verify <run>` — re-proves the telemetry ledger's **integrity**:
244
+ chain linkage + an independent per-record hash recompute, so any edit to a
245
+ recorded verdict or usage digest since record time flips it red. It needs **no
246
+ key** (it re-proves the *recording*). Add `--pubkey <pem-or-path>` to re-run the
247
+ ed25519 **signature** check for every `attested` record against the stored raw
248
+ usage; unreadable keys, missing raw usage, digest mismatches, wrong keys, and
249
+ signature mismatches fail closed. Mirrored as `cw_telemetry_verify` on the MCP
250
+ surface.
251
+ - `cw demo tamper` — a hermetic, offline, one-command proof: it builds a real
252
+ ed25519-signed ledger and then forges it two ways — flips a recorded verdict and
253
+ re-computes the *local* record hash (the chain still breaks), and reuses a
254
+ signature over inflated tokens (ed25519 rejects it). Everything is verified with
255
+ the public key only. The `✗ DETECTED` lines are the point.
256
+ - Re-run either with **only the public key** on a machine we do not control. If it
257
+ doesn't reproduce, our integrity claim is false — hold us to it.
258
+
259
+ ---
260
+
261
+ ## One-line summary
262
+
263
+ CW's cryptography proves **records weren't edited and were signed by the
264
+ keyholder** — strong, offline, public-key-verifiable **integrity and
265
+ attribution**. It does **not** prove the **source was honest**, and a single
266
+ operator holding both roles is the honest limit we are explicitly recruiting
267
+ integration partners to close.
@@ -0,0 +1,43 @@
1
+ # Vendor Manifest Loadability
2
+
3
+ CW ships one kernel to many AI clients. A single `manifest/plugin.manifest.json`
4
+ generates every vendor's plugin files (Claude, Codex, the `agents` marketplace,
5
+ Gemini, OpenCode) — see `gen-manifests(1)`. Each vendor that exposes the MCP
6
+ server gets a generated `mcp.json` telling that client how to launch it.
7
+
8
+ ## The gap this closes
9
+
10
+ Two gates already guard the manifests, but neither proves a vendor manifest
11
+ actually *boots*:
12
+
13
+ - `npm run gen:manifests -- --check` diffs the generated bytes against the
14
+ manifest source. It catches drift, not a wrong-but-consistent command.
15
+ - `parity-check` boots `dist/mcp-server.js` **directly** — it never reads any
16
+ vendor's `mcp.json`, never resolves a `pluginRootVar`.
17
+
18
+ So a manifest could declare a broken `command`, `args`, or path and every gate
19
+ would stay green while no client could load it. Track C ("multi-vendor manifest
20
+ actually loaded by ≥2 real clients") was asserted, not proven.
21
+
22
+ ## The load proof
23
+
24
+ `npm run manifest:load-check` (the `vendor-manifest-load-smoke`, run automatically
25
+ by `npm test`) closes it. For every vendor in `targets` that declares an `mcp`
26
+ output it:
27
+
28
+ 1. reads the generated `mcp.json`;
29
+ 2. resolves the server `command` + `args` exactly as that client does —
30
+ substituting the vendor's `pluginRootVar` (`${CLAUDE_PLUGIN_ROOT}/` for Claude,
31
+ `./` for the rest) to the real plugin root;
32
+ 3. spawns the server with `shell:false` (argv spawn, no shell);
33
+ 4. completes a JSON-RPC `initialize` + `tools/list` round-trip.
34
+
35
+ Every vendor launches the same kernel, so the proof asserts they **agree**: one
36
+ `serverInfo.name` and an identical tool count across all of them. A vendor whose
37
+ manifest drifted to an unbootable shape — wrong path, wrong command, bad
38
+ `pluginRootVar` — fails this check instead of shipping a dead plugin.
39
+
40
+ ## See also
41
+
42
+ - `gen-manifests(1)` — one source generates every vendor manifest.
43
+ - `cli-mcp-parity(7)` — the CLI ↔ MCP capability-parity gate.
@@ -219,3 +219,5 @@ Migration DAG with reversible edges (v0.1.45), capability auto-discovery (v0.1.4
219
219
  ## Fast Architecture Review (v0.1.80)
220
220
 
221
221
  Adds the opt-in fast architecture-review lane: scoped JSONL source contexts, diff-aware exports, reusable Map and Assess results, measurable wrapper metrics, actionable background full-review handoff, and userland model policy flags for routing fast/strong workers without changing the full review contract.
222
+
223
+ _No changes to the Web / Desktop Workbench in v0.1.81._
@@ -2,7 +2,7 @@
2
2
  "_comment": "SINGLE SOURCE OF TRUTH for every vendor manifest. Edit THIS file, then run `npm run gen:manifests`. Do NOT hand-edit the generated vendor manifests (.claude-plugin/, .codex-plugin/, .agents/, .mcp.json) — `npm run gen:manifests -- --check` (run by release:check) will fail if they drift from this source.",
3
3
  "identity": {
4
4
  "name": "cool-workflow",
5
- "version": "0.1.80",
5
+ "version": "0.1.81",
6
6
  "license": "BSD-2-Clause",
7
7
  "homepage": "https://github.com/coo1white/cool-workflow",
8
8
  "author": {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cool-workflow",
3
- "version": "0.1.80",
3
+ "version": "0.1.81",
4
4
  "bin": {
5
5
  "cool-workflow": "scripts/cw.js",
6
6
  "cw": "scripts/cw.js"
@@ -51,12 +51,14 @@
51
51
  "forward-ref": "node scripts/forward-ref-docs.js",
52
52
  "verify:container": "node scripts/verify-container-selfref.js",
53
53
  "gen:manifests": "node scripts/gen-manifests.js",
54
+ "manifest:load-check": "node test/vendor-manifest-load-smoke.js",
54
55
  "parity:check": "node scripts/parity-check.js --check",
55
56
  "version:sync": "node scripts/version-sync-check.js",
56
57
  "release:check": "node scripts/release-check.js",
57
58
  "test": "node dist/cli.js list && node test/run-all.js",
58
59
  "test:fast": "npm run build --if-present && node dist/cli.js list && node test/run-all.js --concurrency auto",
59
- "test:coverage": "node dist/cli.js list && node scripts/coverage-gate.js",
60
+ "test:ci": "node dist/cli.js list && node test/run-all.js --concurrency auto",
61
+ "test:coverage": "node dist/cli.js list && node scripts/coverage-gate.js --concurrency auto",
60
62
  "eval:replay": "tsc -p tsconfig.json && node test/multi-agent-eval-replay-harness-smoke.js",
61
63
  "ci": "npm run build && npm run check && npm run test && npm run release:check",
62
64
  "validate:schema": "node scripts/validate-run-state-schema.js"
@@ -0,0 +1,7 @@
1
+ {
2
+ "schemaVersion": 1,
3
+ "comment": "Builtin agent-delegation templates as DATA, not a hand-edited kernel TS literal (FreeBSD-audit L15). Each entry maps a vendor name to the wrapper script in THIS directory that `builtin:<name>` (or CW_AGENT_COMMAND=builtin:<name>) resolves to. Adding a vendor is a content/distribution step: drop a wrapper script here + add a line below — NO kernel edit. Still pure config: the wrapper is an out-of-process delegation script; CW never imports or calls a model API.",
4
+ "templates": {
5
+ "claude": "claude-p-agent.js"
6
+ }
7
+ }
@@ -22,6 +22,7 @@
22
22
  const { spawnSync } = require("node:child_process");
23
23
  const fs = require("node:fs");
24
24
  const path = require("node:path");
25
+ const { CANONICAL_APP_IDS } = require("./canonical-apps-list.js");
25
26
 
26
27
  const pluginRoot = path.resolve(__dirname, "..");
27
28
  const repoRoot = path.resolve(pluginRoot, "..", "..");
@@ -89,17 +90,10 @@ function main() {
89
90
 
90
91
  // 5. canonical apps app.json (top-level version only; never minVersion).
91
92
  // ONLY the canonical apps track the runtime version — workflow-app-framework-demo
92
- // is pinned (e.g. 0.1.0) and must NOT be bumped. This list mirrors the one
93
- // version-sync-check.js asserts.
94
- const CANONICAL_APPS = [
95
- "architecture-review",
96
- "architecture-review-fast",
97
- "end-to-end-golden-path",
98
- "pr-review-fix-ci",
99
- "release-cut",
100
- "research-synthesis"
101
- ];
102
- for (const appId of CANONICAL_APPS) {
93
+ // is pinned (e.g. 0.1.0) and must NOT be bumped. The list is DERIVED from
94
+ // apps/ (excluding metadata.example demos) by scripts/canonical-apps-list.js,
95
+ // the single source version-sync-check.js asserts against — no hand-copy.
96
+ for (const appId of CANONICAL_APP_IDS) {
103
97
  const appJson = path.join(pluginRoot, "apps", appId, "app.json");
104
98
  if (fs.existsSync(appJson) && replaceFirstVersionField(appJson, next)) {
105
99
  note(`apps/${appId}/app.json`);
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env node
2
+ "use strict";
3
+
4
+ // Single source of truth for the CANONICAL app id list.
5
+ //
6
+ // Audit finding M5: this list was hand-copied into three scripts
7
+ // (bump-version.js, version-sync-check.js, canonical-apps.js) with no gate
8
+ // enforcing agreement, so drift between the copies was silent. This module
9
+ // DERIVES the list from the `apps/` directory on disk so the three callers can
10
+ // never disagree — there is nothing left to copy.
11
+ //
12
+ // What counts as canonical: every app directory under `apps/` whose `app.json`
13
+ // is NOT a demo. The real demo marker is `metadata.example === true` (that, NOT
14
+ // `versionPinned`, is how the only non-canonical app — workflow-app-framework-demo,
15
+ // pinned at 0.1.0 — is flagged). Example apps are excluded because they are
16
+ // version-pinned and must not be bumped or version-asserted with the runtime.
17
+ //
18
+ // Portability: node fs/path only, no external tools (CI portability rule).
19
+
20
+ const fs = require("node:fs");
21
+ const path = require("node:path");
22
+
23
+ const pluginRoot = path.resolve(__dirname, "..");
24
+ const appsDir = path.join(pluginRoot, "apps");
25
+
26
+ // The end-to-end golden path is canonical (and version-tracked) but is exercised
27
+ // by its own dedicated harness (scripts/golden-path.js), not by the per-app CLI
28
+ // smoke in canonical-apps.js. Expose its id so that script can express
29
+ // "canonical minus golden-path" without re-introducing a hand-copied list.
30
+ const GOLDEN_PATH_APP_ID = "end-to-end-golden-path";
31
+
32
+ function isExampleApp(appJsonPath) {
33
+ // An app is excluded from the canonical list iff its app.json declares
34
+ // metadata.example === true. Any read/parse failure is treated as
35
+ // "not an example" so a malformed app surfaces in the canonical list (and
36
+ // therefore in the version gate) rather than being silently dropped.
37
+ try {
38
+ const json = JSON.parse(fs.readFileSync(appJsonPath, "utf8"));
39
+ return json && json.metadata && json.metadata.example === true;
40
+ } catch {
41
+ return false;
42
+ }
43
+ }
44
+
45
+ function listCanonicalAppIds() {
46
+ return fs
47
+ .readdirSync(appsDir, { withFileTypes: true })
48
+ .filter((entry) => entry.isDirectory())
49
+ .map((entry) => entry.name)
50
+ .filter((id) => {
51
+ const appJson = path.join(appsDir, id, "app.json");
52
+ if (!fs.existsSync(appJson)) return false; // not an app directory
53
+ return !isExampleApp(appJson);
54
+ })
55
+ .sort(); // deterministic order (replay determinism)
56
+ }
57
+
58
+ const CANONICAL_APP_IDS = listCanonicalAppIds();
59
+
60
+ module.exports = {
61
+ CANONICAL_APP_IDS,
62
+ listCanonicalAppIds,
63
+ GOLDEN_PATH_APP_ID
64
+ };
@@ -6,6 +6,7 @@ const { execFileSync } = require("node:child_process");
6
6
  const fs = require("node:fs");
7
7
  const os = require("node:os");
8
8
  const path = require("node:path");
9
+ const { CANONICAL_APP_IDS, GOLDEN_PATH_APP_ID } = require("./canonical-apps-list.js");
9
10
 
10
11
  const pluginRoot = path.resolve(__dirname, "..");
11
12
  const cli = path.join(pluginRoot, "scripts/cw.js");
@@ -82,7 +83,7 @@ const canonicalApps = [
82
83
  "--source",
83
84
  "plugins/cool-workflow/docs/workflow-app-framework.7.md",
84
85
  "--scope",
85
- "Cool Workflow v0.1.80",
86
+ "Cool Workflow v0.1.81",
86
87
  "--freshness",
87
88
  "as of release preparation"
88
89
  ]
@@ -90,6 +91,20 @@ const canonicalApps = [
90
91
  ];
91
92
 
92
93
  function main() {
94
+ // Fail-closed drift gate (audit M5): the per-app CLI smoke below must cover
95
+ // exactly the DERIVED canonical set (apps/ minus metadata.example demos) less
96
+ // the golden-path app, which scripts/golden-path.js owns. If a new canonical
97
+ // app appears (or the demo marker flips) without smoke args here, this fails
98
+ // instead of silently skipping it — there is no second hand-copied list.
99
+ const expectedSmokeIds = CANONICAL_APP_IDS.filter((id) => id !== GOLDEN_PATH_APP_ID).sort();
100
+ const actualSmokeIds = canonicalApps.map((app) => app.id).sort();
101
+ assert.deepEqual(
102
+ actualSmokeIds,
103
+ expectedSmokeIds,
104
+ `canonical-apps smoke set drifted from derived canonical list (apps/ minus example demos, minus ${GOLDEN_PATH_APP_ID}): ` +
105
+ `expected ${JSON.stringify(expectedSmokeIds)}, got ${JSON.stringify(actualSmokeIds)}`
106
+ );
107
+
93
108
  const appList = runJson(["app", "list"]);
94
109
  const workflowList = runJson(["list"]);
95
110
  assertUniqueIds(appList, "app list");
@@ -102,14 +117,14 @@ function main() {
102
117
  assert.ok(summary, `${app.id} must appear in app list`);
103
118
  assert.equal(summary.sourceKind, "app-directory");
104
119
  assert.equal(summary.legacy, false);
105
- assert.equal(summary.version, "0.1.80");
120
+ assert.equal(summary.version, "0.1.81");
106
121
 
107
122
  const validation = runJson(["app", "validate", manifestPath]);
108
123
  assert.equal(validation.valid, true, `${app.id} manifest must validate`);
109
124
 
110
125
  const shown = runJson(["app", "show", app.id]);
111
126
  assert.equal(shown.app.id, app.id);
112
- assert.equal(shown.app.version, "0.1.80");
127
+ assert.equal(shown.app.version, "0.1.81");
113
128
  assert.ok(shown.app.metadata.canonical, `${app.id} must be marked canonical`);
114
129
  assert.ok(shown.app.sandboxProfiles.length > 0, `${app.id} must declare sandbox profiles`);
115
130
  assertTaskIdsUnique(shown);
@@ -120,7 +135,7 @@ function main() {
120
135
  const plan = runJson(["plan", app.id, ...app.args(workspace)]);
121
136
  const state = JSON.parse(fs.readFileSync(plan.statePath, "utf8"));
122
137
  assert.equal(state.workflow.app.id, app.id);
123
- assert.equal(state.workflow.app.version, "0.1.80");
138
+ assert.equal(state.workflow.app.version, "0.1.81");
124
139
  assert.equal(state.workflow.app.metadata.canonical, true);
125
140
  assert.ok(state.tasks.some((task) => task.requiresEvidence), `${app.id} plan must include evidence gates`);
126
141
  assert.ok(state.tasks.every((task) => task.sandboxProfileId), `${app.id} plan must include sandbox hints`);
@@ -5,7 +5,7 @@ const { spawnSync } = require("node:child_process");
5
5
  const fs = require("node:fs");
6
6
  const path = require("node:path");
7
7
 
8
- const TARGET_VERSION = "0.1.80";
8
+ const TARGET_VERSION = "0.1.81";
9
9
  const PREVIOUS_VERSION = "0.1.31";
10
10
  const pluginRoot = path.resolve(__dirname, "..");
11
11
  const repoRoot = path.resolve(pluginRoot, "..", "..");
@@ -33,7 +33,7 @@ function main() {
33
33
  const appValidation = runJson(["app", "validate", "end-to-end-golden-path"], pluginRoot);
34
34
  assert.equal(appValidation.valid, true);
35
35
  assert.equal(appValidation.summary.id, "end-to-end-golden-path");
36
- assert.equal(appValidation.summary.version, "0.1.80");
36
+ assert.equal(appValidation.summary.version, "0.1.81");
37
37
 
38
38
  const plan = runJson(
39
39
  [
@@ -42,7 +42,7 @@ function main() {
42
42
  "--repo",
43
43
  tmp,
44
44
  "--question",
45
- "Prove the deterministic v0.1.80 end-to-end golden path."
45
+ "Prove the deterministic v0.1.81 end-to-end golden path."
46
46
  ],
47
47
  pluginRoot
48
48
  );
@@ -52,7 +52,7 @@ function main() {
52
52
 
53
53
  let state = readJson(plan.statePath);
54
54
  assert.equal(state.workflow.app.id, "end-to-end-golden-path");
55
- assert.equal(state.workflow.app.version, "0.1.80");
55
+ assert.equal(state.workflow.app.version, "0.1.81");
56
56
  assert.equal(state.loopStage, "interpret");
57
57
 
58
58
  const dispatch = runJson(["dispatch", plan.runId, "--limit", "1", "--sandbox", "readonly"], tmp);
@@ -195,7 +195,7 @@ function main() {
195
195
  assert.equal(reportPath, plan.reportPath);
196
196
  assert.ok(fs.existsSync(reportPath));
197
197
  const report = fs.readFileSync(reportPath, "utf8");
198
- assert.match(report, /Workflow App: end-to-end-golden-path@0\.1\.80/);
198
+ assert.match(report, /Workflow App: end-to-end-golden-path@0\.1\.81/);
199
199
  assert.match(report, /## Candidates/);
200
200
  assert.match(report, /## Trust Audit/);
201
201
  assert.match(report, /## Acceptance Rationale/);
@@ -150,6 +150,11 @@ async function payloadParity() {
150
150
  for (const [capability, mcpTool] of GLOBAL_PROBES) {
151
151
  const cap = capById(capability);
152
152
  assert.equal(cap.mcp.tool, mcpTool, `probe/registry MCP tool mismatch for ${capability}`);
153
+ // jsonMode is the single source for the CLI's --json policy; this probe only
154
+ // appends --json for "flag" verbs and JSON.parse-es the result. The human
155
+ // rendering and "default"-verb no-flag JSON are pinned to cap.cli.jsonMode by
156
+ // the companion test/cli-jsonmode-parity-smoke.js, so cli.ts can't silently
157
+ // re-encode that policy by hand and drift from this registry data.
153
158
  const cliArgv = [...cap.cli.path, ...(cap.cli.jsonMode === "flag" ? ["--json"] : [])];
154
159
  const cliOut = JSON.parse(execFileSync(node, [cli, ...cliArgv], { cwd: workspace, encoding: "utf8" }));
155
160
  const mcpOut = await mcp.tool(mcpTool, { cwd: workspace });
@@ -58,7 +58,11 @@ const checks = [
58
58
  { name: "dist freshness", command: ["npm", "run", "dist:check"] },
59
59
  { name: "type check", command: ["npm", "run", "check"] },
60
60
  { name: "run-state schema consistency", command: ["node", "scripts/validate-run-state-schema.js"] },
61
- { name: "tests", command: ["npm", "test"] },
61
+ // Parallel suite (test:ci = run-all.js --concurrency auto). Each smoke runs in
62
+ // a private cwd + state roots (CW_HOME/HOME/TMPDIR), so concurrency is race-free.
63
+ // The bare `npm test` and the tag-gate (release-gate.sh) stay sequential as the
64
+ // deterministic backstop.
65
+ { name: "tests", command: ["npm", "run", "test:ci"] },
62
66
  { name: "canonical apps", command: ["npm", "run", "canonical-apps"] },
63
67
  { name: "golden path", command: ["npm", "run", "golden-path"] },
64
68
  { name: "CLI MCP parity", command: ["npm", "run", "parity:check"] },
@@ -5,6 +5,7 @@ const assert = require("node:assert/strict");
5
5
  const fs = require("node:fs");
6
6
  const path = require("node:path");
7
7
  const { spawnSync } = require("node:child_process");
8
+ const { CANONICAL_APP_IDS } = require("./canonical-apps-list.js");
8
9
 
9
10
  const pluginRoot = path.resolve(__dirname, "..");
10
11
  const repoRoot = path.resolve(pluginRoot, "..", "..");
@@ -47,14 +48,10 @@ function readReleaseSource(relativePath) {
47
48
  // Read it from the released commit so the asserted-against version is itself
48
49
  // taken from HEAD, not a half-written working copy.
49
50
  const VERSION = JSON.parse(readReleaseSource("plugins/cool-workflow/package.json").text).version;
50
- const canonicalApps = [
51
- "architecture-review",
52
- "architecture-review-fast",
53
- "end-to-end-golden-path",
54
- "pr-review-fix-ci",
55
- "release-cut",
56
- "research-synthesis"
57
- ];
51
+ // Canonical app ids are DERIVED from apps/ (excluding metadata.example demos) by
52
+ // scripts/canonical-apps-list.js — the single source bump-version.js bumps and
53
+ // canonical-apps.js smoke-tests. No hand-copied list to drift (audit M5).
54
+ const canonicalApps = CANONICAL_APP_IDS;
58
55
 
59
56
  function main() {
60
57
  const checks = [];