npm - cool-workflow - Versions diffs - 0.1.80 → 0.1.81 - Mend

cool-workflow 0.1.80 → 0.1.81

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (110) hide show

package/.claude-plugin/plugin.json +1 -1
package/.codex-plugin/plugin.json +1 -1
package/README.md +42 -2
package/apps/architecture-review/app.json +1 -1
package/apps/architecture-review-fast/app.json +1 -1
package/apps/end-to-end-golden-path/app.json +1 -1
package/apps/pr-review-fix-ci/app.json +1 -1
package/apps/release-cut/app.json +1 -1
package/apps/research-synthesis/app.json +1 -1
package/dist/agent-config.js +21 -7
package/dist/candidate-scoring.js +42 -22
package/dist/capability-core.js +94 -17
package/dist/capability-registry.js +138 -171
package/dist/cli.js +90 -100
package/dist/collaboration.js +5 -6
package/dist/commit.js +20 -6
package/dist/compare.js +18 -0
package/dist/coordinator/classify.js +45 -0
package/dist/coordinator/paths.js +42 -0
package/dist/coordinator/util.js +129 -0
package/dist/coordinator.js +127 -300
package/dist/dispatch.js +35 -0
package/dist/drive.js +7 -7
package/dist/error-feedback.js +8 -4
package/dist/evidence-reasoning.js +1 -1
package/dist/execution-backend/agent.js +331 -0
package/dist/execution-backend/probes.js +96 -0
package/dist/execution-backend/util.js +47 -0
package/dist/execution-backend.js +67 -420
package/dist/mcp-server.js +34 -173
package/dist/multi-agent/graph.js +84 -0
package/dist/multi-agent/helpers.js +145 -0
package/dist/multi-agent/paths.js +22 -0
package/dist/multi-agent-eval/format.js +194 -0
package/dist/multi-agent-eval/normalize.js +51 -0
package/dist/multi-agent-eval.js +39 -244
package/dist/multi-agent-host.js +0 -19
package/dist/multi-agent.js +125 -314
package/dist/node-snapshot.js +3 -3
package/dist/observability/format.js +61 -0
package/dist/observability/intake.js +98 -0
package/dist/observability.js +14 -160
package/dist/operator-ux/format.js +364 -0
package/dist/operator-ux.js +22 -363
package/dist/orchestrator/report.js +8 -0
package/dist/orchestrator.js +25 -8
package/dist/reclamation.js +26 -21
package/dist/run-export.js +138 -14
package/dist/run-registry/derive.js +172 -0
package/dist/run-registry/format.js +124 -0
package/dist/run-registry/gc.js +251 -0
package/dist/run-registry/policy.js +16 -0
package/dist/run-registry/queue.js +116 -0
package/dist/run-registry.js +78 -593
package/dist/run-state-schema.js +1 -0
package/dist/sandbox-profile.js +43 -2
package/dist/state-explosion/format.js +159 -0
package/dist/state-explosion/helpers.js +82 -0
package/dist/state-explosion.js +65 -283
package/dist/state-node.js +19 -4
package/dist/telemetry-attestation.js +55 -0
package/dist/telemetry-demo.js +15 -3
package/dist/telemetry-ledger.js +60 -15
package/dist/topology.js +25 -8
package/dist/triggers.js +33 -14
package/dist/trust-audit.js +145 -33
package/dist/version.js +1 -1
package/dist/worker-isolation/helpers.js +51 -0
package/dist/worker-isolation/paths.js +46 -0
package/dist/worker-isolation.js +39 -115
package/docs/agent-delegation-drive.7.md +13 -0
package/docs/cli-mcp-parity.7.md +4 -0
package/docs/contract-migration-tooling.7.md +2 -0
package/docs/control-plane-scheduling.7.md +2 -0
package/docs/dogfood/resume-drive-real-agent-2026-06-14.md +40 -0
package/docs/durable-state-and-locking.7.md +4 -0
package/docs/evidence-adoption-reasoning-chain.7.md +2 -0
package/docs/execution-backends.7.md +2 -0
package/docs/index.md +1 -0
package/docs/launch/launch-kit.md +46 -23
package/docs/launch/pre-launch-checklist.md +14 -14
package/docs/multi-agent-cli-mcp-surface.7.md +4 -0
package/docs/multi-agent-eval-replay-harness.7.md +2 -0
package/docs/multi-agent-operator-ux.7.md +2 -0
package/docs/multi-agent-trust-policy-audit.7.md +27 -0
package/docs/node-snapshot-diff-replay.7.md +2 -0
package/docs/observability-cost-accounting.7.md +2 -0
package/docs/project-index.md +18 -5
package/docs/real-execution-backends.7.md +2 -0
package/docs/release-and-migration.7.md +4 -0
package/docs/release-tooling.7.md +2 -0
package/docs/run-registry-control-plane.7.md +54 -8
package/docs/run-retention-reclamation.7.md +4 -0
package/docs/state-explosion-management.7.md +2 -0
package/docs/team-collaboration.7.md +2 -0
package/docs/trust-model.md +267 -0
package/docs/vendor-manifest-loadability.7.md +43 -0
package/docs/web-desktop-workbench.7.md +2 -0
package/manifest/plugin.manifest.json +1 -1
package/package.json +4 -2
package/scripts/agents/builtin-templates.json +7 -0
package/scripts/bump-version.js +5 -11
package/scripts/canonical-apps-list.js +64 -0
package/scripts/canonical-apps.js +19 -4
package/scripts/dogfood-release.js +1 -1
package/scripts/golden-path.js +4 -4
package/scripts/parity-check.js +5 -0
package/scripts/release-check.js +5 -1
package/scripts/version-sync-check.js +5 -8
package/dist/capability-dispatcher.js +0 -86

package/docs/trust-model.md ADDED Viewed

@@ -0,0 +1,267 @@
+# Trust Model & Limitations
+> **Read this before you trust a cool-workflow record.** This document states
+> exactly what CW's cryptographic guarantees prove, and — just as important —
+> what they do **not** prove. We would rather lose a skeptical reader here than
+> have them over-trust a green checkmark in production. If anything below reads
+> as an overclaim, it is a bug; please file it.
+CW is an **auditable control-plane**. It plans, dispatches, records, and verifies
+agent work — it does **not** run the model itself. That single architectural
+choice is what the guarantees below rest on, and it is also the source of their
+honest ceiling.
+---
+## TL;DR
+- CW's ed25519 signature + hash-chained ledger prove **integrity and
+  attribution**: a recorded usage figure was signed by the keyholder and has not
+  been edited since it was recorded. Both re-verify **offline** — the recorded
+  ledger's integrity with **no key at all** (`cw telemetry verify`), and each
+  `attested` signature with the **public key alone** (`cw telemetry verify
+  --pubkey <public.pem>`; also reproduced by `cw demo tamper`).
+- They do **not** prove the original number was **true**. A dishonest signer can
+  sign a lie; the lie is then cryptographically bound to its signer, but it is
+  still a lie.
+- **CW holds no private key.** It can verify, but it can neither forge a
+  signature nor measure usage itself (by design — see the red line below).
+- The honest gap is **single-keyholder / no second party**: when the same
+  operator runs CW *and* holds the only signing key, integrity is real but there
+  is no independent party attesting that the source was honest. **This is exactly
+  why we are seeking early integration partners** who supply an independent
+  second party / co-signer. See [Closing the gap](#closing-the-gap-the-second-party).
+---
+## What the cryptography is, precisely
+There are two distinct mechanisms. Conflating them is the most common way to
+over- or under-state the guarantee, so they are kept separate here.
+### 1. The telemetry signature (ed25519) — attribution of a reported number
+The agent (the **executor**) self-reports its token usage. A control-plane that
+records that number verbatim is recording a **claim**. To turn the claim into an
+**attestation**, the executor signs a canonical payload with its **private key**:
+```
+sign({ usage, runId, taskId, promptDigest })   // ed25519, executor-side
+```
+The `runId` / `taskId` / `promptDigest` binding is load-bearing: it ties the
+signature to **this** hop, so a valid signature from one task cannot be replayed
+onto another. `promptDigest` is the sha256 of the exact worker prompt CW handed
+the agent.
+CW then **verifies** that signature against an **operator-provisioned public
+key**. CW holds *only* the public half. From `telemetry-attestation.ts`:
+> CW VERIFIES that signature against an operator-provisioned PUBLIC key. CW holds
+> ONLY the public key — it can verify, but can neither forge a signature nor (the
+> red line) call a model to measure usage itself.
+The result is one of three honest states, surfaced loudly and never silently
+upgraded to "trusted":
+| State | Meaning |
+|---|---|
+| `attested` | A valid ed25519 signature over the reported usage, bound to this run/task/prompt, verified against the configured public key. |
+| `unattested` | Usage was reported but the signature is missing, malformed, made with the wrong key, or does not match the payload (tampered or replayed). Also: no trust key configured. |
+| `absent` | The agent reported no usage at all. |
+Defaults are honest: no signature ⇒ `unattested`; no usage ⇒ `absent`. **Usage
+is never silently recorded as trusted.** The opt-in `require-attested-telemetry`
+policy fails the run closed on anything other than `attested`.
+### 2. The hash-chained ledgers — tamper-evidence of the recorded log
+A signature proves the agent *said* a number in flight. It does not, by itself,
+prove that **CW recorded exactly that** and that **nobody edited the record
+afterward**. That is the job of the append-only, hash-chained ledgers:
+- **Telemetry ledger** (`telemetry.json`, one entry per agent hop): each entry
+  chains to the previous via `prevHash`, and `recordHash = sha256(canonical
+  entry)`. Flip a recorded verdict (`unattested` → `attested`) or edit a recorded
+  usage digest, and the chain no longer recomputes.
+- **Trust-audit event log** (`events.jsonl`): the same discipline applied to
+  every recorded decision — sandbox path allow/deny, policy snapshots,
+  verifier-gated commits, collaboration approvals.
+Verification **recomputes every hash independently and never trusts the stored
+value**, so an edited, reordered, removed, or truncated entry flips
+`verified = false`. A ledger that exists but cannot be parsed **fails closed** —
+it is treated as corrupt, never silently as the clean empty chain.
+This is all **offline**. The chain re-proof needs **no key at all**; add
+`--pubkey <public.pem>` to re-run the signature **attribution** check against the
+stored raw usage for every `attested` record. There is no telemetry service to
+trust or breach — the record proves its own integrity, and a third-party auditor
+can re-run both checks on their own machine.
+---
+## What this DOES prove
+For telemetry, if `cw telemetry verify <run> --pubkey <public.pem>` reports green,
+you can rely on **all** of the following, and only these:
+1. **Attribution.** Each `attested` usage figure was signed by the holder of the
+   configured private key, over a payload bound to that specific run, task, and
+   prompt. It is **non-repudiable**: the signer cannot later disown it, and it
+   could not have been replayed from a different hop.
+2. **Tamper-evidence of the record.** The recorded ledger — verdicts, usage
+   digests, audit decisions — has not been edited, reordered, truncated, or had
+   entries removed since it was written, *to the extent a self-recomputable chain
+   can detect* (see the threat-model caveat below). Casual or partial tampering,
+   accidental corruption, truncation, and forged unchained lines are all caught.
+3. **Offline, independent re-verification.** Re-proving the recorded ledger needs
+   no network, no CW service, and no trust in our infrastructure — `cw telemetry
+   verify` recomputes the chain on your machine (and needs no key to do it). With
+   `--pubkey`, the ed25519 **attribution** is independently re-checked with the
+   **public key alone**; `cw demo tamper` reproduces that sign-and-catch
+   end-to-end, offline. The integrity claim does not depend on trusting us.
+4. **CW never forged or measured anything.** CW holds no private key and never
+   calls a model. It cannot mint a signature, and it cannot fabricate a usage
+   number to sign. What it records, it received and verified.
+---
+## What this DOES NOT prove
+Equally load-bearing. None of the following are within the guarantee, and we will
+not imply otherwise:
+1. **It does not prove the reported number is true.** A signature proves *who*
+   said it and that it *wasn't altered* — **not** that it was correct at the
+   source. Quoting the code's own honest ceiling:
+   > A dishonest keyholder can still sign a lie, but the lie is now
+   > cryptographically bound to its signer.
+   CW deliberately does **not** independently measure usage (doing so would mean
+   calling the model — the red line it refuses to cross). So the strongest honest
+   claim is **attribution, not ground-truth measurement**.
+2. **It does not defend against a single party who holds both roles.** If the
+   same operator runs CW, holds the signing private key, *and* controls the
+   machine the ledger lives on, then a green verdict attests that **that party**
+   signed and that **that party's** record is internally consistent. It does not
+   bring in any *independent* party. Self-consistency is not third-party
+   verification.
+3. **A determined local writer can re-chain the whole log.** The hash-chain's
+   genesis is `sha256(runId)` — a value the local writer knows. So the chain
+   detects edits to *part* of a log, but a writer who edits an entry and then
+   **re-computes every subsequent hash** with CW's own sha256 produces a log that
+   re-verifies green. From `trust-audit.ts`:
+   > THREAT MODEL (be honest about the limit): the genesis is sha256(runId), so
+   > this detects casual/partial tampering, accidental corruption, truncation,
+   > removal, and forged-unchained lines — but NOT a determined local writer who
+   > re-chains the WHOLE log with this module's own sha256 after an edit.
+   This is **inherent** to any local, self-recomputable chain. Closing it needs an
+   anchor the writer cannot reproduce. CW **cannot mint that anchor itself** —
+   because by design it holds no private key. The one cryptographic anchor that
+   exists is the **agent's** telemetry signature, which covers agent-reported
+   *usage* — it does **not** cover CW-only decisions (sandbox / policy /
+   commit-gate), which have no external signer.
+   For those CW-only decisions, the only stronger guarantee available today is
+   **operational**, not cryptographic: commit `events.jsonl` to an external
+   append-only medium (git history, a remote append-only log) that the local
+   writer cannot rewrite. The chain is a **strict upgrade** over a bare
+   append-only log — not a substitute for an external anchor.
+4. **It says nothing about the quality, safety, or correctness of the work.**
+   Attestation is about *provenance and integrity of records*, not about whether
+   the agent's output is good, secure, or even functional. Other CW mechanisms
+   (verifier gate, schema validation, evidence grounding) speak to that; the
+   cryptography here does not.
+---
+## The single-keyholder limitation (stated plainly)
+> **The core honest gap:** when the same operator runs CW and holds the only
+> verification/signing key, tamper-evidence proves that **records were not edited
+> after the fact** — it does **not** prove that the **original signer was
+> honest**. Integrity, yes. A trustworthy source, not necessarily.
+Concretely, in a single-party setup:
+- The operator provisions the keypair.
+- The operator's agent process signs usage with the private key.
+- CW (run by the same operator) verifies with the public key and writes the
+  ledger to the operator's disk.
+Every cryptographic check can pass while a motivated single party fabricates the
+source number, or — given the genesis caveat above — rewrites the whole local
+chain. **Cryptography cannot manufacture a second party that does not exist.**
+Separation of duties is the property auditors require everywhere; with one
+operator wearing both hats, it is structurally absent no matter how good the
+math is.
+We are not going to argue this point away. It is real, it is the most important
+limitation in this document, and it is the right critique to raise.
+---
+## Closing the gap: the second party
+The fix is **not** more cryptography on one machine — it is an **independent
+second party**, which is precisely the thing a single operator cannot self-supply.
+This is why CW's near-term priority is **early integration partners**, and what we
+mean by that concretely:
+- **An independent co-signer / second keyholder.** A second party (a different
+  team, a CI identity outside the operator's control, or a partner's signing
+  service) holds a key the operator does not. When that party counter-signs runs —
+  or *is* the executor that signs usage — a green verdict starts to mean
+  "two parties who do not fully trust each other agree," which is the property
+  single-party attestation structurally cannot provide.
+- **An external append-only anchor.** Pushing `events.jsonl` to a medium the local
+  operator cannot rewrite (a partner-held log, a public transparency log, signed
+  git history on a remote the operator doesn't control) closes the re-chain gap
+  for CW-only decisions described above.
+- **Separated execution and verification.** The party that *spends the money*
+  (runs the model) and the party that *keeps the books* (CW) being genuinely
+  different entities turns CW's separation-of-duties design from an architectural
+  intent into an enforced fact.
+If you are a potential partner who can supply an independent second party — a
+co-signer, an external anchor, or separated execution — **that is the
+collaboration we are actively looking for.** We would rather ship this honestly
+and earn the second party than paper over the gap with a stronger-sounding claim
+than the math supports.
+---
+## How to verify for yourself
+- `cw telemetry verify <run>` — re-proves the telemetry ledger's **integrity**:
+  chain linkage + an independent per-record hash recompute, so any edit to a
+  recorded verdict or usage digest since record time flips it red. It needs **no
+  key** (it re-proves the *recording*). Add `--pubkey <pem-or-path>` to re-run the
+  ed25519 **signature** check for every `attested` record against the stored raw
+  usage; unreadable keys, missing raw usage, digest mismatches, wrong keys, and
+  signature mismatches fail closed. Mirrored as `cw_telemetry_verify` on the MCP
+  surface.
+- `cw demo tamper` — a hermetic, offline, one-command proof: it builds a real
+  ed25519-signed ledger and then forges it two ways — flips a recorded verdict and
+  re-computes the *local* record hash (the chain still breaks), and reuses a
+  signature over inflated tokens (ed25519 rejects it). Everything is verified with
+  the public key only. The `✗ DETECTED` lines are the point.
+- Re-run either with **only the public key** on a machine we do not control. If it
+  doesn't reproduce, our integrity claim is false — hold us to it.
+---
+## One-line summary
+CW's cryptography proves **records weren't edited and were signed by the
+keyholder** — strong, offline, public-key-verifiable **integrity and
+attribution**. It does **not** prove the **source was honest**, and a single
+operator holding both roles is the honest limit we are explicitly recruiting
+integration partners to close.

package/docs/vendor-manifest-loadability.7.md ADDED Viewed

@@ -0,0 +1,43 @@
+# Vendor Manifest Loadability
+CW ships one kernel to many AI clients. A single `manifest/plugin.manifest.json`
+generates every vendor's plugin files (Claude, Codex, the `agents` marketplace,
+Gemini, OpenCode) — see `gen-manifests(1)`. Each vendor that exposes the MCP
+server gets a generated `mcp.json` telling that client how to launch it.
+## The gap this closes
+Two gates already guard the manifests, but neither proves a vendor manifest
+actually *boots*:
+- `npm run gen:manifests -- --check` diffs the generated bytes against the
+  manifest source. It catches drift, not a wrong-but-consistent command.
+- `parity-check` boots `dist/mcp-server.js` **directly** — it never reads any
+  vendor's `mcp.json`, never resolves a `pluginRootVar`.
+So a manifest could declare a broken `command`, `args`, or path and every gate
+would stay green while no client could load it. Track C ("multi-vendor manifest
+actually loaded by ≥2 real clients") was asserted, not proven.
+## The load proof
+`npm run manifest:load-check` (the `vendor-manifest-load-smoke`, run automatically
+by `npm test`) closes it. For every vendor in `targets` that declares an `mcp`
+output it:
+1. reads the generated `mcp.json`;
+2. resolves the server `command` + `args` exactly as that client does —
+   substituting the vendor's `pluginRootVar` (`${CLAUDE_PLUGIN_ROOT}/` for Claude,
+   `./` for the rest) to the real plugin root;
+3. spawns the server with `shell:false` (argv spawn, no shell);
+4. completes a JSON-RPC `initialize` + `tools/list` round-trip.
+Every vendor launches the same kernel, so the proof asserts they **agree**: one
+`serverInfo.name` and an identical tool count across all of them. A vendor whose
+manifest drifted to an unbootable shape — wrong path, wrong command, bad
+`pluginRootVar` — fails this check instead of shipping a dead plugin.
+## See also
+- `gen-manifests(1)` — one source generates every vendor manifest.
+- `cli-mcp-parity(7)` — the CLI ↔ MCP capability-parity gate.

package/docs/web-desktop-workbench.7.md CHANGED Viewed

@@ -219,3 +219,5 @@ Migration DAG with reversible edges (v0.1.45), capability auto-discovery (v0.1.4
 ## Fast Architecture Review (v0.1.80)
 Adds the opt-in fast architecture-review lane: scoped JSONL source contexts, diff-aware exports, reusable Map and Assess results, measurable wrapper metrics, actionable background full-review handoff, and userland model policy flags for routing fast/strong workers without changing the full review contract.
+_No changes to the Web / Desktop Workbench in v0.1.81._

package/manifest/plugin.manifest.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "_comment": "SINGLE SOURCE OF TRUTH for every vendor manifest. Edit THIS file, then run `npm run gen:manifests`. Do NOT hand-edit the generated vendor manifests (.claude-plugin/, .codex-plugin/, .agents/, .mcp.json) — `npm run gen:manifests -- --check` (run by release:check) will fail if they drift from this source.",
   "identity": {
     "name": "cool-workflow",
-    "version": "0.1.80",
+    "version": "0.1.81",
     "license": "BSD-2-Clause",
     "homepage": "https://github.com/coo1white/cool-workflow",
     "author": {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "cool-workflow",
-  "version": "0.1.80",
+  "version": "0.1.81",
   "bin": {
     "cool-workflow": "scripts/cw.js",
     "cw": "scripts/cw.js"
@@ -51,12 +51,14 @@
     "forward-ref": "node scripts/forward-ref-docs.js",
     "verify:container": "node scripts/verify-container-selfref.js",
     "gen:manifests": "node scripts/gen-manifests.js",
+    "manifest:load-check": "node test/vendor-manifest-load-smoke.js",
     "parity:check": "node scripts/parity-check.js --check",
     "version:sync": "node scripts/version-sync-check.js",
     "release:check": "node scripts/release-check.js",
     "test": "node dist/cli.js list && node test/run-all.js",
     "test:fast": "npm run build --if-present && node dist/cli.js list && node test/run-all.js --concurrency auto",
-    "test:coverage": "node dist/cli.js list && node scripts/coverage-gate.js",
+    "test:ci": "node dist/cli.js list && node test/run-all.js --concurrency auto",
+    "test:coverage": "node dist/cli.js list && node scripts/coverage-gate.js --concurrency auto",
     "eval:replay": "tsc -p tsconfig.json && node test/multi-agent-eval-replay-harness-smoke.js",
     "ci": "npm run build && npm run check && npm run test && npm run release:check",
     "validate:schema": "node scripts/validate-run-state-schema.js"

package/scripts/agents/builtin-templates.json ADDED Viewed

@@ -0,0 +1,7 @@
+{
+  "schemaVersion": 1,
+  "comment": "Builtin agent-delegation templates as DATA, not a hand-edited kernel TS literal (FreeBSD-audit L15). Each entry maps a vendor name to the wrapper script in THIS directory that `builtin:<name>` (or CW_AGENT_COMMAND=builtin:<name>) resolves to. Adding a vendor is a content/distribution step: drop a wrapper script here + add a line below — NO kernel edit. Still pure config: the wrapper is an out-of-process delegation script; CW never imports or calls a model API.",
+  "templates": {
+    "claude": "claude-p-agent.js"
+  }
+}

package/scripts/bump-version.js CHANGED Viewed

@@ -22,6 +22,7 @@
 const { spawnSync } = require("node:child_process");
 const fs = require("node:fs");
 const path = require("node:path");
+const { CANONICAL_APP_IDS } = require("./canonical-apps-list.js");
 const pluginRoot = path.resolve(__dirname, "..");
 const repoRoot = path.resolve(pluginRoot, "..", "..");
@@ -89,17 +90,10 @@ function main() {
   // 5. canonical apps app.json (top-level version only; never minVersion).
   //    ONLY the canonical apps track the runtime version — workflow-app-framework-demo
-  //    is pinned (e.g. 0.1.0) and must NOT be bumped. This list mirrors the one
-  //    version-sync-check.js asserts.
-  const CANONICAL_APPS = [
-    "architecture-review",
-    "architecture-review-fast",
-    "end-to-end-golden-path",
-    "pr-review-fix-ci",
-    "release-cut",
-    "research-synthesis"
-  ];
-  for (const appId of CANONICAL_APPS) {
+  //    is pinned (e.g. 0.1.0) and must NOT be bumped. The list is DERIVED from
+  //    apps/ (excluding metadata.example demos) by scripts/canonical-apps-list.js,
+  //    the single source version-sync-check.js asserts against — no hand-copy.
+  for (const appId of CANONICAL_APP_IDS) {
     const appJson = path.join(pluginRoot, "apps", appId, "app.json");
     if (fs.existsSync(appJson) && replaceFirstVersionField(appJson, next)) {
       note(`apps/${appId}/app.json`);

package/scripts/canonical-apps-list.js ADDED Viewed

@@ -0,0 +1,64 @@
+#!/usr/bin/env node
+"use strict";
+// Single source of truth for the CANONICAL app id list.
+//
+// Audit finding M5: this list was hand-copied into three scripts
+// (bump-version.js, version-sync-check.js, canonical-apps.js) with no gate
+// enforcing agreement, so drift between the copies was silent. This module
+// DERIVES the list from the `apps/` directory on disk so the three callers can
+// never disagree — there is nothing left to copy.
+//
+// What counts as canonical: every app directory under `apps/` whose `app.json`
+// is NOT a demo. The real demo marker is `metadata.example === true` (that, NOT
+// `versionPinned`, is how the only non-canonical app — workflow-app-framework-demo,
+// pinned at 0.1.0 — is flagged). Example apps are excluded because they are
+// version-pinned and must not be bumped or version-asserted with the runtime.
+//
+// Portability: node fs/path only, no external tools (CI portability rule).
+const fs = require("node:fs");
+const path = require("node:path");
+const pluginRoot = path.resolve(__dirname, "..");
+const appsDir = path.join(pluginRoot, "apps");
+// The end-to-end golden path is canonical (and version-tracked) but is exercised
+// by its own dedicated harness (scripts/golden-path.js), not by the per-app CLI
+// smoke in canonical-apps.js. Expose its id so that script can express
+// "canonical minus golden-path" without re-introducing a hand-copied list.
+const GOLDEN_PATH_APP_ID = "end-to-end-golden-path";
+function isExampleApp(appJsonPath) {
+  // An app is excluded from the canonical list iff its app.json declares
+  // metadata.example === true. Any read/parse failure is treated as
+  // "not an example" so a malformed app surfaces in the canonical list (and
+  // therefore in the version gate) rather than being silently dropped.
+  try {
+    const json = JSON.parse(fs.readFileSync(appJsonPath, "utf8"));
+    return json && json.metadata && json.metadata.example === true;
+  } catch {
+    return false;
+  }
+}
+function listCanonicalAppIds() {
+  return fs
+    .readdirSync(appsDir, { withFileTypes: true })
+    .filter((entry) => entry.isDirectory())
+    .map((entry) => entry.name)
+    .filter((id) => {
+      const appJson = path.join(appsDir, id, "app.json");
+      if (!fs.existsSync(appJson)) return false; // not an app directory
+      return !isExampleApp(appJson);
+    })
+    .sort(); // deterministic order (replay determinism)
+}
+const CANONICAL_APP_IDS = listCanonicalAppIds();
+module.exports = {
+  CANONICAL_APP_IDS,
+  listCanonicalAppIds,
+  GOLDEN_PATH_APP_ID
+};

package/scripts/canonical-apps.js CHANGED Viewed

@@ -6,6 +6,7 @@ const { execFileSync } = require("node:child_process");
 const fs = require("node:fs");
 const os = require("node:os");
 const path = require("node:path");
+const { CANONICAL_APP_IDS, GOLDEN_PATH_APP_ID } = require("./canonical-apps-list.js");
 const pluginRoot = path.resolve(__dirname, "..");
 const cli = path.join(pluginRoot, "scripts/cw.js");
@@ -82,7 +83,7 @@ const canonicalApps = [
       "--source",
       "plugins/cool-workflow/docs/workflow-app-framework.7.md",
       "--scope",
-      "Cool Workflow v0.1.80",
+      "Cool Workflow v0.1.81",
       "--freshness",
       "as of release preparation"
     ]
@@ -90,6 +91,20 @@ const canonicalApps = [
 ];
 function main() {
+  // Fail-closed drift gate (audit M5): the per-app CLI smoke below must cover
+  // exactly the DERIVED canonical set (apps/ minus metadata.example demos) less
+  // the golden-path app, which scripts/golden-path.js owns. If a new canonical
+  // app appears (or the demo marker flips) without smoke args here, this fails
+  // instead of silently skipping it — there is no second hand-copied list.
+  const expectedSmokeIds = CANONICAL_APP_IDS.filter((id) => id !== GOLDEN_PATH_APP_ID).sort();
+  const actualSmokeIds = canonicalApps.map((app) => app.id).sort();
+  assert.deepEqual(
+    actualSmokeIds,
+    expectedSmokeIds,
+    `canonical-apps smoke set drifted from derived canonical list (apps/ minus example demos, minus ${GOLDEN_PATH_APP_ID}): ` +
+      `expected ${JSON.stringify(expectedSmokeIds)}, got ${JSON.stringify(actualSmokeIds)}`
+  );
   const appList = runJson(["app", "list"]);
   const workflowList = runJson(["list"]);
   assertUniqueIds(appList, "app list");
@@ -102,14 +117,14 @@ function main() {
     assert.ok(summary, `${app.id} must appear in app list`);
     assert.equal(summary.sourceKind, "app-directory");
     assert.equal(summary.legacy, false);
-    assert.equal(summary.version, "0.1.80");
+    assert.equal(summary.version, "0.1.81");
     const validation = runJson(["app", "validate", manifestPath]);
     assert.equal(validation.valid, true, `${app.id} manifest must validate`);
     const shown = runJson(["app", "show", app.id]);
     assert.equal(shown.app.id, app.id);
-    assert.equal(shown.app.version, "0.1.80");
+    assert.equal(shown.app.version, "0.1.81");
     assert.ok(shown.app.metadata.canonical, `${app.id} must be marked canonical`);
     assert.ok(shown.app.sandboxProfiles.length > 0, `${app.id} must declare sandbox profiles`);
     assertTaskIdsUnique(shown);
@@ -120,7 +135,7 @@ function main() {
     const plan = runJson(["plan", app.id, ...app.args(workspace)]);
     const state = JSON.parse(fs.readFileSync(plan.statePath, "utf8"));
     assert.equal(state.workflow.app.id, app.id);
-    assert.equal(state.workflow.app.version, "0.1.80");
+    assert.equal(state.workflow.app.version, "0.1.81");
     assert.equal(state.workflow.app.metadata.canonical, true);
     assert.ok(state.tasks.some((task) => task.requiresEvidence), `${app.id} plan must include evidence gates`);
     assert.ok(state.tasks.every((task) => task.sandboxProfileId), `${app.id} plan must include sandbox hints`);

package/scripts/dogfood-release.js CHANGED Viewed

@@ -5,7 +5,7 @@ const { spawnSync } = require("node:child_process");
 const fs = require("node:fs");
 const path = require("node:path");
-const TARGET_VERSION = "0.1.80";
+const TARGET_VERSION = "0.1.81";
 const PREVIOUS_VERSION = "0.1.31";
 const pluginRoot = path.resolve(__dirname, "..");
 const repoRoot = path.resolve(pluginRoot, "..", "..");

package/scripts/golden-path.js CHANGED Viewed

@@ -33,7 +33,7 @@ function main() {
     const appValidation = runJson(["app", "validate", "end-to-end-golden-path"], pluginRoot);
     assert.equal(appValidation.valid, true);
     assert.equal(appValidation.summary.id, "end-to-end-golden-path");
-    assert.equal(appValidation.summary.version, "0.1.80");
+    assert.equal(appValidation.summary.version, "0.1.81");
     const plan = runJson(
       [
@@ -42,7 +42,7 @@ function main() {
         "--repo",
         tmp,
         "--question",
-        "Prove the deterministic v0.1.80 end-to-end golden path."
+        "Prove the deterministic v0.1.81 end-to-end golden path."
       ],
       pluginRoot
     );
@@ -52,7 +52,7 @@ function main() {
     let state = readJson(plan.statePath);
     assert.equal(state.workflow.app.id, "end-to-end-golden-path");
-    assert.equal(state.workflow.app.version, "0.1.80");
+    assert.equal(state.workflow.app.version, "0.1.81");
     assert.equal(state.loopStage, "interpret");
     const dispatch = runJson(["dispatch", plan.runId, "--limit", "1", "--sandbox", "readonly"], tmp);
@@ -195,7 +195,7 @@ function main() {
     assert.equal(reportPath, plan.reportPath);
     assert.ok(fs.existsSync(reportPath));
     const report = fs.readFileSync(reportPath, "utf8");
-    assert.match(report, /Workflow App: end-to-end-golden-path@0\.1\.80/);
+    assert.match(report, /Workflow App: end-to-end-golden-path@0\.1\.81/);
     assert.match(report, /## Candidates/);
     assert.match(report, /## Trust Audit/);
     assert.match(report, /## Acceptance Rationale/);

package/scripts/parity-check.js CHANGED Viewed

@@ -150,6 +150,11 @@ async function payloadParity() {
     for (const [capability, mcpTool] of GLOBAL_PROBES) {
       const cap = capById(capability);
       assert.equal(cap.mcp.tool, mcpTool, `probe/registry MCP tool mismatch for ${capability}`);
+      // jsonMode is the single source for the CLI's --json policy; this probe only
+      // appends --json for "flag" verbs and JSON.parse-es the result. The human
+      // rendering and "default"-verb no-flag JSON are pinned to cap.cli.jsonMode by
+      // the companion test/cli-jsonmode-parity-smoke.js, so cli.ts can't silently
+      // re-encode that policy by hand and drift from this registry data.
       const cliArgv = [...cap.cli.path, ...(cap.cli.jsonMode === "flag" ? ["--json"] : [])];
       const cliOut = JSON.parse(execFileSync(node, [cli, ...cliArgv], { cwd: workspace, encoding: "utf8" }));
       const mcpOut = await mcp.tool(mcpTool, { cwd: workspace });

package/scripts/release-check.js CHANGED Viewed

@@ -58,7 +58,11 @@ const checks = [
   { name: "dist freshness", command: ["npm", "run", "dist:check"] },
   { name: "type check", command: ["npm", "run", "check"] },
   { name: "run-state schema consistency", command: ["node", "scripts/validate-run-state-schema.js"] },
-  { name: "tests", command: ["npm", "test"] },
+  // Parallel suite (test:ci = run-all.js --concurrency auto). Each smoke runs in
+  // a private cwd + state roots (CW_HOME/HOME/TMPDIR), so concurrency is race-free.
+  // The bare `npm test` and the tag-gate (release-gate.sh) stay sequential as the
+  // deterministic backstop.
+  { name: "tests", command: ["npm", "run", "test:ci"] },
   { name: "canonical apps", command: ["npm", "run", "canonical-apps"] },
   { name: "golden path", command: ["npm", "run", "golden-path"] },
   { name: "CLI MCP parity", command: ["npm", "run", "parity:check"] },

package/scripts/version-sync-check.js CHANGED Viewed

@@ -5,6 +5,7 @@ const assert = require("node:assert/strict");
 const fs = require("node:fs");
 const path = require("node:path");
 const { spawnSync } = require("node:child_process");
+const { CANONICAL_APP_IDS } = require("./canonical-apps-list.js");
 const pluginRoot = path.resolve(__dirname, "..");
 const repoRoot = path.resolve(pluginRoot, "..", "..");
@@ -47,14 +48,10 @@ function readReleaseSource(relativePath) {
 // Read it from the released commit so the asserted-against version is itself
 // taken from HEAD, not a half-written working copy.
 const VERSION = JSON.parse(readReleaseSource("plugins/cool-workflow/package.json").text).version;
-const canonicalApps = [
-  "architecture-review",
-  "architecture-review-fast",
-  "end-to-end-golden-path",
-  "pr-review-fix-ci",
-  "release-cut",
-  "research-synthesis"
-];
+// Canonical app ids are DERIVED from apps/ (excluding metadata.example demos) by
+// scripts/canonical-apps-list.js — the single source bump-version.js bumps and
+// canonical-apps.js smoke-tests. No hand-copied list to drift (audit M5).
+const canonicalApps = CANONICAL_APP_IDS;
 function main() {
   const checks = [];