cool-workflow 0.1.78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. package/.claude-plugin/plugin.json +20 -0
  2. package/.codex-plugin/mcp.json +10 -0
  3. package/.codex-plugin/plugin.json +38 -0
  4. package/.mcp.json +10 -0
  5. package/LICENSE +24 -0
  6. package/README.md +638 -0
  7. package/apps/architecture-review/app.json +51 -0
  8. package/apps/architecture-review/workflow.js +116 -0
  9. package/apps/end-to-end-golden-path/app.json +30 -0
  10. package/apps/end-to-end-golden-path/workflow.js +33 -0
  11. package/apps/pr-review-fix-ci/app.json +59 -0
  12. package/apps/pr-review-fix-ci/workflow.js +90 -0
  13. package/apps/release-cut/app.json +54 -0
  14. package/apps/release-cut/workflow.js +82 -0
  15. package/apps/research-synthesis/app.json +50 -0
  16. package/apps/research-synthesis/workflow.js +76 -0
  17. package/apps/workflow-app-framework-demo/app.json +29 -0
  18. package/apps/workflow-app-framework-demo/workflow.js +44 -0
  19. package/dist/agent-config.js +223 -0
  20. package/dist/candidate-scoring.js +715 -0
  21. package/dist/capability-core.js +630 -0
  22. package/dist/capability-dispatcher.js +86 -0
  23. package/dist/capability-registry.js +523 -0
  24. package/dist/cli.js +1276 -0
  25. package/dist/collaboration.js +727 -0
  26. package/dist/commit.js +570 -0
  27. package/dist/contract-migration.js +234 -0
  28. package/dist/coordinator.js +1163 -0
  29. package/dist/daemon.js +44 -0
  30. package/dist/dispatch.js +201 -0
  31. package/dist/drive.js +503 -0
  32. package/dist/error-feedback.js +415 -0
  33. package/dist/evidence-grounding.js +179 -0
  34. package/dist/evidence-reasoning.js +733 -0
  35. package/dist/execution-backend.js +1279 -0
  36. package/dist/harness.js +61 -0
  37. package/dist/mcp-server.js +1615 -0
  38. package/dist/multi-agent-eval.js +857 -0
  39. package/dist/multi-agent-host.js +764 -0
  40. package/dist/multi-agent-operator-ux.js +537 -0
  41. package/dist/multi-agent-trust.js +366 -0
  42. package/dist/multi-agent.js +1173 -0
  43. package/dist/node-snapshot.js +270 -0
  44. package/dist/observability.js +922 -0
  45. package/dist/operator-ux.js +971 -0
  46. package/dist/orchestrator/audit-operations.js +182 -0
  47. package/dist/orchestrator/candidate-operations.js +117 -0
  48. package/dist/orchestrator/cli-options.js +288 -0
  49. package/dist/orchestrator/collaboration-operations.js +86 -0
  50. package/dist/orchestrator/feedback-operations.js +81 -0
  51. package/dist/orchestrator/host-operations.js +78 -0
  52. package/dist/orchestrator/lifecycle-operations.js +462 -0
  53. package/dist/orchestrator/migration-operations.js +44 -0
  54. package/dist/orchestrator/multi-agent-operations.js +362 -0
  55. package/dist/orchestrator/report.js +369 -0
  56. package/dist/orchestrator/topology-operations.js +84 -0
  57. package/dist/orchestrator.js +874 -0
  58. package/dist/pipeline-contract.js +92 -0
  59. package/dist/pipeline-runner.js +285 -0
  60. package/dist/reclamation.js +882 -0
  61. package/dist/result-normalize.js +194 -0
  62. package/dist/run-export.js +64 -0
  63. package/dist/run-registry.js +1347 -0
  64. package/dist/run-state-schema.js +67 -0
  65. package/dist/sandbox-profile.js +471 -0
  66. package/dist/scheduler.js +266 -0
  67. package/dist/scheduling.js +184 -0
  68. package/dist/schema-validate.js +98 -0
  69. package/dist/state-explosion.js +1213 -0
  70. package/dist/state-migrations.js +463 -0
  71. package/dist/state-node.js +301 -0
  72. package/dist/state.js +308 -0
  73. package/dist/telemetry-attestation.js +156 -0
  74. package/dist/telemetry-ledger.js +145 -0
  75. package/dist/topology.js +527 -0
  76. package/dist/triggers.js +159 -0
  77. package/dist/trust-audit.js +475 -0
  78. package/dist/types/blackboard.js +2 -0
  79. package/dist/types/boundary.js +29 -0
  80. package/dist/types/candidate.js +2 -0
  81. package/dist/types/collaboration.js +2 -0
  82. package/dist/types/core.js +2 -0
  83. package/dist/types/drive.js +10 -0
  84. package/dist/types/error-feedback.js +2 -0
  85. package/dist/types/evidence-reasoning.js +2 -0
  86. package/dist/types/execution-backend.js +2 -0
  87. package/dist/types/multi-agent.js +2 -0
  88. package/dist/types/observability.js +2 -0
  89. package/dist/types/pipeline.js +2 -0
  90. package/dist/types/reclamation.js +8 -0
  91. package/dist/types/result.js +2 -0
  92. package/dist/types/run-registry.js +2 -0
  93. package/dist/types/run.js +2 -0
  94. package/dist/types/sandbox.js +2 -0
  95. package/dist/types/schedule.js +2 -0
  96. package/dist/types/state-node.js +2 -0
  97. package/dist/types/topology.js +2 -0
  98. package/dist/types/trust.js +2 -0
  99. package/dist/types/workbench.js +2 -0
  100. package/dist/types/worker.js +2 -0
  101. package/dist/types/workflow-app.js +2 -0
  102. package/dist/types.js +43 -0
  103. package/dist/verifier-registry.js +46 -0
  104. package/dist/verifier.js +78 -0
  105. package/dist/version.js +8 -0
  106. package/dist/workbench-host.js +172 -0
  107. package/dist/workbench.js +190 -0
  108. package/dist/worker-isolation.js +1028 -0
  109. package/dist/workflow-api.js +98 -0
  110. package/dist/workflow-app-framework.js +626 -0
  111. package/docs/agent-delegation-drive.7.md +190 -0
  112. package/docs/agent-framework.md +176 -0
  113. package/docs/candidate-scoring.7.md +106 -0
  114. package/docs/canonical-workflow-apps.7.md +137 -0
  115. package/docs/capability-topology-registry.7.md +168 -0
  116. package/docs/cli-mcp-parity.7.md +373 -0
  117. package/docs/contract-migration-tooling.7.md +123 -0
  118. package/docs/control-plane-scheduling.7.md +110 -0
  119. package/docs/coordinator-blackboard.7.md +183 -0
  120. package/docs/dogfood/architecture-review-cool-workflow.md +16 -0
  121. package/docs/dogfood-one-real-repo.7.md +168 -0
  122. package/docs/durable-state-and-locking.7.md +107 -0
  123. package/docs/end-to-end-golden-path.7.md +117 -0
  124. package/docs/error-feedback.7.md +153 -0
  125. package/docs/evidence-adoption-reasoning-chain.7.md +270 -0
  126. package/docs/execution-backends.7.md +300 -0
  127. package/docs/getting-started.md +99 -0
  128. package/docs/index.md +41 -0
  129. package/docs/mcp-app-surface.7.md +235 -0
  130. package/docs/multi-agent-cli-mcp-surface.7.md +265 -0
  131. package/docs/multi-agent-eval-replay-harness.7.md +302 -0
  132. package/docs/multi-agent-operator-ux.7.md +314 -0
  133. package/docs/multi-agent-runtime-core.7.md +231 -0
  134. package/docs/multi-agent-topologies.7.md +103 -0
  135. package/docs/multi-agent-trust-policy-audit.7.md +154 -0
  136. package/docs/node-snapshot-diff-replay.7.md +135 -0
  137. package/docs/observability-cost-accounting.7.md +194 -0
  138. package/docs/operator-ux.7.md +180 -0
  139. package/docs/pipeline-runner.7.md +136 -0
  140. package/docs/project-index.md +261 -0
  141. package/docs/real-execution-backends.7.md +142 -0
  142. package/docs/release-and-migration.7.md +280 -0
  143. package/docs/release-tooling.7.md +159 -0
  144. package/docs/routines.md +48 -0
  145. package/docs/run-registry-control-plane.7.md +312 -0
  146. package/docs/run-retention-reclamation.7.md +191 -0
  147. package/docs/sandbox-profiles.7.md +137 -0
  148. package/docs/scheduled-tasks.md +80 -0
  149. package/docs/security-trust-hardening.7.md +117 -0
  150. package/docs/state-explosion-management.7.md +264 -0
  151. package/docs/state-node.7.md +96 -0
  152. package/docs/team-collaboration.7.md +207 -0
  153. package/docs/unix-principles.md +192 -0
  154. package/docs/verifier-gated-commit.7.md +140 -0
  155. package/docs/web-desktop-workbench.7.md +215 -0
  156. package/docs/worker-isolation.7.md +167 -0
  157. package/docs/workflow-app-framework.7.md +274 -0
  158. package/manifest/README.md +43 -0
  159. package/manifest/plugin.manifest.json +316 -0
  160. package/manifest/pricing.policy.json +14 -0
  161. package/package.json +79 -0
  162. package/scripts/agents/claude-p-agent.js +104 -0
  163. package/scripts/agents/claude-p-agent.sh +9 -0
  164. package/scripts/agents/cw-attest-keygen.js +55 -0
  165. package/scripts/agents/cw-attest-wrap.js +143 -0
  166. package/scripts/block-unapproved-tag.sh +39 -0
  167. package/scripts/bump-version.js +249 -0
  168. package/scripts/canonical-apps.js +171 -0
  169. package/scripts/cw.js +4 -0
  170. package/scripts/dist-drift-check.js +79 -0
  171. package/scripts/dogfood-architecture-review.js +237 -0
  172. package/scripts/dogfood-release.js +624 -0
  173. package/scripts/forward-ref-docs.js +73 -0
  174. package/scripts/gen-manifests.js +232 -0
  175. package/scripts/golden-path.js +300 -0
  176. package/scripts/mcp-server.js +4 -0
  177. package/scripts/new-feature.js +121 -0
  178. package/scripts/parity-check.js +213 -0
  179. package/scripts/release-check.js +118 -0
  180. package/scripts/release-flow.js +272 -0
  181. package/scripts/release-gate.sh +85 -0
  182. package/scripts/sync-project-index.js +387 -0
  183. package/scripts/validate-run-state-schema.js +126 -0
  184. package/scripts/verify-container-selfref.js +64 -0
  185. package/scripts/version-sync-check.js +237 -0
  186. package/skills/cool-workflow/SKILL.md +162 -0
  187. package/skills/cool-workflow/references/commands.md +282 -0
  188. package/tsconfig.json +16 -0
  189. package/ui/workbench/app.css +76 -0
  190. package/ui/workbench/app.js +159 -0
  191. package/ui/workbench/index.html +32 -0
  192. package/workflows/architecture-review.workflow.js +84 -0
  193. package/workflows/research-synthesis.workflow.js +47 -0
@@ -0,0 +1,154 @@
1
+ # Multi-Agent Trust / Policy / Audit
2
+
3
+ CW v0.1.22 extends the existing trust-audit layer with first-class
4
+ multi-agent policy, provenance, blackboard write audit, and judge rationale.
5
+ It does not introduce a second audit subsystem.
6
+ CW v0.1.24 includes these trust projections in eval/replay comparison so
7
+ missing provenance, changed policy violations, or missing judge rationale fail
8
+ the regression gate.
9
+
10
+ ## Model
11
+
12
+ Multi-agent trust state is plain JSON attached to existing records:
13
+
14
+ - `AgentRole.policy`
15
+ - `AgentGroup.policy`
16
+ - `AgentMembership.policy`
17
+ - blackboard message `provenance`
18
+ - candidate score and selection audit links
19
+ - append-friendly trust events in `.cw/runs/<run-id>/audit/events.jsonl`
20
+
21
+ Policies describe explicit authority:
22
+
23
+ - allowed blackboard topic ids
24
+ - allowed write operations: `message`, `context`, `artifact`, `snapshot`,
25
+ `coordinator-decision`
26
+ - allowed candidate operations: `register`, `score`, `select`
27
+ - allowed judge operations: `verdict`, `rationale`, `panel-decision`
28
+ - sandbox profile hints
29
+ - required evidence refs for privileged actions
30
+ - denied operations and reasons
31
+
32
+ Missing policy, missing role authority, out-of-scope topics, missing evidence,
33
+ or missing judge rationale fail closed and create audit records.
34
+
35
+ ## Audit Events
36
+
37
+ The existing audit log records multi-agent dimensions with stable ids:
38
+
39
+ - `multi-agent.role-policy`
40
+ - `multi-agent.permission`
41
+ - `blackboard.write`
42
+ - `blackboard.message-provenance`
43
+ - `judge.rationale`
44
+ - `judge.panel-decision`
45
+ - `policy.violation`
46
+
47
+ Events carry ids such as `multiAgentRunId`, `agentRoleId`, `agentGroupId`,
48
+ `agentMembershipId`, `agentFanoutId`, `agentFaninId`, blackboard record ids,
49
+ candidate/score/selection/commit ids, topology ids, `sandboxProfileId`, and
50
+ `policyRef` when relevant.
51
+
52
+ Audit events do not copy large blackboard bodies. Message provenance stores
53
+ author kind/id, role/group/membership/worker ids when known, source, linked
54
+ evidence refs, parent message ids, topic scope, a body hash, and a short
55
+ summary.
56
+
57
+ ## Blackboard Writes
58
+
59
+ Every blackboard write is audited:
60
+
61
+ - topic create/update
62
+ - message post
63
+ - context put/supersede/conflict
64
+ - artifact add
65
+ - snapshot create
66
+ - coordinator decision
67
+
68
+ The audit record says who wrote, under which role or membership, which policy
69
+ allowed or denied it, what evidence was cited, what record changed, and whether
70
+ the write was accepted, denied, superseded, conflicting, or blocked.
71
+
72
+ Denied writes are rejected before state mutation and are visible through
73
+ `policy.violation` and `blackboard.write` audit projections.
74
+
75
+ ## Judge Rationale
76
+
77
+ Judge-panel scoring requires evidence and rationale. Panel selection requires
78
+ score evidence and chair rationale. Accepted judge and panel records cite the
79
+ score, candidate, evidence refs, role policy, and parent audit events.
80
+
81
+ Missing rationale or evidence blocks score, selection, fanin readiness, and
82
+ verifier-gated commit readiness where those gates depend on judge evidence.
83
+
84
+ ## CLI
85
+
86
+ Existing commands remain compatible:
87
+
88
+ ```bash
89
+ node scripts/cw.js audit summary <run-id>
90
+ node scripts/cw.js audit provenance <run-id>
91
+ node scripts/cw.js multi-agent status <run-id>
92
+ node scripts/cw.js multi-agent evidence <run-id>
93
+ ```
94
+
95
+ Focused views:
96
+
97
+ ```bash
98
+ node scripts/cw.js audit multi-agent <run-id>
99
+ node scripts/cw.js audit policy <run-id>
100
+ node scripts/cw.js audit role <run-id> <role-id>
101
+ node scripts/cw.js audit blackboard <run-id>
102
+ node scripts/cw.js audit judge <run-id>
103
+ ```
104
+
105
+ Use `--json` or `--format json` for deterministic machine output.
106
+
107
+ Human output includes stable panels:
108
+
109
+ - Multi-Agent Trust
110
+ - Role Policies
111
+ - Permission Decisions
112
+ - Blackboard Write Audit
113
+ - Message Provenance
114
+ - Judge Rationales
115
+ - Policy Violations
116
+ - Next Action
117
+
118
+ ## MCP
119
+
120
+ MCP parity tools:
121
+
122
+ - `cw_audit_multi_agent`
123
+ - `cw_audit_policy`
124
+ - `cw_audit_role`
125
+ - `cw_audit_blackboard`
126
+ - `cw_audit_judge`
127
+
128
+ The older audit tools remain available:
129
+
130
+ - `cw_audit_summary`
131
+ - `cw_audit_worker`
132
+ - `cw_audit_provenance`
133
+ - `cw_audit_attest`
134
+ - `cw_audit_decision`
135
+
136
+ ## Operator Questions
137
+
138
+ The combined `multi-agent status`, `multi-agent evidence`, `report --show`,
139
+ `audit summary`, and `audit provenance` views answer:
140
+
141
+ - Which role was allowed to do this?
142
+ - Which blackboard message came from which role, member, or worker?
143
+ - Which write was denied and why?
144
+ - Which judge rationale was accepted?
145
+ - Why was this selected result trusted?
146
+
147
+ ## Regression
148
+
149
+ `test/multi-agent-trust-policy-audit-smoke.js` creates a judge-panel run with
150
+ allowed and denied blackboard writes, message provenance, role/membership/worker
151
+ links, accepted judge rationale, missing-rationale and missing-evidence failure
152
+ paths, CLI output assertions, MCP parity assertions, report assertions, and
153
+ audit provenance assertions.
154
+ 0.1.51
@@ -0,0 +1,135 @@
1
+ # Node Snapshot / Diff / Replay
2
+
3
+ CW v0.1.35 adds Node Snapshot / Diff / Replay: per-NODE granularity over the
4
+ v0.1.23 eval/replay harness. Before v0.1.35 the harness worked only at RUN/SUITE
5
+ granularity — `createMultiAgentReplaySnapshot(run)` captured a whole run; there
6
+ was no way to snapshot, fingerprint, diff, or replay a single `StateNode`. This
7
+ release adds that, reusing the harness's normalize/stable-stringify discipline and
8
+ the v0.1.25 fingerprint/freshness pattern — without forking `StateNode`, the eval
9
+ harness, or the run-state schema (all additive).
10
+
11
+ The discipline is the same base-system separation used elsewhere: the mechanism
12
+ captures/diffs/replays one node by id; nothing decides which node "matters".
13
+
14
+ ## Snapshot — derived + fingerprinted
15
+
16
+ A `NodeSnapshot` is a DERIVED projection of one `StateNode`: its body is normalized
17
+ (timestamps/paths stripped via the eval harness's `normalizeValue`), so it is
18
+ byte-stable across captures of the same logical state. It carries a
19
+ `sourceFingerprint` — sha256 over the RAW node (`id:status:updatedAt` + artifact
20
+ and evidence ids/paths) — so any transition flips it.
21
+
22
+ ```text
23
+ node snapshot <run-id> <node-id> [--json]
24
+ ```
25
+
26
+ Persisted under `<run>/nodes/snapshots/<node-id>/<snapshot-id>.json`; the source
27
+ `<run>/nodes/<id>.json` stays the truth. The `snapshot-id` is content-addressed
28
+ (`snap-<node>-<fingerprint>`), so re-snapshotting an unchanged node is idempotent.
29
+
30
+ ## Freshness — fail closed on drift
31
+
32
+ Every load recomputes the fingerprint from the current source and emits a
33
+ freshness verdict:
34
+
35
+ - `valid` — source matches the snapshot.
36
+ - `stale` — the source node changed since capture.
37
+ - `absent` — the node, or a referenced artifact path, is gone/unreadable.
38
+
39
+ `stale` and `absent` both REFUSE diff/replay with a structured `NodeSnapshotError`
40
+ naming the divergence — never a silent stale replay, never a best-effort partial.
41
+
42
+ ## Diff — stable + structural
43
+
44
+ ```text
45
+ node diff <run-id> <baseline-snapshot-id> <candidate-snapshot-id> [--json]
46
+ ```
47
+
48
+ Per-section (`status`/`inputs`/`outputs`/`artifacts`/`evidence`/`errors`/`links`/
49
+ `metadata`) `added|removed|changed|same`, ordered deterministically by the same
50
+ `stableStringify` the eval comparison uses. Byte-identical across repeated runs.
51
+
52
+ ## Replay — isolated + deterministic
53
+
54
+ ```text
55
+ node replay <run-id> <snapshot-id> [--json]
56
+ ```
57
+
58
+ Reconstructs the normalized node from the snapshot with `now` INJECTED — no
59
+ ambient `new Date()` in the deterministic payload. The result carries an
60
+ `outputFingerprint` over the normalized body, so two replays of one snapshot are
61
+ byte-identical (only `replayedAt`/`replayId`, which are now-derived, differ).
62
+ Replaying a `stale`/`absent` snapshot fails closed.
63
+
64
+ ## Verify — replay vs source
65
+
66
+ ```text
67
+ node verify <run-id> <replay-id> [--json]
68
+ ```
69
+
70
+ Compares a replay to a FRESH snapshot of the source node and emits a pass/fail
71
+ verdict plus findings in the eval harness's `severity/category/reason/baselineRef/
72
+ replayRef` shape.
73
+
74
+ ## Surfaces & Compatibility
75
+
76
+ `node.snapshot`/`node.diff`/`node.replay`/`node.replay.verify` are declared in the
77
+ capability registry as `surface: "both"`, so `cw node <verb> --json` and the
78
+ `cw_node_*` MCP tools render one core (`src/node-snapshot.ts`). Additive: no change
79
+ to `StateNode`, `STATE_NODE_SCHEMA_VERSION`, the run-state schema, the pipeline
80
+ contract, or existing eval-suite artifacts; pre-0.1.35 runs and snapshots stay
81
+ loadable. Exporting the previously-private eval-harness helpers
82
+ (`normalizeValue`/`stableStringify`/`lines`) and `fingerprintStrings` is purely
83
+ additive and changes no behavior.
84
+
85
+ ## See Also
86
+
87
+ state-node(7), multi-agent-eval-replay-harness(7), state-explosion-management(7),
88
+ cli-mcp-parity(7)
89
+
90
+ ## Contract Migration Tooling (v0.1.36)
91
+
92
+ first-class declared migration registry (run-state + workflow-app) with per-edge compatibility proofs, fail-closed reachability, and a round-trip/non-destruction prover. See contract-migration-tooling(7).
93
+
94
+ ## Control-Plane Scheduling (v0.1.37)
95
+
96
+ priority + concurrency limits + lease lifecycle + retry/backoff + fail-closed park over the v0.1.28 Run Registry queue; policy-as-data, deterministic. See control-plane-scheduling(7).
97
+
98
+ ## Agent Delegation Drive (v0.1.38)
99
+
100
+ spawn an external agent process per worker, capture result.md + attestation, auto-drive plan->dispatch->fulfill->accept->commit
101
+
102
+ ## Run Retention & Provable Reclamation (v0.1.39)
103
+
104
+ tiered, append-only, cryptographically-verifiable run reclamation: seal the audit skeleton, free the reconstructable bulk, prove it
105
+
106
+ ## Durable State & Locking (v0.1.40)
107
+
108
+ atomic temp->rename writes + fsync-durability for authoritative stores; portable stale-stealing file lock serializing the cross-process read-modify-write stores
109
+
110
+ ## Self-Audit Hardening & Pure-Router Decomposition (v0.1.41)
111
+
112
+ evidence grounding + durable audit append + symlink-hardened containment + deterministic worker ids + recursive redaction; BackendRegistry self-describing drivers (no per-id switches); orchestrator god-object decomposed into per-domain operation modules (pure loadRun->delegate router)
113
+
114
+ ## Robust Result Ingest (v0.1.42)
115
+
116
+ capture findings/evidence from any reasonable agent shape (alt keys + prose), CW derives grounded evidence itself, warn on empty capture — closes the v0.1.41 live-drive 'accepted with 0 captured' failure
117
+
118
+ ## No-False-Green Gate & Launch Prep (v0.1.43)
119
+
120
+ Hard gate blocking empty-capture verifier-gated commits, plus quickstart and launch-prep docs.
121
+
122
+ ## Release-Gate Determinism & Agents Vendor (v0.1.44)
123
+
124
+ Release-readiness checks now validate the committed blob (`git show HEAD:<path>`) instead of the mutable working tree — eliminating false-red/false-green from concurrent working-tree writes (iCloud/Spotlight/editor). Adds the `agents` vendor manifest target: a generated `.agents/plugins/cool-workflow/` adapter giving any non-Claude AI agent one common interface to CW.
125
+
126
+ ## P1-P2 Fixes & CI Content Surfaces (v0.1.49)
127
+
128
+ Migration DAG with reversible edges (v0.1.45), capability auto-discovery (v0.1.46), vendor-adapter registry (v0.1.47), state auto-compaction and P2 fixes (v0.1.48), plus CI content-surface determinism hardening (v0.1.49).
129
+ 0.1.51
130
+
131
+ 0.1.76
132
+
133
+ 0.1.77
134
+
135
+ 0.1.78
@@ -0,0 +1,194 @@
1
+ # Observability + Cost Accounting
2
+
3
+ CW v0.1.31 adds Observability + Cost Accounting: time/duration, failure rate,
4
+ verifier pass rate, candidate acceptance rate, and token/cost — all DERIVED from
5
+ the run state CW already keeps. Before v0.1.31 there was no metrics module and no
6
+ token or cost field anywhere; run state already carried `createdAt`/`updatedAt`/
7
+ `completedAt`/`dispatchedAt` and outcome statuses on tasks, workers, verifier
8
+ nodes, candidates, memberships, and feedback. This release projects those into a
9
+ report — and adds an additive, host-attested usage record so cost can be
10
+ accounted honestly — without changing the `ResultEnvelope` schema and without
11
+ taking ownership of source truth.
12
+
13
+ The design follows the same base-system observability philosophy as
14
+ [State Explosion Management](state-explosion-management.7.md) and the
15
+ [Run Registry / Control Plane](run-registry-control-plane.7.md):
16
+
17
+ - the per-run `.cw/runs/<id>/state.json` is the SINGLE source of truth
18
+ - metrics are a DERIVED projection of source records, never a separate database
19
+ - no telemetry pipeline, no background collector daemon, no hidden counters
20
+ - plain files, stable JSON, deterministic output
21
+ - fail closed: a rate over zero samples is `n/a`, never a fabricated 0%/100%
22
+ - cost is ATTESTED, never measured or fabricated; absent usage is `unreported`
23
+ - backward compatible; usage/cost fields are additive and optional
24
+
25
+ ## Derived, not a telemetry pipeline
26
+
27
+ Every number is a projection of existing durable state:
28
+
29
+ - durations come from recorded timestamps — `dispatchedAt`→`completedAt` for
30
+ tasks, `createdAt`→worker output `recordedAt` for workers, `createdAt`→
31
+ `updatedAt` for the run;
32
+ - the failure rate pools failed/rejected workers, failed memberships, failed
33
+ un-worker-backed tasks, and unresolved (`open`/`tasked`) feedback over the
34
+ total of those samples;
35
+ - the verifier pass rate counts `verifier` state nodes whose status is a pass
36
+ (`verified`/`committed`) against decided gates (pass + `failed`/`rejected`/
37
+ `blocked`); pending/running gates are undecided and excluded;
38
+ - the candidate acceptance rate counts `selected`/`verified` candidates over all
39
+ candidate records.
40
+
41
+ There is no metrics store. `deriveMetricsReport(run, { now, policy })` is a PURE
42
+ function of one run's state, an injected `now`, and an optional pricing policy.
43
+ The only now-derived field is `generatedAt`; durations are computed from recorded
44
+ timestamps, so a report over a fixed snapshot is byte-reproducible (eval/replay
45
+ agnostic). The per-run report is persisted as a rebuildable, fingerprinted
46
+ snapshot under `.cw/runs/<id>/metrics/metrics-report.json`; the cross-repo
47
+ summary reports each run's snapshot freshness as `valid|stale|absent` against
48
+ current source — fail closed, exactly like the registry.
49
+
50
+ ## A counter you cannot trust is worse than none
51
+
52
+ Each rate is a `RateMetric` carrying `state` (`ok`/`n/a`), `count`, `total`,
53
+ `rate`, and per-bucket sample counts. Over zero samples the state is `n/a` and
54
+ `count`/`rate` are `null` — never `0`. No divide-by-zero, no partial-data rate
55
+ presented as complete. Sample counts and buckets accompany every rate so a reader
56
+ can audit the numerator and denominator.
57
+
58
+ ## Cost is attested, never measured or fabricated
59
+
60
+ CW does not call the model; the host/worker does. Token usage is recorded as
61
+ HOST-ATTESTED provenance — a `UsageRecord` accepted on the existing intake path
62
+ and stored on the task or worker record (never on `ResultEnvelope`):
63
+
64
+ ```
65
+ cw result <run-id> <task-id> <file> \
66
+ --usage-input-tokens 12000 --usage-output-tokens 3400 \
67
+ --usage-model claude-opus-4-8 --usage-source host-attested
68
+ cw worker output <run-id> <worker-id> <file> --usage-input-tokens N --usage-output-tokens M --usage-model ID
69
+ ```
70
+
71
+ CW records what the host attests, verbatim, and synthesizes nothing. When the
72
+ host reports no usage the value is an explicit `unreported` — never `0`, never a
73
+ silent guess. The report surfaces `usage.coverage` (the fraction of work units
74
+ carrying attested usage) and `usage.unreportedUnits` so the gap is visible.
75
+
76
+ A monetary figure is `attested` ONLY when derived from attested usage × a
77
+ recorded pricing policy with an EXACT model match. When a model is priced by the
78
+ policy's `defaultPrice` fallback, that portion is a SEPARATE `estimated` figure
79
+ and the cost `state` becomes `estimated`; the two USD figures are never conflated
80
+ into one. Cost states:
81
+
82
+ - `attested` — every attested model exact-matched a policy entry;
83
+ - `estimated` — some attested usage was priced by the policy default/fallback;
84
+ - `unpriced` — attested usage present but no policy entry (and no default);
85
+ - `unreported` — no attested usage to price.
86
+
87
+ ## Mechanism vs policy: pricing is data
88
+
89
+ The runtime is MECHANISM: it records attested usage and derives rates/durations.
90
+ The pricing table is POLICY — supplied as DATA (`CostPolicy`), not baked into the
91
+ kernel. The same attested usage yields different cost reports under different
92
+ pricing without touching the runtime. A bundled EXAMPLE policy lives at
93
+ `manifest/pricing.policy.json` (USD per 1e6 tokens, an editable starting point —
94
+ not a live price feed); pass `--pricing <path>` to use your own, or
95
+ `--pricing default` for the bundled example. With no policy supplied, cost is
96
+ `unpriced`/`unreported`, never guessed.
97
+
98
+ ## One source, every surface
99
+
100
+ The metrics verbs are declared once in `src/capability-registry.ts`, so the CLI
101
+ and MCP surfaces are two renderings of one core (`src/observability.ts`) and pass
102
+ the v0.1.27 parity gate — `cw <cmd> --json` is byte-identical to `cw_<tool>`
103
+ (durations are integers from recorded timestamps; only the ISO `generatedAt` is
104
+ now-derived and neutralized by the parity probe). The v0.1.30 Workbench renders a
105
+ read-only metrics panel from the same payload, showing coverage and
106
+ `unreported`/`n/a` honestly — it shows nothing the CLI/MCP cannot.
107
+
108
+ ## Commands
109
+
110
+ - `cw metrics show <run-id>` — the derived per-run report: durations, the three
111
+ rates with sample counts, attested usage with coverage, and cost. `--json` for
112
+ the canonical payload; `--pricing <path>|default` to price attested usage.
113
+ - `cw metrics summary` — the cross-repo rollup over the v0.1.28 run registry:
114
+ pooled rates, summed attested usage/cost with coverage, and per-app and
115
+ per-backend breakdowns. `--scope repo|home`; unreadable runs are counted
116
+ (`unreadableRuns`), never silently dropped.
117
+
118
+ MCP hosts call `cw_metrics_show` and `cw_metrics_summary` with the identical
119
+ payloads. Old runs load and report `unreported` cost while still yielding correct
120
+ time and rate metrics from their existing timestamps and outcomes.
121
+
122
+ This document targets CW 0.1.31.
123
+
124
+
125
+ ## Team Collaboration (v0.1.32)
126
+
127
+ v0.1.32 adds Team Collaboration: a host-attested actor and append-only
128
+ approvals/rejections/comments/handoffs provenance-linked to a durable target,
129
+ plus a review gate that STACKS ON the verifier gate — required approvals from
130
+ authorized roles, enforced inside `resolveCommitGate` AFTER the verifier checks
131
+ and never instead of them, failing closed on quorum/authority/self-approval and
132
+ recording who approved the very artifact that shipped. Policy (required approvals,
133
+ authorized roles, self-approval) is data, default off (pre-v0.1.32 behavior
134
+ unchanged). The verbs are parity-gated and render read-only in the v0.1.30
135
+ Workbench. See [Team Collaboration](team-collaboration.7.md).
136
+
137
+ ## Release Tooling (v0.1.33)
138
+
139
+ the per-tag mechanical surfaces (version bump across 17 surfaces, feature scaffold, and the forward-reference docs) become deterministic scripts, with a de-duplicated release gate. See release-tooling(7).
140
+
141
+ ## Real Execution Backend Integrations (v0.1.34)
142
+
143
+ container/remote/ci backends really execute (docker/podman run, remote/CI POST-and-poll) under the sandbox contract, with byte-stable evidence vs node and fail-closed refusal when a runtime/endpoint is unavailable. See real-execution-backends(7).
144
+
145
+ ## Node Snapshot / Diff / Replay (v0.1.35)
146
+
147
+ per-node snapshot, structural diff, and isolated deterministic replay over StateNode, reusing the v0.1.23 eval harness; fail-closed on source drift (valid|stale|absent). See node-snapshot-diff-replay(7).
148
+
149
+ ## Contract Migration Tooling (v0.1.36)
150
+
151
+ first-class declared migration registry (run-state + workflow-app) with per-edge compatibility proofs, fail-closed reachability, and a round-trip/non-destruction prover. See contract-migration-tooling(7).
152
+
153
+ ## Control-Plane Scheduling (v0.1.37)
154
+
155
+ priority + concurrency limits + lease lifecycle + retry/backoff + fail-closed park over the v0.1.28 Run Registry queue; policy-as-data, deterministic. See control-plane-scheduling(7).
156
+
157
+ ## Agent Delegation Drive (v0.1.38)
158
+
159
+ spawn an external agent process per worker, capture result.md + attestation, auto-drive plan->dispatch->fulfill->accept->commit
160
+
161
+ ## Run Retention & Provable Reclamation (v0.1.39)
162
+
163
+ tiered, append-only, cryptographically-verifiable run reclamation: seal the audit skeleton, free the reconstructable bulk, prove it
164
+
165
+ ## Durable State & Locking (v0.1.40)
166
+
167
+ atomic temp->rename writes + fsync-durability for authoritative stores; portable stale-stealing file lock serializing the cross-process read-modify-write stores
168
+
169
+ ## Self-Audit Hardening & Pure-Router Decomposition (v0.1.41)
170
+
171
+ evidence grounding + durable audit append + symlink-hardened containment + deterministic worker ids + recursive redaction; BackendRegistry self-describing drivers (no per-id switches); orchestrator god-object decomposed into per-domain operation modules (pure loadRun->delegate router)
172
+
173
+ ## Robust Result Ingest (v0.1.42)
174
+
175
+ capture findings/evidence from any reasonable agent shape (alt keys + prose), CW derives grounded evidence itself, warn on empty capture — closes the v0.1.41 live-drive 'accepted with 0 captured' failure
176
+
177
+ ## No-False-Green Gate & Launch Prep (v0.1.43)
178
+
179
+ Hard gate blocking empty-capture verifier-gated commits, plus quickstart and launch-prep docs.
180
+
181
+ ## Release-Gate Determinism & Agents Vendor (v0.1.44)
182
+
183
+ Release-readiness checks now validate the committed blob (`git show HEAD:<path>`) instead of the mutable working tree — eliminating false-red/false-green from concurrent working-tree writes (iCloud/Spotlight/editor). Adds the `agents` vendor manifest target: a generated `.agents/plugins/cool-workflow/` adapter giving any non-Claude AI agent one common interface to CW.
184
+
185
+ ## P1-P2 Fixes & CI Content Surfaces (v0.1.49)
186
+
187
+ Migration DAG with reversible edges (v0.1.45), capability auto-discovery (v0.1.46), vendor-adapter registry (v0.1.47), state auto-compaction and P2 fixes (v0.1.48), plus CI content-surface determinism hardening (v0.1.49).
188
+ 0.1.51
189
+
190
+ 0.1.76
191
+
192
+ 0.1.77
193
+
194
+ 0.1.78
@@ -0,0 +1,180 @@
1
+ # Operator UX
2
+
3
+ Cool Workflow v0.1.12 added a read-only Operator UX layer for understanding a
4
+ run from the console. It does not change workflow state, dispatch workers, score
5
+ candidates, or commit snapshots. It reads `WorkflowRun` state and renders
6
+ deterministic summaries for humans while preserving JSON for scripts and MCP.
7
+
8
+ ## Inspect A Run
9
+
10
+ Human status is the default:
11
+
12
+ ```bash
13
+ node scripts/cw.js status <run-id>
14
+ ```
15
+
16
+ The status view includes run id, workflow/app id and version, loop stage, active
17
+ phase, blocked reasons, phase/task counts, workers, candidates, feedback,
18
+ commits, multi-agent runtime health, Multi-Agent Operator UX counts, report
19
+ path, and the next recommended command.
20
+
21
+ Machine-readable status stays available:
22
+
23
+ ```bash
24
+ node scripts/cw.js status <run-id> --json
25
+ node scripts/cw.js status <run-id> --format json
26
+ ```
27
+
28
+ `CoolWorkflowRunner.status()` and MCP `cw_status` continue to return structured
29
+ status data for integrations.
30
+
31
+ In v0.1.13, MCP also exposes JSON-native operator tools:
32
+ `cw_operator_status`, `cw_operator_graph`, `cw_operator_report`,
33
+ `cw_worker_summary`, `cw_candidate_summary`, `cw_feedback_summary`, and
34
+ `cw_commit_summary`.
35
+
36
+ ## Next Actions
37
+
38
+ Recommendations are deterministic and only use commands that exist in the CW
39
+ CLI. Examples:
40
+
41
+ ```text
42
+ node scripts/cw.js dispatch <run-id> --limit 4
43
+ reason: pending tasks are ready for the active phase
44
+
45
+ node scripts/cw.js worker manifest <run-id> <worker-id>
46
+ reason: running workers need their manifests inspected
47
+
48
+ node scripts/cw.js feedback show <run-id> <feedback-id>
49
+ reason: open feedback should be resolved before more dispatch
50
+
51
+ node scripts/cw.js candidate register <run-id> --worker <worker-id>
52
+ reason: a completed worker result has not been registered as a candidate
53
+
54
+ node scripts/cw.js commit <run-id> --selection <selection-id>
55
+ reason: a verified selected candidate is ready for a verifier-gated commit
56
+ ```
57
+
58
+ Open feedback is prioritized before dispatch or candidate work. If all tracked
59
+ work is complete, the advisor points to `cw report <run-id> --show`.
60
+
61
+ ## Graph
62
+
63
+ Use the top-level graph command for a compact console map:
64
+
65
+ ```bash
66
+ node scripts/cw.js graph <run-id>
67
+ node scripts/cw.js graph <run-id> --json
68
+ ```
69
+
70
+ The legacy node command remains compatible:
71
+
72
+ ```bash
73
+ node scripts/cw.js node graph <run-id>
74
+ node scripts/cw.js node graph <run-id> --json
75
+ ```
76
+
77
+ The human graph groups phases, tasks, dispatches, workers, result nodes,
78
+ verifier nodes, candidates, selections, commits, and feedback, then prints the
79
+ edges between them. v0.1.17 also adds `multi-agent-run`, `agent-role`,
80
+ `agent-group`, `agent-membership`, `agent-fanout`, and `agent-fanin` nodes when
81
+ the run has first-class multi-agent state. v0.1.18 adds `blackboard`,
82
+ `blackboard-topic`, `blackboard-message`, `blackboard-context`,
83
+ `blackboard-artifact`, `blackboard-snapshot`, and `coordinator-decision` nodes
84
+ when shared coordination state exists. JSON output returns deterministic `nodes`
85
+ and `edges`.
86
+
87
+ ## Multi-Agent Operator UX
88
+
89
+ v0.1.21 adds focused multi-agent operator views that answer who depends on
90
+ whom, who is blocked, and which evidence was adopted into the accepted result:
91
+
92
+ ```bash
93
+ node scripts/cw.js multi-agent graph <run-id>
94
+ node scripts/cw.js multi-agent dependencies <run-id>
95
+ node scripts/cw.js multi-agent failures <run-id>
96
+ node scripts/cw.js multi-agent evidence <run-id>
97
+ ```
98
+
99
+ The same derived model appears in `status`, `report --show`, and
100
+ `cw_multi_agent_status` under `summaries.multiAgentOperator`. See
101
+ [multi-agent-operator-ux.7.md](multi-agent-operator-ux.7.md) for the full
102
+ trace from agent membership to verifier-gated commit.
103
+
104
+ ## Console Report
105
+
106
+ `cw report` still writes the Markdown report file and prints its path:
107
+
108
+ ```bash
109
+ node scripts/cw.js report <run-id>
110
+ ```
111
+
112
+ Use `--show` or `--summary` when the operator needs a readable console report:
113
+
114
+ ```bash
115
+ node scripts/cw.js report <run-id> --show
116
+ node scripts/cw.js report <run-id> --summary
117
+ ```
118
+
119
+ The console report includes the same high-value status panels plus active and
120
+ pending tasks, evidence paths and locators, and resource inspection commands.
121
+
122
+ ## Resource Summaries
123
+
124
+ Major run resources have human summaries by default and JSON when requested:
125
+
126
+ ```bash
127
+ node scripts/cw.js worker summary <run-id>
128
+ node scripts/cw.js worker summary <run-id> --json
129
+
130
+ node scripts/cw.js candidate summary <run-id>
131
+ node scripts/cw.js candidate summary <run-id> --json
132
+
133
+ node scripts/cw.js feedback summary <run-id>
134
+ node scripts/cw.js feedback summary <run-id> --json
135
+
136
+ node scripts/cw.js commit summary <run-id>
137
+ node scripts/cw.js commit summary <run-id> --json
138
+
139
+ node scripts/cw.js multi-agent summary <run-id>
140
+ node scripts/cw.js multi-agent summary <run-id> --json
141
+ node scripts/cw.js multi-agent graph <run-id>
142
+ node scripts/cw.js multi-agent graph <run-id> --json
143
+ node scripts/cw.js multi-agent dependencies <run-id>
144
+ node scripts/cw.js multi-agent failures <run-id>
145
+ node scripts/cw.js multi-agent evidence <run-id>
146
+ ```
147
+
148
+ Worker summaries show allocated/running/verified/failed/rejected counts,
149
+ sandbox profile ids, manifest paths, result paths, and linked feedback for
150
+ failed or rejected workers.
151
+
152
+ Candidate summaries show registered/scored/selected/verified/rejected/failed
153
+ counts, latest ranking path, selected candidates, candidates ready for commit,
154
+ and obvious missing scoring/evidence/gate problems.
155
+
156
+ Feedback summaries group records by open/tasked/resolved/rejected status,
157
+ severity, classification, and retryability.
158
+
159
+ Commit summaries distinguish verifier-gated commits from non-gated checkpoints
160
+ and show snapshot paths, evidence counts, and linked verifier/candidate/selection
161
+ ids.
162
+
163
+ Multi-agent summaries show run and group status, role coverage, membership
164
+ health, fanout/fanin progress, missing evidence, blocked reasons, and the next
165
+ recommended action.
166
+
167
+ ## File Discipline
168
+
169
+ Operator UX follows the same FreeBSD-flavored rule as the rest of CW:
170
+
171
+ ```text
172
+ clear console output for humans
173
+ stable JSON for scripts
174
+ plain files for evidence
175
+ no hidden daemon assumption
176
+ ```
177
+
178
+ When in doubt, inspect `.cw/runs/<run-id>/state.json`, the resource directories,
179
+ and the command-specific `--json` output.
180
+ 0.1.51