@pilotspace/add 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/CHANGELOG.md +81 -0
  2. package/GETTING-STARTED.md +187 -139
  3. package/README.md +13 -7
  4. package/bin/cli.js +96 -5
  5. package/docs/01-principles.md +3 -3
  6. package/docs/02-the-flow.md +19 -12
  7. package/docs/03-step-1-specify.md +15 -13
  8. package/docs/04-step-2-scenarios.md +2 -2
  9. package/docs/05-step-3-contract.md +3 -3
  10. package/docs/06-step-4-tests.md +10 -2
  11. package/docs/07-step-5-build.md +3 -1
  12. package/docs/08-step-6-verify.md +25 -5
  13. package/docs/09-the-loop.md +12 -6
  14. package/docs/10-setup-and-stages.md +27 -13
  15. package/docs/11-governance.md +6 -2
  16. package/docs/12-roles.md +3 -3
  17. package/docs/13-adoption.md +1 -1
  18. package/docs/14-foundation.md +15 -15
  19. package/docs/15-foundations-and-lineage.md +106 -0
  20. package/docs/README.md +4 -0
  21. package/docs/appendix-a-templates.md +3 -3
  22. package/docs/appendix-b-prompts.md +40 -5
  23. package/docs/appendix-c-glossary.md +49 -12
  24. package/docs/appendix-d-worked-example.md +2 -2
  25. package/docs/appendix-e-checklists.md +16 -4
  26. package/docs/appendix-f-requirements-matrix.md +8 -8
  27. package/docs/appendix-g-references.md +106 -0
  28. package/package.json +1 -1
  29. package/skill/add/SKILL.md +41 -38
  30. package/skill/add/adopt.md +13 -11
  31. package/skill/add/deltas.md +8 -6
  32. package/skill/add/fold.md +19 -17
  33. package/skill/add/graduate.md +74 -0
  34. package/skill/add/intake.md +22 -7
  35. package/skill/add/loop.md +59 -0
  36. package/skill/add/phases/0-ground.md +66 -0
  37. package/skill/add/phases/0-setup.md +32 -25
  38. package/skill/add/phases/1-specify.md +28 -13
  39. package/skill/add/phases/2-scenarios.md +14 -4
  40. package/skill/add/phases/3-contract.md +27 -12
  41. package/skill/add/phases/4-tests.md +15 -5
  42. package/skill/add/phases/5-build.md +33 -4
  43. package/skill/add/phases/6-verify.md +40 -2
  44. package/skill/add/phases/7-observe.md +13 -5
  45. package/skill/add/report-template.md +65 -7
  46. package/skill/add/run.md +93 -39
  47. package/skill/add/scope.md +10 -6
  48. package/skill/add/setup-review.md +13 -10
  49. package/skill/add/streams.md +88 -23
  50. package/tooling/add.py +1817 -90
  51. package/tooling/templates/CONVENTIONS.md.tmpl +1 -1
  52. package/tooling/templates/DESIGN.md.tmpl +66 -0
  53. package/tooling/templates/GLOSSARY.md.tmpl +29 -0
  54. package/tooling/templates/MILESTONE.md.tmpl +1 -0
  55. package/tooling/templates/PROJECT.md.tmpl +6 -3
  56. package/tooling/templates/TASK.md.tmpl +55 -15
  57. package/tooling/templates/catalog.sample.json +38 -0
  58. package/tooling/templates/prototype.sample.json +48 -0
  59. package/tooling/templates/tokens.sample.json +55 -0
  60. package/tooling/templates/udd-catalog.md +122 -0
  61. package/tooling/templates/udd-tokens.md +79 -0
package/bin/cli.js CHANGED
@@ -29,12 +29,21 @@ function warn(msg) { process.stderr.write("warn: " + msg + "\n"); }
29
29
  function fail(msg) { process.stderr.write("error: " + msg + "\n"); process.exit(1); }
30
30
 
31
31
  function parseArgs(argv) {
32
- const args = { _: [], force: false, stage: "prototype", name: null };
32
+ // stage/name stay null unless EXPLICITLY passed the engine's own `init`
33
+ // defaults the stage and infers the name from the folder, so the manual-init
34
+ // hint only echoes flags the user actually chose (shortest true command).
35
+ const args = { _: [], force: false, check: false, stage: null, name: null };
33
36
  for (let i = 0; i < argv.length; i++) {
34
37
  const a = argv[i];
35
38
  if (a === "--force") args.force = true;
36
- else if (a === "--stage") args.stage = argv[++i];
37
- else if (a === "--name") args.name = argv[++i];
39
+ else if (a === "--check") args.check = true;
40
+ else if (a === "--stage" || a === "--name") {
41
+ const v = argv[++i];
42
+ // fail loudly on a trailing/abutting flag — never silently drop a value
43
+ // the user tried to pass (parity with the pip twin's argparse error)
44
+ if (v == null || v.startsWith("--")) fail(a + " requires a value");
45
+ if (a === "--stage") args.stage = v; else args.name = v;
46
+ }
38
47
  else if (a.startsWith("--")) warn("ignoring unknown flag " + a);
39
48
  else args._.push(a);
40
49
  }
@@ -99,10 +108,87 @@ function cmdInit(args) {
99
108
  log("");
100
109
  log("Prefer the CLI / not using Claude Code? Initialise it yourself (this arms the lock-down):");
101
110
  const launcher = process.platform === "win32" ? "py" : "python3";
102
- log(` ${launcher} .add/tooling/add.py init --await-lock --stage ${args.stage}` +
111
+ log(` ${launcher} .add/tooling/add.py init --await-lock` +
112
+ (args.stage ? ` --stage ${args.stage}` : "") +
103
113
  (args.name ? ` --name "${args.name}"` : ""));
104
114
  }
105
115
 
116
+ // --- update: re-materialize the managed layer without a re-install -----------
117
+ // The managed trees (ship-controlled). `update` clean-replaces each, so a file removed
118
+ // upstream leaves no orphan — and never touches .add/state.json, PROJECT.md, milestones,
119
+ // tasks, or archive (user data). Pure file-copy (npm <-> pip parity with _installer.py).
120
+ const MANAGED = [
121
+ ["skill/add", [".claude", "skills", "add"], false],
122
+ ["tooling", [".add", "tooling"], true],
123
+ ["docs", [".add", "docs"], false],
124
+ ];
125
+ const STAMP_FILE = ".add-version";
126
+
127
+ function pkgVersion() {
128
+ try { return require(path.join(PKG_ROOT, "package.json")).version; }
129
+ catch (_e) { return "0.0.0"; }
130
+ }
131
+
132
+ function readStamp(addDir) {
133
+ const p = path.join(addDir, STAMP_FILE);
134
+ if (!fs.existsSync(p)) return null;
135
+ try { return JSON.parse(fs.readFileSync(p, "utf8")); } catch (_e) { return null; }
136
+ }
137
+
138
+ function writeStamp(addDir, version) {
139
+ fs.mkdirSync(addDir, { recursive: true });
140
+ fs.writeFileSync(
141
+ path.join(addDir, STAMP_FILE),
142
+ JSON.stringify({ version: version, channel: "npm", installed_at: new Date().toISOString() }, null, 2) + "\n"
143
+ );
144
+ }
145
+
146
+ function cleanReplaceTree(src, dest, stripTests) {
147
+ if (!fs.existsSync(src)) fail("missing packaged source: " + src);
148
+ fs.mkdirSync(path.dirname(dest), { recursive: true });
149
+ if (fs.existsSync(dest)) fs.rmSync(dest, { recursive: true, force: true });
150
+ fs.cpSync(src, dest, { recursive: true });
151
+ if (stripTests) {
152
+ fs.rmSync(path.join(dest, "__pycache__"), { recursive: true, force: true });
153
+ for (const entry of fs.readdirSync(dest)) {
154
+ if (/^test_.*\.py$/.test(entry)) fs.rmSync(path.join(dest, entry), { force: true });
155
+ }
156
+ }
157
+ }
158
+
159
+ function cmdUpdate(args) {
160
+ const target = path.resolve(args._[0] || ".");
161
+ const addDir = path.join(target, ".add");
162
+ if (!fs.existsSync(path.join(addDir, "tooling")) && !fs.existsSync(path.join(addDir, "state.json"))) {
163
+ fail("no ADD project at " + target + " (.add/ not found) — run `init` first");
164
+ }
165
+ const version = pkgVersion();
166
+ const stamp = readStamp(addDir);
167
+ const cur = stamp && stamp.version ? stamp.version : null;
168
+
169
+ if (args.check) {
170
+ if (cur === version) log("ADD is current: project and package both at " + version + ".");
171
+ else if (cur === null) log("ADD project is unstamped; installed package is " + version + ". Run `update`.");
172
+ else log("ADD update available: project on " + cur + ", package is " + version + ". Run `update`.");
173
+ return;
174
+ }
175
+ if (cur === version && !args.force) {
176
+ log("ADD already at " + version + " — nothing to update (use --force to re-materialize).");
177
+ return;
178
+ }
179
+ // design-for-failure: back up state BEFORE touching anything.
180
+ const stateFile = path.join(addDir, "state.json");
181
+ if (fs.existsSync(stateFile)) {
182
+ fs.copyFileSync(stateFile, path.join(addDir, "pre-update-state.bak.json"));
183
+ }
184
+ for (const [sub, destParts, stripTests] of MANAGED) {
185
+ cleanReplaceTree(path.join(PKG_ROOT, sub), path.join(target, ...destParts), stripTests);
186
+ }
187
+ writeStamp(addDir, version);
188
+ log("ADD updated " + (cur || "(unstamped)") + " -> " + version +
189
+ " · skill · tooling · docs refreshed · your project state untouched.");
190
+ }
191
+
106
192
  function main() {
107
193
  const argv = process.argv.slice(2);
108
194
  const cmd = argv[0] && !argv[0].startsWith("--") ? argv.shift() : "init";
@@ -111,9 +197,14 @@ function main() {
111
197
  case "init":
112
198
  cmdInit(args);
113
199
  break;
200
+ case "update":
201
+ cmdUpdate(args);
202
+ break;
114
203
  case "help":
115
204
  case "--help":
116
- log("usage: npx @pilotspace/add init [targetDir] [--force] [--stage <s>] [--name <n>]");
205
+ log("usage: npx @pilotspace/add <init|update> [targetDir] [--force] [--check]");
206
+ log(" init install the ADD skill + tooling + book into a project");
207
+ log(" update re-materialize skill/tooling/docs to this package version (preserves your state)");
117
208
  break;
118
209
  default:
119
210
  fail("unknown command '" + cmd + "'. Try: npx @pilotspace/add init");
@@ -34,7 +34,7 @@ The flow has an order, but it is not a one-way march. Any step may reveal a gap
34
34
 
35
35
  How much you let the AI do is not a single switch. It is a setting that lives *per scope*, and it can differ from one part of the system to another. A well-tested, low-risk area may run at full autonomy while a new, high-risk one is held back.
36
36
 
37
- The *default starting point* is a deliberate choice. A team that has built up evidence and tooling may **start a scope at auto** — the AI drafts the front, a human approves the frozen contract once, and the build runs and auto-gates on evidence — and *lower to conservative* wherever risk is high. (An earlier formulation started every scope conservative and made autonomy the earned exception; it is the same dial either way — what differs is which end you default to.) Two things never move with the default, whichever way it points: the contract-freeze seam stays human (the AI never freezes the interface it then builds against), and a high-risk scope is always lowered, never auto-run.
37
+ The *default starting point* is a deliberate choice. A team that has built up evidence and tooling may **start a scope at auto** — the AI drafts the specification bundle, a human approves the frozen contract once, and the build runs and auto-gates on evidence — and *lower to conservative* wherever risk is high. (An earlier formulation started every scope conservative and made autonomy the earned exception; it is the same control either way — what differs is which end you default to.) Two things never move with the default, whichever way it points: the contract-freeze decision point stays human (the AI never freezes the interface it then builds against), and a high-risk scope is always lowered, never auto-run.
38
38
 
39
39
  **Consequence:** autonomy is a per-scope setting you choose deliberately and can lower at any time; high-risk scope is held to a human gate regardless of the default (see [11 Governance](./11-governance.md)).
40
40
 
@@ -60,9 +60,9 @@ The instructions you give the AI are plain text that reference files in the repo
60
60
 
61
61
  **Consequence:** the same project works whether the team uses one AI coding tool or another, and switching tools changes nothing structural.
62
62
 
63
- ## 9. Two surfaces: the State you load, the Story you reference
63
+ ## 9. Two layers: the working state you load, the audit trail you reference
64
64
 
65
- A method that fills the context window with its own documentation defeats itself — the agent rots before it reaches the work. So ADD keeps two doc surfaces and never loads both. The **State surface** is everything an agent loads to do the work each session: the `add` skill itself (its router `SKILL.md` and the one phase currently in play) together with the lean, current operational docs — `PROJECT.md` (the foundation), the active `MILESTONE.md`, the active `TASK.md`, and `state.json`. The **Story surface** is this book: the whole method, read once by a person to understand and trust ADD, and thereafter **never auto-loaded** into agent context — only referenced by a pointer. Depth lives on the Story surface; leanness is enforced on the State surface; they never compete for the same tokens.
65
+ A method that fills the context window with its own documentation defeats itself — the agent rots before it reaches the work. So ADD keeps two documentation layers and never loads both. The **working state** is everything an agent loads to do the work each session: the `add` skill itself (its router `SKILL.md` and the one phase currently in play) together with the lean, current operational docs — `PROJECT.md` (the foundation), the active `MILESTONE.md`, the active `TASK.md`, and `state.json`. The **audit trail** is this book plus the records behind it: the whole method, read once by a person to understand and trust ADD, and thereafter **never auto-loaded** into agent context — only referenced by a pointer. Depth lives in the audit trail; leanness is enforced on the working state; they never compete for the same tokens.
66
66
 
67
67
  **Consequence:** the book can be as rich as trust requires without costing a single runtime token, while the loaded surface stays small enough never to rot. It is why the guideline block in `CLAUDE.md`/`AGENTS.md` *points* to `add.py status` and `PROJECT.md` rather than copying them.
68
68
 
@@ -6,15 +6,18 @@
6
6
 
7
7
  ## The flow
8
8
 
9
- AIDD is one repeatable flow of **seven steps**: six build the feature — Specify → Scenarios → Contract → Tests → Build → Verify — and the seventh, **Observe**, feeds what production teaches back into the next Specify. In the default flow the AI drafts the front (steps 1–4) and a person approves it **once**, at the contract freeze; the AI performs the Build; and Verify is resolved on evidence under `autonomy: auto`, with a person owning any residue. (See [11 Governance](./11-governance.md) for the autonomy dial and the one-approval seam.)
9
+ AIDD is one repeatable flow of **seven steps**: six build the feature — Specify → Scenarios → Contract → Tests → Build → Verify — and the seventh, **Observe**, feeds what production teaches back into the next Specify. In the default flow the AI drafts the specification bundle (steps 1–4) and a person approves it **once**, at the contract freeze; the AI performs the Build; and Verify is resolved on evidence under `autonomy: auto`, with a person owning any residue. (See [11 Governance](./11-governance.md) for the autonomy level and the one-approval decision point.)
10
10
 
11
- ![The ADD flow a solid forward spine Specify→Scenarios→Contract→Tests→Build→Verify→Observe, with dashed backward-correction arrows (any phase may return to an earlier one), a Tests⇄Build red/green engine, and Observe looping back to the next Specify](./add-flow.png)
11
+ **Before those seven steps comes a phase-0 preamble: `ground`.** Before it specifies anything, the AI gathers the real current codebase the task touches — the actual files, symbols, signatures, patterns, and conventions — into a lean §0 *grounding map*, surfacing the **anchors** the frozen contract will later cite. Ground is AI-owned and adds no new approval (the one approval stays at the contract freeze); it aims the specification bundle at reality instead of assumption, so the contract, tests, and build are grounded in the code as it actually is. The seven steps keep their numbering and brand — ground precedes them as step 0 (it is drawn as node 0 in the diagram below).
12
+
13
+ ![The ADD flow — a solid primary flow Specify→Scenarios→Contract→Tests→Build→Verify→Observe, with dashed backward-correction arrows (any phase may return to an earlier one), a Tests⇄Build red/green engine, and Observe looping back to the next Specify](./add-flow.png)
12
14
 
13
15
  ```mermaid
14
16
  flowchart LR
17
+ S0["0 Ground<br/>the real codebase"] --> S1["1 Specify<br/>the rules"]
15
18
  S1["1 Specify<br/>the rules"] --> S2["2 Scenarios<br/>pass/fail cases"]
16
19
  S2 --> S3["3 Contract<br/>freeze the shape"]
17
- S3 --> S4["4 Tests<br/>safety net (red)"]
20
+ S3 --> S4["4 Tests<br/>failing-first (red)"]
18
21
  S4 --> S5["5 Build<br/>AI writes code"]
19
22
  S5 --> S6["6 Verify<br/>evidence + checks"]
20
23
  S6 --> OBS["Observe<br/>in production"]
@@ -23,14 +26,14 @@ flowchart LR
23
26
  S5 -. "a missing rule → back to Specify" .-> S1
24
27
  OBS -. "what you learn becomes the next spec" .-> S1
25
28
  classDef human fill:#FAEEDA,stroke:#BA7517,color:#633806;
26
- classDef seam fill:#E1F5EE,stroke:#0F6E56,color:#04342C;
29
+ classDef decision fill:#E1F5EE,stroke:#0F6E56,color:#04342C;
27
30
  classDef machine fill:#E6F1FB,stroke:#185FA5,color:#042C53;
28
31
  class S1,S2 human;
29
- class S3,S4 seam;
30
- class S5,S6 machine;
32
+ class S3,S4 decision;
33
+ class S0,S5,S6 machine;
31
34
  ```
32
35
 
33
- > **Solid arrows are the forward spine** — you never start a phase before its input exists (forward-skip forbidden). **Dashed arrows are backward correction** — any phase may return to an earlier one to repair its artifact (the long loop, Observe → Specify, is the same rule at milestone scale). The tight Tests ⇄ Build cycle is the per-feature red/green engine.
36
+ > **Solid arrows are the primary flow** — you never start a phase before its input exists (forward-skip forbidden). **Dashed arrows are backward correction** — any phase may return to an earlier one to repair its artifact (the long loop, Observe → Specify, is the same rule at milestone scale). The tight Tests ⇄ Build cycle is the per-feature red/green engine.
34
37
 
35
38
  ```text
36
39
  human-led ─────────────────►│◄─────────── machine-led ──► human verify
@@ -45,9 +48,9 @@ flowchart LR
45
48
  └─────────────────────────┘ becomes the next Specify
46
49
  ```
47
50
 
48
- The shape is deliberate: the human-led steps establish direction, a frozen contract forms the seam in the middle, and the AI-led build runs fast and safely on the far side because everything it needs is already fixed.
51
+ The shape is deliberate: the human-led steps establish direction, a frozen contract forms the decision point in the middle, and the AI-led build runs fast and safely on the far side because everything it needs is already fixed.
49
52
 
50
- > **What changed in v7 (the diagrams above show the structural spine, which is unchanged).** The *steps* and their order are exactly as drawn — only **who resolves them** moved. The AI now drafts the whole front (steps 1–4) and a person approves it **once**, at the contract freeze (not a sign-off at each step); and **Verify is auto-gated on evidence** under `autonomy: auto` (the default), escalating security — always a `HARD-STOP` — and other residue to a person. Lower the dial to `conservative` to keep a human at the Verify gate. See [11 Governance](./11-governance.md).
53
+ > **What changed in v7 (the diagrams above show the structural flow, which is unchanged).** The *steps* and their order are exactly as drawn — only **who resolves them** moved. The AI now drafts the whole specification bundle (steps 1–4) and a person approves it **once**, at the contract freeze (not a sign-off at each step); and **Verify is auto-gated on evidence** under `autonomy: auto` (the default), escalating security — always a `HARD-STOP` — and other residue to a person. Lower the autonomy level to `conservative` to keep a human at the Verify gate. See [11 Governance](./11-governance.md).
51
54
 
52
55
  ## Why the order is the order
53
56
 
@@ -58,7 +61,7 @@ Each step produces exactly one artifact, and each artifact is the input to the n
58
61
  | 1 Specify | the rules | scenarios, and everything after |
59
62
  | 2 Scenarios | pass/fail cases | the tests |
60
63
  | 3 Contract | the fixed shape | the tests and the build |
61
- | 4 Tests | the failing safety net | the build and the verification |
64
+ | 4 Tests | the failing-first suite | the build and the verification |
62
65
  | 5 Build | the code | the verification |
63
66
  | 6 Verify | a trusted, releasable change | the release and the next loop |
64
67
 
@@ -66,17 +69,21 @@ The single rule of discipline follows directly: **do not begin a step until the
66
69
 
67
70
  The flow runs in two directions under two rules that never conflict. **Backward correction is always allowed:** any phase may send you back to an earlier one to repair its artifact — a failing Build that exposes a missing rule sends you back to Specify, and that is the loop working ([principle 4](./01-principles.md)), not a failure. **Forward-skipping is forbidden:** you never start a phase before its input artifact exists. Correct backward freely; never skip forward.
68
71
 
72
+ **`done` is terminal — except via the recorded reopen.** Backward correction moves a *live* task; a task at `done` has already passed its gate. The one way back from `done` is the recorded `reopen` action (`add.py reopen <task> --to <phase> --reason "..."`): it returns the task to an earlier phase, resets the gate, and writes down *why* — so a done verdict is never quietly un-done. This is the same backward-correction rule, made explicit at the one state where it would otherwise be bypassed silently.
73
+
69
74
  ## Who does what
70
75
 
71
76
  | Step | Person's job | AI's job |
72
77
  |------|--------------|----------|
73
78
  | 1 Specify | confirm the rules (part of the one approval) | draft; list assumptions to confirm |
74
79
  | 2 Scenarios | confirm what "correct" looks like (part of the one approval) | draft scenarios |
75
- | 3 Contract | **approve & freeze the whole bundle (§1–§4) once — the seam** | draft the contract and mocks |
80
+ | 3 Contract | **approve & freeze the whole bundle (§1–§4) once — the decision point** | draft the contract and mocks |
76
81
  | 4 Tests | confirm the targets (part of the one approval) | draft the failing tests |
77
82
  | 5 Build | direct in small batches | implement until tests pass |
78
83
  | 6 Verify | own the residue (security · concurrency · architecture); approve when `conservative` | gather evidence; **auto-PASS on complete evidence** under `autonomy: auto` |
79
- | 7 Observe | read the signal; fold confirmed deltas into PROJECT.md | run behind a flag; emit competency deltas |
84
+ | 7 Observe | read the signal; consolidate confirmed deltas into PROJECT.md | run behind a flag; emit lessons learned |
85
+
86
+ **What the human sees when it is their turn — the decision arc.** Whenever the flow stops for the human — the baseline approval that ends setup, the contract-freeze decision point and an escalated verify gate within each task, and the wider decision points of the loop (intake · scope · milestone close · stage graduation) — the AI opens its report with the **decision arc**: three engine-sourced lines — `goal:` the milestone goal the work serves · `done:` the proven progress toward it · `plan:` what comes next. The arc renders first, above the report's summary, so the human confirms with sight of the whole trajectory rather than a local snapshot. It is presentation only — it never adds a gate or changes an outcome. See [Appendix C](./appendix-c-glossary.md).
80
87
 
81
88
  ## What survives, and what is disposable
82
89
 
@@ -4,7 +4,7 @@
4
4
 
5
5
  > **Purpose:** state, in plain language, what the feature must do and what it must reject, with no ambiguity left for the AI to resolve by guessing.
6
6
  > **Produces:** `SPEC.md` for the feature.
7
- > **How it works — co-specification:** AI and human **brainstorm the shape together**; the AI drafts; the **human validates, with the AI's advice.** The decisive advice is a *least-sure flag* — the AI names the one or two things most likely to be wrong, so the human's attention lands where it matters. The human owns the decision; the AI owns surfacing what it does not yet know.
7
+ > **How it works — co-specification:** AI and human **brainstorm the shape together**; the AI drafts; the **human validates, with the AI's advice.** The decisive advice is a *lowest-confidence flag* — the AI names the one or two things most likely to be wrong, so the human's attention lands where it matters. The human owns the decision; the AI owns surfacing what it does not yet know.
8
8
 
9
9
  ---
10
10
 
@@ -19,10 +19,10 @@ There is also a diagnostic value: **if you cannot write the spec, you do not yet
19
19
  A specification is not dictated by one side. It is made in three moves:
20
20
 
21
21
  1. **Diverge — brainstorm by both.** Before drafting, the AI surfaces the *decision space*: the two or three genuine ways to frame the feature, and the open questions it would otherwise resolve by guessing. You react — add, kill, redirect. This is the brainstorm, and it lives in the conversation, not in a new document.
22
- 2. **Converge — the AI drafts, and ranks its own uncertainty.** The AI writes the spec below, then ranks what it is least sure about. It does not hand you a flat wall of equal-looking assumptions to nod through; it tells you *where it is most likely wrong, and what that would cost.*
22
+ 2. **Converge — the AI drafts, and ranks its own uncertainty.** The AI writes the spec below, then ranks where its confidence is lowest. It does not hand you a flat list of equal-looking assumptions to nod through; it tells you *where it is most likely wrong, and what that would cost.*
23
23
  3. **Validate — you decide, with the AI's advice.** You read the ranked uncertainty first, then confirm, correct, or send it back. Your approval is real because your attention was aimed.
24
24
 
25
- The brainstorm leaves a *light trace, not a document.* What you chose becomes a rule; what you weighed and dropped becomes a one-line **`Framings weighed:`** note; what stayed genuinely uncertain becomes a **least-sure flag**. Nothing new to maintain — the residue lands in the spec you were writing anyway.
25
+ The brainstorm leaves a *light trace, not a document.* What you chose becomes a rule; what you weighed and dropped becomes a one-line **`Framings weighed:`** note; what stayed genuinely uncertain becomes a **lowest-confidence flag**. Nothing new to maintain — the residue lands in the spec you were writing anyway.
26
26
 
27
27
  ## What a good specification contains
28
28
 
@@ -31,7 +31,7 @@ Four parts, kept short:
31
31
  1. **Must** — the behaviors the feature is required to perform.
32
32
  2. **Reject** — the inputs or situations it must refuse, each paired with a named error.
33
33
  3. **After** — the state that is true once it succeeds (what changed).
34
- 4. **Assumptions — least-sure first** — the things you are taking for granted, **ranked so the most-likely-wrong come first.** The top one or two carry a `⚠` flag with *why it is uncertain* and *what it costs if wrong*; the rest are the low-stakes tail. A spec with genuinely nothing uncertain still names its single biggest risk, however small — the AI never claims a blank mind.
34
+ 4. **Assumptions — lowest-confidence first** — the things you are taking for granted, **ranked so the most-likely-wrong come first.** The top one or two carry a `⚠` flag with *why it is uncertain* and *what it costs if wrong*; the rest are the low-stakes tail. A spec with genuinely nothing uncertain still names its single biggest risk, however small — the AI never claims a blank mind.
35
35
 
36
36
  Naming the errors matters. "Reject bad amounts" is an instruction to guess; `amount <= 0 -> "amount_invalid"` is a rule that produces a testable scenario and a defined contract response.
37
37
 
@@ -47,8 +47,8 @@ Reject:
47
47
  - <bad input / situation> -> "<error_code>"
48
48
  After:
49
49
  - <what is true once it succeeds>
50
- Assumptions — least-sure first:
51
- ⚠ <most-likely-wrong assumption> — least sure because <why>; if wrong: <cost>
50
+ Assumptions — lowest-confidence first:
51
+ ⚠ <most-likely-wrong assumption> — lowest confidence because <why>; if wrong: <cost>
52
52
  - [x] <confirmed / low-stakes assumption> — <one line>
53
53
  ```
54
54
 
@@ -69,8 +69,8 @@ Reject:
69
69
  - source == destination -> "same_account"
70
70
  - balance < amount -> "insufficient_funds"
71
71
  - account not mine -> "forbidden"
72
- Assumptions — least-sure first:
73
- ⚠ same currency only (no FX) in v1 — least sure because the ticket never said; if wrong: the whole amount/rounding model changes and this contract is wrong
72
+ Assumptions — lowest-confidence first:
73
+ ⚠ same currency only (no FX) in v1 — lowest confidence because the ticket never said; if wrong: the whole amount/rounding model changes and this contract is wrong
74
74
  - [x] no daily limit in v1 — confirmed: out of scope for v1
75
75
  ```
76
76
 
@@ -78,16 +78,18 @@ The `Framings weighed:` line shows what was considered and dropped, so the chose
78
78
 
79
79
  ## The AI's role here
80
80
 
81
- Use the AI to **open the space and then narrow it honestly.** First it brainstorms the genuine framings with you (diverge). Then it drafts the spec from whatever raw material you have — a ticket, an interview, a contract document — listing every assumption it had to make, **ranked least-sure first**, and flagging the one or two it is least confident in with *why* and *what it costs if wrong*. Its instinct is to fill gaps silently and present a confident wall; the method forces those gaps into the open, and forces the confident wall to declare its own soft spots. See `playbook/1_specify.md` in [Appendix B](./appendix-b-prompts.md).
81
+ Use the AI to **open the space and then narrow it honestly.** First it brainstorms the genuine framings with you (diverge). Then it drafts the spec from whatever raw material you have — a ticket, an interview, a contract document — listing every assumption it had to make, **ranked lowest-confidence first**, and flagging the one or two it is least confident in with *why* and *what it costs if wrong*. Its instinct is to fill gaps silently and present a confident wall; the method forces those gaps into the open, and forces the confident wall to declare its own soft spots. See `playbook/1_specify.md` in [Appendix B](./appendix-b-prompts.md).
82
82
 
83
- The defining instruction: *if a requirement is unclear, ask — do not resolve it by guessing — and of the things you must assume, say plainly which you are least sure about.*
83
+ The defining instruction: *if a requirement is unclear, ask — do not resolve it by guessing — and of the things you must assume, say plainly where your confidence is lowest.*
84
84
 
85
85
  ## Common mistakes
86
86
 
87
87
  - **Stating only the happy path.** The "Reject" list is where most real complexity lives; an empty one usually means it has not been thought through.
88
88
  - **Free-text errors.** Errors must be named codes, not sentences, so they can become scenarios and contract responses.
89
89
  - **Hidden assumptions.** If an assumption is not written down, it is not confirmed — it is a future bug with a delay timer.
90
- - **A flat wall of "confirmed" assumptions.** Eight equal-looking ticks invite a reflex approval. Rank them; flag the one or two that are load-bearing. An unranked list hides the risk inside the noise.
90
+ - **A flat list of "confirmed" assumptions.** Eight equal-looking ticks invite a reflex approval. Rank them; flag the one or two that are load-bearing. An unranked list hides the risk inside the noise.
91
+ - **"Existing behavior" claims without a citation.** An assumption row that asserts "this is how X works today" is describing intent, not code. Any wiring claim or assumption that depends on the current state of an existing path must carry a grep/line citation (e.g. `file.rs:203`) — otherwise it is a future bug in disguise.
92
+ - **Wiring claims that name a symbol, not a caller chain.** Verifying that a function exists is not the same as verifying it is reachable. A wiring claim is only valid when it names the production caller chain from an actual entry point — not just the symbol's location in a file. A function that nothing calls is dead, not wired.
91
93
 
92
94
  ## Exit check
93
95
 
@@ -96,7 +98,7 @@ A spec is done when:
96
98
  - [ ] Every required behavior is stated explicitly.
97
99
  - [ ] Every rejection has a named error code.
98
100
  - [ ] The success state-change is described.
99
- - [ ] The assumptions are ordered least-sure first, and the one or two `⚠` flags carry *why* + *cost* — or, for genuinely trivial scope, an honest "none material" that still names the single biggest risk.
101
+ - [ ] The assumptions are ordered lowest-confidence first, and the one or two `⚠` flags carry *why* + *cost* — or, for genuinely trivial scope, an honest "none material" that still names the single biggest risk.
100
102
 
101
103
  The shift from older practice: you no longer pre-confirm every assumption to advance. You confirm that the AI has *ranked* its uncertainty and that you have *engaged the top of the rank.* Stated honestly: the flag makes a genuine review cheap and a lazy one visibly negligent — it cannot force the read. That is the most a lightweight check can buy.
102
104
 
@@ -108,7 +110,7 @@ If you cannot state a rule clearly, the feature is not ready to build. Stop, tak
108
110
 
109
111
  ## The one approval, and where the flag really lands
110
112
 
111
- In the one-approval front, you do not approve the spec alone — you approve the whole frozen bundle (spec, scenarios, contract, tests) once, at the contract freeze. So the least-sure flag is **bundle-wide**: at that single seam the AI leads with *"of everything I'm asking you to freeze, these one or two points are most likely wrong"* — and a flag may point at an uncovered scenario or the contract shape, not only a spec assumption. The ranking you do here in Specify is the first feeder into that one gate. See [05 Contract](./05-step-3-contract.md) and the `add` skill's `run.md`.
113
+ In the one-approval flow, you do not approve the spec alone — you approve the whole frozen bundle (spec, scenarios, contract, tests) once, at the contract freeze. So the lowest-confidence flag is **bundle-wide**: at that single decision point the AI leads with *"of everything I'm asking you to freeze, these one or two points are most likely wrong"* — and a flag may point at an uncovered scenario or the contract shape, not only a spec assumption. The ranking you do here in Specify is the first input into that one gate. See [05 Contract](./05-step-3-contract.md) and the `add` skill's `run.md`.
112
114
 
113
115
  ---
114
116
 
@@ -6,7 +6,7 @@
6
6
  > **Produces:** `features/<name>.feature`.
7
7
  > **Person's job:** decide what "correct" looks like in concrete situations. **AI's job:** draft the scenarios.
8
8
 
9
- > **Part of the one-approval front (v7).** In the default flow these scenarios are drafted by the AI alongside the spec, contract, and failing tests as **one bundle**, approved by a person **once**, at the contract freeze — not signed off step by step. This chapter is how to get the scenarios *right*; [05 Contract](./05-step-3-contract.md) is where the bundle is frozen. See [11 Governance](./11-governance.md).
9
+ > **Part of the specification bundle (v7).** In the default flow these scenarios are drafted by the AI alongside the spec, contract, and failing tests as **one bundle**, approved by a person **once** (the one approval), at the contract freeze — not signed off step by step. This chapter is how to get the scenarios *right*; [05 Contract](./05-step-3-contract.md) is where the bundle is frozen. See [11 Governance](./11-governance.md).
10
10
 
11
11
  ---
12
12
 
@@ -14,7 +14,7 @@
14
14
 
15
15
  A plain rule is still open to interpretation. "Source must have enough balance" leaves open: enough for what, exactly? What happens to the balances when it is *not* enough? A scenario removes the interpretation by pinning a specific situation to a specific expected result.
16
16
 
17
- Scenarios occupy a unique position: they are **readable by people and checkable by machines at the same time.** A product owner can confirm a scenario is what they meant; a test can be generated directly from it. This makes them the bridge between the human-led front of the flow and the machine-led back. They are the single most leverage-bearing artifact in the method, because everything downstream — the tests, and through them the build's definition of success — is generated from them.
17
+ Scenarios occupy a unique position: they are **readable by people and checkable by machines at the same time.** A product owner can confirm a scenario is what they meant; a test can be generated directly from it. This makes them the bridge between the human-led half of the flow and the machine-led back. They are the single most leverage-bearing artifact in the method, because everything downstream — the tests, and through them the build's definition of success — is generated from them.
18
18
 
19
19
  ## The form
20
20
 
@@ -6,13 +6,13 @@
6
6
  > **Produces:** `contracts/<name>.md` (plus a mock and contract tests).
7
7
  > **Person's job:** approve and freeze the shape. **AI's job:** generate the first draft, the mock, and the contract tests.
8
8
 
9
- > **The one approval lands here (v7).** In the default flow the AI drafts the whole front — spec, scenarios, this contract, and the failing tests as **one bundle**, and a person gives a **single approval at this freeze**. Freezing the contract is the one human gate of the front, not the third of three sign-offs; reject any part and the whole bundle returns to draft (backward correction, not failure). See [11 Governance](./11-governance.md).
9
+ > **The one approval lands here (v7).** In the default flow the AI drafts spec, scenarios, this contract, and the failing tests as **one specification bundle**, and a person gives a **single approval at this freeze**. Freezing the contract is the one human gate of the bundle, not the third of three sign-offs; reject any part and the whole bundle returns to draft (backward correction, not failure). See [11 Governance](./11-governance.md).
10
10
 
11
11
  ---
12
12
 
13
- ## The seam of the whole method
13
+ ## The decision point of the whole method
14
14
 
15
- This step is the seam between the human-led and machine-led halves of the flow, and it is what makes everything after it safe.
15
+ This step is the decision point between the human-led and machine-led halves of the flow, and it is what makes everything after it safe.
16
16
 
17
17
  The reasoning is simple. The AI is allowed to write and rewrite code quickly. That is only safe if there is a stable surface that the rest of the system depends on and that the AI is not allowed to disturb. The frozen contract is that surface. Below it, the code is disposable and can be regenerated freely; above it, nothing breaks, because the shape it depends on does not move.
18
18
 
@@ -6,7 +6,7 @@
6
6
  > **Produces:** a failing (red) automated test suite.
7
7
  > **Person's job:** set the targets and coverage. **AI's job:** generate the tests.
8
8
 
9
- > **Part of the one-approval front (v7).** In the default flow these tests are drafted by the AI as part of the front **bundle** (spec · scenarios · contract · tests) and approved by a person **once**, at the contract freeze — the tests are part of what that single approval covers. They still must be **red before the build**. See [11 Governance](./11-governance.md).
9
+ > **Part of the specification bundle (v7).** In the default flow these tests are drafted by the AI as part of the specification **bundle** (spec · scenarios · contract · tests) and approved by a person **once**, at the contract freeze — the tests are part of what that one approval covers. They still must be **red before the build**. See [11 Governance](./11-governance.md).
10
10
 
11
11
  ---
12
12
 
@@ -18,7 +18,7 @@ The reason is mechanical. If code is written first and tests after, the tests ar
18
18
 
19
19
  ## The must-fail principle
20
20
 
21
- After generating the tests, you run them — and they must **fail**, because no implementation exists yet. This sounds trivial and is not. A test that passes before any code is written is testing nothing; it is a false reassurance that will later wave bad code through. Confirming the suite is "red for the right reason" (a missing implementation, not a broken test) is what makes it a genuine safety net.
21
+ After generating the tests, you run them — and they must **fail**, because no implementation exists yet. This sounds trivial and is not. A test that passes before any code is written is testing nothing; it is a false reassurance that will later wave bad code through. Confirming the suite is "red for the right reason" (a missing implementation, not a broken test) is what makes it genuinely protective.
22
22
 
23
23
  ## What to test
24
24
 
@@ -60,6 +60,11 @@ The AI generates the test suite from the scenarios and contract. Your job is to
60
60
  - **A green suite before the build.** Means the tests are not actually exercising the missing feature — fix them now.
61
61
  - **Skipping the side-effect assertions.** Without `assert a.balance == 20` on the rejection path, a corrupting partial failure passes silently.
62
62
  - **No coverage target.** Without a recorded target, coverage can quietly erode during the build.
63
+ - **`should_panic` as a red test.** Marking a test `#[should_panic(expected = "implement in green wave")]` (or the equivalent in any language) passes immediately and stays green while red — it is a lying red. Declare unimplemented paths with `todo!()` (or `unimplemented!()`) so the test actually fails. If a test is intentionally designed to flip from red to green during the build, say so with a comment: `// flip authorized at green wave`.
64
+ - **Collateral tests named by category, not by exact name.** When a spec adds a slash command, a new CLI subcommand, or any other globally-enumerated thing, there is a fixed collateral set of tests that count or enumerate it (e.g. a command-registry count test, a help-text snapshot, an autocomplete positional assert). Pre-list these tests by their **exact test names** in §4 — not categories — so the build agent's edits to those "pre-existing" tests are expected and the count is right. Naming only the category means the agent finds the wrong test or misses one.
65
+ - **Arithmetic not checked against frozen constants.** Before freezing, check that the red suite can reach green: a fixture with N bytes fails a hard-coded M-byte budget if N > M — the suite can never pass. Run the numbers before freeze, and add an additive override (e.g. `set_budget`) when the scenario implies a limit the production constant cannot satisfy in test.
66
+ - **Non-hermetic tests that read real user state.** Tests that call a loader with `None` (defaulting to `~/.helios/settings.json` or the real home dir) become torn-read flakes under a parallel suite and assert nothing useful. Red tests that create or read production paths must redirect them to a temp dir; grep new tests for `home_dir`, `~/.config`, real-path defaults before freeze.
67
+ - **Tests that share a per-machine singleton without isolation.** Background services (embedded servers, filesystem watchers) bind to fixed ports or paths. Tests that start such a service must tear it down, or they collide with a parallel run or an already-running dev instance. If the singleton cannot be isolated, gate those tests as serial (one thread, no parallel execution) and document it.
63
68
 
64
69
  ## Exit check
65
70
 
@@ -67,6 +72,9 @@ The AI generates the test suite from the scenarios and contract. Your job is to
67
72
  - [ ] The suite runs in the pipeline and is **red for the right reason**.
68
73
  - [ ] Tests assert observable behavior, not internals.
69
74
  - [ ] A coverage target is recorded.
75
+ - [ ] No `should_panic` lying reds — unimplemented paths use `todo!()` or equivalent so they actually fail.
76
+ - [ ] Collateral tests for globally-enumerated things (command counts, help snapshots) are listed by exact name.
77
+ - [ ] Arithmetic checked: the red fixtures can reach green against the frozen constants.
70
78
 
71
79
  ## If the check fails
72
80
 
@@ -63,7 +63,7 @@ The autonomy granted in this step should match the evidence and your review capa
63
63
 
64
64
  ## Common mistakes
65
65
 
66
- - **Batches too large to review.** Shrinks verification to rubber-stamping.
66
+ - **Batches too large to review.** Shrinks verification to approving without reading.
67
67
  - **Letting the AI add unknown dependencies.** The allow-list check in the pipeline should block this automatically; if it does not, the supply-chain risk is real (an AI may invent a plausible package name that an attacker has registered).
68
68
  - **Accepting "all tests pass" without reading the change.** Passing tests are necessary, not sufficient — the next step exists for exactly this reason.
69
69
 
@@ -78,3 +78,5 @@ The autonomy granted in this step should match the evidence and your review capa
78
78
  ## If the check fails
79
79
 
80
80
  If the AI weakened a test, reject and re-prompt. If it added an out-of-allow-list package, the pipeline blocks it; have the AI find an approved alternative or raise the package for human approval. If the batch is too large to review, ask the AI to split the work and resubmit. Only once the exit check passes does the change proceed to verification.
81
+
82
+ And in the other direction: if the *verify* gate later finds a confirmed cheat — a tamper, or a build that gamed the green (overfit to the fixtures, vacuous asserts, stubbed-away logic) — the task returns *here* for an honest redo. That return is the **bounded self-heal loop** (see the run chapter): revert the tampered file or de-overfit the code, then advance again. It is capped — after the cap a confirmed cheat HARD-STOPs to the human rather than looping forever, and a gamed green is never auto-passed.
@@ -12,16 +12,16 @@
12
12
 
13
13
  The build produced passing tests. That is necessary but not sufficient. Verification is where a person establishes trust — and the principle governing it is *trust through evidence, not inspection.*
14
14
 
15
- This needs care, because it is easy to misread. "Not by inspection" does not mean "do not look at the code." It means the *basis* of trust is the passing evidence plus a deliberate check of the specific things tests cannot easily catch — not a general impression that the code reads plausibly. Plausibility is exactly the trap: AI code is frequently plausible and wrong. So verification has two parts: confirm the evidence, then check the known blind spots.
15
+ This needs care, because it is easy to misread. "Not by inspection" does not mean "do not look at the code." It means the *basis* of trust is the passing evidence plus a deliberate check of the specific things tests cannot easily catch — not a general impression that the code reads plausibly. Plausibility is exactly the trap: AI code is frequently plausible and wrong. So verification has two parts: confirm the evidence, then check the known non-functional risks.
16
16
 
17
- ## Who resolves Verify — the evidence auto-gate
17
+ ## Who resolves Verify — the automated quality gate
18
18
 
19
- Verify can be resolved two ways, set per task by the `autonomy:` header (see [governance](./11-governance.md) and the autonomy dial):
19
+ Verify can be resolved two ways, set per task by the `autonomy:` header (see [governance](./11-governance.md) and the autonomy level):
20
20
 
21
21
  - **Auto (the default).** When `autonomy: auto`, the run resolves the gate on **evidence** rather than waiting for a person — but only when *all* of these hold: every test green, coverage not decreased, no test weakened and no contract edited, the convergence loops dry, and **no residue** (security, concurrency, or architecture). It records `PASS` as *auto-resolved*, naming the run as the accountable owner — an explicit pass, not a skip. This is principle 7: a gate may be resolved by evidence when that evidence is sufficient and the result is logged.
22
22
  - **Human.** When `autonomy: conservative`, or whenever the run finds residue it cannot judge, the gate stops for a person; the two parts below are theirs.
23
23
 
24
- **Security is always a `HARD-STOP` and is never auto-passed, at any autonomy level.** The two parts that follow — confirm the evidence, then check the blind spots — are what *either* resolver works through; the only question is whether a person or the recorded run signs the outcome.
24
+ **Security is always a `HARD-STOP` and is never auto-passed, at any autonomy level.** The two parts that follow — confirm the evidence, then check the non-functional risks — are what *either* resolver works through; the only question is whether a person or the recorded run signs the outcome.
25
25
 
26
26
  ## Part one — confirm the evidence
27
27
 
@@ -40,6 +40,22 @@ Automated tests are excellent at behavior on defined inputs and poor at a few sp
40
40
  - **Security.** Are there exposed secrets, injection openings, or unexpected dependencies? AI-generated code is known to hardcode secrets and to pull in packages by plausible-but-wrong names.
41
41
  - **Architecture conformance.** Does the change respect the layering and dependency rules in `CONVENTIONS.md`? Speed with no architectural check produces a fast-growing tangle that becomes unmaintainable within months.
42
42
 
43
+ ## Part three — the deep check (do not skim)
44
+
45
+ Two failures slip straight past green tests. The first is code that is never *wired in* — a new function that nothing calls, an endpoint no route reaches: the tests for it pass in isolation while the feature is, in practice, absent. The second is the opposite — code left *dead* behind a path nothing exercises, quietly rotting. And for a change that produced prose rather than code, the equivalent failure is signing off on a claim you never actually read in full. Plausibility hides all three. So verification carries one explicit requirement beyond the non-functional review:
46
+
47
+ > Deep check — do not skim. If the task produced code, record that every new symbol is referenced (wiring) and that no new dead/unused code was introduced. If it produced prose or non-code, record a semantic read — what you read in full and what it confirmed. Which path applies is the resolver's judgement; the engine never classifies.
48
+
49
+ This is *evidence*, not impression: a reference search showing where each new symbol is called, a scan confirming nothing new is orphaned, or — for prose — a note of exactly what was read and what it confirmed. An unfilled deep check is a **shallow verify**, not a pass. The engine cannot judge wiring, dead code, or whether prose was truly read; the resolver records the evidence, and a person (under `conservative`) or the recorded run (under `auto`) signs it.
50
+
51
+ **The wiring trace is a named step, not a free-form note.** For every new hook, closure, or middleware registered in this task: trace from the process entry point to the call site and record it explicitly — symbol, file, line. A symbol that is only reachable via a test helper or `make_config` but not via the production entry point (e.g. `build_harness_with_dispatcher`, `interactive_mode`) is not wired. This is the third repeated class in production: "runtime-activation-order/silent-noop" — the code exists and the unit tests pass, but the feature is absent in the running program. The wiring trace is how you catch it before a user does.
52
+
53
+ ## Part four — was the green earned?
54
+
55
+ Passing tests say the code satisfies the cases you wrote down. They do not say it earned that pass honestly — and the mechanical tamper tripwire (Step 6's floor) only catches an *edited* test or contract, not a build that gamed the *unchanged* suite. The same rubric the phase guide carries names what the tripwire cannot see:
56
+
57
+ A green suite proves the tests pass — not that the build EARNED them. Three judgment cheats pass the unchanged suite without earning it: src overfit to the test fixtures (special-cased to the literal inputs, not the general behavior §1 asked for), vacuous asserts (tautological — green even against an empty implementation), and real logic stubbed away (the function returns a constant the tests happen to accept). These cheats are invisible to the mechanical tamper tripwire, which only sees edited files. Score them with an adversarial refute-read: an independent reviewer — a subagent under `autonomy: auto` is recommended, the engine never spawns one — prompted to argue the green was NOT earned from outside the build context. This is the verify-gate, whole-suite specialization of run.md's adversarial verify (see run.md), not a new discipline. A confirmed earned-green failure is HARD-STOP-class: never auto-passed, never RISK-ACCEPTED — but a first cheat is a chance to redo: a confirmed cheat (mechanical tamper or a reported earned-green failure) enters the bounded self-heal loop — it returns to build for an honest redo, and only after the loop's cap does it HARD-STOP to the human (the loop lives in run.md).
58
+
43
59
  ## Recording the outcome
44
60
 
45
61
  Every verification ends with exactly one recorded outcome, with an accountable owner — never a silent pass:
@@ -58,14 +74,18 @@ A security finding is always a `HARD-STOP`; it is never waved through with a wai
58
74
  - [ ] Concurrency/timing of the risky operation is safe.
59
75
  - [ ] No exposed secrets, injection openings, or unexpected dependencies.
60
76
  - [ ] Layering and dependencies follow `CONVENTIONS.md`.
77
+ - [ ] Deep check (do not skim): for code, every new symbol is referenced (wiring) and no new dead/unused code was introduced; for prose/non-code, a semantic read is recorded.
61
78
  - [ ] The change is approved — by a person, **or** (under `autonomy: auto`, no residue) auto-resolved by the run as the recorded accountable owner.
62
79
  - [ ] An outcome is recorded (`PASS` / `RISK-ACCEPTED` / `HARD-STOP`).
63
80
 
64
81
  ## Common mistakes
65
82
 
66
- - **Shipping on plausibility.** Reading the diff, finding it reasonable, and approving — without the evidence and the blind-spot checks — is the precise failure the method exists to prevent.
83
+ - **Shipping on plausibility.** Reading the diff, finding it reasonable, and approving — without the evidence and the non-functional review — is the precise failure the method exists to prevent.
67
84
  - **Treating a security gap as acceptable risk.** It is a `HARD-STOP`, not a waiver.
68
85
  - **Skipping the concurrency check** because the tests are green. Tests rarely exercise simultaneity; this is a manual check by design.
86
+ - **Trusting the green agent's self-reported test count.** A build agent running a filtered suite (e.g. `-E 'test(theme)'`) only sees tests inside the filter. Collateral failures outside the filter — a stale count in `all_commands_in_registry`, an e2e snapshot the agent did not touch — are invisible. The orchestrator's **full-suite rerun is load-bearing**; never skip it on the grounds that the scoped run was green.
87
+ - **User-observable-only failures escalate to the human before exhausting discriminating probes.** When a symptom is only observable by a person (a TCC dialog, a visual flicker, an OS-level prompt), do not respond by running the suite again. Instead, design two or three targeted probes that let the user distinguish cause A from cause B in one interaction each. Three AskUser probes resolve what three blind reruns cannot.
88
+ - **Background-process hangs misdiagnosed as test failures.** A test that never exits is not a failure in the test logic — it is a hang. The diagnosis recipe: background the test process, run `pgrep` to find it, use the platform profiler (`sample <pid>` on macOS, `perf` on Linux) to sample the stack, then `lsof -p <pid>` to see open files. Run an isolation experiment (suspect line on/off, 3×3) before reading any code. Entry-count caps do not bound wall time — a single huge directory or a blocking syscall inside a `spawn_blocking` call can hang indefinitely even when the entry cap is satisfied.
69
89
 
70
90
  ## If the check fails
71
91
 
@@ -15,7 +15,7 @@ That information is the input to the next cycle. What you learn in production be
15
15
 
16
16
  ## Release deliberately
17
17
 
18
- Release behind a mechanism that limits the blast radius of a mistake — a feature flag, a gradual rollout, or both. The verification step established that the feature is correct against everything you anticipated; a controlled release is your protection against what you did not anticipate. If something is wrong, you want to affect a few users and roll back, not affect everyone and scramble.
18
+ Release behind a mechanism that limits the scope of impact of a mistake — a feature flag, a gradual rollout, or both. The verification step established that the feature is correct against everything you anticipated; a controlled release is your protection against what you did not anticipate. If something is wrong, you want to affect a few users and roll back, not affect everyone and scramble.
19
19
 
20
20
  ## Reuse the scenarios as monitors
21
21
 
@@ -33,9 +33,9 @@ Every defect, surprise, or new need is written up as a change to the specificati
33
33
 
34
34
  This is also where the AI returns to a useful role: summarizing telemetry, clustering errors into themes, and drafting the proposed spec delta for a person to review. But the production decisions — what to roll back, what to prioritize — remain human.
35
35
 
36
- ## Competency deltas and the foundation fold
36
+ ## Lessons learned and the retrospective consolidation
37
37
 
38
- A spec delta feeds the *next feature*. But a loop also teaches the **method itself** — that the domain model missed a boundary, that a whole class of scenario was never tested, that a build convention helped or hurt. AIDD captures those as **competency deltas**: a single tagged learning, written in the Observe step, marking which of the five competencies it sharpens.
38
+ A spec delta feeds the *next feature*. But a loop also teaches the **method itself** — that the domain model missed a boundary, that a whole class of scenario was never tested, that a build convention helped or hurt. AIDD captures those as **lessons learned**: a single tagged learning, written in the Observe step, marking which of the five competencies it sharpens.
39
39
 
40
40
  | tag | competency | a delta here means you learned something about… |
41
41
  |-----|------------|--------------------------------------------------|
@@ -45,11 +45,11 @@ A spec delta feeds the *next feature*. But a loop also teaches the **method itse
45
45
  | `TDD` | Test | how we prove correctness — a missing scenario, a flaky or hollow test |
46
46
  | `ADD` | AI/build | how the AI builds — a harness, prompt, or convention that helped or hurt |
47
47
 
48
- Each delta is one tagged entry — `- [COMPETENCY · status] the learning (evidence: a pointer)` — and the evidence is **required**: a failing scenario, a production signal, a review note. No evidence means it is an opinion, not a delta. The AI **emits** deltas as `open`; it never folds its own. Folding is judgment, and judgment is the human's — the same verify/observe seam that keeps the AI from grading its own work.
48
+ Each delta is one tagged entry — `- [COMPETENCY · status] the learning (evidence: a pointer)` — and the evidence is **required**: a failing scenario, a production signal, a review note. No evidence means it is an opinion, not a delta. The AI **emits** deltas as `open`; it never consolidates its own. Consolidation is judgment, and judgment is the human's — the same verify/observe decision point that keeps the AI from grading its own work.
49
49
 
50
- **The fold.** At milestone close (or on demand, when open deltas pile up), a person runs the fold ritual: **gather** every `open` delta across the milestone's tasks, **group** them by competency, **propose** the exact foundation edit for each, **confirm** with the human one by one, then **write** — append-only — flipping each delta to `folded` (merged) or `rejected` (considered and deliberately not merged, left in place so the trail survives), and bumping the `foundation-version:` marker. `DDD`/`SDD`/`UDD` deltas fold into the matching section of `PROJECT.md`; `TDD`/`ADD` fold into `CONVENTIONS.md` (they sharpen the engine, not the product); and **every** fold also appends one row to `PROJECT.md` §Key Decisions — the universal, auditable record of what the foundation learned.
50
+ **The consolidation.** At milestone close (or on demand, when open deltas pile up), a person runs the retrospective consolidation: **gather** every `open` delta across the milestone's tasks, **group** them by competency, **propose** the exact foundation edit for each, **confirm** with the human one by one, then **write** — append-only — flipping each delta to `folded` (merged) or `rejected` (considered and deliberately not merged, left in place so the trail survives), and bumping the `foundation-version:` marker. `DDD`/`SDD`/`UDD` deltas consolidate into the matching section of `PROJECT.md`; `TDD`/`ADD` consolidate into `CONVENTIONS.md` (they sharpen the engine, not the product); and **every** consolidation also appends one row to `PROJECT.md` §Key Decisions — the universal, auditable record of what the foundation learned.
51
51
 
52
- **Tooling.** `add.py deltas` lists every open delta across the project (so nothing waiting to be folded is invisible); `add.py check` lints each delta's well-formedness — known competency tag, valid status, non-empty evidence. There is deliberately **no `add.py fold`**: the engine stays judgment-free, and the ritual lives with the human who owns it.
52
+ **Tooling.** `add.py deltas` lists every open delta across the project (so nothing waiting to be consolidated is invisible); `add.py check` lints each delta's well-formedness — known competency tag, valid status, non-empty evidence. There is deliberately **no `add.py fold`**: the engine stays judgment-free, and the ritual lives with the human who owns it.
53
53
 
54
54
  ## Re-entrancy: the loop is the whole point
55
55
 
@@ -57,5 +57,11 @@ Two principles converge here. *The flow is re-entrant* — any step can send you
57
57
 
58
58
  A team operating this way does not experience requirements changing as a failure of planning. It experiences it as the system working: reality is teaching the specification, and the specification is teaching the next build.
59
59
 
60
+ ## The milestone holds until its goal is met
61
+
62
+ A single feature loops through Observe back to Specify; a **milestone** has the same shape at a larger scale, and a gate to match. A milestone is not finished when its tasks are done — it is finished when its **goal** is met, expressed as the exit criteria in `MILESTONE.md`. So `add.py milestone-done` is **goal-gated**: it refuses to close a milestone while any exit criterion is still unchecked, and **holds until** every box is checked. Those checkboxes are the human's affirmation that the goal is genuinely met — the engine reads the tally, it never judges the goal itself. (A milestone with no exit criteria closes as before; `milestone-done` is the only path to `done`, and archiving refuses anything not yet done — so the one gate cannot be slipped.)
63
+
64
+ While the milestone is held open, the work each task leaves behind — open lessons, and items discovered but out of scope — becomes its next tasks: the AI proposes them, the human confirms, and the loop continues until the goal is reached. The milestone is the loop made concrete; the exit criteria are its finish line.
65
+
60
66
  > **Do:** release small, watch the scenarios, and feed every learning back into the spec.
61
67
  > **Don't:** treat shipping as the end. The most valuable information about a feature arrives *after* it ships.