cclaw-cli 0.51.26 → 0.51.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,120 @@ import { stageExamples } from "./examples.js";
5
5
  import { reviewStackAwareRoutes, reviewStackAwareRoutingSummary, stageAutoSubagentDispatch, stageSchema, stageTrackRenderContext } from "./stage-schema.js";
6
6
  import { conversationLanguagePolicyMarkdown } from "./language-policy.js";
7
7
  import { referencePatternsForStage } from "./reference-patterns.js";
8
+ import { harnessDelegationRecipes } from "../harness-adapters.js";
8
9
  const VERIFICATION_STAGES = ["tdd", "review", "ship"];
10
+ // ---------- Cross-cutting universal mechanics (Layer 2 building blocks) ----------
11
+ //
12
+ // These are shared, structural blocks that get injected into every stage skill.
13
+ // They check structural shape, not domain content. Each has a matching linter
14
+ // rule in `src/artifact-linter.ts` so artifacts can fail when shape is missing.
15
+ export const FORBIDDEN_SYCOPHANCY_PHRASES = [
16
+ "you're absolutely right",
17
+ "great point",
18
+ "absolutely!",
19
+ "thanks for catching",
20
+ "thanks for the great",
21
+ "good catch",
22
+ "love this",
23
+ "nailed it"
24
+ ];
25
+ export const FORBIDDEN_PLACEHOLDER_TOKENS = [
26
+ "TBD",
27
+ "TODO",
28
+ "FIXME",
29
+ "implement later",
30
+ "similar to Task",
31
+ "add appropriate error handling",
32
+ "add proper logging",
33
+ "fill this in",
34
+ "<placeholder>"
35
+ ];
36
+ export const CONFIDENCE_FINDING_REGEX_SOURCE = "\\[P[123]\\]\\s*\\(confidence:\\s*\\d{1,2}/10\\)\\s+[^\\s]+(?::\\d+)?\\s+—";
37
+ export function stopPerIssueBlock() {
38
+ return `## STOP-per-issue Protocol
39
+
40
+ After each critical section (premise / alternatives / mode pick / each review finding), STOP and record one decision marker before continuing:
41
+
42
+ - \`Q<n>:\` — issue or open question
43
+ - \`decision:\` — \`accept\` / \`reject\` / \`defer\` / \`skip — no issues\`
44
+ - \`rationale:\` — one line, evidence-backed
45
+
46
+ Do not batch decisions. Do not silently move on. The artifact MUST contain at least one \`decision:\` marker per critical section.
47
+ `;
48
+ }
49
+ export function confidenceCalibrationBlock() {
50
+ return `## Confidence Calibration
51
+
52
+ Findings, recommendations, and review notes use the calibrated finding format:
53
+
54
+ \`[P1|P2|P3] (confidence: <n>/10) <repo-relative-path>[:<line>] — <one-line description>\`
55
+
56
+ - \`P1\` blocks merge; \`P2\` should be addressed; \`P3\` is nice-to-have.
57
+ - Confidence \`< 7\` — suppress unless severity is \`P1\`.
58
+ - "What evidence would change this?" — every finding must answer it inline or in the next bullet.
59
+ - Never assert "this is fine" without confidence; never assert confidence above \`8\` without a cited artifact, line, or test.
60
+ `;
61
+ }
62
+ export function outsideVoiceSlotBlock() {
63
+ return `## Outside Voice Slot (optional)
64
+
65
+ Reserve a section titled \`## Outside Voice\` (or \`## Outside Voice — <model/critic>\`) for a second-model or fresh-context critic perspective when used. Required shape when present:
66
+
67
+ - \`source:\` — model id, critic agent name, or human reviewer handle
68
+ - \`prompt:\` — exact frame sent (or reference to \`docs/quality-gates.md\` recipe)
69
+ - \`tension:\` — at least one disagreement with the main draft, or \`none — converged\`
70
+ - \`resolution:\` — accepted / rejected / merged / deferred + one-line rationale
71
+
72
+ Empty when not used; do not fabricate an outside voice.
73
+ `;
74
+ }
75
+ export function antiSycophancyBlock() {
76
+ const phrases = FORBIDDEN_SYCOPHANCY_PHRASES.map((p) => `\`${p}\``).join(", ");
77
+ return `## Anti-sycophancy
78
+
79
+ Forbidden response openers when receiving review, critic output, or user feedback: ${phrases}.
80
+
81
+ Replace agreement theater with one of:
82
+
83
+ - \`Verified — <evidence>\` (you actually checked)
84
+ - \`Disagree — <reason>\` (you push back with substance)
85
+ - \`Investigating — <next step>\` (you do not yet know)
86
+
87
+ Never agree before reading the cited evidence. Never apologize for asking a clarifying question.
88
+ `;
89
+ }
90
+ export function noPlaceholdersBlock() {
91
+ const tokens = FORBIDDEN_PLACEHOLDER_TOKENS.map((p) => `\`${p}\``).join(", ");
92
+ return `## NO PLACEHOLDERS Rule
93
+
94
+ Plans, specs, designs, and review artifacts MUST NOT contain placeholder tokens: ${tokens}. Use repo-relative paths and concrete commands; if a value is genuinely unknown, write the open question explicitly with a \`Q<n>:\` marker and a \`decision: defer — <reason>\` row instead of inserting a placeholder token.
95
+ `;
96
+ }
97
+ export function watchedFailProofBlock() {
98
+ return `## Watched-fail Proof
99
+
100
+ Any "the failure is real" claim (failing test, broken build, regression catch, deployment fail) MUST include a watched-fail proof line in the artifact:
101
+
102
+ \`proof: <iso-ts> | <observed snippet — first 200 chars> | source: <command or log path>\`
103
+
104
+ For TDD specifically, this is the watched-RED proof and is required per new test before \`stage-complete\` accepts the stage.
105
+ `;
106
+ }
107
+ function crossCuttingMechanicsBlock(stage) {
108
+ // All stages share the universal mechanics, but each stage's matching
109
+ // linter rules decide what is mandatory vs. structural-only.
110
+ const blocks = [
111
+ stopPerIssueBlock(),
112
+ confidenceCalibrationBlock(),
113
+ outsideVoiceSlotBlock(),
114
+ antiSycophancyBlock(),
115
+ noPlaceholdersBlock()
116
+ ];
117
+ if (stage === "tdd" || stage === "review" || stage === "ship") {
118
+ blocks.push(watchedFailProofBlock());
119
+ }
120
+ return blocks.join("\n");
121
+ }
9
122
  function whenNotToUseBlock(items) {
10
123
  if (items.length === 0) {
11
124
  return "";
@@ -68,13 +181,25 @@ function autoSubagentDispatchBlock(stage, track) {
68
181
  const mandatoryList = mandatory.length > 0 ? mandatory.map((a) => `\`${a}\``).join(", ") : "none";
69
182
  const delegationLogRel = `${RUNTIME_ROOT}/state/delegation-log.json`;
70
183
  const delegationEventsRel = `${RUNTIME_ROOT}/state/delegation-events.jsonl`;
71
- const artifactRef = `${RUNTIME_ROOT}/artifacts/${schema.artifactRules.artifactFile}`;
72
184
  return `## Automatic Subagent Dispatch
73
185
  | Agent | Mode | Class | Return Schema | User Gate | Trigger | Purpose |
74
186
  |---|---|---|---|---|---|---|
75
187
  ${rows}
76
188
  Mandatory: ${mandatoryList}. Record lifecycle rows in \`${delegationLogRel}\` and append-only \`${delegationEventsRel}\` before completion.
77
189
  ### Harness Dispatch Contract — use true harness dispatch: Claude Task, Cursor generic dispatch, OpenCode \`.opencode/agents/<agent>.md\` via Task/@agent, Codex \`.codex/agents/<agent>.toml\`. Do not collapse OpenCode or Codex to role-switch by default. Worker ACK Contract: ACK must include \`spanId\`, \`dispatchId\`, \`dispatchSurface\`, \`agentDefinitionPath\`, and \`ackTs\`; never claim \`fulfillmentMode: "isolated"\` without matching lifecycle proof. Helper: \`.cclaw/hooks/delegation-record.mjs --status=<status> --span-id=<spanId> --dispatch-id=<dispatchId> --dispatch-surface=<surface> --agent-definition-path=<path> --json\`. Exact recipe: scheduled -> launched -> acknowledged -> completed with the same span; completed isolated/generic rows require a prior ACK event for that span or \`--ack-ts=<iso>\`.
190
+
191
+ ${perHarnessLifecycleRecipeBlock()}`;
192
+ }
193
+ function perHarnessLifecycleRecipeBlock() {
194
+ const recipes = harnessDelegationRecipes();
195
+ const rows = recipes
196
+ .map((recipe) => `| \`${recipe.harnessId}\` | \`${recipe.dispatchSurface}\` | \`${recipe.agentDefinitionExample}\` | \`${recipe.fulfillmentMode}\` |`)
197
+ .join("\n");
198
+ return `### Per-Harness Lifecycle Recipe — placeholders only
199
+ Reuse the same \`<span-id>\` and \`<dispatch-id>\` across scheduled -> launched -> acknowledged -> completed; substitute neutral tokens \`<agent-name>\`, \`<stage>\`, \`<iso-ts>\`, \`<artifact-anchor>\`. Full command sequences live in \`docs/harnesses.md\`.
200
+ | Harness | Dispatch surface | Agent definition path | fulfillmentMode |
201
+ |---|---|---|---|
202
+ ${rows}
78
203
  `;
79
204
  }
80
205
  function researchPlaybooksBlock(playbooks) {
@@ -438,6 +563,7 @@ ${interactionFocus.length > 0 ? interactionFocus.map((item, i) => `${i + 1}. ${i
438
563
  Decision protocol: ask only decision-changing questions, record the chosen option, rationale, risk, and rollback when the stage makes a non-trivial call.
439
564
 
440
565
  ${batchExecutionModeBlock(stage, track)}
566
+ ${crossCuttingMechanicsBlock(stage)}
441
567
  ## Required Gates
442
568
  ${gateList}
443
569
 
@@ -11,7 +11,7 @@ export const BRAINSTORM = {
11
11
  philosophy: {
12
12
  hardGate: "Do NOT invoke implementation skills, write code, scaffold projects, or mutate product behavior until a concrete direction is approved by the user.",
13
13
  ironLaw: "NO ARTIFACT IS COMPLETE WITHOUT AN EXPLICITLY APPROVED DIRECTION — SILENCE IS NOT APPROVAL.",
14
- purpose: "Turn an initial idea into an approved problem frame and direction, using product or technical-maintenance discovery before proposing solutions.",
14
+ purpose: "Turn an initial idea into an approved problem frame and direction, using domain-neutral problem discovery (product, technical-maintenance, research, ops, or infrastructure framing) before proposing solutions.",
15
15
  whenToUse: [
16
16
  "Starting a new feature or behavior change",
17
17
  "Requirements are ambiguous or trade-offs are unclear",
@@ -37,8 +37,8 @@ export const BRAINSTORM = {
37
37
  executionModel: {
38
38
  checklist: [
39
39
  "**Explore project context** — inspect existing files/docs/recent activity before asking what to build; capture matching files/patterns/seeds in `Context > Discovered context` so downstream stages don't redo discovery.",
40
- "**Classify stage depth** — choose `lite` for clear low-risk tasks, `standard` for normal product/engineering changes, or `deep` for ambiguity, architecture, external dependency, security/data risk, or explicit think-bigger requests.",
41
- "**Write the Problem Decision Record** — product work captures persona/JTBD/pain/value/evidence/success/why-now/do-nothing/non-goals; technical-maintenance work captures affected operator/developer, failure mode, operational improvement, verification signal, do-nothing cost, and non-goals.",
40
+ "**Classify stage depth** — choose `lite` for clear low-risk tasks, `standard` for normal engineering/product changes, or `deep` for ambiguity, architecture, external dependency, security/data risk, or explicit think-bigger requests.",
41
+ "**Write the Problem Decision Record** — pick a free-form `Frame type` label that names how this work is framed (examples: product, technical-maintenance, research-spike, ops-incident, infrastructure), then fill the universal Framing fields: affected user/role/operator, current state/failure mode/opportunity, desired observable outcome, evidence/signal, why now, do-nothing consequence, and non-goals.",
42
42
  "**Premise check (one pass)** — answer the three gstack-style questions in the artifact body: *Right problem? Direct path? What if we do nothing?* Take a position; do not hedge.",
43
43
  "**Reframe with How Might We** — write a single `How Might We …?` line that names the user/operator, the desired outcome, and the constraint. This is the altitude check before approaches.",
44
44
  "**Run Clarity Gate** — record ambiguity score (0.00-1.00), decision boundaries, reaffirmed non-goals, and residual-risk handoff before locking recommendations. If ambiguity remains high (>0.40), ask one decision-changing question before recommending.",
@@ -81,7 +81,7 @@ export const BRAINSTORM = {
81
81
  requiredEvidence: [
82
82
  "Artifact written to `.cclaw/artifacts/01-brainstorm-<slug>.md`.",
83
83
  "Project context was explored (files, docs, or recent activity referenced).",
84
- "Problem Decision Record includes product framing or technical-maintenance framing.",
84
+ "Problem Decision Record includes a `Frame type` label and the universal Framing fields (affected user/role/operator, current state/failure mode/opportunity, desired observable outcome, evidence/signal, why now, do-nothing consequence, non-goals).",
85
85
  "Clarity Gate records ambiguity score, decision boundaries, reaffirmed non-goals, and residual-risk handoff.",
86
86
  "Clarifying questions are one-at-a-time and captured only when they change a decision or stop condition.",
87
87
  "2-3 approaches with trade-offs are recorded, including one higher-upside challenger option and reference-pattern source/disposition when applicable.",
@@ -130,7 +130,7 @@ export const BRAINSTORM = {
130
130
  },
131
131
  artifactValidation: [
132
132
  { section: "Context", required: true, validationRule: "Must reference project state and relevant existing code or patterns. A `Discovered context` subsection (or list) is recommended for downstream traceability." },
133
- { section: "Problem Decision Record", required: true, validationRule: "Must include either product framing fields (persona/JTBD/pain/value/evidence/success/why-now/do-nothing/non-goals) or technical-maintenance fields (operator/developer, failure mode, operational improvement, verification signal, do-nothing cost, non-goals)." },
133
+ { section: "Problem Decision Record", required: true, validationRule: "Must include a free-form `Frame type` label (examples only: product, technical-maintenance, research-spike, ops-incident, infrastructure) and the universal Framing fields: affected user/role/operator, current state/failure mode/opportunity, desired observable outcome, evidence/signal, why now, do-nothing consequence, non-goals. The linter checks that the section has meaningful content; the field labels themselves are the structural contract." },
134
134
  { section: "Premise Check", required: false, validationRule: "Recommended: explicit answers to `Right problem?`, `Direct path?`, `What if we do nothing?` — take a position, do not hedge." },
135
135
  { section: "How Might We", required: false, validationRule: "Recommended: a single `How Might We …?` line naming the user, the outcome, and the binding constraint." },
136
136
  { section: "Clarity Gate", required: false, validationRule: "Recommended before recommendation lock: include ambiguity score (0.00-1.00), decision boundaries, reaffirmed non-goals, and residual-risk handoff for scope." },
@@ -12,7 +12,7 @@ const REQUIRED_TOP_LEVEL_FIELDS = {
12
12
  };
13
13
  const STAGE_TAXONOMIES = {
14
14
  brainstorm: {
15
- approachTier: ["Lightweight", "Standard", "Deep"],
15
+ approachTier: ["Lightweight", "Standard", "Deep", "lite", "standard", "deep"],
16
16
  approachRole: ["baseline", "challenger", "wild-card"],
17
17
  approachUpside: ["low", "modest", "high", "higher"]
18
18
  },
@@ -22,6 +22,10 @@ export const ARTIFACT_TEMPLATES = {
22
22
 
23
23
  # Brainstorm Artifact
24
24
 
25
+ ## Mode Block
26
+ - **Mode:** STARTUP | BUILDER | ENGINEERING | OPS | RESEARCH (pick exactly one)
27
+ - **Why this mode:** (one line; cite a concrete signal — repo state, user prompt, ownership, risk window)
28
+
25
29
  ## Context
26
30
  - **Project state:**
27
31
  - **Relevant existing code/patterns:**
@@ -36,25 +40,15 @@ export const ARTIFACT_TEMPLATES = {
36
40
 
37
41
  ## Problem Decision Record
38
42
  - **Depth:** lite | standard | deep
39
- - **Frame type:** product | technical-maintenance
40
-
41
- ### Product framing (use when applicable)
42
- - **Persona / user:**
43
- - **Job to be done:**
44
- - **Pain / trigger:**
45
- - **Value hypothesis:**
46
- - **Evidence / signal:**
47
- - **Success metric:**
48
- - **Why now:**
49
- - **Do-nothing consequence:**
50
- - **Non-goals:**
51
-
52
- ### Technical-maintenance framing (use when product framing is not applicable)
53
- - **Affected operator/developer:**
54
- - **Current failure mode:**
55
- - **Expected operational improvement:**
56
- - **Verification signal:**
57
- - **Do-nothing cost:**
43
+ - **Frame type:** \`<free-form-label>\` (one short token that names how this work is framed; pick whatever fits — examples in commentary only: \`product\`, \`technical-maintenance\`, \`research-spike\`, \`ops-incident\`, \`infrastructure\`, \`library-extraction\`. Do NOT treat the examples as an enum.)
44
+
45
+ ### Framing fields (universal keep field names; fill in whatever is meaningful for this work)
46
+ - **Affected user / role / operator:** (who experiences the problem or carries the consequence)
47
+ - **Current state / failure mode / opportunity:** (what is happening today)
48
+ - **Desired outcome (observable):** (what changes when this work lands; phrase so a test or operator could verify)
49
+ - **Evidence / signal supporting this framing:** (citation, metric, ticket, prior artifact, repo path, or \`- None.\`)
50
+ - **Why now (urgency / cost of waiting):**
51
+ - **Do-nothing consequence:** (concrete — not "nothing happens")
58
52
  - **Non-goals:**
59
53
 
60
54
  ## Premise Check
@@ -62,6 +56,26 @@ export const ARTIFACT_TEMPLATES = {
62
56
  - **Direct path?** (yes/no + one-line justification)
63
57
  - **What if we do nothing?** (concrete consequence, not "nothing happens")
64
58
 
59
+ ## Forcing Questions
60
+ > Minimum 3 questions; each answer MUST contain at least one *specific* token: a concrete name, a role, a number, a repo-relative path, an external link, or a verbatim quote. Vague answers fail the linter.
61
+
62
+ | # | Forcing question | Specific answer | Decision impact | Q\\<n\\> decision |
63
+ |---|---|---|---|---|
64
+ | 1 | | | | decision: |
65
+ | 2 | | | | decision: |
66
+ | 3 | | | | decision: |
67
+
68
+ ## Premise List
69
+ > ≥2 premises. Each premise must be in the form \`P<n>: <statement> — agreed | disagreed | revised\`. \`revised\` rows must include the revised statement on the next line.
70
+
71
+ - P1: <statement> — agreed | disagreed | revised
72
+ - P2: <statement> — agreed | disagreed | revised
73
+
74
+ ## Anti-Sycophancy Stamp
75
+ - **Forbidden response openers acknowledged:** yes (no "you're absolutely right", "great point", "absolutely!", etc.)
76
+ - **Posture commitment:** push back with reasoning when premises feel weak; do not perform agreement.
77
+ - **Evidence-that-would-change-the-recommendation:** (one line per premise, or \`- None.\`)
78
+
65
79
  ## How Might We
66
80
  - *How might we …?* — one line naming the user, the desired outcome, and the binding constraint.
67
81
 
@@ -99,6 +113,33 @@ export const ARTIFACT_TEMPLATES = {
99
113
 
100
114
  > Role values: \`baseline\` | \`challenger\` | \`wild-card\`. Upside values: \`low\` | \`modest\` | \`high\` | \`higher\`. Exactly one row must be a \`challenger\` with \`high\` or \`higher\` upside.
101
115
 
116
+ ### Approach Detail Cards
117
+ > Required structural form per approach (≥2). One block per row above:
118
+
119
+ #### APPROACH A
120
+ - Summary:
121
+ - Effort:
122
+ - Risk:
123
+ - Pros:
124
+ - Cons:
125
+ - Reuses:
126
+
127
+ #### APPROACH B
128
+ - Summary:
129
+ - Effort:
130
+ - Risk:
131
+ - Pros:
132
+ - Cons:
133
+ - Reuses:
134
+
135
+ RECOMMENDATION: <approach letter — one-line rationale, traced to forcing-question answers and premise list>
136
+
137
+ ## Outside Voice (optional)
138
+ - source: <model id | critic agent | human reviewer> | (or \`- not used.\`)
139
+ - prompt:
140
+ - tension:
141
+ - resolution:
142
+
102
143
  ## Approach Reaction
103
144
  - Closest option:
104
145
  - Concerns:
@@ -184,6 +225,20 @@ ${SEED_SHELF_SECTION}
184
225
  | B (ideal architecture) | | | | | | |
185
226
  | C (optional) | | | | | | |
186
227
 
228
+ RECOMMENDATION: <option letter — one-line rationale tying back to premise challenge and existing-code leverage>
229
+
230
+ ## Failure Modes Registry
231
+ > Universal failure-mode shape — applies to CLI, library, infra, web, batch jobs.
232
+
233
+ | Codepath | Failure mode | Rescued? (yes/no) | Test? (unit/integration/e2e) | User sees? (message/silent/N/A) | Logged? (level/none) | Q\\<n\\> decision |
234
+ |---|---|---|---|---|---|---|
235
+ | | | | | | | decision: |
236
+
237
+ ## Reversibility Rating
238
+ - Score (1-5, 1 = one-way door / unrecoverable, 5 = trivially reversible):
239
+ - Justification (cite a specific artifact/file or migration step):
240
+ - Rollback plan reference:
241
+
187
242
  ## Temporal Interrogation
188
243
  - Deep/optional only; omit for compact scope.
189
244
  | Time slice | Likely decision pressure | Lock now or defer? | Reason |
@@ -343,7 +398,7 @@ ${SEED_SHELF_SECTION}
343
398
  | Topic | Finding | Evidence |
344
399
  |---|---|---|
345
400
  | Domain conventions | | |
346
- | UX/product patterns | | |
401
+ | User-facing or operator-facing patterns | | |
347
402
 
348
403
  ## Architecture Options
349
404
  | Option | Trade-offs | Recommendation | Evidence |
@@ -516,6 +571,32 @@ ${MARKDOWN_CODE_FENCE}
516
571
  |---|---|---|---|
517
572
  | | | | covered/gap |
518
573
 
574
+ ## ASCII Coverage Diagram
575
+
576
+ <!-- diagram: ascii-coverage -->
577
+
578
+ ${MARKDOWN_CODE_FENCE}
579
+ entry-point
580
+ ├── happy path [★★★]
581
+ ├── empty input [★★]
582
+ ├── error path [★]
583
+ ├── concurrency edge [GAP]
584
+ ├── slow-network edge [→E2E]
585
+ └── perf regression [→EVAL]
586
+ ${MARKDOWN_CODE_FENCE}
587
+
588
+ > Required marker tokens (at least one each present where applicable): \`[★★★]\` / \`[★★]\` / \`[★]\` / \`[GAP]\` / \`[→E2E]\` / \`[→EVAL]\`. The diagram is the single source of truth for coverage; gaps must be traced into Plan or Spec.
589
+
590
+ ## Regression Iron Rule
591
+ - Iron rule acknowledged: yes — any diff that changes existing behavior gets a regression test added to the plan, no exceptions.
592
+ - Detected behavior changes (or \`- None.\`):
593
+ - Regression test handoff (Plan task ID or \`- None.\`):
594
+
595
+ ## Calibrated Findings
596
+ > Format: \`[P1|P2|P3] (confidence: <n>/10) <repo-relative-path>[:<line>] — <one-line description>\`. Findings with confidence \`< 7\` are suppressed unless severity is \`P1\`.
597
+
598
+ - (or \`- None this stage.\`)
599
+
519
600
  ## Performance Budget
520
601
  | Critical path | Metric | Target | Measurement method |
521
602
  |---|---|---|---|
@@ -673,6 +754,50 @@ For meaningful design work, replace the Learnings sentinel with 1-3 JSON learnin
673
754
  |---|---|---|
674
755
  | | | |
675
756
 
757
+ ## Synthesis Sources
758
+ > Spec is synthesized from existing context (CLAUDE.md / AGENTS.md / TODOS.md / git history / brainstorm + scope + design artifacts) — interview only when something genuinely cannot be derived. List the artifacts/files actually read and what each supplied.
759
+
760
+ | Source | What it supplied | Confidence (1-10) |
761
+ |---|---|---|
762
+ | | | |
763
+
764
+ ## Behavior Contract
765
+ > List behaviors universally (works for CLI, library, infra, web, batch). Use either \`As a <role>, I can <action> so that <outcome>.\` or \`Given <state>, When <event>, Then <outcome>.\`. ≥3 behaviors required. The shape — not the topic — is what the linter checks.
766
+
767
+ - (or write \`- None.\` if a single-step spec)
768
+
769
+ ## Architecture Modules
770
+ > One line of responsibility per module — no file paths, no signatures, no method names. Modules must be derivable from the design artifact.
771
+
772
+ | Module | Responsibility (one sentence) | Maps to design ref (DD-#) |
773
+ |---|---|---|
774
+ | | | |
775
+
776
+ ## Testing Strategy
777
+ - Behaviors covered (not implementation):
778
+ - Integration vs. unit split (and why):
779
+ - Real services vs. doubles (and why):
780
+ - Coverage gaps with rationale (or \`- None.\`):
781
+
782
+ ## Spec Self-Review
783
+ > Inline pass; fix in place. If a check fails, do not move on without recording the fix.
784
+
785
+ - [ ] Placeholders scan (no \`TBD\`, \`TODO\`, \`FIXME\`, \`<placeholder>\`)
786
+ - [ ] Internal consistency (sections do not contradict each other)
787
+ - [ ] Scope check (focused enough for a single plan)
788
+ - [ ] Ambiguity check (no requirement readable two ways)
789
+ - Patches applied:
790
+ - None
791
+ - Remaining concerns:
792
+ - None
793
+
794
+ ## Reviewer Concerns (convergence guard)
795
+ > Populate ONLY if the spec review loop did not converge after 3 iterations. Each row links a concern to the unresolved review pass.
796
+
797
+ | ID | Concern | Reviewer / source | Disposition (open/accept/defer) | Rationale |
798
+ |---|---|---|---|---|
799
+ | | | | | |
800
+
676
801
  ## Approval
677
802
  - Approved by:
678
803
  - Date:
@@ -684,6 +809,11 @@ For meaningful design work, replace the Learnings sentinel with 1-3 JSON learnin
684
809
 
685
810
  # Plan Artifact
686
811
 
812
+ ## Plan Header
813
+ - **Goal:** (one sentence — what this plan delivers)
814
+ - **Architecture:** (2-3 sentences — approach + key boundaries)
815
+ - **Tech Stack:** (key languages/runtimes/frameworks/libraries that the executor must know)
816
+
687
817
  ## Upstream Handoff
688
818
  - Source artifacts: \`03-design-<slug>.md\`, \`04-spec.md\`
689
819
  - Decisions carried forward:
@@ -750,6 +880,56 @@ Execution rule: complete and verify each batch before starting the next batch.
750
880
  |---|---|---|
751
881
  | | | |
752
882
 
883
+ ## Implementation Units
884
+ > Required structural form per implementation unit. Use ≥1 unit; bite-sized 2-5 minute steps inside each. The linter validates shape, not topic.
885
+
886
+ ### Implementation Unit U-1
887
+ - **Goal:**
888
+ - **Requirements (from Spec):**
889
+ - **Dependencies (other units):**
890
+ - **Files (repo-relative; never absolute):**
891
+ - Create:
892
+ - Modify:
893
+ - Test:
894
+ - **Approach:** (1-3 sentences; cite design decision DD-# or LD#hash)
895
+ - **Patterns to follow:** (link existing files/modules to mirror, or \`- None applicable.\`)
896
+ - **Test scenarios:**
897
+ - Happy:
898
+ - Edge:
899
+ - Error:
900
+ - Integration:
901
+ - **Verification:** (outcome to observe — not a shell script; e.g., "command exits 0 and prints \`<artifact-anchor>\`").
902
+ - **Steps (each 2-5 min, checkbox):**
903
+ - [ ] Step 1: write failing test for <behavior>
904
+ - [ ] Step 2: run test, observe RED with reason
905
+ - [ ] Step 3: minimal implementation
906
+ - [ ] Step 4: run test, observe GREEN
907
+ - [ ] Step 5: refactor + commit
908
+
909
+ ## High-Level Technical Design
910
+ > "Directional guidance, not implementation specification." Choose the form that fits the work: pseudo-code grammar, mermaid sequence/state, data-flow ASCII, decision matrix. Skip if the plan is a pure rename/move.
911
+
912
+ \`\`\`
913
+ (pseudo-code, mermaid, ASCII data flow, or decision matrix)
914
+ \`\`\`
915
+
916
+ ## Plan Self-Review
917
+ - [ ] Spec coverage: every spec behavior maps to a unit/task
918
+ - [ ] Placeholder scan (regex on full artifact, not only Task List)
919
+ - [ ] Type/name consistency across units (signatures referenced match definitions)
920
+ - [ ] No silent scope reduction
921
+ - [ ] Confidence per unit recorded (1-10)
922
+ - Patches applied:
923
+ - None
924
+ - Remaining concerns:
925
+ - None
926
+
927
+ ## Execution Handoff
928
+ - **Posture chosen:** Subagent-Driven (recommended) | Inline executor
929
+ - **Why this posture:** (one line tying choice to plan size, parallelism, novelty)
930
+ - **Subagent recipe (if Subagent-Driven):** \`<harness>\` -> \`<dispatch surface>\` -> \`<agent-definition path>\` (substitute neutral placeholders; full recipes in \`docs/harnesses.md\`)
931
+ - **Inline recipe (if Inline executor):** TDD loop unit-by-unit with batch checkpoints
932
+
753
933
  ## No-Placeholder Scan
754
934
  - Scanned tokens: \`TODO\`, \`TBD\`, \`FIXME\`, \`<fill-in>\`, \`<your-*-here>\`, \`xxx\`, bare ellipsis in task rows.
755
935
  - Hits: 0 (required for WAIT_FOR_CONFIRM to resolve).
@@ -822,6 +1002,39 @@ Execution rule: complete and verify each batch before starting the next batch.
822
1002
  - Spec criterion IDs:
823
1003
 
824
1004
 
1005
+ ## Iron Law Acknowledgement
1006
+ - Iron Law: NO PRODUCTION CODE WITHOUT A FAILING TEST FIRST.
1007
+ - Acknowledged: yes — code that landed before its test will be deleted and rewritten from the test.
1008
+ - Exceptions invoked (or \`- None.\`):
1009
+
1010
+ ## Watched-RED Proof
1011
+ > Required for every new test in this stage. Each row proves the test was *observed* failing before any production code was written.
1012
+
1013
+ | Slice | Test name | Observed at (ISO ts) | Failure reason snippet | Source command/log |
1014
+ |---|---|---|---|---|
1015
+ | S-1 | | | | |
1016
+
1017
+ ## Vertical Slice Cycle
1018
+ > Per slice: RED -> GREEN -> REFACTOR within the same cycle (refactor not deferred). The linter checks structural presence of all three phases.
1019
+
1020
+ | Slice | RED ts | GREEN ts | REFACTOR ts (or \`deferred because <reason>\`) |
1021
+ |---|---|---|---|
1022
+ | S-1 | | | |
1023
+
1024
+ ## Assertion Correctness Notes
1025
+ > For each new test assertion, name a *plausible subtle bug* that would still pass it (mental mutation test). If you cannot, the assertion is too coarse — strengthen it.
1026
+
1027
+ | Slice | Assertion (one line) | Bug that would still pass | Strengthening action (or \`- Sufficient.\`) |
1028
+ |---|---|---|---|
1029
+ | S-1 | | | |
1030
+
1031
+ ## Anti-Rationalization Checks
1032
+ - [ ] No "test passes immediately" — each new test was watched failing first
1033
+ - [ ] No "code before test" reuse from a prior session
1034
+ - [ ] No "tests after" backfill instead of RED-first
1035
+ - [ ] No "spirit not ritual" overrides
1036
+ - Notes (or \`- None this stage.\`):
1037
+
825
1038
  ## Verification Ladder
826
1039
  | Slice | Tier reached | Evidence |
827
1040
  |---|---|---|
@@ -863,6 +1076,44 @@ For meaningful TDD work, replace the Learnings sentinel with 1-3 JSON learning b
863
1076
  - Open questions:
864
1077
  - Drift from upstream (or \`None\`):
865
1078
 
1079
+ ## Self-Review First
1080
+ - [ ] Build/lint/type-check/tests passed locally
1081
+ - [ ] Diff matches spec/plan (no scope creep)
1082
+ - [ ] Leftover prints / commented code / unused imports removed
1083
+ - [ ] Deletion test: each new module justifies its existence
1084
+ - Evidence (commands + result):
1085
+ - Patches applied (or \`- None.\`):
1086
+
1087
+ ## Frame the Review Request
1088
+ - **Goal:**
1089
+ - **Approach:**
1090
+ - **Risk areas:**
1091
+ - **Verification done:**
1092
+ - **Open questions for the reviewer:**
1093
+
1094
+ ## Critic Subagent Dispatch
1095
+ > Dispatch a fresh-context critic (not the session history). Required even for self-driven review — the critic delegates back via \`delegation-record.mjs\` so the proof chain is preserved.
1096
+
1097
+ | Field | Value |
1098
+ |---|---|
1099
+ | Critic agent definition path | \`<repo-relative path under harness directory>\` |
1100
+ | Dispatch surface | One of the \`--dispatch-surface\` enum values listed in \`docs/harnesses.md\` (\`claude-task\`, \`cursor-task\`, \`opencode-agent\`, \`codex-agent\`, \`generic-task\`, \`role-switch\`, \`manual\`) |
1101
+ | Frame sent | WHAT_WAS_IMPLEMENTED + PLAN_OR_REQUIREMENTS + BASE_SHA + HEAD_SHA |
1102
+ | Critic returned | Strengths / Critical / Important / Minor |
1103
+ | Span id | \`<span-id>\` |
1104
+ | Acknowledgement ts | \`<iso ts>\` |
1105
+
1106
+ ## Receiving Posture
1107
+ - [ ] No performative agreement (forbidden openers acknowledged)
1108
+ - [ ] READ -> UNDERSTAND -> VERIFY -> EVALUATE -> RESPOND -> IMPLEMENT one-at-a-time discipline followed
1109
+ - [ ] Push-back recorded with reasoning when the critic was wrong
1110
+ - Notes (or \`- None.\`):
1111
+
1112
+ ## Critic Convergence
1113
+ - Iterations run: <n>/3
1114
+ - Convergence reached: yes / no — \`Reviewer Concerns\` populated when no
1115
+ - Stop reason:
1116
+
866
1117
  ## Review Evidence Scope
867
1118
  - Base/head:
868
1119
  - Files inspected:
@@ -992,6 +1243,12 @@ For meaningful review work, replace the Learnings sentinel with 1-3 JSON learnin
992
1243
  - Open questions:
993
1244
  - Drift from upstream (or \`None\`):
994
1245
 
1246
+ ## Verify Tests Gate
1247
+ - Discovered test command (cite repo config — package scripts / pyproject / go.mod / Cargo.toml / pom.xml / gradle):
1248
+ - Result: PASS | FAIL
1249
+ - Evidence (full output snippet or path):
1250
+ - Stop on FAIL: confirmed (no options surface unless PASS)
1251
+
995
1252
  ## Preflight Results
996
1253
  - Review verdict:
997
1254
  - Build:
@@ -1000,9 +1257,44 @@ For meaningful review work, replace the Learnings sentinel with 1-3 JSON learnin
1000
1257
  - Type-check:
1001
1258
  - Working tree clean:
1002
1259
 
1260
+ ## Base Branch Determination
1261
+ - Command run: \`git merge-base HEAD main || git merge-base HEAD master\`
1262
+ - Base branch:
1263
+ - User confirmation (if ambiguous):
1264
+
1265
+ ## Finalization Options
1266
+ > Exactly four options must be surfaced when tests pass. Selecting any option requires a recorded user decision.
1267
+
1268
+ 1. **Merge back to base locally** — \`MERGE_LOCAL\`
1269
+ 2. **Push and create PR** — \`OPEN_PR\`
1270
+ 3. **Keep branch as-is** — \`KEEP_BRANCH\`
1271
+ 4. **Discard this work** — \`DISCARD\` (typed-confirmation required)
1272
+
1273
+ - Selected option:
1274
+ - Typed confirmation (DISCARD only):
1275
+ - User decision recorded at:
1276
+
1003
1277
  ## Release Notes
1004
1278
  -
1005
1279
 
1280
+ ## Structured PR Body
1281
+ > Required when selected option is \`OPEN_PR\`. The structure is universal — replace placeholder bullets with concrete content, do not introduce domain-specific subsections.
1282
+
1283
+ ### ## Summary
1284
+ - (2-3 bullets describing what changed and why)
1285
+
1286
+ ### ## Test Plan
1287
+ - [ ] (verification step — repo-relative command + expected outcome)
1288
+ - [ ] (additional verification step or \`Manual: <action>\`)
1289
+
1290
+ ### ## Commits Included
1291
+ - (auto-generated commit list; one bullet per commit hash + subject)
1292
+
1293
+ ## Worktree Cleanup
1294
+ - Cleanup applies to options \`MERGE_LOCAL\` and \`DISCARD\`; preserved for \`OPEN_PR\` and \`KEEP_BRANCH\`.
1295
+ - Worktree path:
1296
+ - Cleanup result:
1297
+
1006
1298
  ## Rollback Plan
1007
1299
  - Trigger conditions:
1008
1300
  - Rollback steps: