ralphflow 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/dist/{chunk-DOC64TD6.js → chunk-CA4XP6KI.js} +1 -1
  2. package/dist/ralphflow.js +132 -18
  3. package/dist/{server-EX5MWYW4.js → server-64NQCIKJ.js} +88 -21
  4. package/package.json +1 -1
  5. package/src/dashboard/ui/app.js +4 -1
  6. package/src/dashboard/ui/archives.js +27 -2
  7. package/src/dashboard/ui/index.html +1 -1
  8. package/src/dashboard/ui/loop-detail.js +1 -1
  9. package/src/dashboard/ui/sidebar.js +1 -1
  10. package/src/dashboard/ui/state.js +3 -0
  11. package/src/dashboard/ui/styles.css +56 -0
  12. package/src/dashboard/ui/utils.js +30 -0
  13. package/src/templates/code-review/loops/00-collect-loop/changesets.md +3 -0
  14. package/src/templates/code-review/loops/00-collect-loop/prompt.md +179 -0
  15. package/src/templates/code-review/loops/00-collect-loop/tracker.md +16 -0
  16. package/src/templates/code-review/loops/01-spec-review-loop/prompt.md +238 -0
  17. package/src/templates/code-review/loops/01-spec-review-loop/tracker.md +16 -0
  18. package/src/templates/code-review/loops/02-quality-review-loop/issues.md +3 -0
  19. package/src/templates/code-review/loops/02-quality-review-loop/prompt.md +306 -0
  20. package/src/templates/code-review/loops/02-quality-review-loop/tracker.md +16 -0
  21. package/src/templates/code-review/loops/03-fix-loop/prompt.md +265 -0
  22. package/src/templates/code-review/loops/03-fix-loop/tracker.md +16 -0
  23. package/src/templates/code-review/ralphflow.yaml +98 -0
  24. package/src/templates/design-review/loops/00-explore-loop/ideas.md +3 -0
  25. package/src/templates/design-review/loops/00-explore-loop/prompt.md +207 -0
  26. package/src/templates/design-review/loops/00-explore-loop/tracker.md +16 -0
  27. package/src/templates/design-review/loops/01-design-loop/designs.md +3 -0
  28. package/src/templates/design-review/loops/01-design-loop/prompt.md +201 -0
  29. package/src/templates/design-review/loops/01-design-loop/tracker.md +16 -0
  30. package/src/templates/design-review/loops/02-review-loop/prompt.md +255 -0
  31. package/src/templates/design-review/loops/02-review-loop/tracker.md +16 -0
  32. package/src/templates/design-review/loops/03-plan-loop/plans.md +3 -0
  33. package/src/templates/design-review/loops/03-plan-loop/prompt.md +247 -0
  34. package/src/templates/design-review/loops/03-plan-loop/tracker.md +16 -0
  35. package/src/templates/design-review/ralphflow.yaml +84 -0
  36. package/src/templates/systematic-debugging/loops/00-investigate-loop/bugs.md +3 -0
  37. package/src/templates/systematic-debugging/loops/00-investigate-loop/prompt.md +237 -0
  38. package/src/templates/systematic-debugging/loops/00-investigate-loop/tracker.md +16 -0
  39. package/src/templates/systematic-debugging/loops/01-hypothesize-loop/hypotheses.md +3 -0
  40. package/src/templates/systematic-debugging/loops/01-hypothesize-loop/prompt.md +312 -0
  41. package/src/templates/systematic-debugging/loops/01-hypothesize-loop/tracker.md +18 -0
  42. package/src/templates/systematic-debugging/loops/02-fix-loop/fixes.md +3 -0
  43. package/src/templates/systematic-debugging/loops/02-fix-loop/prompt.md +342 -0
  44. package/src/templates/systematic-debugging/loops/02-fix-loop/tracker.md +18 -0
  45. package/src/templates/systematic-debugging/ralphflow.yaml +81 -0
  46. package/src/templates/tdd-implementation/loops/00-spec-loop/prompt.md +208 -0
  47. package/src/templates/tdd-implementation/loops/00-spec-loop/specs.md +3 -0
  48. package/src/templates/tdd-implementation/loops/00-spec-loop/tracker.md +16 -0
  49. package/src/templates/tdd-implementation/loops/01-tdd-loop/prompt.md +323 -0
  50. package/src/templates/tdd-implementation/loops/01-tdd-loop/test-cases.md +3 -0
  51. package/src/templates/tdd-implementation/loops/01-tdd-loop/tracker.md +18 -0
  52. package/src/templates/tdd-implementation/loops/02-verify-loop/prompt.md +226 -0
  53. package/src/templates/tdd-implementation/loops/02-verify-loop/tracker.md +16 -0
  54. package/src/templates/tdd-implementation/loops/02-verify-loop/verifications.md +3 -0
  55. package/src/templates/tdd-implementation/ralphflow.yaml +73 -0
@@ -0,0 +1,323 @@
1
+ # TDD Loop — Red-Green-Refactor Implementation
2
+
3
+ **App:** `{{APP_NAME}}` — all flow files live under `.ralph-flow/{{APP_NAME}}/`.
4
+
5
+ **You are agent `{{AGENT_NAME}}`.** Multiple agents may work in parallel.
6
+ Coordinate via `tracker.md` — the single source of truth.
7
+ *(If you see the literal text `{{AGENT_NAME}}` above — i.e., it was not substituted — treat your name as `agent-1`.)*
8
+
9
+ Read `.ralph-flow/{{APP_NAME}}/01-tdd-loop/tracker.md` FIRST to determine where you are.
10
+
11
+ > **PROJECT CONTEXT.** Read `CLAUDE.md` for architecture, stack, conventions, commands, and URLs.
12
+
13
+ **Pipeline:** `test-cases.md → YOU → code changes (tests + production code)`
14
+
15
+ ---
16
+
17
+ ## The Iron Law
18
+
19
+ ```
20
+ NO PRODUCTION CODE WITHOUT A FAILING TEST FIRST
21
+ ```
22
+
23
+ Write code before the test? Delete it. Start over. No exceptions:
24
+ - Do not keep it as "reference"
25
+ - Do not "adapt" it while writing tests
26
+ - Do not look at it
27
+ - Delete means delete
28
+
29
+ Implement fresh from tests. Period.
30
+
31
+ ---
32
+
33
+ ## Visual Communication Protocol
34
+
35
+ When communicating scope, structure, relationships, or status, render **ASCII diagrams** using Unicode box-drawing characters. These help the user see the full picture at the terminal without scrolling through prose.
36
+
37
+ **Character set:** `┌ ─ ┐ │ └ ┘ ├ ┤ ┬ ┴ ┼ ═ ● ○ ▼ ▶`
38
+
39
+ **Diagram types to use:**
40
+
41
+ - **TDD Cycle Diagram** — RED/GREEN/REFACTOR status with test output summaries
42
+ - **Decomposition Tree** — hierarchical breakdown with `├──` and `└──` branches
43
+ - **Data Flow** — arrows (`──→`) showing how information moves between components
44
+ - **Comparison Table** — bordered table for trade-offs and design options
45
+ - **Status Summary** — bordered box with completion indicators (`✓` done, `◌` pending)
46
+
47
+ **Rules:** Keep diagrams under 20 lines and under 70 characters wide. Populate with real data from current context. Render inside fenced code blocks. Use diagrams to supplement, not replace, prose.
48
+
49
+ ---
50
+
51
+ ## Tracker Lock Protocol
52
+
53
+ Before ANY write to `tracker.md`, you MUST acquire the lock:
54
+
55
+ **Lock file:** `.ralph-flow/{{APP_NAME}}/01-tdd-loop/.tracker-lock`
56
+
57
+ ### Acquire Lock
58
+ 1. Check if `.tracker-lock` exists
59
+ - Exists AND file is < 60 seconds old → sleep 2s, retry (up to 5 retries)
60
+ - Exists AND file is >= 60 seconds old → stale lock, delete it (agent crashed mid-write)
61
+ - Does not exist → continue
62
+ 2. Write lock: `echo "{{AGENT_NAME}} $(date -u +%Y-%m-%dT%H:%M:%SZ)" > .ralph-flow/{{APP_NAME}}/01-tdd-loop/.tracker-lock`
63
+ 3. Sleep 500ms (`sleep 0.5`)
64
+ 4. Re-read `.tracker-lock` — verify YOUR agent name (`{{AGENT_NAME}}`) is in it
65
+ - Your name → you own the lock, proceed to write `tracker.md`
66
+ - Other name → you lost the race, retry from step 1
67
+ 5. Write your changes to `tracker.md`
68
+ 6. Delete `.tracker-lock` immediately: `rm .ralph-flow/{{APP_NAME}}/01-tdd-loop/.tracker-lock`
69
+ 7. Never leave a lock held — if your write fails, delete the lock in your error handler
70
+
71
+ ### When to Lock
72
+ - Claiming a test case (pending → in_progress)
73
+ - Completing a test case (in_progress → completed, unblocking dependents)
74
+ - Updating stage transitions (red → green → refactor)
75
+ - Heartbeat updates (bundled with other writes, not standalone)
76
+
77
+ ### When NOT to Lock
78
+ - Reading `tracker.md` — read-only access needs no lock
79
+ - Reading `test-cases.md` — always read-only
80
+
81
+ ---
82
+
83
+ ## Test Case Selection Algorithm
84
+
85
+ Instead of "pick next unchecked test case", follow this algorithm:
86
+
87
+ 1. **Parse tracker** — read `completed_test_cases`, `## Dependencies`, Test Cases Queue metadata `{agent, status}`, Agent Status table
88
+ 2. **Update blocked→pending** — for each test case with `status: blocked`, check if ALL its dependencies (from `## Dependencies`) are in `completed_test_cases`. If yes, acquire lock and update to `status: pending`
89
+ 3. **Resume own work** — if any test case has `{agent: {{AGENT_NAME}}, status: in_progress}`, resume it (skip to the current stage)
90
+ 4. **Find claimable** — filter test cases where `status: pending` AND `agent: -`
91
+ 5. **Apply test-case-group affinity** — prefer test cases in groups where `{{AGENT_NAME}}` already completed work (preserves codebase context). If no affinity match, pick any claimable test case
92
+ 6. **Claim** — acquire lock, set `{agent: {{AGENT_NAME}}, status: in_progress}`, update your Agent Status row, update `last_heartbeat`, release lock, log the claim
93
+ 7. **Nothing available:**
94
+ - All test cases completed → emit `<promise>ALL TEST-CASES COMPLETE</promise>`
95
+ - All remaining test cases are blocked or claimed by others → log "{{AGENT_NAME}}: waiting — all test cases blocked or claimed", exit: `kill -INT $PPID` (the `while` loop restarts and re-checks)
96
+
97
+ ### New Test Case Discovery
98
+
99
+ If you find a test case in the Test Cases Queue without `{agent, status}` metadata (e.g., added by the spec loop while agents were running):
100
+ 1. Read the test case's `**Depends on:**` field in `test-cases.md`
101
+ 2. Add the dependency to `## Dependencies` section if not already there (skip if `Depends on: None`)
102
+ 3. Set status to `pending` (all deps in `completed_test_cases`) or `blocked` (deps incomplete)
103
+ 4. Set agent to `-`
104
+
105
+ ---
106
+
107
+ ## Anti-Hijacking Rules
108
+
109
+ 1. **Never touch another agent's `in_progress` test case** — do not modify, complete, or reassign it
110
+ 2. **Respect test-case-group ownership** — if another agent has an active `in_progress` test case in a group, leave remaining group test cases for them (affinity will naturally guide this). Only claim from that group if the other agent has finished all their group test cases
111
+ 3. **Note file overlap conflicts** — if your test case modifies files that another agent's active test case also modifies, log a WARNING in the tracker and coordinate carefully
112
+
113
+ ---
114
+
115
+ ## Heartbeat Protocol
116
+
117
+ Every tracker write includes updating your `last_heartbeat` to current ISO 8601 timestamp in the Agent Status table. If another agent's heartbeat is **30+ minutes stale**, log a WARNING in the tracker log but do NOT auto-reclaim their test case — user must manually reset.
118
+
119
+ ---
120
+
121
+ ## Crash Recovery (Self)
122
+
123
+ On fresh start, if your agent name has an `in_progress` test case but you have no memory of it:
124
+ - Test file exists AND test fails (RED stage completed) → resume at GREEN stage
125
+ - Test file exists AND test passes (GREEN stage completed) → resume at REFACTOR stage
126
+ - No test file found → restart from RED stage
127
+
128
+ ---
129
+
130
+ ## State Machine (3 stages per test case)
131
+
132
+ ```
133
+ RED → Write failing test, run it, confirm correct failure → stage: green
134
+ GREEN → Write minimal code to pass, run tests, confirm pass → stage: refactor
135
+ REFACTOR → Clean up, tests stay green, no new behavior → next test case
136
+ ```
137
+
138
+ When ALL done: `<promise>ALL TEST-CASES COMPLETE</promise>`
139
+
140
+ After completing ANY stage, exit: `kill -INT $PPID`
141
+
142
+ ---
143
+
144
+ ## STAGE 1: RED — Write Failing Test
145
+
146
+ 1. Read tracker → **run test case selection algorithm** (see above)
147
+ 2. Read test case in `test-cases.md` + its source SPEC context
148
+ 3. If sibling test cases are done, read their test files to align patterns
149
+ 4. Read `CLAUDE.md` for project context, test framework, and conventions
150
+ 5. Explore codebase — **20+ files:** test infrastructure, existing test patterns, source modules under test
151
+ 6. **Write ONE failing test** — use the exact test description from the test case:
152
+ - One behavior per test. One assertion per test.
153
+ - Clear name that describes expected behavior
154
+ - Real code, no mocks unless unavoidable
155
+ - Setup only what the test needs
156
+ 7. **Run the test.** Record the FULL output.
157
+ 8. **Verify it fails correctly:**
158
+ - Test FAILS (not errors) → good, confirm the failure message matches the "Expected RED Failure" from the test case
159
+ - Test ERRORS (syntax, import, etc.) → fix the error, re-run until it fails correctly
160
+ - **Test PASSES on first run → STOP. You have a PROBLEM.** Either:
161
+ - The feature already exists → delete the test, report in tracker log, move on
162
+ - Your test is wrong (testing existing behavior, not new behavior) → delete and rewrite
163
+ - Never proceed to GREEN with a test that passed in RED
164
+ 9. **Render a RED Status Diagram** — output an ASCII box showing:
165
+ - Test file path and test name
166
+ - Failure message (truncated to 2 lines)
167
+ - Expected vs actual
168
+ 10. Acquire lock → update tracker: your Agent Status row `active_test_case: TC-{N}`, `stage: green`, `last_heartbeat`, record test output in log → release lock
169
+ 11. Commit test file with message: `test(TC-{N}): RED — {test description}`
170
+ 12. Exit: `kill -INT $PPID`
171
+
172
+ ### RED Stage Rationalization Table
173
+
174
+ | You are thinking... | Answer |
175
+ |---------------------|--------|
176
+ | "I'll write the test after the code" | NO. Delete code. Write test first. |
177
+ | "This is too simple to test" | NO. Simple code breaks. Test takes 30 seconds. |
178
+ | "I know it works" | NO. Confidence is not evidence. |
179
+ | "I need to explore the implementation first" | Fine. Explore. Then THROW AWAY exploration, write test. |
180
+ | "Let me just get it working, then add tests" | NO. That is not TDD. Start over. |
181
+ | "The test is obvious, I can skip RED verification" | NO. You MUST see the test fail. |
182
+ | "I'll keep this code as reference" | NO. Delete means delete. Implement fresh from tests. |
183
+
184
+ ---
185
+
186
+ ## STAGE 2: GREEN — Minimal Implementation
187
+
188
+ 1. Read tracker → confirm your test case, stage should be `green`
189
+ 2. Re-read the test file you wrote in RED
190
+ 3. **Write the SIMPLEST code that makes the test pass.** Nothing more:
191
+ - No features the test does not require
192
+ - No refactoring of other code
193
+ - No "improvements" beyond what the test checks
194
+ - Hardcoding is acceptable if the test only checks one value
195
+ 4. **Run ALL tests** (not just the new one). Record FULL output.
196
+ 5. **Verify:**
197
+ - New test passes → good
198
+ - New test fails → fix implementation (NOT the test), re-run
199
+ - Other tests broken → fix immediately before proceeding
200
+ - Output pristine (no errors, warnings, deprecation notices)
201
+ 6. **Render a GREEN Status Diagram** — output an ASCII box showing:
202
+ - Test count: passed / total
203
+ - The specific test that transitioned RED → GREEN
204
+ - Any warnings or notable output
205
+ 7. Acquire lock → update tracker: `stage: refactor`, `last_heartbeat`, record test output in log → release lock
206
+ 8. Commit with message: `feat(TC-{N}): GREEN — {brief description of what was implemented}`
207
+ 9. Exit: `kill -INT $PPID`
208
+
209
+ ### GREEN Stage Anti-Patterns
210
+
211
+ - **Over-engineering:** Adding parameters, options, or abstractions the test does not require
212
+ - **Future-proofing:** Building for test cases you have not written yet
213
+ - **Refactoring during GREEN:** Save it for REFACTOR stage
214
+ - **Modifying the test:** If the test is wrong, go back to RED. Do NOT adjust the test in GREEN.
215
+
216
+ ---
217
+
218
+ ## STAGE 3: REFACTOR — Clean Up
219
+
220
+ 1. Read tracker → confirm your test case, stage should be `refactor`
221
+ 2. Re-read ALL tests and implementation code for this test case
222
+ 3. **Clean up — but do NOT add new behavior:**
223
+ - Remove code duplication
224
+ - Improve variable and function names
225
+ - Extract helper functions
226
+ - Simplify complex conditionals
227
+ - Improve error messages
228
+ - Align with project conventions (from `CLAUDE.md`)
229
+ 4. **After EVERY refactoring change, run ALL tests.** If any test fails:
230
+ - Undo the refactoring change
231
+ - Try a different approach
232
+ - Tests MUST stay green throughout refactoring
233
+ 5. **Render a Completion Summary** — output an ASCII status diagram showing:
234
+ - What was built (functions, modules, test files)
235
+ - Test results: all pass count
236
+ - How this test case fits in the group progress
237
+ 6. Commit with message: `refactor(TC-{N}): REFACTOR — {what was cleaned up}`
238
+ 7. **Mark done & unblock dependents:**
239
+ - Acquire lock
240
+ - Add test case to `completed_test_cases` list
241
+ - Check off test case in Test Cases Queue: `[x]`, set `{completed}`
242
+ - Add commit hash to Completed Mapping (if section exists)
243
+ - **Unblock dependents:** for each test case in `## Dependencies` that lists the just-completed test case, check if ALL its dependencies are now in `completed_test_cases`. If yes, update that test case's status from `blocked` → `pending` in the Test Cases Queue
244
+ - Update your Agent Status row: clear `active_test_case`
245
+ - Update `last_heartbeat`
246
+ - Log entry
247
+ - Release lock
248
+ 8. **Run test case selection algorithm again:**
249
+ - Claimable test case found → claim it, set `stage: red`, exit: `kill -INT $PPID`
250
+ - All test cases completed → `<promise>ALL TEST-CASES COMPLETE</promise>`
251
+ - All blocked/claimed → log "waiting", exit: `kill -INT $PPID`
252
+
253
+ ---
254
+
255
+ ## First-Run Handling
256
+
257
+ If Test Cases Queue in tracker is empty: read `test-cases.md`, scan `## TC-{N}:` headers, populate queue with `{agent: -, status: pending|blocked}` metadata (compute from Dependencies), then start.
258
+
259
+ ## Decision Reporting Protocol
260
+
261
+ When you make a substantive decision a human reviewer would want to know about, report it to the dashboard:
262
+
263
+ **When to report:**
264
+ - Test strategy decisions (unit vs integration approach for a test case)
265
+ - Implementation choices (which approach to make the test pass)
266
+ - Mocking decisions (why you chose to mock or not mock a dependency)
267
+ - Scope boundary decisions (what minimal implementation covers)
268
+ - File overlap or conflict decisions (how you handled shared files with other agents)
269
+
270
+ **How to report:**
271
+ ```bash
272
+ curl -s --connect-timeout 2 --max-time 5 -X POST "http://127.0.0.1:4242/api/decision?app=$RALPHFLOW_APP&loop=$RALPHFLOW_LOOP" -H 'Content-Type: application/json' -d '{"item":"TC-{N}","agent":"{{AGENT_NAME}}","decision":"{one-line summary}","reasoning":"{why this choice}"}'
273
+ ```
274
+
275
+ **Do NOT report** routine operations: claiming a test case, updating heartbeat, stage transitions, waiting for blocked test cases. Only report substantive choices that affect the implementation.
276
+
277
+ **Best-effort only:** If the dashboard is unreachable (curl fails), continue working normally. Decision reporting must never block or delay your work.
278
+
279
+ ---
280
+
281
+ ## Testing Anti-Patterns — NEVER Do These
282
+
283
+ 1. **Testing mock behavior instead of real behavior** — if your assertion checks a mock element (`*-mock` test IDs, mock return values), you are testing the mock, not the code. Delete and rewrite.
284
+ 2. **Adding test-only methods to production classes** — if a method exists only because tests need it, move it to test utilities. Production code must not know about tests.
285
+ 3. **Mocking without understanding dependencies** — before mocking, ask: "What side effects does the real method have? Does my test depend on any of them?" Mock at the lowest level necessary, not at the level that seems convenient.
286
+ 4. **Multiple behaviors per test** — if the test name contains "and", split it. One test, one behavior, one assertion.
287
+ 5. **Incomplete mock data** — mock the COMPLETE data structure as it exists in reality, not just the fields your immediate test uses. Partial mocks hide structural assumptions.
288
+
289
+ ---
290
+
291
+ ## Red Flags — STOP and Start Over
292
+
293
+ - Code written before test
294
+ - Test written after implementation
295
+ - Test passes immediately in RED stage
296
+ - Cannot explain why test failed
297
+ - Tests added "later"
298
+ - Rationalizing "just this once"
299
+ - "I already manually tested it"
300
+ - "Tests after achieve the same purpose"
301
+ - "Keep as reference" or "adapt existing code"
302
+ - "This is different because..."
303
+ - Mock setup is >50% of test code
304
+
305
+ **All of these mean: Delete code. Start over with RED.**
306
+
307
+ ---
308
+
309
+ ## Rules
310
+
311
+ - One test case at a time per agent. One stage per iteration.
312
+ - Read tracker first, update tracker last. Always use lock protocol for writes.
313
+ - Read `CLAUDE.md` for all project-specific context.
314
+ - **The Iron Law is absolute: NO PRODUCTION CODE WITHOUT A FAILING TEST FIRST.**
315
+ - RED must produce a FAILING test. GREEN must produce MINIMAL passing code. REFACTOR must NOT add behavior.
316
+ - Run ALL tests after every change, not just the current test.
317
+ - Commit after each stage: RED commit (test only), GREEN commit (implementation), REFACTOR commit (cleanup).
318
+ - Align with sibling test cases via Test Case Group context.
319
+ - **Multi-agent: never touch another agent's in_progress test case. Coordinate via tracker.md.**
320
+
321
+ ---
322
+
323
+ Read `.ralph-flow/{{APP_NAME}}/01-tdd-loop/tracker.md` now and begin.
@@ -0,0 +1,3 @@
1
+ # Test Cases
2
+
3
+ <!-- Populated by the spec loop -->
@@ -0,0 +1,18 @@
1
+ # TDD Loop — Tracker
2
+
3
+ - completed_test_cases: []
4
+
5
+ ## Agent Status
6
+
7
+ | agent | active_test_case | stage | last_heartbeat |
8
+ |-------|------------------|-------|----------------|
9
+
10
+ ---
11
+
12
+ ## Dependencies
13
+
14
+ ## Test Case Groups
15
+
16
+ ## Test Cases Queue
17
+
18
+ ## Log
@@ -0,0 +1,226 @@
1
+ # Verify Loop — Verify All Specs Against Fresh Evidence
2
+
3
+ **App:** `{{APP_NAME}}` — all flow files live under `.ralph-flow/{{APP_NAME}}/`.
4
+
5
+ Read `.ralph-flow/{{APP_NAME}}/02-verify-loop/tracker.md` FIRST to determine where you are.
6
+
7
+ > **Evidence before claims, always.** You are a verification agent. You do not trust reports. You do not trust "should work." You run commands, read output, and report facts. If you have not run the verification command in THIS iteration, you cannot claim it passes.
8
+
9
+ > Only write to: `02-verify-loop/tracker.md`, `02-verify-loop/verifications.md`. Source code is READ-ONLY. If verification fails, report it — do NOT fix it.
10
+
11
+ **Pipeline:** `completed TEST-CASEs → YOU → verification report`
12
+
13
+ ---
14
+
15
+ ## The Iron Law
16
+
17
+ ```
18
+ NO COMPLETION CLAIMS WITHOUT FRESH VERIFICATION EVIDENCE
19
+ ```
20
+
21
+ If you have not run the command and read the output in THIS iteration, you cannot claim it passes. "Should pass" is not verification. "Looks correct" is not verification. "Previously passed" is not verification.
22
+
23
+ ---
24
+
25
+ ## Visual Communication Protocol
26
+
27
+ When communicating scope, structure, relationships, or status, render **ASCII diagrams** using Unicode box-drawing characters. These help the user see the full picture at the terminal without scrolling through prose.
28
+
29
+ **Character set:** `┌ ─ ┐ │ └ ┘ ├ ┤ ┬ ┴ ┼ ═ ● ○ ▼ ▶`
30
+
31
+ **Diagram types to use:**
32
+
33
+ - **Verification Matrix** — specs vs acceptance criteria with pass/fail indicators
34
+ - **Test Results Summary** — bordered box with test counts and status
35
+ - **Coverage Map** — which specs are fully verified, partially verified, or failing
36
+ - **Status Summary** — bordered box with completion indicators (`✓` pass, `✗` fail, `◌` pending)
37
+
38
+ **Rules:** Keep diagrams under 20 lines and under 70 characters wide. Populate with real data from current context. Render inside fenced code blocks. Use diagrams to supplement, not replace, prose.
39
+
40
+ ---
41
+
42
+ ## State Machine (2 stages per verification)
43
+
44
+ ```
45
+ VERIFY → Run full test suite, check each spec's criteria against evidence → stage: report
46
+ REPORT → Write verification report, record results, mark done → next spec
47
+ ```
48
+
49
+ **FIRST — Check completion.** Read the tracker. If the Verifications Queue has entries
50
+ AND every entry is `[x]` (no pending verifications):
51
+ 1. **Re-scan** — check if new specs were completed in the TDD loop since last run.
52
+ Read `01-tdd-loop/tracker.md` for `completed_test_cases`.
53
+ Read `00-spec-loop/specs.md` for all specs.
54
+ A spec is verifiable when ALL its test cases appear in `completed_test_cases`.
55
+ 2. **New verifiable specs found** → add them as `- [ ] VERIFY-{N}: {spec title}` to the Verifications Queue and proceed.
56
+ 3. **No new verifiable specs** → `<promise>ALL VERIFICATIONS COMPLETE</promise>`
57
+
58
+ Pick the lowest-numbered `ready` verification. NEVER process a `blocked` verification.
59
+
60
+ ---
61
+
62
+ ## First-Run Handling
63
+
64
+ If Verifications Queue is empty, build it:
65
+
66
+ 1. Read `00-spec-loop/specs.md` → list all `## SPEC-{N}:` entries
67
+ 2. Read `01-tdd-loop/tracker.md` → get `completed_test_cases` list
68
+ 3. Read `01-tdd-loop/test-cases.md` → map each TC to its `**Source:** SPEC-{N}`
69
+ 4. For each SPEC: check if ALL its test cases appear in `completed_test_cases`
70
+ - All complete → add `- [ ] VERIFY-{N}: Verify SPEC-{N} — {title}` to queue as `ready`
71
+ - Some incomplete → add as `blocked` with note of which TCs are pending
72
+ 5. Skip specs that already appear in the Completed Mapping
73
+ 6. If no verifiable specs exist → `<promise>ALL VERIFICATIONS COMPLETE</promise>`
74
+
75
+ ---
76
+
77
+ ## STAGE 1: VERIFY
78
+
79
+ 1. Read tracker → pick lowest-numbered `ready` verification
80
+ 2. Read the corresponding SPEC from `00-spec-loop/specs.md`
81
+ 3. Read ALL test cases for this spec from `01-tdd-loop/test-cases.md`
82
+ 4. Read `CLAUDE.md` for project context — especially test commands
83
+ 5. **Run the FULL test suite.** Not a subset. Not just the new tests. The FULL suite.
84
+ - Record the COMPLETE output — test count, pass count, fail count, error output
85
+ - Record the exact command you ran and its exit code
86
+ 6. **For EACH acceptance criterion in the spec:**
87
+ - Identify which test case(s) cover it
88
+ - Find the test result in the output — PASS or FAIL
89
+ - If PASS: record the evidence (test name, assertion, output line)
90
+ - If FAIL: record the failure (test name, error message, expected vs actual)
91
+ - If NO TEST covers this criterion: record as UNTESTED with explanation
92
+ 7. **Run any additional verification commands** specified in the spec or test cases:
93
+ - Build commands (ensure no compilation errors)
94
+ - Lint commands (ensure no lint violations)
95
+ - Type check commands (ensure no type errors)
96
+ - Record ALL output
97
+ 8. **Render a Verification Matrix** — output an ASCII diagram showing:
98
+ - Each acceptance criterion from the spec
99
+ - Which test case(s) cover it
100
+ - PASS (`✓`), FAIL (`✗`), or UNTESTED (`?`) status
101
+ - Evidence summary (one line per criterion)
102
+ 9. Update tracker: `active_verification: VERIFY-{N}`, `stage: report`, log entry with test results
103
+
104
+ ### Verification Red Flags — STOP
105
+
106
+ - Using "should pass" without running the command
107
+ - Referencing output from a PREVIOUS iteration
108
+ - Skipping a criterion because "it's obviously covered"
109
+ - Trusting TDD loop's completion claims without re-running
110
+ - Saying "all tests pass" without showing the output
111
+ - Using partial test runs instead of full suite
112
+
113
+ ### Verification Evidence Requirements
114
+
115
+ | Claim | Required Evidence | NOT Sufficient |
116
+ |-------|-------------------|----------------|
117
+ | Tests pass | Full test output: 0 failures, exit code 0 | "Should pass", previous run |
118
+ | Build succeeds | Build command output: exit 0 | "Linter passed" |
119
+ | Criterion met | Specific test name + assertion + result | "Tests pass" (too generic) |
120
+ | No regressions | Full suite: same or more passing tests | Subset of tests |
121
+ | Feature works | Test exercising the exact behavior | Adjacent test passing |
122
+
123
+ ---
124
+
125
+ ## STAGE 2: REPORT
126
+
127
+ 1. **Write verification entry** in `02-verify-loop/verifications.md`:
128
+
129
+ ```markdown
130
+ ## VERIFY-{N}: SPEC-{M} — {Spec Title}
131
+
132
+ **Verified:** {ISO 8601 timestamp}
133
+ **Test Command:** `{exact command}`
134
+ **Test Results:** {X passed, Y failed, Z total}
135
+ **Exit Code:** {0 or non-zero}
136
+
137
+ ### Acceptance Criteria Verification
138
+
139
+ | # | Criterion | Test Case(s) | Status | Evidence |
140
+ |---|-----------|--------------|--------|----------|
141
+ | 1 | {criterion text} | TC-{A} | PASS | {one-line evidence} |
142
+ | 2 | {criterion text} | TC-{B}, TC-{C} | FAIL | {failure reason} |
143
+ | 3 | {criterion text} | — | UNTESTED | {why no test covers this} |
144
+
145
+ ### Full Test Output
146
+ {Paste the complete test runner output — do not truncate}
147
+
148
+ ### Additional Checks
149
+ - Build: {PASS/FAIL — command + output summary}
150
+ - Lint: {PASS/FAIL — command + output summary}
151
+ - Types: {PASS/FAIL — command + output summary}
152
+
153
+ ### Verdict
154
+ {PASS — all criteria met with evidence}
155
+ {PARTIAL — N of M criteria met, failures listed}
156
+ {FAIL — critical criteria not met, details}
157
+ ```
158
+
159
+ 2. **Render a Completion Summary** — output an ASCII box showing:
160
+ - Spec title and verification verdict
161
+ - Criteria scorecard: X/Y passed
162
+ - Any failures or gaps that need attention
163
+ 3. **Mark done in tracker:**
164
+ - Check off in Verifications Queue: `[x]`
165
+ - Add to completed mapping with verdict
166
+ - Set `active_verification: none`, `stage: verify`
167
+ - Log entry with verdict summary
168
+ 4. **If FAIL or PARTIAL:** Log detailed failure information. Do NOT attempt fixes — the verify loop is read-only for source code. Report the failures clearly so the user can decide next steps.
169
+ 5. **Check for more work:**
170
+ - More verifications in queue → proceed to next
171
+ - All done → `<promise>ALL VERIFICATIONS COMPLETE</promise>`
172
+ 6. Exit: `kill -INT $PPID`
173
+
174
+ ---
175
+
176
+ ## Decision Reporting Protocol
177
+
178
+ When you make a substantive decision a human reviewer would want to know about, report it to the dashboard:
179
+
180
+ **When to report:**
181
+ - Verification strategy decisions (which additional checks to run beyond tests)
182
+ - Evidence interpretation (how you determined a criterion was met or not)
183
+ - Coverage gaps identified (acceptance criteria without corresponding tests)
184
+ - Ambiguous results (tests pass but behavior seems incorrect)
185
+
186
+ **How to report:**
187
+ ```bash
188
+ curl -s --connect-timeout 2 --max-time 5 -X POST "http://127.0.0.1:4242/api/decision?app=$RALPHFLOW_APP&loop=$RALPHFLOW_LOOP" -H 'Content-Type: application/json' -d '{"item":"VERIFY-{N}","agent":"verify-loop","decision":"{one-line summary}","reasoning":"{why this choice}"}'
189
+ ```
190
+
191
+ **Do NOT report** routine operations: picking the next verification, updating tracker, stage transitions. Only report substantive choices that affect the verification outcome.
192
+
193
+ **Best-effort only:** If the dashboard is unreachable (curl fails), continue working normally. Decision reporting must never block or delay your work.
194
+
195
+ ---
196
+
197
+ ## Rationalization Prevention
198
+
199
+ | Excuse | Reality |
200
+ |--------|---------|
201
+ | "Should work now" | RUN the verification |
202
+ | "I'm confident" | Confidence is not evidence |
203
+ | "Just this once" | No exceptions |
204
+ | "TDD loop said it passed" | Verify independently — agents lie by omission |
205
+ | "Tests passed last iteration" | FRESH evidence only. Re-run. |
206
+ | "Partial check is enough" | Partial proves nothing |
207
+ | "This criterion is obviously covered" | Show the test name and output line |
208
+ | "I'm tired and want to finish" | Exhaustion is not an excuse |
209
+
210
+ ---
211
+
212
+ ## Rules
213
+
214
+ - One spec verification at a time. Both stages run in one iteration, one `kill` at the end.
215
+ - Read tracker first, update tracker last.
216
+ - **The Iron Law is absolute: NO COMPLETION CLAIMS WITHOUT FRESH VERIFICATION EVIDENCE.**
217
+ - Run the FULL test suite, not a subset. Regressions matter.
218
+ - Every acceptance criterion must have a verdict with evidence.
219
+ - Source code is READ-ONLY. Report failures, do not fix them.
220
+ - Record ALL command outputs in the verification report — truncation is dishonesty.
221
+ - UNTESTED criteria are failures of the spec/decompose process, not of verification. Report them honestly.
222
+ - If tests fail, report the exact failure. Do not rationalize, speculate, or minimize.
223
+
224
+ ---
225
+
226
+ Read `.ralph-flow/{{APP_NAME}}/02-verify-loop/tracker.md` now and begin.
@@ -0,0 +1,16 @@
1
+ # Verify Loop — Tracker
2
+
3
+ - active_verification: none
4
+ - stage: verify
5
+ - completed_verifications: []
6
+ - pending_verifications: []
7
+
8
+ ---
9
+
10
+ ## Verifications Queue
11
+
12
+ ## Dependency Graph
13
+
14
+ ## Completed Mapping
15
+
16
+ ## Log
@@ -0,0 +1,3 @@
1
+ # Verifications
2
+
3
+ <!-- Populated by the verify loop -->
@@ -0,0 +1,73 @@
1
+ name: tdd-implementation
2
+ description: "Spec → Red-Green-Refactor → Verify pipeline for test-driven development"
3
+ version: 1
4
+ dir: .ralph-flow
5
+
6
+ entities:
7
+ SPEC:
8
+ prefix: SPEC
9
+ data_file: 00-spec-loop/specs.md
10
+ TEST-CASE:
11
+ prefix: TC
12
+ data_file: 01-tdd-loop/test-cases.md
13
+ VERIFICATION:
14
+ prefix: VERIFY
15
+ data_file: 02-verify-loop/verifications.md
16
+
17
+ loops:
18
+ spec-loop:
19
+ order: 0
20
+ name: "Spec Loop"
21
+ prompt: 00-spec-loop/prompt.md
22
+ tracker: 00-spec-loop/tracker.md
23
+ data_files:
24
+ - 00-spec-loop/specs.md
25
+ entities: [SPEC]
26
+ stages: [analyze, specify, decompose]
27
+ completion: "ALL SPECS WRITTEN"
28
+ feeds: [tdd-loop]
29
+ multi_agent: false
30
+ model: claude-sonnet-4-6
31
+ cadence: 0
32
+
33
+ tdd-loop:
34
+ order: 1
35
+ name: "TDD Loop"
36
+ prompt: 01-tdd-loop/prompt.md
37
+ tracker: 01-tdd-loop/tracker.md
38
+ data_files:
39
+ - 01-tdd-loop/test-cases.md
40
+ entities: [TEST-CASE, SPEC]
41
+ stages: [red, green, refactor]
42
+ completion: "ALL TEST-CASES COMPLETE"
43
+ fed_by: [spec-loop]
44
+ feeds: [verify-loop]
45
+ model: claude-sonnet-4-6
46
+ multi_agent:
47
+ enabled: true
48
+ max_agents: 4
49
+ strategy: tracker-lock
50
+ agent_placeholder: "{{AGENT_NAME}}"
51
+ lock:
52
+ file: 01-tdd-loop/.tracker-lock
53
+ type: echo
54
+ stale_seconds: 60
55
+ worktree:
56
+ strategy: shared
57
+ auto_merge: true
58
+ cadence: 0
59
+
60
+ verify-loop:
61
+ order: 2
62
+ name: "Verify Loop"
63
+ prompt: 02-verify-loop/prompt.md
64
+ tracker: 02-verify-loop/tracker.md
65
+ data_files:
66
+ - 02-verify-loop/verifications.md
67
+ entities: [VERIFICATION, SPEC]
68
+ stages: [verify, report]
69
+ completion: "ALL VERIFICATIONS COMPLETE"
70
+ fed_by: [tdd-loop]
71
+ multi_agent: false
72
+ model: claude-sonnet-4-6
73
+ cadence: 0