@ai-dev-methodologies/rlp-desk 0.9.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/plans/validated-snacking-crayon.md +156 -359
- package/docs/superpowers/plans/2026-04-12-flywheel-redesign.md +704 -0
- package/docs/superpowers/specs/2026-04-12-flywheel-redesign.md +161 -0
- package/package.json +1 -1
- package/src/commands/rlp-desk.md +15 -1
- package/src/node/reporting/campaign-reporting.mjs +364 -0
- package/src/node/run.mjs +12 -0
- package/src/node/runner/campaign-main-loop.mjs +104 -1
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# Flywheel Redesign Spec
|
|
2
|
+
|
|
3
|
+
## Problem
|
|
4
|
+
|
|
5
|
+
rlp-desk's current loop is: Worker → Verify → FAIL → fix contract → Worker (same approach).
|
|
6
|
+
When the approach itself is wrong, retrying with fixes wastes iterations.
|
|
7
|
+
The loop needs a direction review step that challenges premises and forces alternatives before the next Worker runs.
|
|
8
|
+
|
|
9
|
+
## Design
|
|
10
|
+
|
|
11
|
+
### Loop Structure
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
First iteration (no flywheel):
|
|
15
|
+
Worker → Verify → PASS → next US
|
|
16
|
+
|
|
17
|
+
On FAIL (--flywheel on-fail):
|
|
18
|
+
Verify FAIL
|
|
19
|
+
→ Flywheel Agent (fresh context, opus)
|
|
20
|
+
Step 0A: Premise Challenge
|
|
21
|
+
Step 0B: Existing Code Leverage
|
|
22
|
+
Step 0C: Ideal State Mapping
|
|
23
|
+
Step 0D: Implementation Alternatives (min 2)
|
|
24
|
+
Step 0E: Scope Decision (HOLD/PIVOT/REDUCE/EXPAND)
|
|
25
|
+
Step 0F: Contract Rewrite + Rejected Directions
|
|
26
|
+
→ Worker (reads updated contract) → Verify
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Flywheel runs BEFORE Worker. It decides direction, Worker executes.
|
|
30
|
+
|
|
31
|
+
### Execution Modes
|
|
32
|
+
|
|
33
|
+
tmux mode (3 panes):
|
|
34
|
+
```
|
|
35
|
+
┌──────────────┬──────────────┬──────────────┐
|
|
36
|
+
│ Flywheel │ Worker │ Verifier │
|
|
37
|
+
│ claude opus │ claude/codex │ claude/codex │
|
|
38
|
+
│ direction │ implements │ verifies │
|
|
39
|
+
└──────────────┴──────────────┴──────────────┘
|
|
40
|
+
```
|
|
41
|
+
- Leader dispatches to flywheel pane, polls flywheel-signal.json
|
|
42
|
+
- After signal received, dispatches Worker with updated contract
|
|
43
|
+
|
|
44
|
+
agent mode:
|
|
45
|
+
- Leader calls Agent() with flywheel prompt → fresh context automatic
|
|
46
|
+
- Agent() returns decision directly (no file signal needed)
|
|
47
|
+
- Leader updates memory, then dispatches Worker agent
|
|
48
|
+
|
|
49
|
+
### CLI Flags
|
|
50
|
+
|
|
51
|
+
```
|
|
52
|
+
--flywheel off|on-fail (default: off)
|
|
53
|
+
--flywheel-model MODEL (default: opus)
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### 4 Scope Decisions
|
|
57
|
+
|
|
58
|
+
| Decision | When | Action |
|
|
59
|
+
|----------|------|--------|
|
|
60
|
+
| HOLD | Premises valid, approach correct | Refine contract with specific fixes |
|
|
61
|
+
| PIVOT | Premise broken, approach wrong | Switch to alternative, record rejected direction |
|
|
62
|
+
| REDUCE | US too complex for current scope | Split AC or simplify, defer remainder |
|
|
63
|
+
| EXPAND | Missing prerequisite discovered | Add AC or expand contract |
|
|
64
|
+
|
|
65
|
+
### Flywheel Prompt Template (plan-ceo-review core internalized)
|
|
66
|
+
|
|
67
|
+
6-step review process:
|
|
68
|
+
|
|
69
|
+
**0A. Premise Challenge**
|
|
70
|
+
List every assumption in the current approach. For each, state whether this iteration's evidence supports or contradicts it. Broken premise → PIVOT or REDUCE.
|
|
71
|
+
|
|
72
|
+
**0B. Existing Code Leverage**
|
|
73
|
+
Check if Worker missed reusable code. Check if a different approach fits existing patterns better.
|
|
74
|
+
|
|
75
|
+
**0C. Ideal State Mapping**
|
|
76
|
+
Describe the ideal completion of this US in 2-3 sentences. How far is the current approach from ideal?
|
|
77
|
+
|
|
78
|
+
**0D. Implementation Alternatives (MANDATORY)**
|
|
79
|
+
Minimum 2 alternatives. Each: summary, effort (S/M/L), risk, tradeoff vs current approach.
|
|
80
|
+
|
|
81
|
+
**0E. Scope Decision**
|
|
82
|
+
Choose HOLD/PIVOT/REDUCE/EXPAND. Justify with evidence from this iteration only.
|
|
83
|
+
|
|
84
|
+
**0F. Contract Rewrite**
|
|
85
|
+
Rewrite Next Iteration Contract in campaign memory.
|
|
86
|
+
Record decision in Key Decisions.
|
|
87
|
+
Record failed approaches in Rejected Directions (prevents future Workers from repeating).
|
|
88
|
+
|
|
89
|
+
**CEO Cognitive Patterns (embedded in prompt):**
|
|
90
|
+
1. First-principles — ignore convention, start from the problem
|
|
91
|
+
2. 10x check — can 2x effort yield 10x better result?
|
|
92
|
+
3. Inversion — what must be true for this approach to fail?
|
|
93
|
+
4. Simplicity bias — prefer simple over complex solutions
|
|
94
|
+
5. User-back — reason backwards from end-user experience
|
|
95
|
+
6. Time-value — does this pivot save 3+ iterations?
|
|
96
|
+
7. Sunk cost immunity — ignore prior investment
|
|
97
|
+
8. Blast radius — assess impact scope of direction change
|
|
98
|
+
9. Reversibility — prefer easily reversible decisions
|
|
99
|
+
10. Evidence > opinion — judge only by this iteration's actual results
|
|
100
|
+
|
|
101
|
+
### Signal Protocol
|
|
102
|
+
|
|
103
|
+
flywheel-signal.json:
|
|
104
|
+
```json
|
|
105
|
+
{
|
|
106
|
+
"iteration": N,
|
|
107
|
+
"decision": "hold|pivot|reduce|expand",
|
|
108
|
+
"summary": "one line explanation",
|
|
109
|
+
"rejected_directions": ["approach X because Y"],
|
|
110
|
+
"contract_updated": true,
|
|
111
|
+
"timestamp": "ISO"
|
|
112
|
+
}
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### Campaign Memory Updates
|
|
116
|
+
|
|
117
|
+
Flywheel agent writes directly to campaign memory:
|
|
118
|
+
- **Next Iteration Contract**: rewritten based on decision
|
|
119
|
+
- **Key Decisions**: flywheel decision + reasoning appended
|
|
120
|
+
- **Rejected Directions**: new section, append-only (Worker reads to avoid repeating)
|
|
121
|
+
|
|
122
|
+
### Files Changed
|
|
123
|
+
|
|
124
|
+
| File | Change |
|
|
125
|
+
|------|--------|
|
|
126
|
+
| src/scripts/init_ralph_desk.zsh | Flywheel prompt template, 3rd pane setup, --flywheel flags in presets |
|
|
127
|
+
| src/node/runner/campaign-main-loop.mjs | Flywheel dispatch (tmux + agent), shouldRunFlywheel(), pane management |
|
|
128
|
+
| src/node/run.mjs | --flywheel, --flywheel-model flag parsing |
|
|
129
|
+
| src/commands/rlp-desk.md | Flywheel documentation, options reference |
|
|
130
|
+
| src/governance.md | Flywheel step in Leader loop protocol |
|
|
131
|
+
| src/scripts/run_ralph_desk.zsh | 3rd pane creation for tmux mode |
|
|
132
|
+
|
|
133
|
+
### What Stays (from current branch)
|
|
134
|
+
|
|
135
|
+
- SV Report generation (generateSVReport in campaign-reporting.mjs)
|
|
136
|
+
- Brainstorm step 0 SV feedback (rlp-desk.md)
|
|
137
|
+
- analyticsDir in buildPaths
|
|
138
|
+
|
|
139
|
+
### What Gets Removed (from current branch)
|
|
140
|
+
|
|
141
|
+
- Current pivot implementation (shouldRunPivot, dispatchPivot, buildPivotTriggerCmd)
|
|
142
|
+
- Current pivot prompt template in init_ralph_desk.zsh
|
|
143
|
+
- Current --pivot-mode, --pivot-model flags
|
|
144
|
+
- test-pivot-step.mjs
|
|
145
|
+
|
|
146
|
+
## Verification
|
|
147
|
+
|
|
148
|
+
### TDD Tests
|
|
149
|
+
- shouldRunFlywheel logic (off/on-fail conditions)
|
|
150
|
+
- Flywheel prompt contains all 6 steps + 10 cognitive patterns
|
|
151
|
+
- Signal protocol parsing
|
|
152
|
+
- Rejected directions persistence across iterations
|
|
153
|
+
- 3-pane creation in tmux mode
|
|
154
|
+
|
|
155
|
+
### Self-Verification (3 scenarios)
|
|
156
|
+
- LOW: --flywheel off → normal loop unchanged
|
|
157
|
+
- MEDIUM: --flywheel on-fail + FAIL → flywheel fires → memory updated → Worker reflects
|
|
158
|
+
- CRITICAL: PIVOT decision → rejected direction recorded → next Worker avoids it
|
|
159
|
+
|
|
160
|
+
### E2E
|
|
161
|
+
- Test project with intentional FAIL → flywheel activates → direction change → success
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ai-dev-methodologies/rlp-desk",
|
|
3
|
-
"version": "0.9.
|
|
3
|
+
"version": "0.9.1",
|
|
4
4
|
"description": "Fresh-context iterative loops for Claude Code — autonomous task completion with independent verification",
|
|
5
5
|
"scripts": {
|
|
6
6
|
"postinstall": "node scripts/postinstall.js",
|
package/src/commands/rlp-desk.md
CHANGED
|
@@ -112,7 +112,17 @@ Ask about these items one by one (or in small groups):
|
|
|
112
112
|
|
|
113
113
|
After all items are confirmed:
|
|
114
114
|
|
|
115
|
-
0. **SV Report Feedback** — If a prior campaign's self-verification report exists
|
|
115
|
+
0. **SV Report Feedback** — If a prior campaign's self-verification report exists:
|
|
116
|
+
a. Scan `~/.claude/ralph-desk/analytics/` for directories matching this project (by slug or project root)
|
|
117
|
+
b. Read the latest `self-verification-report.md` from each matching directory
|
|
118
|
+
c. Extract from §7 (Patterns) and §8 (Recommendations):
|
|
119
|
+
- Which US types/sizes failed most frequently
|
|
120
|
+
- Which AC quality dimensions scored lowest
|
|
121
|
+
- Which model tiers underperformed for this project's complexity
|
|
122
|
+
- Specific brainstorm/PRD/test-spec recommendations from prior campaigns
|
|
123
|
+
d. Present findings to user: "Prior campaign analysis found: [patterns]. Recommendations: [suggestions]."
|
|
124
|
+
e. If no prior reports exist, skip and note "No prior campaign data available."
|
|
125
|
+
(governance §8½)
|
|
116
126
|
1. **Ambiguity Gate (IL-2)** — score each AC per governance §1a IL-2 (6 dimensions, 0-12 points).
|
|
117
127
|
If ANY AC scores below 6: **REJECT** — refine that AC before proceeding.
|
|
118
128
|
If all ACs score 6-9: **WARN** — proceed with logged warning, show low-scoring dimensions.
|
|
@@ -180,6 +190,8 @@ Tell the user:
|
|
|
180
190
|
# --iter-timeout N tmux only (default: 600)
|
|
181
191
|
# --debug debug logging
|
|
182
192
|
# --with-self-verification post-campaign SV report
|
|
193
|
+
# --flywheel off|on-fail direction review on fail (default: off)
|
|
194
|
+
# --flywheel-model MODEL flywheel reviewer model (default: opus)
|
|
183
195
|
```
|
|
184
196
|
|
|
185
197
|
**If codex is NOT installed** — show claude-only presets + install recommendation:
|
|
@@ -208,6 +220,8 @@ Tell the user:
|
|
|
208
220
|
# --iter-timeout N tmux only (default: 600)
|
|
209
221
|
# --debug debug logging
|
|
210
222
|
# --with-self-verification post-campaign SV report
|
|
223
|
+
# --flywheel off|on-fail direction review on fail (default: off)
|
|
224
|
+
# --flywheel-model MODEL flywheel reviewer model (default: opus)
|
|
211
225
|
```
|
|
212
226
|
|
|
213
227
|
Replace `<actual-slug>` with the real slug from this init (e.g. `auth-refactor`).
|
|
@@ -226,6 +226,370 @@ export async function generateCampaignReport({
|
|
|
226
226
|
await fs.writeFile(reportFile, `${lines.join('\n')}\n`, 'utf8');
|
|
227
227
|
}
|
|
228
228
|
|
|
229
|
+
function svReportVersionPath(targetPath, version) {
|
|
230
|
+
return targetPath.replace(/\.md$/u, `-v${version}.md`);
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
async function collectIterFiles(logsDir, pattern) {
|
|
234
|
+
const entries = await fs.readdir(logsDir, { withFileTypes: true }).catch(() => []);
|
|
235
|
+
return entries
|
|
236
|
+
.filter((entry) => entry.isFile() && pattern.test(entry.name))
|
|
237
|
+
.map((entry) => entry.name)
|
|
238
|
+
.sort();
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
function computeWorkerQuality(doneClaims) {
|
|
242
|
+
if (doneClaims.length === 0) {
|
|
243
|
+
return { planPercent: 0, tddPercent: 0, redConfirmPercent: 0, total: 0 };
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
let withPlan = 0;
|
|
247
|
+
let withTdd = 0;
|
|
248
|
+
let withRedConfirm = 0;
|
|
249
|
+
|
|
250
|
+
for (const claim of doneClaims) {
|
|
251
|
+
const steps = claim.execution_steps ?? [];
|
|
252
|
+
const stepNames = steps.map((s) => s.step);
|
|
253
|
+
|
|
254
|
+
if (stepNames.includes('plan')) {
|
|
255
|
+
withPlan += 1;
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
const writeIdx = stepNames.indexOf('write_test');
|
|
259
|
+
const implIdx = stepNames.indexOf('implement');
|
|
260
|
+
if (writeIdx !== -1 && implIdx !== -1 && writeIdx < implIdx) {
|
|
261
|
+
withTdd += 1;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
const redStep = steps.find((s) => s.step === 'verify_red');
|
|
265
|
+
if (redStep && redStep.exit_code === 1) {
|
|
266
|
+
withRedConfirm += 1;
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
const pct = (n) => Math.round((n / doneClaims.length) * 100);
|
|
271
|
+
|
|
272
|
+
return {
|
|
273
|
+
planPercent: pct(withPlan),
|
|
274
|
+
tddPercent: pct(withTdd),
|
|
275
|
+
redConfirmPercent: pct(withRedConfirm),
|
|
276
|
+
total: doneClaims.length,
|
|
277
|
+
};
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
function computeVerifierQuality(verdicts) {
|
|
281
|
+
if (verdicts.length === 0) {
|
|
282
|
+
return { reasoningPercent: 0, independentPercent: 0, total: 0 };
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
const REQUIRED_CATEGORIES = ['il1_compliance', 'layer_enforcement', 'test_sufficiency', 'anti_gaming', 'worker_process_audit'];
|
|
286
|
+
let withComplete = 0;
|
|
287
|
+
let withIndependent = 0;
|
|
288
|
+
|
|
289
|
+
for (const v of verdicts) {
|
|
290
|
+
const reasoning = v.reasoning ?? {};
|
|
291
|
+
const present = REQUIRED_CATEGORIES.filter((cat) => reasoning[cat]);
|
|
292
|
+
if (present.length === REQUIRED_CATEGORIES.length) {
|
|
293
|
+
withComplete += 1;
|
|
294
|
+
}
|
|
295
|
+
if (present.length > 0) {
|
|
296
|
+
withIndependent += 1;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
const pct = (n) => Math.round((n / verdicts.length) * 100);
|
|
301
|
+
|
|
302
|
+
return {
|
|
303
|
+
reasoningPercent: pct(withComplete),
|
|
304
|
+
independentPercent: pct(withIndependent),
|
|
305
|
+
total: verdicts.length,
|
|
306
|
+
};
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
function buildAcLifecycle(doneClaims, verdicts) {
|
|
310
|
+
const lifecycle = {};
|
|
311
|
+
|
|
312
|
+
for (const claim of doneClaims) {
|
|
313
|
+
const usId = claim.us_id ?? 'unknown';
|
|
314
|
+
const iter = claim.iteration ?? 0;
|
|
315
|
+
if (!lifecycle[usId]) {
|
|
316
|
+
lifecycle[usId] = { firstClaimed: iter, firstVerified: null, reopenCount: 0, finalStatus: 'pending' };
|
|
317
|
+
}
|
|
318
|
+
if (iter < lifecycle[usId].firstClaimed) {
|
|
319
|
+
lifecycle[usId].firstClaimed = iter;
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
for (const v of verdicts) {
|
|
324
|
+
const usId = v.us_id ?? 'unknown';
|
|
325
|
+
const iter = v.iteration ?? 0;
|
|
326
|
+
if (!lifecycle[usId]) {
|
|
327
|
+
lifecycle[usId] = { firstClaimed: iter, firstVerified: null, reopenCount: 0, finalStatus: 'pending' };
|
|
328
|
+
}
|
|
329
|
+
if (v.verdict === 'pass' && lifecycle[usId].firstVerified === null) {
|
|
330
|
+
lifecycle[usId].firstVerified = iter;
|
|
331
|
+
lifecycle[usId].finalStatus = 'verified';
|
|
332
|
+
} else if (v.verdict === 'fail' && lifecycle[usId].firstVerified !== null) {
|
|
333
|
+
lifecycle[usId].reopenCount += 1;
|
|
334
|
+
lifecycle[usId].finalStatus = 'pending';
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
return lifecycle;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
function extractPatterns(doneClaims, verdicts) {
|
|
342
|
+
const strengths = [];
|
|
343
|
+
const weaknesses = [];
|
|
344
|
+
const passCount = verdicts.filter((v) => v.verdict === 'pass').length;
|
|
345
|
+
const failCount = verdicts.filter((v) => v.verdict === 'fail').length;
|
|
346
|
+
|
|
347
|
+
if (passCount > 0 && failCount === 0) {
|
|
348
|
+
strengths.push('All iterations passed on first attempt.');
|
|
349
|
+
} else if (passCount > failCount) {
|
|
350
|
+
strengths.push(`${passCount} of ${verdicts.length} iterations passed.`);
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
if (failCount > 0) {
|
|
354
|
+
weaknesses.push(`${failCount} of ${verdicts.length} iterations failed verification.`);
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
const wq = computeWorkerQuality(doneClaims);
|
|
358
|
+
if (wq.tddPercent === 100 && wq.total > 0) {
|
|
359
|
+
strengths.push('TDD compliance at 100%.');
|
|
360
|
+
} else if (wq.tddPercent < 80 && wq.total > 0) {
|
|
361
|
+
weaknesses.push(`TDD compliance at ${wq.tddPercent}% — below 80% threshold.`);
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
if (strengths.length === 0) {
|
|
365
|
+
strengths.push('No notable strengths detected from available data.');
|
|
366
|
+
}
|
|
367
|
+
if (weaknesses.length === 0) {
|
|
368
|
+
weaknesses.push('No notable weaknesses detected from available data.');
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
return { strengths, weaknesses };
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
function buildRecommendations(doneClaims, verdicts, analytics) {
|
|
375
|
+
const recs = { brainstorm: [], prd: [], testSpec: [] };
|
|
376
|
+
const wq = computeWorkerQuality(doneClaims);
|
|
377
|
+
|
|
378
|
+
if (wq.tddPercent < 80 && wq.total > 0) {
|
|
379
|
+
recs.brainstorm.push('Recommend stricter TDD enforcement in worker prompts.');
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
const usFailCounts = {};
|
|
383
|
+
for (const v of verdicts) {
|
|
384
|
+
if (v.verdict === 'fail') {
|
|
385
|
+
usFailCounts[v.us_id] = (usFailCounts[v.us_id] ?? 0) + 1;
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
for (const [usId, count] of Object.entries(usFailCounts)) {
|
|
389
|
+
if (count >= 2) {
|
|
390
|
+
recs.prd.push(`${usId} failed ${count} times — consider splitting into smaller ACs.`);
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
const models = new Set(analytics.map((r) => r.worker_model));
|
|
395
|
+
if (models.size > 1) {
|
|
396
|
+
recs.testSpec.push(`Model upgrade occurred (${[...models].join(' -> ')}). Note which model handled what.`);
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
if (recs.brainstorm.length === 0) {
|
|
400
|
+
recs.brainstorm.push('No brainstorm recommendations.');
|
|
401
|
+
}
|
|
402
|
+
if (recs.prd.length === 0) {
|
|
403
|
+
recs.prd.push('No PRD recommendations.');
|
|
404
|
+
}
|
|
405
|
+
if (recs.testSpec.length === 0) {
|
|
406
|
+
recs.testSpec.push('No test-spec recommendations.');
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
return recs;
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
export async function generateSVReport({
|
|
413
|
+
slug,
|
|
414
|
+
logsDir,
|
|
415
|
+
prdFile: _prdFile,
|
|
416
|
+
testSpecFile,
|
|
417
|
+
analyticsFile,
|
|
418
|
+
outputDir,
|
|
419
|
+
}) {
|
|
420
|
+
void _prdFile; // reserved for future use (PRD pattern extraction)
|
|
421
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
422
|
+
|
|
423
|
+
// Collect iteration files
|
|
424
|
+
const claimFiles = await collectIterFiles(logsDir, /^iter-\d+-done-claim\.json$/u);
|
|
425
|
+
const verdictFiles = await collectIterFiles(logsDir, /^iter-\d+-verify-verdict\.json$/u);
|
|
426
|
+
|
|
427
|
+
const doneClaims = [];
|
|
428
|
+
for (const file of claimFiles) {
|
|
429
|
+
const data = await readJsonIfExists(path.join(logsDir, file));
|
|
430
|
+
if (data) {
|
|
431
|
+
doneClaims.push(data);
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
const verdicts = [];
|
|
436
|
+
for (const file of verdictFiles) {
|
|
437
|
+
const data = await readJsonIfExists(path.join(logsDir, file));
|
|
438
|
+
if (data) {
|
|
439
|
+
verdicts.push(data);
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
const analytics = await readAnalytics(analyticsFile);
|
|
444
|
+
|
|
445
|
+
// Compute metrics
|
|
446
|
+
const workerQuality = computeWorkerQuality(doneClaims);
|
|
447
|
+
const verifierQuality = computeVerifierQuality(verdicts);
|
|
448
|
+
const acLifecycle = buildAcLifecycle(doneClaims, verdicts);
|
|
449
|
+
const patterns = extractPatterns(doneClaims, verdicts);
|
|
450
|
+
const recommendations = buildRecommendations(doneClaims, verdicts, analytics);
|
|
451
|
+
|
|
452
|
+
// Read test-spec for context
|
|
453
|
+
const testSpecContent = await fs.readFile(testSpecFile, 'utf8').catch(() => '');
|
|
454
|
+
const totalIterations = doneClaims.length;
|
|
455
|
+
const dataQuality = totalIterations > 0 ? 100 : 0;
|
|
456
|
+
|
|
457
|
+
// Section 1: Automated Validation Summary
|
|
458
|
+
const validationRows = analytics.length > 0
|
|
459
|
+
? analytics.map((r) => `| ${r.iter} | ${r.us_id} | ${r.verdict} | ${r.worker_model} | ${r.duration}s |`)
|
|
460
|
+
: ['| - | - | - | - | - |'];
|
|
461
|
+
|
|
462
|
+
// Section 2: Failure Deep Dive
|
|
463
|
+
const failedVerdicts = verdicts.filter((v) => v.verdict === 'fail');
|
|
464
|
+
const failureLines = failedVerdicts.length > 0
|
|
465
|
+
? failedVerdicts.map((v) => {
|
|
466
|
+
const issues = (v.issues ?? []).map((i) => ` - ${i.criterion_id ?? 'unknown'} [${i.severity ?? 'major'}]: ${i.summary ?? 'unspecified'}`).join('\n');
|
|
467
|
+
return `### Iteration ${v.iteration ?? '?'} — ${v.us_id ?? 'unknown'}\n${issues || ' - No structured issues.'}`;
|
|
468
|
+
})
|
|
469
|
+
: ['No failed iterations.'];
|
|
470
|
+
|
|
471
|
+
// Section 3: Worker Process Quality
|
|
472
|
+
const wqLines = totalIterations > 0
|
|
473
|
+
? [
|
|
474
|
+
`- Total iterations analyzed: ${workerQuality.total}`,
|
|
475
|
+
`- Planning step: ${workerQuality.planPercent}%`,
|
|
476
|
+
`- TDD compliance: ${workerQuality.tddPercent}%`,
|
|
477
|
+
`- RED confirmation: ${workerQuality.redConfirmPercent}%`,
|
|
478
|
+
]
|
|
479
|
+
: ['- No iteration data available.'];
|
|
480
|
+
|
|
481
|
+
// Section 4: Verifier Judgment Quality
|
|
482
|
+
const vqLines = verdicts.length > 0
|
|
483
|
+
? [
|
|
484
|
+
`- Total verdicts analyzed: ${verifierQuality.total}`,
|
|
485
|
+
`- Reasoning completeness: ${verifierQuality.reasoningPercent}%`,
|
|
486
|
+
`- Independent verification: ${verifierQuality.independentPercent}%`,
|
|
487
|
+
]
|
|
488
|
+
: ['- No verdict data available.'];
|
|
489
|
+
|
|
490
|
+
// Section 5: AC Lifecycle
|
|
491
|
+
const lifecycleEntries = Object.entries(acLifecycle);
|
|
492
|
+
const acLines = lifecycleEntries.length > 0
|
|
493
|
+
? lifecycleEntries.map(([usId, lc]) =>
|
|
494
|
+
`| ${usId} | ${lc.firstClaimed} | ${lc.firstVerified ?? '-'} | ${lc.reopenCount} | ${lc.finalStatus} |`)
|
|
495
|
+
: ['| - | - | - | - | - |'];
|
|
496
|
+
|
|
497
|
+
// Section 6: Test-Spec Adherence
|
|
498
|
+
const specLines = testSpecContent
|
|
499
|
+
? [`Test spec present (${testSpecContent.split('\n').length} lines).`]
|
|
500
|
+
: ['No test spec found.'];
|
|
501
|
+
|
|
502
|
+
// Section 9: Cost & Performance
|
|
503
|
+
const costLines = analytics.length > 0
|
|
504
|
+
? [
|
|
505
|
+
`- Iteration records: ${analytics.length}`,
|
|
506
|
+
`- Total duration: ${analytics.reduce((sum, r) => sum + Number(r.duration ?? 0), 0)}s`,
|
|
507
|
+
]
|
|
508
|
+
: ['- No cost data available.'];
|
|
509
|
+
|
|
510
|
+
// Build report
|
|
511
|
+
const now = new Date().toISOString();
|
|
512
|
+
const lines = [
|
|
513
|
+
`# Campaign Self-Verification Report: ${slug}`,
|
|
514
|
+
`Report Version: 1 | Generated: ${now} | Campaign: ${slug}`,
|
|
515
|
+
`Data Quality: ${dataQuality}% iterations complete`,
|
|
516
|
+
'',
|
|
517
|
+
'## 1. Automated Validation Summary',
|
|
518
|
+
'| Iter | US | Verdict | Model | Duration |',
|
|
519
|
+
'|------|-----|---------|-------|----------|',
|
|
520
|
+
...validationRows,
|
|
521
|
+
'',
|
|
522
|
+
'## 2. Failure Deep Dive',
|
|
523
|
+
...failureLines,
|
|
524
|
+
'',
|
|
525
|
+
'## 3. Worker Process Quality',
|
|
526
|
+
...wqLines,
|
|
527
|
+
'',
|
|
528
|
+
'## 4. Verifier Judgment Quality',
|
|
529
|
+
...vqLines,
|
|
530
|
+
'',
|
|
531
|
+
'## 5. AC Lifecycle',
|
|
532
|
+
'| US | First Claimed | First Verified | Reopen Count | Final Status |',
|
|
533
|
+
'|-----|--------------|----------------|--------------|--------------|',
|
|
534
|
+
...acLines,
|
|
535
|
+
'',
|
|
536
|
+
'## 6. Test-Spec Adherence',
|
|
537
|
+
...specLines,
|
|
538
|
+
'',
|
|
539
|
+
'## 7. Patterns: Strengths & Weaknesses',
|
|
540
|
+
'### Strengths',
|
|
541
|
+
...patterns.strengths.map((s) => `- ${s}`),
|
|
542
|
+
'### Weaknesses',
|
|
543
|
+
...patterns.weaknesses.map((w) => `- ${w}`),
|
|
544
|
+
'',
|
|
545
|
+
'## 8. Recommendations for Next Cycle',
|
|
546
|
+
'### Brainstorm',
|
|
547
|
+
...recommendations.brainstorm.map((r) => `- ${r}`),
|
|
548
|
+
'### PRD',
|
|
549
|
+
...recommendations.prd.map((r) => `- ${r}`),
|
|
550
|
+
'### Test-Spec',
|
|
551
|
+
...recommendations.testSpec.map((r) => `- ${r}`),
|
|
552
|
+
'',
|
|
553
|
+
'## 9. Cost & Performance',
|
|
554
|
+
...costLines,
|
|
555
|
+
'',
|
|
556
|
+
'## 10. Blind Spots',
|
|
557
|
+
'- Token counts are not available in tmux mode (estimated from file sizes).',
|
|
558
|
+
'- Source code inspection findings are excluded unless marked [source-inspection].',
|
|
559
|
+
'- Worker internal reasoning beyond execution_steps is not captured.',
|
|
560
|
+
'',
|
|
561
|
+
];
|
|
562
|
+
|
|
563
|
+
// Version existing report
|
|
564
|
+
const reportPath = path.join(outputDir, 'self-verification-report.md');
|
|
565
|
+
const versionedPath = await versionFile(reportPath, svReportVersionPath);
|
|
566
|
+
const version = versionedPath ? Number(versionedPath.match(/-v(\d+)\.md$/)?.[1] ?? 0) + 1 : 1;
|
|
567
|
+
|
|
568
|
+
await fs.writeFile(reportPath, `${lines.join('\n')}\n`, 'utf8');
|
|
569
|
+
|
|
570
|
+
// Write structured data
|
|
571
|
+
const dataPath = path.join(outputDir, 'self-verification-data.json');
|
|
572
|
+
await fs.writeFile(dataPath, `${JSON.stringify({
|
|
573
|
+
slug,
|
|
574
|
+
generated: now,
|
|
575
|
+
worker_quality: workerQuality,
|
|
576
|
+
verifier_quality: verifierQuality,
|
|
577
|
+
ac_lifecycle: acLifecycle,
|
|
578
|
+
patterns,
|
|
579
|
+
recommendations,
|
|
580
|
+
analytics_count: analytics.length,
|
|
581
|
+
}, null, 2)}\n`, 'utf8');
|
|
582
|
+
|
|
583
|
+
// Build summary for campaign report
|
|
584
|
+
const passCount = verdicts.filter((v) => v.verdict === 'pass').length;
|
|
585
|
+
const failCount = verdicts.filter((v) => v.verdict === 'fail').length;
|
|
586
|
+
const summary = totalIterations > 0
|
|
587
|
+
? `SV report: ${totalIterations} iterations analyzed. TDD compliance: ${workerQuality.tddPercent}%. Pass/Fail: ${passCount}/${failCount}. Report: ${reportPath}`
|
|
588
|
+
: `SV report generated with no iteration data. Report: ${reportPath}`;
|
|
589
|
+
|
|
590
|
+
return { reportPath, version, summary };
|
|
591
|
+
}
|
|
592
|
+
|
|
229
593
|
export async function readStatus(slug, options = {}) {
|
|
230
594
|
const rootDir = path.resolve(options.rootDir ?? process.cwd());
|
|
231
595
|
const statusFile = path.join(rootDir, '.claude', 'ralph-desk', 'logs', slug, 'runtime', 'status.json');
|
package/src/node/run.mjs
CHANGED
|
@@ -21,6 +21,8 @@ const RUN_DEFAULTS = {
|
|
|
21
21
|
lockWorkerModel: false,
|
|
22
22
|
autonomous: false,
|
|
23
23
|
withSelfVerification: false,
|
|
24
|
+
flywheel: 'off',
|
|
25
|
+
flywheelModel: 'opus',
|
|
24
26
|
};
|
|
25
27
|
|
|
26
28
|
function write(stream, value) {
|
|
@@ -57,6 +59,8 @@ function buildHelpText() {
|
|
|
57
59
|
' --debug',
|
|
58
60
|
' --autonomous',
|
|
59
61
|
' --with-self-verification',
|
|
62
|
+
' --flywheel off|on-fail',
|
|
63
|
+
' --flywheel-model MODEL',
|
|
60
64
|
' --help',
|
|
61
65
|
].join('\n');
|
|
62
66
|
}
|
|
@@ -142,6 +146,14 @@ function parseRunOptions(args, cwd) {
|
|
|
142
146
|
case '--with-self-verification':
|
|
143
147
|
options.withSelfVerification = true;
|
|
144
148
|
break;
|
|
149
|
+
case '--flywheel':
|
|
150
|
+
options.flywheel = consumeValue(args, index, token);
|
|
151
|
+
index += 1;
|
|
152
|
+
break;
|
|
153
|
+
case '--flywheel-model':
|
|
154
|
+
options.flywheelModel = consumeValue(args, index, token);
|
|
155
|
+
index += 1;
|
|
156
|
+
break;
|
|
145
157
|
default:
|
|
146
158
|
throw new Error(`unknown option: ${token}`);
|
|
147
159
|
}
|