@ai-dev-methodologies/rlp-desk 0.15.3 → 0.15.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/CHANGELOG.md +98 -0
  2. package/README.md +34 -4
  3. package/docs/rlp-desk/failure-modes.md +191 -0
  4. package/package.json +10 -3
  5. package/src/node/MANIFEST.txt +3 -0
  6. package/src/node/prompts/prompt-assembler.mjs +2 -2
  7. package/src/node/run.mjs +70 -3
  8. package/src/node/runner/campaign-main-loop.mjs +97 -13
  9. package/src/node/util/debug-log.mjs +10 -6
  10. package/src/node/util/lifecycle-metrics.mjs +102 -0
  11. package/src/scripts/lib_ralph_desk.zsh +66 -0
  12. package/src/scripts/run_ralph_desk.zsh +23 -3
  13. package/docs/plans/bug-report-overhaul-backlog.md +0 -49
  14. package/docs/plans/bug-report-overhaul-v0.md +0 -238
  15. package/docs/plans/bug-report-overhaul-v1.md +0 -319
  16. package/docs/plans/native-agent-revert.md +0 -184
  17. package/docs/plans/polished-gliding-toucan.md +0 -234
  18. package/docs/plans/pr-e-phase-c1-blocked-recovery-hygiene-v0.md +0 -233
  19. package/docs/plans/spicy-booping-galaxy.md +0 -717
  20. package/docs/plans/strategic-review/rlp-desk-strategic-review.md +0 -125
  21. package/docs/plans/v0.15-stabilization-phase-a-prep.md +0 -130
  22. package/docs/plans/v0.15-stabilization-plan.md +0 -178
  23. package/docs/plans/v0.16-real-llm-sv-gate-spec.md +0 -177
  24. package/docs/rlp-desk/internal/verification-policy-gap-analysis.md +0 -523
  25. package/docs/rlp-desk/internal/verification-strategy-research.md +0 -2097
  26. package/docs/rlp-desk/plans/cozy-gliding-trinket.md +0 -53
  27. package/docs/rlp-desk/plans/frolicking-churning-honey.md +0 -253
  28. package/docs/rlp-desk/plans/keen-sauteeing-snowflake.md +0 -245
  29. package/docs/rlp-desk/plans/mutable-booping-corbato.md +0 -163
  30. package/docs/rlp-desk/plans/rlp-desk-0.11-handoff-7fixes.md +0 -352
  31. package/docs/rlp-desk/plans/rlp-desk-0.11.1-tmux-pane-disappearance.md +0 -260
  32. package/docs/rlp-desk/plans/rlp-desk-elegant-papert-agent-a8cd695ffca2a3ad8.md +0 -84
  33. package/docs/rlp-desk/plans/rlp-desk-elegant-papert.md +0 -270
  34. package/docs/rlp-desk/plans/rlp-desk-tmux-flywheel-routing.md +0 -730
  35. package/docs/rlp-desk/plans/toasty-whistling-diffie-agent-a6814625642e956da.md +0 -201
  36. package/docs/rlp-desk/plans/toasty-whistling-diffie.md +0 -117
  37. package/docs/rlp-desk/plans/validated-snacking-crayon.md +0 -204
  38. package/examples/calculator/.claude/ralph-desk/logs/loop-test/iter-001.worker-output.log +0 -0
  39. package/examples/calculator/.claude/ralph-desk/logs/loop-test/iter-001.worker-prompt.md +0 -38
  40. package/examples/calculator/.claude/ralph-desk/logs/loop-test/iter-001.worker-trigger.sh +0 -28
  41. package/examples/calculator/.claude/ralph-desk/logs/loop-test/session-config.json +0 -25
  42. package/examples/calculator/.claude/ralph-desk/logs/loop-test/status.json +0 -10
  43. package/examples/calculator/.claude/ralph-desk/logs/loop-test/worker-heartbeat.json +0 -1
package/CHANGELOG.md ADDED
@@ -0,0 +1,98 @@
1
+ # Changelog
2
+
3
+ All notable changes to `@ai-dev-methodologies/rlp-desk` are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and the project adheres to [Semantic Versioning](https://semver.org/).
4
+
5
+ For pre-v0.15.4 versions, refer to `git log` and individual GitHub release notes.
6
+
7
+ ## [Unreleased]
8
+
9
+ ### Planned (not yet shipped)
10
+ - v0.15.5 candidate: flip `RLP_LIFECYCLE_METRICS=1` default to ON (gated on 3 consecutive nightly real-LLM SV passes per `docs/plans/v0.15.4-release-runbook.md` §7.5.2).
11
+ - Post-v0.15.6: remove `RLP_LIFECYCLE_METRICS` flag entirely (per plan v3 ADR follow-ups).
12
+ - Phase D.1 (handoff documents) + Phase D.2 (per-stage agent role specialization) — both deferred per `docs/plans/v0.15.4-release-runbook.md` §7.6.
13
+
14
+ ## [0.15.5] — 2026-06-17
15
+
16
+ Patch: fixes surfaced by a fresh-context live dogfood of the tmux and agent run modes, plus packaging hygiene.
17
+
18
+ ### Fixed
19
+ - **Curl-install (`install.sh`) produced a Node leader that could not start.** `src/node/MANIFEST.txt` was missing three runtime modules, so a curl-installed `--mode agent` / Node leader threw `ERR_MODULE_NOT_FOUND` at startup. MANIFEST regenerated. (npm installs were unaffected.)
20
+ - **Tmux-mode leader could hang at startup.** A bare `> "$COST_LOG"` redirect runs zsh's `$NULLCMD` (`cat`), which blocks reading stdin when launched with an open TTY (interactive shell / tmux pane). Changed to `: > "$COST_LOG"`.
21
+ - **Worker sometimes printed the iteration signal instead of writing it.** The per-iteration prompt now explicitly instructs the worker to WRITE the verify signal to the iter-signal file (resolved path in tmux mode).
22
+ - **Clearer error for mis-leveled PRD user-story headings.** When user stories use `###` instead of `## US-NNN` (H2), the Node leader now hints at the correct heading level instead of a bare "No user stories found".
23
+
24
+ ### Added
25
+ - **`clean <slug> [--kill-session]` for the Node CLI.** Resets a campaign — removes sentinels, signal/claim/verdict files, and runtime state, while preserving the PRD, test-spec, prompts, memory, and reports. Previously unimplemented in the Node rewrite, which left a blocked campaign with no recovery path.
26
+
27
+ ### Changed
28
+ - **Smaller published package.** The tarball no longer ships internal research docs, dev planning handoffs, or example runtime state (73 → 53 files, ~364 → ~244 kB).
29
+
30
+ ## [0.15.4] — 2026-05-08
31
+
32
+ Phase B: tmux/process lifecycle hardening + observability + real-LLM SV strengthening. 4 sequential PRs (B1, B2-FIX, B4, B3) merged to main, plus pre-release audit fix branch addressing 16 findings (3 CRITICAL, 6 HIGH, 5 MEDIUM, 2 LOW).
33
+
34
+ ### Added
35
+ - `RLP_LIFECYCLE_METRICS=1` env flag enables structured tmux/process lifecycle telemetry. Five metrics emitted per iteration:
36
+ - `iter_signal_write_to_read_ms` — Worker FS write → leader poll resolve
37
+ - `verdict_write_to_read_ms` — Verifier FS write → leader poll resolve
38
+ - `pane_eof_to_cleanup_ms` — kill-start → `killPaneProcess` return
39
+ - `pane_reap_latency_ms` — done-claim observe → pane shell-idle
40
+ - `sentinel_lock_to_unlock_ms` — per-type, lock vs unlock pair
41
+ - Default OFF; zero overhead when unset.
42
+ - Lands in `debug.log` (LIFECYCLE category) and batched per iter into `campaign.jsonl.lifecycle_metrics`.
43
+ - `RLP_DESK_NODE_PATH` env override for SV scenarios. Lets the operator point bug-05 / bug-07 at a source-tree leader (`<repo>/src/node/run.mjs`) for pre-merge AC3.1a sampling.
44
+ - `B3_STAGE2_BLOCKING=1` env flag. Promotes B3 Stage 2 lifecycle-band assertions from non-blocking (informational) to release-blocking. Operator opts in after a 3-night PASS streak per release runbook §7.5.2.
45
+ - `docs/rlp-desk/failure-modes.md` — FMEA-style consolidated failure modes atlas (origin: omc-team Gotchas pattern). 14 entries across 6 categories.
46
+
47
+ ### Fixed
48
+ - Bug #5/7 done-claim race window (PR-B2-FIX). Worker pane is now reaped (`_kill_pane_process`) and `done-claim.json` sentinel locked (`chmod 0o444`) at the moment leader observes done-claim. Previously the pane lingered 30-120s post-write and could revise the artifact.
49
+ - Four substrate sites fixed: `run_ralph_desk.zsh` codex-exit synth path / A4 fallback inline path / post iter-signal reaper, `campaign-main-loop.mjs` Node leader worker reap.
50
+ - B3 Stage 2 jq false-PASS (audit C1). Pre-compute entry count via `jq flatten | length`; SKIP when zero entries instead of falsely matching `max=0` against the band.
51
+ - B3 scenarios circular pre-merge gate (audit C2). bug-05 / bug-07 honor `RLP_DESK_NODE_PATH`; pre-merge AC3.1a sample is now achievable.
52
+ - A2 dry-run placement (audit C3). Runbook splits into A2 (pre-bump: tolerate EPUBLISHCONFLICT, verify auth + tarball) and A2' (post-bump: strict exit-0 dry-run).
53
+ - A5 trigger-file oracle anchor (audit H1). Uses runtime-derived commit SHA of the prior version-bump commit, not a non-existent `vX.Y.Z` git tag.
54
+ - markLockStart timestamp inversion (audit H3). Moved BEFORE `lockSentinel` chmod in reapProducer so `sentinel_lock_to_unlock_ms` covers full lock duration including chmod execution.
55
+
56
+ ### Strengthened
57
+ - Real-LLM SV scenarios bug-05 (worker-dead-on-reuse) and bug-07 (post-sentinel-race) now run two-stage assertions when invoked with `RLP_LIFECYCLE_METRICS=1`:
58
+ - Stage 1 (presence): `lifecycle_metrics` field non-null in `campaign.jsonl`.
59
+ - Stage 2 (value): observed metrics within tolerance bands. Default NON-BLOCKING; flip to BLOCKING via `B3_STAGE2_BLOCKING=1`.
60
+ - bug-06 retains structural-only check ($0 cost; deterministic injection deferred to PR-B5 per ADR follow-ups).
61
+ - New unit tests:
62
+ - `tests/node/test-sentinel-reaper-invariant.test.mjs` — 6 invariant cases including B2-FIX primary target (case 5: done-claim ALIVE pane → reap).
63
+ - `tests/node/test-lifecycle-metrics.test.mjs` — 10 LifecycleMetricsCollector cases.
64
+ - `tests/node/test-campaign-jsonl-shape.test.mjs` — 4 shape contract cases (flag-on/off + sentinel context).
65
+ - `tests/node/test-b3-band-revalidation.test.mjs` — 17 cases for revalidation harness pure helpers (audit L2).
66
+ - `tests/node/us006-campaign-main-loop.test.mjs` — Bug-7-C-negative case added (audit M1).
67
+ - `tests/test_b2fix_sentinel_lock.sh` — 9 zsh PART-A code-pattern + PART-B helper-behavior assertions.
68
+ - sv-gate-fast: 48 → 71 (+23 guards across B2-FIX, B4, B3, audit fixes).
69
+ - Node test suite: 339 → 377 (+38 cases).
70
+
71
+ ### Documentation
72
+ - `docs/plans/v0.15-phase-b-lifecycle-audit.md` — B1 lifecycle audit (sentinel write-attribution, B4 metric proposal, ASCII diagrams). §4.5 appended with empirical revalidation update (audit H4).
73
+ - `docs/plans/v0.15-phase-b-plan-v3.md` — APPROVED ralplan plan v3 (Planner→Architect→Critic).
74
+ - `docs/plans/v0.15-phase-b3-revalidation-findings.md` — pre-merge band revalidation findings (synthetic vs empirical drift, refit table).
75
+ - `docs/plans/v0.15.4-pre-release-audit.md` — operator audit found 16 issues + 4 false positives. §9 per-finding fix status added.
76
+ - `docs/plans/v0.15.4-release-runbook.md` — release runbook with 7-phase pipeline (4 user gates), nightly schedule (§7.5), deferred follow-ups (§7.6 Phase D), failure-mode summary (§8).
77
+
78
+ ### Internal (packaging)
79
+ - npm tarball no longer ships internal planning documents (`docs/plans/`). User-facing reference docs at `docs/rlp-desk/` continue to ship unchanged. `package.json` `files` glob narrowed from `"docs/"` to `"docs/rlp-desk/"`. Saves ~280KB per install.
80
+
81
+ ### Migration notes
82
+ - No breaking changes. Existing 0.15.3 installations should upgrade smoothly via `npm install -g @ai-dev-methodologies/rlp-desk@0.15.4`. The postinstall script unlocks 0o444-protected files before overwriting (per CLAUDE.md upgrade-path EACCES guard).
83
+ - New `RLP_LIFECYCLE_METRICS` env flag defaults OFF — no behavior change for existing pipelines.
84
+ - Real-LLM SV scenarios accept new `RLP_DESK_NODE_PATH` env override but default to installed leader (backwards-compatible).
85
+
86
+ ## [0.15.3] — earlier release
87
+
88
+ See git log: `git log e0efaba` (chore: bump version to 0.15.3) for the 0.15.3 history.
89
+
90
+ ## Older versions
91
+
92
+ For changelog-style notes prior to 0.15.4, refer to:
93
+ - `git log <version-bump-commit>` for each `chore: bump version to X.Y.Z` commit
94
+ - GitHub Releases at https://github.com/ai-dev-methodologies/rlp-desk/releases
95
+ - `docs/plans/v0.15-stabilization-plan.md` for v0.15.x stabilization track context
96
+
97
+ [Unreleased]: https://github.com/ai-dev-methodologies/rlp-desk/compare/v0.15.4...HEAD
98
+ [0.15.4]: https://github.com/ai-dev-methodologies/rlp-desk/compare/v0.15.3...v0.15.4
package/README.md CHANGED
@@ -524,12 +524,42 @@ mkdir my-calc && cd my-calc
524
524
  /rlp-desk run loop-test
525
525
  ```
526
526
 
527
+ ## Lifecycle Observability (v0.15.4+)
528
+
529
+ Set `RLP_LIFECYCLE_METRICS=1` before invoking the runner to enable structured tmux/process lifecycle telemetry. Default: OFF (zero overhead when unset).
530
+
531
+ ```bash
532
+ RLP_LIFECYCLE_METRICS=1 node ~/.claude/ralph-desk/node/run.mjs run my-slug --mode tmux
533
+ ```
534
+
535
+ When enabled, five metrics are emitted per iteration:
536
+
537
+ | Metric | Meaning |
538
+ |---|---|
539
+ | `iter_signal_write_to_read_ms` | Worker FS write → leader poll resolve |
540
+ | `verdict_write_to_read_ms` | Verifier FS write → leader poll resolve |
541
+ | `pane_eof_to_cleanup_ms` | Kill-start → `killPaneProcess` return |
542
+ | `pane_reap_latency_ms` | done-claim observe → pane shell-idle |
543
+ | `sentinel_lock_to_unlock_ms` | per sentinel type, lock vs unlock pair |
544
+
545
+ **Where they land:**
546
+ - `debug.log` — `[LIFECYCLE]` tagged lines (per emission)
547
+ - `campaign.jsonl` — batched `lifecycle_metrics` object per iteration record (canonical authoritative source)
548
+
549
+ **When to enable:**
550
+ - Investigating tmux race windows or leader-poll latency
551
+ - Pre-merge real-LLM SV scenarios (`bug-05` / `bug-07` two-stage assertions consume this telemetry)
552
+ - Long-running campaigns where lifecycle SLO tracking matters
553
+
554
+ **See also:** `docs/rlp-desk/failure-modes.md` for known race patterns the metrics catch.
555
+
527
556
  ## Documentation
528
557
 
529
- - [Architecture](docs/architecture.md) — Design philosophy, Agent() and tmux execution modes
530
- - [Getting Started](docs/getting-started.md) — Step-by-step tutorial with the calculator example
531
- - [Protocol Reference](docs/protocol-reference.md) — Full protocol specification
532
- - [Future Plans](docs/TODO-verification-next.md) — P3 items and upcoming features
558
+ - [Architecture](docs/rlp-desk/architecture.md) — Design philosophy, Agent() and tmux execution modes
559
+ - [Getting Started](docs/rlp-desk/getting-started.md) — Step-by-step tutorial with the calculator example
560
+ - [Protocol Reference](docs/rlp-desk/protocol-reference.md) — Full protocol specification
561
+ - [Failure Modes Atlas](docs/rlp-desk/failure-modes.md) — known failure patterns + recovery procedures
562
+ - [Future Plans](docs/rlp-desk/TODO-verification-next.md) — P3 items and upcoming features
533
563
 
534
564
  ## Contributing
535
565
 
@@ -0,0 +1,191 @@
1
+ # rlp-desk Failure Modes Atlas
2
+
3
+ > Origin: 2026-05-08 (audit B-NEW-1, derived from omc-team's "Gotchas" pattern). Single canonical reference for known failure modes across the rlp-desk substrate. Each entry is FMEA-style: cause → symptom → detection → recovery.
4
+
5
+ This atlas consolidates Bug #5/6/7/8/10 + lifecycle race + sentinel contention failure patterns. New failure modes are added here once verified, with a back-link to the originating bug report or audit doc.
6
+
7
+ ---
8
+
9
+ ## §1 — Subprocess lifecycle (tmux + Worker/Verifier panes)
10
+
11
+ ### F1.1 — Worker pane idle false-positive (Bug #6)
12
+ | Field | Value |
13
+ |---|---|
14
+ | Symptom | Leader marks worker as "no progress" while iter-signal.json was already written |
15
+ | Root cause | Worker TUI returns to idle prompt after writing sentinel; capture-pane shows stasis byte-equality without observing the FS write |
16
+ | Detection | `tests/test-bug6-worker-idle-false-positive.sh`; `_worker_pane_has_signal` short-circuit in `check_no_progress` |
17
+ | Recovery | Existing fix-M short-circuits BLOCKED escalation when iter-signal.json is present. No operator action required |
18
+ | Reference | `src/scripts/run_ralph_desk.zsh` `_worker_pane_has_signal` helper |
19
+
20
+ ### F1.2 — Post-sentinel pane race (Bug #7)
21
+ | Field | Value |
22
+ |---|---|
23
+ | Symptom | Verify-verdict.json mtime drifts 30-120s after leader observes it; iter-N+1 worker dispatched while iter-N verifier's pane is still alive |
24
+ | Root cause | Without explicit teardown, claude/codex TUI continues self-reviewing after sentinel write |
25
+ | Detection | `tests/sv-real-llm/scenarios/bug-07-post-sentinel-race.test.sh` (real-LLM), `tests/node/test-sentinel-reaper-invariant.test.mjs` (unit) |
26
+ | Recovery | `_kill_pane_process` (Bug #7 Fix-Q) at zsh `lib_ralph_desk.zsh:257-272` and Node `pane-manager.mjs:91-116`. `_lock_sentinel` (Fix-R) freezes the file mtime |
27
+ | Reference | Bug #7 PR-A; v0.15.4 PR-B2-FIX extends to done-claim sentinel |
28
+
29
+ ### F1.3 — done-claim race (v0.15.4 PR-B2-FIX target)
30
+ | Field | Value |
31
+ |---|---|
32
+ | Symptom | Worker writes done-claim.json then idles 30-120s before iter-signal.json. Worker may revise done-claim mid-flight; A4 fallback synthesizes signal from a stale done-claim |
33
+ | Root cause | Original Bug #7 Fix-Q only reaped at iter-signal observation. Done-claim was unguarded |
34
+ | Detection | `tests/node/test-sentinel-reaper-invariant.test.mjs` case 5 (done-claim ALIVE pane → kill) |
35
+ | Recovery | `_kill_pane_process` + `_lock_sentinel "$DONE_CLAIM_FILE"` at 4 substrate sites (3 zsh + 1 Node). See `docs/plans/v0.15-phase-b-plan-v3.md` §B2-FIX |
36
+ | Reference | v0.15.4 commit `2b5af6c`; audit `docs/plans/v0.15.4-pre-release-audit.md` §1 C2 |
37
+
38
+ ### F1.4 — Worker dead on reuse (Bug #5)
39
+ | Field | Value |
40
+ |---|---|
41
+ | Symptom | At iter-N+1 entry, leader dispatches into a previously-killed worker pane; tmux returns "can't find pane" |
42
+ | Root cause | `_r12_check_lifecycle` not enforced strictly enough between iters |
43
+ | Detection | `tests/sv-real-llm/scenarios/bug-05-worker-dead-on-reuse.test.sh`; `[r12]` log markers |
44
+ | Recovery | R12 lifecycle monitor at iter-entry: detect dead pane within 5s budget → either replace pane OR write BLOCKED with infra_failure (no silent advance) |
45
+ | Reference | Bug #5 BOS 2026-05-05 |
46
+
47
+ ### F1.5 — Worker incomplete with leader fallback (Bug #8)
48
+ | Field | Value |
49
+ |---|---|
50
+ | Symptom | Codex worker exits without writing iter-signal.json; leader synthesizes one from done-claim, but tree may be dirty |
51
+ | Root cause | Pre-Bug-#8: leader synthesized verify signal whenever done-claim existed, regardless of git state. Caused false PASSes when worker bailed mid-write |
52
+ | Detection | `_bug8_check_synth_allowed` 3-gate (done-claim present + git OK + tree clean); 4 BLOCK_TAGS variants |
53
+ | Recovery | Refuse synthesis on Gate 1/2/3 fail; write BLOCKED sentinel with appropriate failure_category (infra_failure / metric_failure) |
54
+ | Reference | Bug #8 PR-B; src/scripts/run_ralph_desk.zsh L644-695 |
55
+
56
+ ### F1.6 — Operator-recovery artifact mismatch (Bug #10)
57
+ | Field | Value |
58
+ |---|---|
59
+ | Symptom | Operator manually clears BLOCKED sentinel + writes iter-signal/done-claim, but artifacts mismatch status.json or have stale mtime |
60
+ | Root cause | No validation pass when leader resumes from operator-cleared BLOCKED state |
61
+ | Detection | `_validate_operator_recovery_artifacts` 5-gate (file exists, parses, us_id matches, iteration matches, mtime > prompt mtime); `tests/node/test-blocked-recovery-hygiene.test.mjs` |
62
+ | Recovery | Pre-resume validator returns 0 only when all 5 gates pass; sets `RECOVERY_FAIL_REASON` for caller logging on failure |
63
+ | Reference | PR-A Bug #10; lib_ralph_desk.zsh L298-380 |
64
+
65
+ ---
66
+
67
+ ## §2 — Sentinel file contention
68
+
69
+ ### F2.1 — Concurrent write-during-read window
70
+ | Field | Value |
71
+ |---|---|
72
+ | Symptom | Leader's `jq` parse on iter-signal.json fails with "unexpected EOF" when polled mid-write |
73
+ | Root cause | Worker writes sentinel non-atomically; leader's poll catches a partial state |
74
+ | Detection | "JSON not yet valid — continue polling" log entry; `tests/node/test-sentinel-exclusive.mjs` |
75
+ | Recovery | `writeSentinelExclusive` uses O_EXCL; `_lock_sentinel` chmod 0o444 prevents post-observe rewrite; jq -e parse retried on next poll tick |
76
+ | Reference | v5.7 §4.24 file-guarantee contract; sv-gate-fast §4.24 checks |
77
+
78
+ ### F2.2 — Locked sentinel blocks next iter's writer
79
+ | Field | Value |
80
+ |---|---|
81
+ | Symptom | Iter-N+1 worker EACCES on iter-signal.json write because iter-N lock (chmod 0o444) was never released |
82
+ | Root cause | `_unlock_sentinel` not invoked at iter-start |
83
+ | Detection | "Permission denied" in worker stderr; `lib_ralph_desk.zsh` lifecycle test |
84
+ | Recovery | `unlockSentinelFile(paths.signalFile)` + `unlockSentinelFile(paths.verdictFile)` called defensively at every iter start (campaign-main-loop.mjs L1552-1555) |
85
+ | Reference | v5.7 §4.25; campaign-main-loop.mjs unlockSentinelFile call sites |
86
+
87
+ ### F2.3 — Locked file orphans across upgrades
88
+ | Field | Value |
89
+ |---|---|
90
+ | Symptom | npm install of new rlp-desk version EACCES on previously-locked installed files |
91
+ | Root cause | Installed files chmod 0o444 from prior version; postinstall.js attempts straight overwrite |
92
+ | Detection | "EACCES: permission denied" during `npm install` |
93
+ | Recovery | `scripts/postinstall.js:163-167` walks installed dir, chmod 0o644 BEFORE copy; user-facing fallback documented in S1 runbook (`npm uninstall -g` first) |
94
+ | Reference | scripts/postinstall.js unlock-walk; v0.15.4 S1 rollback runbook |
95
+
96
+ ---
97
+
98
+ ## §3 — Telemetry & observability (v0.15.4+)
99
+
100
+ ### F3.1 — lifecycle_metrics field absent (B4 telemetry regression)
101
+ | Field | Value |
102
+ |---|---|
103
+ | Symptom | `campaign.jsonl.lifecycle_metrics` is null even when `RLP_LIFECYCLE_METRICS=1` |
104
+ | Root cause | `LifecycleMetricsCollector` instantiated with wrong env (e.g. options.env shadows process.env) |
105
+ | Detection | `tests/node/test-campaign-jsonl-shape.test.mjs` AC4.3 (flag-set populated case); B3 Stage 1 presence assertion |
106
+ | Recovery | Inject explicit collector via `options.lifecycleMetrics`, OR set `env: { RLP_LIFECYCLE_METRICS: '1' }` in run() options |
107
+ | Reference | v0.15.4 PR-B4 + audit fix C2 |
108
+
109
+ ### F3.2 — Stage 2 false-PASS on absent metric
110
+ | Field | Value |
111
+ |---|---|
112
+ | Symptom | B3 Stage 2 assertion PASSes on band check even when telemetry never emitted |
113
+ | Root cause | jq query collapsed `null|empty` to `max=0`; band check `0 ≤ band` always true |
114
+ | Detection | `tests/node/test-b3-band-revalidation.test.mjs` percentile + bucket cases |
115
+ | Recovery | Pre-compute `entry_count` via flatten\|length; SKIP when 0; only run band check on non-empty data |
116
+ | Reference | v0.15.4 audit C1 fix; commit `21e12ed` |
117
+
118
+ ### F3.3 — sentinel_lock_to_unlock_ms unmeasurable for done-claim
119
+ | Field | Value |
120
+ |---|---|
121
+ | Symptom | Metric never emits for done-claim sentinel even though lock IS applied |
122
+ | Root cause | Production happy-path never calls `unlockSentinelFile(doneClaimFile)`; only signalFile + verdictFile are unlocked at iter start |
123
+ | Detection | Inspection: `lifecycle_metrics.sentinel_lock_to_unlock_ms` array contains iter-signal.json + verify-verdict.json entries but never done-claim.json |
124
+ | Recovery | Documented in `lifecycle-metrics.mjs` markLockStart() — done-claim intentionally excluded from this metric. Future: emit at lib_ralph_desk.zsh:602 archival site if needed |
125
+ | Reference | v0.15.4 audit H2; commit `feb1701` |
126
+
127
+ ### F3.4 — Synthetic baseline drift from production
128
+ | Field | Value |
129
+ |---|---|
130
+ | Symptom | B1 §4.2 synthetic numbers differ from B3 empirical p95 by >25% |
131
+ | Root cause | Synthetic anchored to zsh leader's POLL_INTERVAL=5s; production scenarios run via Node leader (100ms poll). Different leader, different floor |
132
+ | Detection | `tests/sv-real-llm/lib/b3-band-revalidation.mjs` runs 5-iter sandbox + compares |
133
+ | Recovery | Refit `B3_BAND_*_MS` constants in `tests/sv-real-llm/lib/b3-lifecycle-assertions.sh`. See revalidation findings doc |
134
+ | Reference | v0.15.4 audit H4; revalidation doc `docs/plans/v0.15-phase-b3-revalidation-findings.md` |
135
+
136
+ ---
137
+
138
+ ## §4 — Release / packaging
139
+
140
+ ### F4.1 — A2 dry-run before version bump
141
+ | Field | Value |
142
+ |---|---|
143
+ | Symptom | `npm publish --dry-run` exits non-zero with EPUBLISHCONFLICT |
144
+ | Root cause | Plan v6 placed A2 in preflight before Step 2 (version bump). With package.json still at prior version, dry-run targets registry-existing version |
145
+ | Detection | First-time observed in 2026-05-07 release attempt; documented as plan defect |
146
+ | Recovery | Split into A2 (pre-bump: tolerate EPUBLISHCONFLICT exit; verify tarball assembled) + A2' (post-bump: strict exit-0 dry-run) |
147
+ | Reference | v0.15.4 audit C3; runbook §1 + §2 step 2.5 |
148
+
149
+ ### F4.2 — Internal docs leak in npm tarball
150
+ | Field | Value |
151
+ |---|---|
152
+ | Symptom | npm-published tarball ships `docs/plans/*` (internal audit + planning) totaling ~280KB |
153
+ | Root cause | package.json `files` glob entry `"docs/"` was overly broad |
154
+ | Detection | `npm pack --dry-run \| grep "docs/plans"` (M5 verification command) |
155
+ | Recovery | Narrow glob to `"docs/rlp-desk/"`. postinstall.js syncs only from `docs/rlp-desk/`, so this is safe |
156
+ | Reference | v0.15.4 audit M5; commit `d26421e` |
157
+
158
+ ---
159
+
160
+ ## §5 — Add new entries
161
+
162
+ When a new failure mode is identified:
163
+ 1. Pick the smallest existing §N category that fits (or §6 if none)
164
+ 2. Use the same field schema (Symptom / Root cause / Detection / Recovery / Reference)
165
+ 3. Cross-link to the originating bug doc, audit, or test
166
+ 4. Optional: add a sv-gate-fast grep guard to enforce the recovery contract
167
+
168
+ When a failure mode is permanently retired:
169
+ 1. Move to §7 "Retired" (do NOT delete — historical reference)
170
+ 2. Note retirement reason (e.g., "design changed in v0.16; replaced by ...")
171
+
172
+ ---
173
+
174
+ ## §6 — Open issues (no recovery yet)
175
+
176
+ ### F6.1 — us006 real-tmux boundary test flakiness
177
+ | Field | Value |
178
+ |---|---|
179
+ | Symptom | `tests/node/us006-campaign-main-loop.test.mjs` AC6.1 boundary case (real tmux session w/ 4 panes) intermittently fails 2-5 of 377 Node suite tests on first run; passes cleanly on retry |
180
+ | Root cause | Real tmux session creation + pane process spawn race: `tmux send-keys` may fire before pane's shell is fully ready, causing `can't find pane` warnings (which are non-fatal but timing-sensitive assertions occasionally trip) |
181
+ | Detection | Observed 2026-05-07 + 2026-05-08 in v0.15.4 release pipeline preflight; first-run fail-count varies 2-5 of 377 |
182
+ | Recovery | Runbook §2 S2 retry-once policy: re-run `npm run test:node`. Second run consistently 377/377 PASS |
183
+ | Reference | v0.15.4 release pipeline observation; runbook §7.5.3 Stage 2 INFO-band-exceeded path is unrelated but uses same retry-once mental model |
184
+
185
+ **Why "open"**: the flake is in the test, not in production code. Adding retry-once to npm test:node script would mask actual regressions. Better fix: redesign the AC6.1 boundary test to use `wait-for-pane-ready` synchronization before sending keys. Deferred to a future v0.15.x patch — not release-blocking.
186
+
187
+ ---
188
+
189
+ ## §7 — Retired
190
+
191
+ (none as of 2026-05-08)
package/package.json CHANGED
@@ -1,10 +1,12 @@
1
1
  {
2
2
  "name": "@ai-dev-methodologies/rlp-desk",
3
- "version": "0.15.3",
3
+ "version": "0.15.5",
4
4
  "description": "Fresh-context iterative loops for Claude Code — autonomous task completion with independent verification",
5
5
  "scripts": {
6
6
  "postinstall": "node scripts/postinstall.js",
7
7
  "uninstall": "node scripts/uninstall.js",
8
+ "manifest:check": "node scripts/build-node-manifest.js --check",
9
+ "prepublishOnly": "node scripts/build-node-manifest.js --check",
8
10
  "test:node": "node --test 'tests/node/*.mjs' 'tests/node/*.test.mjs'",
9
11
  "test:zsh": "for f in tests/test_*.sh; do echo \"=== $f ===\"; zsh \"$f\" || exit 1; done",
10
12
  "test:fast": "npm run test:node",
@@ -19,10 +21,15 @@
19
21
  "src/governance.md",
20
22
  "src/model-upgrade-table.md",
21
23
  "scripts/",
22
- "docs/",
23
- "examples/",
24
+ "docs/rlp-desk/*.md",
25
+ "docs/rlp-desk/blueprints/",
26
+ "examples/calculator/.claude/ralph-desk/context/",
27
+ "examples/calculator/.claude/ralph-desk/memos/",
28
+ "examples/calculator/.claude/ralph-desk/plans/",
29
+ "examples/calculator/.claude/ralph-desk/prompts/",
24
30
  "install.sh",
25
31
  "README.md",
32
+ "CHANGELOG.md",
26
33
  "LICENSE"
27
34
  ],
28
35
  "keywords": [
@@ -7,9 +7,12 @@ reporting/campaign-reporting.mjs
7
7
  run.mjs
8
8
  runner/campaign-main-loop.mjs
9
9
  runner/leader-registry.mjs
10
+ runner/prompt-detector.mjs
10
11
  runner/prompt-dismisser.mjs
11
12
  shared/fs.mjs
12
13
  shared/paths.mjs
13
14
  tmux/pane-manager.mjs
14
15
  util/debug-log.mjs
16
+ util/desk-root.mjs
17
+ util/lifecycle-metrics.mjs
15
18
  util/shell-quote.mjs
@@ -150,8 +150,8 @@ export async function assembleWorkerPrompt({
150
150
  } else {
151
151
  promptLines.push(`- **Test Spec**: Read \`${fullTestSpecPath}\` (full — find ${nextUs} section)`);
152
152
  }
153
- promptLines.push(`When done, signal verify with us_id="${nextUs}" (not "ALL").`);
154
- promptLines.push(`Signal format: {"iteration": N, "status": "verify", "us_id": "${nextUs}", ...}`);
153
+ promptLines.push(`When done, you MUST WRITE (not just print) the verify signal to the iter-signal FILE — Path: \`memos/<slug>-iter-signal.json\` (see the MANDATORY signal-file instruction in the base prompt).`);
154
+ promptLines.push(`Write this exact JSON to that file (us_id="${nextUs}", not "ALL"): {"iteration": N, "status": "verify", "us_id": "${nextUs}", "summary": "what was done", "timestamp": "ISO"}`);
155
155
  promptLines.push('');
156
156
  promptLines.push(`**Update the campaign memory's 'Next Iteration Contract' to reflect ${nextUs}.**`);
157
157
  } else if (verifiedUs.length > 0) {
package/src/node/run.mjs CHANGED
@@ -1,7 +1,7 @@
1
1
  import fs from 'node:fs';
2
2
  import os from 'node:os';
3
3
  import path from 'node:path';
4
- import { spawn } from 'node:child_process';
4
+ import { spawn, spawnSync } from 'node:child_process';
5
5
  import { fileURLToPath } from 'node:url';
6
6
 
7
7
  import { initCampaign } from './init/campaign-initializer.mjs';
@@ -9,6 +9,7 @@ import { readStatus } from './reporting/campaign-reporting.mjs';
9
9
  import {
10
10
  run as runCampaignMain,
11
11
  detectLegacyDeskInRunMode,
12
+ buildPaths,
12
13
  } from './runner/campaign-main-loop.mjs';
13
14
  import { isClaudeEngine } from './cli/command-builder.mjs';
14
15
 
@@ -51,7 +52,7 @@ function buildHelpText() {
51
52
  ' run <slug> [options] Run loop (tmux=zsh leader [production], agent=Node leader [deprecated alpha], native=slash-only error)',
52
53
  ' status <slug> Show loop status',
53
54
  ' logs <slug> [N] Show iteration log (not implemented in the Node rewrite yet)',
54
- ' clean <slug> [--kill-session] Reset for re-run (not implemented in the Node rewrite yet)',
55
+ ' clean <slug> [--kill-session] Reset for re-run (removes sentinels + runtime/; preserves PRD/prompts/memory)',
55
56
  ' resume <slug> Resume loop (not implemented in the Node rewrite yet)',
56
57
  '',
57
58
  'Run Options:',
@@ -217,6 +218,71 @@ async function runStatusCommand(args, deps) {
217
218
  return 0;
218
219
  }
219
220
 
221
+ // D-6 (dogfood): real `clean` for the Node leader. Previously "not implemented",
222
+ // which left a blocked campaign with NO recovery path (a transient parse error
223
+ // wrote a blocked sentinel that bricked re-runs). Removes the transient/terminal
224
+ // state (sentinels, signal/claim/verdict, runtime/) while PRESERVING the durable
225
+ // inputs (PRD, test-spec, prompts, context, memory) and the campaign report.
226
+ async function runCleanCommand(args, deps) {
227
+ if (args.length === 0 || args[0] === '--help') {
228
+ write(deps.stdout, 'Usage: node src/node/run.mjs clean <slug> [--kill-session]');
229
+ return 0;
230
+ }
231
+ const slug = args[0];
232
+ const killSession = args.includes('--kill-session');
233
+ const paths = buildPaths(deps.cwd, slug);
234
+
235
+ // --kill-session: read the session name from runtime/session-config.json
236
+ // BEFORE removing runtime, then best-effort tmux teardown.
237
+ if (killSession) {
238
+ try {
239
+ const cfgPath = path.join(paths.runtimeDir, 'session-config.json');
240
+ if (deps.fileExists(cfgPath)) {
241
+ const cfg = JSON.parse(fs.readFileSync(cfgPath, 'utf8'));
242
+ if (cfg && cfg.session_name) {
243
+ spawnSync('tmux', ['kill-session', '-t', cfg.session_name], { stdio: 'ignore' });
244
+ }
245
+ }
246
+ } catch { /* best-effort */ }
247
+ }
248
+
249
+ const transient = [
250
+ paths.blockedSentinel,
251
+ paths.blockedSentinel.replace(/\.md$/, '.json'),
252
+ paths.completeSentinel,
253
+ paths.completeSentinel.replace(/\.md$/, '.json'),
254
+ paths.signalFile,
255
+ paths.doneClaimFile,
256
+ paths.verdictFile,
257
+ paths.flywheelSignalFile,
258
+ paths.flywheelGuardVerdictFile,
259
+ ];
260
+ let removed = 0;
261
+ for (const target of transient) {
262
+ try {
263
+ if (fs.existsSync(target)) {
264
+ // Sentinels may be chmod 0o444 (write-lock); relax before unlink.
265
+ try { fs.chmodSync(target, 0o644); } catch { /* ignore */ }
266
+ fs.rmSync(target, { force: true });
267
+ removed += 1;
268
+ }
269
+ } catch { /* best-effort per-file */ }
270
+ }
271
+ try {
272
+ if (fs.existsSync(paths.runtimeDir)) {
273
+ fs.rmSync(paths.runtimeDir, { recursive: true, force: true });
274
+ removed += 1;
275
+ }
276
+ } catch { /* best-effort */ }
277
+
278
+ write(
279
+ deps.stdout,
280
+ `Cleaned ${slug}: removed ${removed} transient artifact(s) (sentinels, signal/claim/verdict, runtime/`
281
+ + `${killSession ? ', tmux session' : ''}). Preserved PRD, test-spec, prompts, context, memory, reports.`,
282
+ );
283
+ return 0;
284
+ }
285
+
220
286
  // v0.14.0: Default location of the zsh runner installed by postinstall.js
221
287
  // (Phase 3 of the v0.14.0 plan re-enables this sync). Overridable via
222
288
  // RLP_DESK_ZSH_RUNNER for development checkouts that point to src/scripts.
@@ -466,9 +532,10 @@ export async function main(argv = process.argv.slice(2), overrides = {}) {
466
532
  return await runRunCommand(rest, deps);
467
533
  case 'status':
468
534
  return await runStatusCommand(rest, deps);
535
+ case 'clean':
536
+ return await runCleanCommand(rest, deps);
469
537
  case 'brainstorm':
470
538
  case 'logs':
471
- case 'clean':
472
539
  case 'resume':
473
540
  throw new Error(`${command} is not implemented in the Node rewrite yet`);
474
541
  default: