@ai-dev-methodologies/rlp-desk 0.15.2 → 0.15.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +82 -0
- package/README.md +34 -4
- package/docs/rlp-desk/failure-modes.md +191 -0
- package/package.json +3 -2
- package/src/node/runner/campaign-main-loop.mjs +174 -11
- package/src/node/util/debug-log.mjs +10 -6
- package/src/node/util/lifecycle-metrics.mjs +102 -0
- package/src/scripts/lib_ralph_desk.zsh +141 -0
- package/src/scripts/run_ralph_desk.zsh +44 -0
- package/docs/plans/bug-report-overhaul-backlog.md +0 -49
- package/docs/plans/bug-report-overhaul-v0.md +0 -238
- package/docs/plans/bug-report-overhaul-v1.md +0 -319
- package/docs/plans/native-agent-revert.md +0 -184
- package/docs/plans/polished-gliding-toucan.md +0 -234
- package/docs/plans/spicy-booping-galaxy.md +0 -717
- package/docs/plans/strategic-review/rlp-desk-strategic-review.md +0 -125
- package/docs/plans/v0.15-stabilization-plan.md +0 -178
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to `@ai-dev-methodologies/rlp-desk` are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and the project adheres to [Semantic Versioning](https://semver.org/).
|
|
4
|
+
|
|
5
|
+
For pre-v0.15.4 versions, refer to `git log` and individual GitHub release notes.
|
|
6
|
+
|
|
7
|
+
## [Unreleased]
|
|
8
|
+
|
|
9
|
+
### Planned (not yet shipped)
|
|
10
|
+
- v0.15.5 candidate: flip `RLP_LIFECYCLE_METRICS=1` default to ON (gated on 3 consecutive nightly real-LLM SV passes per `docs/plans/v0.15.4-release-runbook.md` §7.5.2).
|
|
11
|
+
- Post-v0.15.6: remove `RLP_LIFECYCLE_METRICS` flag entirely (per plan v3 ADR follow-ups).
|
|
12
|
+
- Phase D.1 (handoff documents) + Phase D.2 (per-stage agent role specialization) — both deferred per `docs/plans/v0.15.4-release-runbook.md` §7.6.
|
|
13
|
+
|
|
14
|
+
## [0.15.4] — 2026-05-08 (pending release)
|
|
15
|
+
|
|
16
|
+
Phase B: tmux/process lifecycle hardening + observability + real-LLM SV strengthening. 4 sequential PRs (B1, B2-FIX, B4, B3) merged to main, plus pre-release audit fix branch addressing 16 findings (3 CRITICAL, 6 HIGH, 5 MEDIUM, 2 LOW).
|
|
17
|
+
|
|
18
|
+
### Added
|
|
19
|
+
- `RLP_LIFECYCLE_METRICS=1` env flag enables structured tmux/process lifecycle telemetry. Five metrics emitted per iteration:
|
|
20
|
+
- `iter_signal_write_to_read_ms` — Worker FS write → leader poll resolve
|
|
21
|
+
- `verdict_write_to_read_ms` — Verifier FS write → leader poll resolve
|
|
22
|
+
- `pane_eof_to_cleanup_ms` — kill-start → `killPaneProcess` return
|
|
23
|
+
- `pane_reap_latency_ms` — done-claim observe → pane shell-idle
|
|
24
|
+
- `sentinel_lock_to_unlock_ms` — per-type, lock vs unlock pair
|
|
25
|
+
- Default OFF; zero overhead when unset.
|
|
26
|
+
- Lands in `debug.log` (LIFECYCLE category) and batched per iter into `campaign.jsonl.lifecycle_metrics`.
|
|
27
|
+
- `RLP_DESK_NODE_PATH` env override for SV scenarios. Lets the operator point bug-05 / bug-07 at a source-tree leader (`<repo>/src/node/run.mjs`) for pre-merge AC3.1a sampling.
|
|
28
|
+
- `B3_STAGE2_BLOCKING=1` env flag. Promotes B3 Stage 2 lifecycle-band assertions from non-blocking (informational) to release-blocking. Operator opts in after a 3-night PASS streak per release runbook §7.5.2.
|
|
29
|
+
- `docs/rlp-desk/failure-modes.md` — FMEA-style consolidated failure modes atlas (origin: omc-team Gotchas pattern). 14 entries across 6 categories.
|
|
30
|
+
|
|
31
|
+
### Fixed
|
|
32
|
+
- Bug #5/7 done-claim race window (PR-B2-FIX). Worker pane is now reaped (`_kill_pane_process`) and `done-claim.json` sentinel locked (`chmod 0o444`) at the moment leader observes done-claim. Previously the pane lingered 30-120s post-write and could revise the artifact.
|
|
33
|
+
- Four substrate sites fixed: `run_ralph_desk.zsh` codex-exit synth path / A4 fallback inline path / post iter-signal reaper, `campaign-main-loop.mjs` Node leader worker reap.
|
|
34
|
+
- B3 Stage 2 jq false-PASS (audit C1). Pre-compute entry count via `jq flatten | length`; SKIP when zero entries instead of falsely matching `max=0` against the band.
|
|
35
|
+
- B3 scenarios circular pre-merge gate (audit C2). bug-05 / bug-07 honor `RLP_DESK_NODE_PATH`; pre-merge AC3.1a sample is now achievable.
|
|
36
|
+
- A2 dry-run placement (audit C3). Runbook splits into A2 (pre-bump: tolerate EPUBLISHCONFLICT, verify auth + tarball) and A2' (post-bump: strict exit-0 dry-run).
|
|
37
|
+
- A5 trigger-file oracle anchor (audit H1). Uses runtime-derived commit SHA of the prior version-bump commit, not a non-existent `vX.Y.Z` git tag.
|
|
38
|
+
- markLockStart timestamp inversion (audit H3). Moved BEFORE `lockSentinel` chmod in reapProducer so `sentinel_lock_to_unlock_ms` covers full lock duration including chmod execution.
|
|
39
|
+
|
|
40
|
+
### Strengthened
|
|
41
|
+
- Real-LLM SV scenarios bug-05 (worker-dead-on-reuse) and bug-07 (post-sentinel-race) now run two-stage assertions when invoked with `RLP_LIFECYCLE_METRICS=1`:
|
|
42
|
+
- Stage 1 (presence): `lifecycle_metrics` field non-null in `campaign.jsonl`.
|
|
43
|
+
- Stage 2 (value): observed metrics within tolerance bands. Default NON-BLOCKING; flip to BLOCKING via `B3_STAGE2_BLOCKING=1`.
|
|
44
|
+
- bug-06 retains structural-only check ($0 cost; deterministic injection deferred to PR-B5 per ADR follow-ups).
|
|
45
|
+
- New unit tests:
|
|
46
|
+
- `tests/node/test-sentinel-reaper-invariant.test.mjs` — 6 invariant cases including B2-FIX primary target (case 5: done-claim ALIVE pane → reap).
|
|
47
|
+
- `tests/node/test-lifecycle-metrics.test.mjs` — 10 LifecycleMetricsCollector cases.
|
|
48
|
+
- `tests/node/test-campaign-jsonl-shape.test.mjs` — 4 shape contract cases (flag-on/off + sentinel context).
|
|
49
|
+
- `tests/node/test-b3-band-revalidation.test.mjs` — 17 cases for revalidation harness pure helpers (audit L2).
|
|
50
|
+
- `tests/node/us006-campaign-main-loop.test.mjs` — Bug-7-C-negative case added (audit M1).
|
|
51
|
+
- `tests/test_b2fix_sentinel_lock.sh` — 9 zsh PART-A code-pattern + PART-B helper-behavior assertions.
|
|
52
|
+
- sv-gate-fast: 48 → 71 (+23 guards across B2-FIX, B4, B3, audit fixes).
|
|
53
|
+
- Node test suite: 339 → 377 (+38 cases).
|
|
54
|
+
|
|
55
|
+
### Documentation
|
|
56
|
+
- `docs/plans/v0.15-phase-b-lifecycle-audit.md` — B1 lifecycle audit (sentinel write-attribution, B4 metric proposal, ASCII diagrams). §4.5 appended with empirical revalidation update (audit H4).
|
|
57
|
+
- `docs/plans/v0.15-phase-b-plan-v3.md` — APPROVED ralplan plan v3 (Planner→Architect→Critic).
|
|
58
|
+
- `docs/plans/v0.15-phase-b3-revalidation-findings.md` — pre-merge band revalidation findings (synthetic vs empirical drift, refit table).
|
|
59
|
+
- `docs/plans/v0.15.4-pre-release-audit.md` — operator audit found 16 issues + 4 false positives. §9 per-finding fix status added.
|
|
60
|
+
- `docs/plans/v0.15.4-release-runbook.md` — release runbook with 7-phase pipeline (4 user gates), nightly schedule (§7.5), deferred follow-ups (§7.6 Phase D), failure-mode summary (§8).
|
|
61
|
+
|
|
62
|
+
### Internal (packaging)
|
|
63
|
+
- npm tarball no longer ships internal planning documents (`docs/plans/`). User-facing reference docs at `docs/rlp-desk/` continue to ship unchanged. `package.json` `files` glob narrowed from `"docs/"` to `"docs/rlp-desk/"`. Saves ~280KB per install.
|
|
64
|
+
|
|
65
|
+
### Migration notes
|
|
66
|
+
- No breaking changes. Existing 0.15.3 installations should upgrade smoothly via `npm install -g @ai-dev-methodologies/rlp-desk@0.15.4`. The postinstall script unlocks 0o444-protected files before overwriting (per CLAUDE.md upgrade-path EACCES guard).
|
|
67
|
+
- New `RLP_LIFECYCLE_METRICS` env flag defaults OFF — no behavior change for existing pipelines.
|
|
68
|
+
- Real-LLM SV scenarios accept new `RLP_DESK_NODE_PATH` env override but default to installed leader (backwards-compatible).
|
|
69
|
+
|
|
70
|
+
## [0.15.3] — earlier release
|
|
71
|
+
|
|
72
|
+
See git log: `git log e0efaba` (chore: bump version to 0.15.3) for the 0.15.3 history.
|
|
73
|
+
|
|
74
|
+
## Older versions
|
|
75
|
+
|
|
76
|
+
For changelog-style notes prior to 0.15.4, refer to:
|
|
77
|
+
- `git log <version-bump-commit>` for each `chore: bump version to X.Y.Z` commit
|
|
78
|
+
- GitHub Releases at https://github.com/ai-dev-methodologies/rlp-desk/releases
|
|
79
|
+
- `docs/plans/v0.15-stabilization-plan.md` for v0.15.x stabilization track context
|
|
80
|
+
|
|
81
|
+
[Unreleased]: https://github.com/ai-dev-methodologies/rlp-desk/compare/v0.15.4...HEAD
|
|
82
|
+
[0.15.4]: https://github.com/ai-dev-methodologies/rlp-desk/compare/v0.15.3...v0.15.4
|
package/README.md
CHANGED
|
@@ -524,12 +524,42 @@ mkdir my-calc && cd my-calc
|
|
|
524
524
|
/rlp-desk run loop-test
|
|
525
525
|
```
|
|
526
526
|
|
|
527
|
+
## Lifecycle Observability (v0.15.4+)
|
|
528
|
+
|
|
529
|
+
Set `RLP_LIFECYCLE_METRICS=1` before invoking the runner to enable structured tmux/process lifecycle telemetry. Default: OFF (zero overhead when unset).
|
|
530
|
+
|
|
531
|
+
```bash
|
|
532
|
+
RLP_LIFECYCLE_METRICS=1 node ~/.claude/ralph-desk/node/run.mjs run my-slug --mode tmux
|
|
533
|
+
```
|
|
534
|
+
|
|
535
|
+
When enabled, five metrics are emitted per iteration:
|
|
536
|
+
|
|
537
|
+
| Metric | Meaning |
|
|
538
|
+
|---|---|
|
|
539
|
+
| `iter_signal_write_to_read_ms` | Worker FS write → leader poll resolve |
|
|
540
|
+
| `verdict_write_to_read_ms` | Verifier FS write → leader poll resolve |
|
|
541
|
+
| `pane_eof_to_cleanup_ms` | Kill-start → `killPaneProcess` return |
|
|
542
|
+
| `pane_reap_latency_ms` | done-claim observe → pane shell-idle |
|
|
543
|
+
| `sentinel_lock_to_unlock_ms` | per sentinel type, lock vs unlock pair |
|
|
544
|
+
|
|
545
|
+
**Where they land:**
|
|
546
|
+
- `debug.log` — `[LIFECYCLE]` tagged lines (per emission)
|
|
547
|
+
- `campaign.jsonl` — batched `lifecycle_metrics` object per iteration record (canonical authoritative source)
|
|
548
|
+
|
|
549
|
+
**When to enable:**
|
|
550
|
+
- Investigating tmux race windows or leader-poll latency
|
|
551
|
+
- Pre-merge real-LLM SV scenarios (`bug-05` / `bug-07` two-stage assertions consume this telemetry)
|
|
552
|
+
- Long-running campaigns where lifecycle SLO tracking matters
|
|
553
|
+
|
|
554
|
+
**See also:** `docs/rlp-desk/failure-modes.md` for known race patterns the metrics catch.
|
|
555
|
+
|
|
527
556
|
## Documentation
|
|
528
557
|
|
|
529
|
-
- [Architecture](docs/architecture.md) — Design philosophy, Agent() and tmux execution modes
|
|
530
|
-
- [Getting Started](docs/getting-started.md) — Step-by-step tutorial with the calculator example
|
|
531
|
-
- [Protocol Reference](docs/protocol-reference.md) — Full protocol specification
|
|
532
|
-
- [
|
|
558
|
+
- [Architecture](docs/rlp-desk/architecture.md) — Design philosophy, Agent() and tmux execution modes
|
|
559
|
+
- [Getting Started](docs/rlp-desk/getting-started.md) — Step-by-step tutorial with the calculator example
|
|
560
|
+
- [Protocol Reference](docs/rlp-desk/protocol-reference.md) — Full protocol specification
|
|
561
|
+
- [Failure Modes Atlas](docs/rlp-desk/failure-modes.md) — known failure patterns + recovery procedures
|
|
562
|
+
- [Future Plans](docs/rlp-desk/TODO-verification-next.md) — P3 items and upcoming features
|
|
533
563
|
|
|
534
564
|
## Contributing
|
|
535
565
|
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
# rlp-desk Failure Modes Atlas
|
|
2
|
+
|
|
3
|
+
> Origin: 2026-05-08 (audit B-NEW-1, derived from omc-team's "Gotchas" pattern). Single canonical reference for known failure modes across the rlp-desk substrate. Each entry is FMEA-style: cause → symptom → detection → recovery.
|
|
4
|
+
|
|
5
|
+
This atlas consolidates Bug #5/6/7/8/10 + lifecycle race + sentinel contention failure patterns. New failure modes are added here once verified, with a back-link to the originating bug report or audit doc.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## §1 — Subprocess lifecycle (tmux + Worker/Verifier panes)
|
|
10
|
+
|
|
11
|
+
### F1.1 — Worker pane idle false-positive (Bug #6)
|
|
12
|
+
| Field | Value |
|
|
13
|
+
|---|---|
|
|
14
|
+
| Symptom | Leader marks worker as "no progress" while iter-signal.json was already written |
|
|
15
|
+
| Root cause | Worker TUI returns to idle prompt after writing sentinel; capture-pane shows stasis byte-equality without observing the FS write |
|
|
16
|
+
| Detection | `tests/test-bug6-worker-idle-false-positive.sh`; `_worker_pane_has_signal` short-circuit in `check_no_progress` |
|
|
17
|
+
| Recovery | Existing fix-M short-circuits BLOCKED escalation when iter-signal.json is present. No operator action required |
|
|
18
|
+
| Reference | `src/scripts/run_ralph_desk.zsh` `_worker_pane_has_signal` helper |
|
|
19
|
+
|
|
20
|
+
### F1.2 — Post-sentinel pane race (Bug #7)
|
|
21
|
+
| Field | Value |
|
|
22
|
+
|---|---|
|
|
23
|
+
| Symptom | Verify-verdict.json mtime drifts 30-120s after leader observes it; iter-N+1 worker dispatched while iter-N verifier's pane is still alive |
|
|
24
|
+
| Root cause | Without explicit teardown, claude/codex TUI continues self-reviewing after sentinel write |
|
|
25
|
+
| Detection | `tests/sv-real-llm/scenarios/bug-07-post-sentinel-race.test.sh` (real-LLM), `tests/node/test-sentinel-reaper-invariant.test.mjs` (unit) |
|
|
26
|
+
| Recovery | `_kill_pane_process` (Bug #7 Fix-Q) at zsh `lib_ralph_desk.zsh:257-272` and Node `pane-manager.mjs:91-116`. `_lock_sentinel` (Fix-R) freezes the file mtime |
|
|
27
|
+
| Reference | Bug #7 PR-A; v0.15.4 PR-B2-FIX extends to done-claim sentinel |
|
|
28
|
+
|
|
29
|
+
### F1.3 — done-claim race (v0.15.4 PR-B2-FIX target)
|
|
30
|
+
| Field | Value |
|
|
31
|
+
|---|---|
|
|
32
|
+
| Symptom | Worker writes done-claim.json then idles 30-120s before iter-signal.json. Worker may revise done-claim mid-flight; A4 fallback synthesizes signal from a stale done-claim |
|
|
33
|
+
| Root cause | Original Bug #7 Fix-Q only reaped at iter-signal observation. Done-claim was unguarded |
|
|
34
|
+
| Detection | `tests/node/test-sentinel-reaper-invariant.test.mjs` case 5 (done-claim ALIVE pane → kill) |
|
|
35
|
+
| Recovery | `_kill_pane_process` + `_lock_sentinel "$DONE_CLAIM_FILE"` at 4 substrate sites (3 zsh + 1 Node). See `docs/plans/v0.15-phase-b-plan-v3.md` §B2-FIX |
|
|
36
|
+
| Reference | v0.15.4 commit `2b5af6c`; audit `docs/plans/v0.15.4-pre-release-audit.md` §1 C2 |
|
|
37
|
+
|
|
38
|
+
### F1.4 — Worker dead on reuse (Bug #5)
|
|
39
|
+
| Field | Value |
|
|
40
|
+
|---|---|
|
|
41
|
+
| Symptom | At iter-N+1 entry, leader dispatches into a previously-killed worker pane; tmux returns "can't find pane" |
|
|
42
|
+
| Root cause | `_r12_check_lifecycle` not enforced strictly enough between iters |
|
|
43
|
+
| Detection | `tests/sv-real-llm/scenarios/bug-05-worker-dead-on-reuse.test.sh`; `[r12]` log markers |
|
|
44
|
+
| Recovery | R12 lifecycle monitor at iter-entry: detect dead pane within 5s budget → either replace pane OR write BLOCKED with infra_failure (no silent advance) |
|
|
45
|
+
| Reference | Bug #5 BOS 2026-05-05 |
|
|
46
|
+
|
|
47
|
+
### F1.5 — Worker incomplete with leader fallback (Bug #8)
|
|
48
|
+
| Field | Value |
|
|
49
|
+
|---|---|
|
|
50
|
+
| Symptom | Codex worker exits without writing iter-signal.json; leader synthesizes one from done-claim, but tree may be dirty |
|
|
51
|
+
| Root cause | Pre-Bug-#8: leader synthesized verify signal whenever done-claim existed, regardless of git state. Caused false PASSes when worker bailed mid-write |
|
|
52
|
+
| Detection | `_bug8_check_synth_allowed` 3-gate (done-claim present + git OK + tree clean); 4 BLOCK_TAGS variants |
|
|
53
|
+
| Recovery | Refuse synthesis on Gate 1/2/3 fail; write BLOCKED sentinel with appropriate failure_category (infra_failure / metric_failure) |
|
|
54
|
+
| Reference | Bug #8 PR-B; src/scripts/run_ralph_desk.zsh L644-695 |
|
|
55
|
+
|
|
56
|
+
### F1.6 — Operator-recovery artifact mismatch (Bug #10)
|
|
57
|
+
| Field | Value |
|
|
58
|
+
|---|---|
|
|
59
|
+
| Symptom | Operator manually clears BLOCKED sentinel + writes iter-signal/done-claim, but artifacts mismatch status.json or have stale mtime |
|
|
60
|
+
| Root cause | No validation pass when leader resumes from operator-cleared BLOCKED state |
|
|
61
|
+
| Detection | `_validate_operator_recovery_artifacts` 5-gate (file exists, parses, us_id matches, iteration matches, mtime > prompt mtime); `tests/node/test-blocked-recovery-hygiene.test.mjs` |
|
|
62
|
+
| Recovery | Pre-resume validator returns 0 only when all 5 gates pass; sets `RECOVERY_FAIL_REASON` for caller logging on failure |
|
|
63
|
+
| Reference | PR-A Bug #10; lib_ralph_desk.zsh L298-380 |
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## §2 — Sentinel file contention
|
|
68
|
+
|
|
69
|
+
### F2.1 — Concurrent write-during-read window
|
|
70
|
+
| Field | Value |
|
|
71
|
+
|---|---|
|
|
72
|
+
| Symptom | Leader's `jq` parse on iter-signal.json fails with "unexpected EOF" when polled mid-write |
|
|
73
|
+
| Root cause | Worker writes sentinel non-atomically; leader's poll catches a partial state |
|
|
74
|
+
| Detection | "JSON not yet valid — continue polling" log entry; `tests/node/test-sentinel-exclusive.mjs` |
|
|
75
|
+
| Recovery | `writeSentinelExclusive` uses O_EXCL; `_lock_sentinel` chmod 0o444 prevents post-observe rewrite; jq -e parse retried on next poll tick |
|
|
76
|
+
| Reference | v5.7 §4.24 file-guarantee contract; sv-gate-fast §4.24 checks |
|
|
77
|
+
|
|
78
|
+
### F2.2 — Locked sentinel blocks next iter's writer
|
|
79
|
+
| Field | Value |
|
|
80
|
+
|---|---|
|
|
81
|
+
| Symptom | Iter-N+1 worker EACCES on iter-signal.json write because iter-N lock (chmod 0o444) was never released |
|
|
82
|
+
| Root cause | `_unlock_sentinel` not invoked at iter-start |
|
|
83
|
+
| Detection | "Permission denied" in worker stderr; `lib_ralph_desk.zsh` lifecycle test |
|
|
84
|
+
| Recovery | `unlockSentinelFile(paths.signalFile)` + `unlockSentinelFile(paths.verdictFile)` called defensively at every iter start (campaign-main-loop.mjs L1552-1555) |
|
|
85
|
+
| Reference | v5.7 §4.25; campaign-main-loop.mjs unlockSentinelFile call sites |
|
|
86
|
+
|
|
87
|
+
### F2.3 — Locked file orphans across upgrades
|
|
88
|
+
| Field | Value |
|
|
89
|
+
|---|---|
|
|
90
|
+
| Symptom | npm install of new rlp-desk version EACCES on previously-locked installed files |
|
|
91
|
+
| Root cause | Installed files chmod 0o444 from prior version; postinstall.js attempts straight overwrite |
|
|
92
|
+
| Detection | "EACCES: permission denied" during `npm install` |
|
|
93
|
+
| Recovery | `scripts/postinstall.js:163-167` walks installed dir, chmod 0o644 BEFORE copy; user-facing fallback documented in S1 runbook (`npm uninstall -g` first) |
|
|
94
|
+
| Reference | scripts/postinstall.js unlock-walk; v0.15.4 S1 rollback runbook |
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## §3 — Telemetry & observability (v0.15.4+)
|
|
99
|
+
|
|
100
|
+
### F3.1 — lifecycle_metrics field absent (B4 telemetry regression)
|
|
101
|
+
| Field | Value |
|
|
102
|
+
|---|---|
|
|
103
|
+
| Symptom | `campaign.jsonl.lifecycle_metrics` is null even when `RLP_LIFECYCLE_METRICS=1` |
|
|
104
|
+
| Root cause | `LifecycleMetricsCollector` instantiated with wrong env (e.g. options.env shadows process.env) |
|
|
105
|
+
| Detection | `tests/node/test-campaign-jsonl-shape.test.mjs` AC4.3 (flag-set populated case); B3 Stage 1 presence assertion |
|
|
106
|
+
| Recovery | Inject explicit collector via `options.lifecycleMetrics`, OR set `env: { RLP_LIFECYCLE_METRICS: '1' }` in run() options |
|
|
107
|
+
| Reference | v0.15.4 PR-B4 + audit fix C2 |
|
|
108
|
+
|
|
109
|
+
### F3.2 — Stage 2 false-PASS on absent metric
|
|
110
|
+
| Field | Value |
|
|
111
|
+
|---|---|
|
|
112
|
+
| Symptom | B3 Stage 2 assertion PASSes on band check even when telemetry never emitted |
|
|
113
|
+
| Root cause | jq query collapsed `null|empty` to `max=0`; band check `0 ≤ band` always true |
|
|
114
|
+
| Detection | `tests/node/test-b3-band-revalidation.test.mjs` percentile + bucket cases |
|
|
115
|
+
| Recovery | Pre-compute `entry_count` via flatten\|length; SKIP when 0; only run band check on non-empty data |
|
|
116
|
+
| Reference | v0.15.4 audit C1 fix; commit `21e12ed` |
|
|
117
|
+
|
|
118
|
+
### F3.3 — sentinel_lock_to_unlock_ms unmeasurable for done-claim
|
|
119
|
+
| Field | Value |
|
|
120
|
+
|---|---|
|
|
121
|
+
| Symptom | Metric never emits for done-claim sentinel even though lock IS applied |
|
|
122
|
+
| Root cause | Production happy-path never calls `unlockSentinelFile(doneClaimFile)`; only signalFile + verdictFile are unlocked at iter start |
|
|
123
|
+
| Detection | Inspection: `lifecycle_metrics.sentinel_lock_to_unlock_ms` array contains iter-signal.json + verify-verdict.json entries but never done-claim.json |
|
|
124
|
+
| Recovery | Documented in `lifecycle-metrics.mjs` markLockStart() — done-claim intentionally excluded from this metric. Future: emit at lib_ralph_desk.zsh:602 archival site if needed |
|
|
125
|
+
| Reference | v0.15.4 audit H2; commit `feb1701` |
|
|
126
|
+
|
|
127
|
+
### F3.4 — Synthetic baseline drift from production
|
|
128
|
+
| Field | Value |
|
|
129
|
+
|---|---|
|
|
130
|
+
| Symptom | B1 §4.2 synthetic numbers differ from B3 empirical p95 by >25% |
|
|
131
|
+
| Root cause | Synthetic anchored to zsh leader's POLL_INTERVAL=5s; production scenarios run via Node leader (100ms poll). Different leader, different floor |
|
|
132
|
+
| Detection | `tests/sv-real-llm/lib/b3-band-revalidation.mjs` runs 5-iter sandbox + compares |
|
|
133
|
+
| Recovery | Refit `B3_BAND_*_MS` constants in `tests/sv-real-llm/lib/b3-lifecycle-assertions.sh`. See revalidation findings doc |
|
|
134
|
+
| Reference | v0.15.4 audit H4; revalidation doc `docs/plans/v0.15-phase-b3-revalidation-findings.md` |
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
## §4 — Release / packaging
|
|
139
|
+
|
|
140
|
+
### F4.1 — A2 dry-run before version bump
|
|
141
|
+
| Field | Value |
|
|
142
|
+
|---|---|
|
|
143
|
+
| Symptom | `npm publish --dry-run` exits non-zero with EPUBLISHCONFLICT |
|
|
144
|
+
| Root cause | Plan v6 placed A2 in preflight before Step 2 (version bump). With package.json still at prior version, dry-run targets registry-existing version |
|
|
145
|
+
| Detection | First-time observed in 2026-05-07 release attempt; documented as plan defect |
|
|
146
|
+
| Recovery | Split into A2 (pre-bump: tolerate EPUBLISHCONFLICT exit; verify tarball assembled) + A2' (post-bump: strict exit-0 dry-run) |
|
|
147
|
+
| Reference | v0.15.4 audit C3; runbook §1 + §2 step 2.5 |
|
|
148
|
+
|
|
149
|
+
### F4.2 — Internal docs leak in npm tarball
|
|
150
|
+
| Field | Value |
|
|
151
|
+
|---|---|
|
|
152
|
+
| Symptom | npm-published tarball ships `docs/plans/*` (internal audit + planning) totaling ~280KB |
|
|
153
|
+
| Root cause | package.json `files` glob entry `"docs/"` was overly broad |
|
|
154
|
+
| Detection | `npm pack --dry-run \| grep "docs/plans"` (M5 verification command) |
|
|
155
|
+
| Recovery | Narrow glob to `"docs/rlp-desk/"`. postinstall.js syncs only from `docs/rlp-desk/`, so this is safe |
|
|
156
|
+
| Reference | v0.15.4 audit M5; commit `d26421e` |
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## §5 — Add new entries
|
|
161
|
+
|
|
162
|
+
When a new failure mode is identified:
|
|
163
|
+
1. Pick the smallest existing §N category that fits (or §6 if none)
|
|
164
|
+
2. Use the same field schema (Symptom / Root cause / Detection / Recovery / Reference)
|
|
165
|
+
3. Cross-link to the originating bug doc, audit, or test
|
|
166
|
+
4. Optional: add a sv-gate-fast grep guard to enforce the recovery contract
|
|
167
|
+
|
|
168
|
+
When a failure mode is permanently retired:
|
|
169
|
+
1. Move to §7 "Retired" (do NOT delete — historical reference)
|
|
170
|
+
2. Note retirement reason (e.g., "design changed in v0.16; replaced by ...")
|
|
171
|
+
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
## §6 — Open issues (no recovery yet)
|
|
175
|
+
|
|
176
|
+
### F6.1 — us006 real-tmux boundary test flakiness
|
|
177
|
+
| Field | Value |
|
|
178
|
+
|---|---|
|
|
179
|
+
| Symptom | `tests/node/us006-campaign-main-loop.test.mjs` AC6.1 boundary case (real tmux session w/ 4 panes) intermittently fails 2-5 of 377 Node suite tests on first run; passes cleanly on retry |
|
|
180
|
+
| Root cause | Real tmux session creation + pane process spawn race: `tmux send-keys` may fire before pane's shell is fully ready, causing `can't find pane` warnings (which are non-fatal but timing-sensitive assertions occasionally trip) |
|
|
181
|
+
| Detection | Observed 2026-05-07 + 2026-05-08 in v0.15.4 release pipeline preflight; first-run fail-count varies 2-5 of 377 |
|
|
182
|
+
| Recovery | Runbook §2 S2 retry-once policy: re-run `npm run test:node`. Second run consistently 377/377 PASS |
|
|
183
|
+
| Reference | v0.15.4 release pipeline observation; runbook §7.5.3 Stage 2 INFO-band-exceeded path is unrelated but uses same retry-once mental model |
|
|
184
|
+
|
|
185
|
+
**Why "open"**: the flake is in the test, not in production code. Adding retry-once to npm test:node script would mask actual regressions. Better fix: redesign the AC6.1 boundary test to use `wait-for-pane-ready` synchronization before sending keys. Deferred to a future v0.15.x patch — not release-blocking.
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## §7 — Retired
|
|
190
|
+
|
|
191
|
+
(none as of 2026-05-08)
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ai-dev-methodologies/rlp-desk",
|
|
3
|
-
"version": "0.15.
|
|
3
|
+
"version": "0.15.4",
|
|
4
4
|
"description": "Fresh-context iterative loops for Claude Code — autonomous task completion with independent verification",
|
|
5
5
|
"scripts": {
|
|
6
6
|
"postinstall": "node scripts/postinstall.js",
|
|
@@ -19,10 +19,11 @@
|
|
|
19
19
|
"src/governance.md",
|
|
20
20
|
"src/model-upgrade-table.md",
|
|
21
21
|
"scripts/",
|
|
22
|
-
"docs/",
|
|
22
|
+
"docs/rlp-desk/",
|
|
23
23
|
"examples/",
|
|
24
24
|
"install.sh",
|
|
25
25
|
"README.md",
|
|
26
|
+
"CHANGELOG.md",
|
|
26
27
|
"LICENSE"
|
|
27
28
|
],
|
|
28
29
|
"keywords": [
|
|
@@ -32,6 +32,8 @@ import {
|
|
|
32
32
|
generateSVReport,
|
|
33
33
|
prepareCampaignAnalytics,
|
|
34
34
|
} from '../reporting/campaign-reporting.mjs';
|
|
35
|
+
import { LifecycleMetricsCollector } from '../util/lifecycle-metrics.mjs';
|
|
36
|
+
import { makeDebugLogger } from '../util/debug-log.mjs';
|
|
35
37
|
import {
|
|
36
38
|
createPane as defaultCreatePane,
|
|
37
39
|
killPaneProcess as defaultKillPaneProcess,
|
|
@@ -133,6 +135,10 @@ function buildPaths(rootDir, slug, env = process.env) {
|
|
|
133
135
|
flywheelGuardPromptFile: path.join(deskRoot, 'prompts', `${slug}.flywheel-guard.prompt.md`),
|
|
134
136
|
flywheelGuardVerdictFile: path.join(deskRoot, 'memos', `${slug}-flywheel-guard-verdict.json`),
|
|
135
137
|
laneAuditFile: path.join(campaignLogDir, 'lane-audit.json'),
|
|
138
|
+
// v0.15.4 PR-B4: structured debug.log. log_lifecycle_metric (zsh) and
|
|
139
|
+
// LifecycleMetricsCollector (Node) both emit here when
|
|
140
|
+
// RLP_LIFECYCLE_METRICS=1.
|
|
141
|
+
debugLogFile: path.join(campaignLogDir, 'debug.log'),
|
|
136
142
|
};
|
|
137
143
|
}
|
|
138
144
|
|
|
@@ -492,7 +498,74 @@ async function _validateOperatorRecoveryArtifacts({ paths, state }) {
|
|
|
492
498
|
return { ok: true, reason: 'all five checks passed' };
|
|
493
499
|
}
|
|
494
500
|
|
|
495
|
-
|
|
501
|
+
// PR-E (Phase C1, stabilization): operator-cleared BLOCKED recovery.
|
|
502
|
+
// When operator manually deletes <slug>-blocked.md to recover (a documented
|
|
503
|
+
// flow), counters in status.json (consecutive_failures / consecutive_blocks)
|
|
504
|
+
// stay populated. Without this branch, leader relaunches with stale counters
|
|
505
|
+
// and may immediately re-BLOCK on the first failure even though operator's
|
|
506
|
+
// intent was a fresh start. Pair to PR-A (phase=verify recovery, Bug #10).
|
|
507
|
+
//
|
|
508
|
+
// 4-check validator. Returns { ok, reason }. On any failure, caller falls
|
|
509
|
+
// through to existing behavior — defensive default, never auto-recovers
|
|
510
|
+
// against ambiguous state.
|
|
511
|
+
//
|
|
512
|
+
// Check 4 reads <slug>-blocked.json sidecar (NOT status.json), because
|
|
513
|
+
// status.json never persists `last_block_reason` (blocked-write code path
|
|
514
|
+
// at L920-968 doesn't write that field). The sidecar DOES carry
|
|
515
|
+
// `recoverable: bool` per _classifyBlock contract — that's the canonical
|
|
516
|
+
// non-recoverable signal.
|
|
517
|
+
async function _validateBlockedRecovery({ paths, state }) {
|
|
518
|
+
// Check 1: precondition
|
|
519
|
+
if (state.phase !== 'blocked') {
|
|
520
|
+
return { ok: false, reason: `state.phase is ${state.phase}, not 'blocked'` };
|
|
521
|
+
}
|
|
522
|
+
// Check 2: sentinel cleared by operator
|
|
523
|
+
if (await exists(paths.blockedSentinel)) {
|
|
524
|
+
return { ok: false, reason: 'blocked sentinel still present (operator did not clear)' };
|
|
525
|
+
}
|
|
526
|
+
// Check 3: counters non-zero (something to reset)
|
|
527
|
+
const failures = state.consecutive_failures ?? 0;
|
|
528
|
+
const blocks = state.consecutive_blocks ?? 0;
|
|
529
|
+
if (failures === 0 && blocks === 0) {
|
|
530
|
+
return { ok: false, reason: 'counters already zero, nothing to recover' };
|
|
531
|
+
}
|
|
532
|
+
// Check 4: sidecar safety check
|
|
533
|
+
const sidecarPath = paths.blockedSentinel.replace(/\.md$/, '.json');
|
|
534
|
+
let sidecar = null;
|
|
535
|
+
try {
|
|
536
|
+
sidecar = await readJsonIfExists(sidecarPath);
|
|
537
|
+
} catch (err) {
|
|
538
|
+
// Malformed sidecar — be defensive and fall through.
|
|
539
|
+
return { ok: false, reason: `blocked.json sidecar parse error: ${err?.message ?? err}` };
|
|
540
|
+
}
|
|
541
|
+
if (sidecar && sidecar.recoverable === false) {
|
|
542
|
+
return {
|
|
543
|
+
ok: false,
|
|
544
|
+
reason: `non-recoverable category ${sidecar.reason_category ?? 'unknown'} from sidecar (use clean to reset)`,
|
|
545
|
+
};
|
|
546
|
+
}
|
|
547
|
+
return { ok: true, reason: 'sidecar absent or recoverable=true; recovery permitted' };
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
// PR-E helper: rename the recovered sidecar so operator can audit what was
|
|
551
|
+
// recovered from. Best-effort — failure here is non-fatal.
|
|
552
|
+
async function _archiveRecoveredSidecar(paths) {
|
|
553
|
+
const sidecarPath = paths.blockedSentinel.replace(/\.md$/, '.json');
|
|
554
|
+
if (!(await exists(sidecarPath))) return;
|
|
555
|
+
const iso = new Date().toISOString().replace(/[:.]/g, '-');
|
|
556
|
+
const archivePath = `${sidecarPath}.recovered-${iso}`;
|
|
557
|
+
try {
|
|
558
|
+
await fs.rename(sidecarPath, archivePath);
|
|
559
|
+
} catch (err) {
|
|
560
|
+
console.error(`[recovery] failed to archive sidecar: ${err?.message ?? err}`);
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
async function appendIterationAnalytics(paths, state, usId, verdict, options, lifecycleMetrics = null) {
|
|
565
|
+
// v0.15.4 PR-B4: lifecycle_metrics field — null when flag unset (collector
|
|
566
|
+
// returns null), object grouped by metric name when flag set. Test:
|
|
567
|
+
// tests/node/test-campaign-jsonl-shape.mjs.
|
|
568
|
+
const lifecycleSnapshot = lifecycleMetrics ? lifecycleMetrics.flush() : null;
|
|
496
569
|
await appendCampaignAnalytics(paths.analyticsFile, {
|
|
497
570
|
iter: state.iteration,
|
|
498
571
|
us_id: usId,
|
|
@@ -501,6 +574,7 @@ async function appendIterationAnalytics(paths, state, usId, verdict, options) {
|
|
|
501
574
|
verdict,
|
|
502
575
|
duration: 0,
|
|
503
576
|
timestamp: toIso(resolveNow(options.now)),
|
|
577
|
+
lifecycle_metrics: lifecycleSnapshot,
|
|
504
578
|
});
|
|
505
579
|
}
|
|
506
580
|
|
|
@@ -1107,7 +1181,7 @@ async function runFinalSequentialVerify({
|
|
|
1107
1181
|
});
|
|
1108
1182
|
|
|
1109
1183
|
if (typeof reapProducer === 'function') {
|
|
1110
|
-
await reapProducer(verifierPaneId, paths.verdictFile);
|
|
1184
|
+
await reapProducer(verifierPaneId, paths.verdictFile, 'verify-verdict');
|
|
1111
1185
|
}
|
|
1112
1186
|
|
|
1113
1187
|
if (verdict.verdict !== 'pass') {
|
|
@@ -1305,8 +1379,20 @@ async function _runCampaignBody(slug, options, paths, rootDir) {
|
|
|
1305
1379
|
const killPaneProcess = options.killPaneProcess ?? defaultKillPaneProcess;
|
|
1306
1380
|
const lockSentinel = options.lockSentinelFile ?? defaultLockSentinelFile;
|
|
1307
1381
|
const stampAckField = options.stampAckField ?? defaultStampAckField;
|
|
1308
|
-
|
|
1382
|
+
// v0.15.4 PR-B4: lifecycle observability collector. Tests inject
|
|
1383
|
+
// options.lifecycleMetrics for shape-contract verification; production
|
|
1384
|
+
// path constructs from process.env (RLP_LIFECYCLE_METRICS=1 enables).
|
|
1385
|
+
const debugLogger = makeDebugLogger(paths.debugLogFile);
|
|
1386
|
+
const lifecycleMetrics = options.lifecycleMetrics ?? new LifecycleMetricsCollector({
|
|
1387
|
+
env: options.env ?? process.env,
|
|
1388
|
+
debugLog: (cat, fields) => debugLogger(cat, fields),
|
|
1389
|
+
});
|
|
1390
|
+
const reapProducer = async (paneId, sentinelFile, sentinelType = null) => {
|
|
1309
1391
|
if (!paneId) return;
|
|
1392
|
+
// v0.15.4 PR-B4: pane_eof_to_cleanup_ms = wallclock from kill-start to
|
|
1393
|
+
// killPaneProcess return. pane_reap_latency_ms tracks the same window
|
|
1394
|
+
// when the trigger was a sentinel observation (i.e. sentinelType set).
|
|
1395
|
+
const reapStart = Date.now();
|
|
1310
1396
|
await killPaneProcess(paneId, {
|
|
1311
1397
|
sendRawKey,
|
|
1312
1398
|
waitForExit: waitForProcessExit,
|
|
@@ -1321,7 +1407,22 @@ async function _runCampaignBody(slug, options, paths, rootDir) {
|
|
|
1321
1407
|
} catch (err) {
|
|
1322
1408
|
console.error(`[handshake] waitForProcessExit failed on ${paneId} (${err?.message ?? err}); continuing`);
|
|
1323
1409
|
}
|
|
1410
|
+
const reapMs = Date.now() - reapStart;
|
|
1411
|
+
lifecycleMetrics.record('pane_eof_to_cleanup_ms', reapMs, { pane_id: paneId });
|
|
1412
|
+
if (sentinelType) {
|
|
1413
|
+
lifecycleMetrics.record('pane_reap_latency_ms', reapMs, {
|
|
1414
|
+
pane_id: paneId,
|
|
1415
|
+
sentinel_type: sentinelType,
|
|
1416
|
+
});
|
|
1417
|
+
}
|
|
1324
1418
|
if (sentinelFile) {
|
|
1419
|
+
// v0.15.4 audit H3 fix: markLockStart BEFORE lockSentinel so the
|
|
1420
|
+
// sentinel_lock_to_unlock_ms metric covers the full lock duration
|
|
1421
|
+
// including chmod 0o444 execution time. Previous code recorded
|
|
1422
|
+
// post-chmod timestamp — sub-ms skew but semantically inverted.
|
|
1423
|
+
// v0.15.4 PR-B4: open lock-to-unlock pair tracking. markUnlock fires
|
|
1424
|
+
// at unlockSentinelFile call sites or end-of-iter for never-unlocked.
|
|
1425
|
+
lifecycleMetrics.markLockStart(path.basename(sentinelFile));
|
|
1325
1426
|
await lockSentinel(sentinelFile, { log: (msg) => console.error(msg) });
|
|
1326
1427
|
// PR-0b-narrow AC-H2: stamp the leader_ack audit field. Best-effort,
|
|
1327
1428
|
// does not block subsequent dispatch.
|
|
@@ -1414,6 +1515,33 @@ async function _runCampaignBody(slug, options, paths, rootDir) {
|
|
|
1414
1515
|
}
|
|
1415
1516
|
}
|
|
1416
1517
|
|
|
1518
|
+
// PR-E (Phase C1, stabilization): operator-cleared BLOCKED recovery.
|
|
1519
|
+
// Pair to PR-A above. PR-E runs AFTER PR-A so phase=verify takes precedence
|
|
1520
|
+
// when both apply (defensive ordering: never auto-recover phase=blocked if
|
|
1521
|
+
// the operator's actual intent was phase=verify hygiene). Does NOT use
|
|
1522
|
+
// _skipNextWorkerDispatch — counters reset is enough; worker dispatches
|
|
1523
|
+
// normally on the next iteration with a clean state.
|
|
1524
|
+
if (state.phase === 'blocked' && !state._skipNextWorkerDispatch) {
|
|
1525
|
+
const validation = await _validateBlockedRecovery({ paths, state });
|
|
1526
|
+
if (validation.ok) {
|
|
1527
|
+
const previousReason = state.last_block_reason ?? '';
|
|
1528
|
+
console.error(
|
|
1529
|
+
`[recovery] Operator-cleared BLOCKED detected (was: ${previousReason || 'unrecorded'}). Resetting counters and resuming as worker. iter=${state.iteration} us_id=${state.current_us}: ${validation.reason}`,
|
|
1530
|
+
);
|
|
1531
|
+
state.phase = 'worker';
|
|
1532
|
+
state.consecutive_failures = 0;
|
|
1533
|
+
state.consecutive_blocks = 0;
|
|
1534
|
+
state.last_block_reason = '';
|
|
1535
|
+
// Archive sidecar (rename, not delete) so operator can audit the
|
|
1536
|
+
// recovered-from state. Best-effort.
|
|
1537
|
+
await _archiveRecoveredSidecar(paths);
|
|
1538
|
+
} else {
|
|
1539
|
+
console.error(
|
|
1540
|
+
`[recovery] phase=blocked ignored, falling through to existing behavior: ${validation.reason}`,
|
|
1541
|
+
);
|
|
1542
|
+
}
|
|
1543
|
+
}
|
|
1544
|
+
|
|
1417
1545
|
// P1-E Lane Enforcement: snapshot lane mtimes before each iteration,
|
|
1418
1546
|
// compare at the top of the next iteration. Drift on read-only artifacts
|
|
1419
1547
|
// (PRD, test-spec, context) emits a lane_violation_warning event + audit
|
|
@@ -1426,13 +1554,15 @@ async function _runCampaignBody(slug, options, paths, rootDir) {
|
|
|
1426
1554
|
// iteration must not block the next producer's atomic-rename write.
|
|
1427
1555
|
// Idempotent: missing-file calls are no-ops.
|
|
1428
1556
|
await unlockSentinelFile(paths.signalFile);
|
|
1557
|
+
lifecycleMetrics.markUnlock(path.basename(paths.signalFile), { iter: state.iteration });
|
|
1429
1558
|
await unlockSentinelFile(paths.verdictFile);
|
|
1559
|
+
lifecycleMetrics.markUnlock(path.basename(paths.verdictFile), { iter: state.iteration });
|
|
1430
1560
|
// Audit drift from the prior iteration before doing anything new.
|
|
1431
1561
|
const _laneSnapshotAfter = await _snapshotLaneMtimes(paths);
|
|
1432
1562
|
const _laneViolations = await _checkLaneViolations(paths, _laneSnapshot, _laneSnapshotAfter, state, options);
|
|
1433
1563
|
if (_laneViolations) {
|
|
1434
1564
|
for (const v of _laneViolations) {
|
|
1435
|
-
await appendIterationAnalytics(paths, state, state.current_us ?? 'ALL', 'lane_violation_warning', { ...options, lane_violation: v });
|
|
1565
|
+
await appendIterationAnalytics(paths, state, state.current_us ?? 'ALL', 'lane_violation_warning', { ...options, lane_violation: v }, lifecycleMetrics);
|
|
1436
1566
|
}
|
|
1437
1567
|
if (options.laneStrict) {
|
|
1438
1568
|
// Strict mode: escalate to BLOCKED with downgrade
|
|
@@ -1568,7 +1698,7 @@ async function _runCampaignBody(slug, options, paths, rootDir) {
|
|
|
1568
1698
|
}
|
|
1569
1699
|
|
|
1570
1700
|
// Bug #7 Fix-Q/R: reap flywheel pane before consuming the signal.
|
|
1571
|
-
await reapProducer(state.flywheel_pane_id ?? state.verifier_pane_id, paths.flywheelSignalFile);
|
|
1701
|
+
await reapProducer(state.flywheel_pane_id ?? state.verifier_pane_id, paths.flywheelSignalFile, 'flywheel-signal');
|
|
1572
1702
|
|
|
1573
1703
|
state.last_flywheel_decision = flywheelSignal.decision;
|
|
1574
1704
|
// P0-A multi-mission orchestration: optionally captured from flywheel signal.
|
|
@@ -1611,7 +1741,7 @@ async function _runCampaignBody(slug, options, paths, rootDir) {
|
|
|
1611
1741
|
}
|
|
1612
1742
|
|
|
1613
1743
|
// Bug #7 Fix-Q/R: reap guard pane before mutating state.
|
|
1614
|
-
await reapProducer(guardPaneId, paths.flywheelGuardVerdictFile);
|
|
1744
|
+
await reapProducer(guardPaneId, paths.flywheelGuardVerdictFile, 'flywheel-guard-verdict');
|
|
1615
1745
|
|
|
1616
1746
|
if (!state.flywheel_guard_count[state.current_us]) {
|
|
1617
1747
|
state.flywheel_guard_count[state.current_us] = 0;
|
|
@@ -1797,10 +1927,35 @@ async function _runCampaignBody(slug, options, paths, rootDir) {
|
|
|
1797
1927
|
}
|
|
1798
1928
|
}
|
|
1799
1929
|
|
|
1930
|
+
// v0.15.4 PR-B4: iter_signal_write_to_read_ms = wallclock from worker FS
|
|
1931
|
+
// write to leader poll resolve. Sentinel mtime is the producer-side anchor;
|
|
1932
|
+
// Date.now() is the leader-side anchor. Best-effort stat — if the file
|
|
1933
|
+
// already lacks read perms (race vs prior lock), fall back to skip.
|
|
1934
|
+
try {
|
|
1935
|
+
const sigStat = fsSync.statSync(paths.signalFile);
|
|
1936
|
+
lifecycleMetrics.record('iter_signal_write_to_read_ms', Date.now() - sigStat.mtimeMs, {
|
|
1937
|
+
iter: state.iteration,
|
|
1938
|
+
us_id: state.current_us,
|
|
1939
|
+
});
|
|
1940
|
+
} catch { /* fail-open: skip on stat error */ }
|
|
1800
1941
|
// Bug #7 Fix-Q/R: reap the worker pane the instant we accept the signal so
|
|
1801
1942
|
// claude/codex cannot self-review and rewrite iter-signal.json. Runs even
|
|
1802
1943
|
// for the codex-fallback synthesized signal (no-op on a dead pane).
|
|
1803
|
-
await reapProducer(state.worker_pane_id, paths.signalFile);
|
|
1944
|
+
await reapProducer(state.worker_pane_id, paths.signalFile, 'iter-signal');
|
|
1945
|
+
// v0.15.4 PR-B2-FIX: same worker pass produced done-claim. The pane is
|
|
1946
|
+
// already reaped above; lock done-claim so the iter-NNN-done-claim archive
|
|
1947
|
+
// and any post-iter Bug #8 gate read a snapshot the worker can no longer
|
|
1948
|
+
// revise. Symmetric with the zsh lock-on-iter-signal contract at
|
|
1949
|
+
// run_ralph_desk.zsh:3197. Best-effort: missing-file is fail-open.
|
|
1950
|
+
//
|
|
1951
|
+
// v0.15.4 audit H2 fix: NO markLockStart for done-claim. In production
|
|
1952
|
+
// happy path done-claim is locked-but-never-unlocked (only signalFile +
|
|
1953
|
+
// verdictFile receive iter-start unlockSentinelFile at L1552-1555), so
|
|
1954
|
+
// markUnlock would never fire and the metric would silently never emit.
|
|
1955
|
+
// done-claim is intentionally excluded from sentinel_lock_to_unlock_ms;
|
|
1956
|
+
// the lib_ralph_desk.zsh:602 archival step is the practical lock-end
|
|
1957
|
+
// event but is not currently instrumented (deferred — not B4 scope).
|
|
1958
|
+
await lockSentinel(paths.doneClaimFile, { log: (msg) => console.error(msg) });
|
|
1804
1959
|
|
|
1805
1960
|
// US-019 R7 P1-G: verify_partial malformed downgrade.
|
|
1806
1961
|
// verify_partial requires verified_acs[] to be a non-empty array. Otherwise the verifier
|
|
@@ -1871,10 +2026,18 @@ async function _runCampaignBody(slug, options, paths, rootDir) {
|
|
|
1871
2026
|
});
|
|
1872
2027
|
}
|
|
1873
2028
|
|
|
2029
|
+
// v0.15.4 PR-B4: verdict_write_to_read_ms parallel to iter_signal metric.
|
|
2030
|
+
try {
|
|
2031
|
+
const verdStat = fsSync.statSync(paths.verdictFile);
|
|
2032
|
+
lifecycleMetrics.record('verdict_write_to_read_ms', Date.now() - verdStat.mtimeMs, {
|
|
2033
|
+
iter: state.iteration,
|
|
2034
|
+
us_id: state.current_us,
|
|
2035
|
+
});
|
|
2036
|
+
} catch { /* fail-open */ }
|
|
1874
2037
|
// Bug #7 Fix-Q/R: reap verifier pane immediately after accepting the
|
|
1875
2038
|
// verdict — without this the codex/claude TUI keeps running for ~2min and
|
|
1876
2039
|
// can rewrite verify-verdict.json (mtime drift observed in 19th launch).
|
|
1877
|
-
await reapProducer(state.verifier_pane_id, paths.verdictFile);
|
|
2040
|
+
await reapProducer(state.verifier_pane_id, paths.verdictFile, 'verify-verdict');
|
|
1878
2041
|
|
|
1879
2042
|
if (verdict.verdict === 'pass') {
|
|
1880
2043
|
state.consecutive_failures = 0;
|
|
@@ -1883,7 +2046,7 @@ async function _runCampaignBody(slug, options, paths, rootDir) {
|
|
|
1883
2046
|
}
|
|
1884
2047
|
state.current_us = getNextUs(usList, state.verified_us, null);
|
|
1885
2048
|
fixContractPath = null;
|
|
1886
|
-
await appendIterationAnalytics(paths, state, usId, 'pass', options);
|
|
2049
|
+
await appendIterationAnalytics(paths, state, usId, 'pass', options, lifecycleMetrics);
|
|
1887
2050
|
await writeStatus(paths, state, options.onStatusChange, options.now);
|
|
1888
2051
|
|
|
1889
2052
|
if (state.verified_us.length === usList.length) {
|
|
@@ -1899,7 +2062,7 @@ async function _runCampaignBody(slug, options, paths, rootDir) {
|
|
|
1899
2062
|
const blockedReason = verdict.reason || verdict.summary || 'verifier-blocked';
|
|
1900
2063
|
const blockedClassification = _classifyBlock('verifier', { verdict, state, slug });
|
|
1901
2064
|
await writeSentinel(paths.blockedSentinel, 'blocked', usId, blockedReason, blockedClassification, paths);
|
|
1902
|
-
await appendIterationAnalytics(paths, state, usId, 'blocked', options);
|
|
2065
|
+
await appendIterationAnalytics(paths, state, usId, 'blocked', options, lifecycleMetrics);
|
|
1903
2066
|
await writeStatus(paths, state, options.onStatusChange, options.now);
|
|
1904
2067
|
let svSummary;
|
|
1905
2068
|
if (options.withSelfVerification) {
|
|
@@ -1938,7 +2101,7 @@ async function _runCampaignBody(slug, options, paths, rootDir) {
|
|
|
1938
2101
|
}
|
|
1939
2102
|
|
|
1940
2103
|
state.consecutive_failures += 1;
|
|
1941
|
-
await appendIterationAnalytics(paths, state, usId, 'fail', options);
|
|
2104
|
+
await appendIterationAnalytics(paths, state, usId, 'fail', options, lifecycleMetrics);
|
|
1942
2105
|
const upgradedModel = nextWorkerModel(options.workerModel ?? state.worker_model, state.consecutive_failures);
|
|
1943
2106
|
if (upgradedModel === 'BLOCKED') {
|
|
1944
2107
|
state.phase = 'blocked';
|