@ai-dev-methodologies/rlp-desk 0.15.3 → 0.15.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,102 @@
1
+ // v0.15.4 PR-B4 — Lifecycle observability helper.
2
+ //
3
+ // Plan: docs/plans/v0.15-phase-b-plan-v3.md §B4.
4
+ // Audit: docs/plans/v0.15-phase-b-lifecycle-audit.md §3 Table 2.
5
+ //
6
+ // Five metrics tracked, all gated on RLP_LIFECYCLE_METRICS=1 env flag:
7
+ // - iter_signal_write_to_read_ms leader-poll-resolves vs worker-FS-write
8
+ // - verdict_write_to_read_ms leader-poll-resolves vs verifier-FS-write
9
+ // - pane_eof_to_cleanup_ms pane process exit vs killPaneProcess return
10
+ // - pane_reap_latency_ms done-claim observed vs C-c×2 + waitForExit
11
+ // - sentinel_lock_to_unlock_ms per type, _lock vs _unlock (object)
12
+ //
13
+ // Emission discipline:
14
+ // - debug.log: tagged [LIFECYCLE] per record (when flag set)
15
+ // - campaign.jsonl: ONE batched lifecycle_metrics object per iteration
16
+ // (the collector accumulates, the iter-end flush emits)
17
+ // When flag is unset:
18
+ // - record() is a no-op (early return) — zero overhead beyond a Map check
19
+ // - flush() returns null so analytics writer can branch on the field
20
+
21
+ const ENV_FLAG_NAME = 'RLP_LIFECYCLE_METRICS';
22
+
23
+ export function lifecycleMetricsEnabled(env = process.env) {
24
+ return env[ENV_FLAG_NAME] === '1';
25
+ }
26
+
27
+ export class LifecycleMetricsCollector {
28
+ constructor({ env = process.env, debugLog = null } = {}) {
29
+ this._enabled = lifecycleMetricsEnabled(env);
30
+ this._debugLog = debugLog;
31
+ this._records = [];
32
+ this._sentinelLockTimes = new Map();
33
+ }
34
+
35
+ get enabled() {
36
+ return this._enabled;
37
+ }
38
+
39
+ // Record a single timing metric. value is in milliseconds. ctx is a flat
40
+ // object of audit fields (iter, us_id, pane_id, sentinel_type, etc).
41
+ record(name, valueMs, ctx = {}) {
42
+ if (!this._enabled) return;
43
+ const entry = {
44
+ metric: name,
45
+ value_ms: Math.max(0, Math.round(valueMs)),
46
+ ts: new Date().toISOString(),
47
+ ...ctx,
48
+ };
49
+ this._records.push(entry);
50
+ if (this._debugLog) {
51
+ // Best-effort fire-and-forget. The debug-log helper is itself best-
52
+ // effort (appendFile error swallowed), so we don't await it.
53
+ this._debugLog('LIFECYCLE', { metric: name, value_ms: entry.value_ms, ...ctx });
54
+ }
55
+ }
56
+
57
+ // Convenience: pair-bookkeeping for sentinel_lock_to_unlock_ms (object-
58
+ // valued metric keyed by sentinel type). Call markLockStart at chmod 0o444
59
+ // time, markUnlock at chmod 0o644 time (or end-of-iter for never-unlocked).
60
+ //
61
+ // v0.15.4 audit H2: done-claim is intentionally NOT instrumented with this
62
+ // pair. In production happy path done-claim is locked-but-never-unlocked
63
+ // (campaign-main-loop unlocks only signalFile + verdictFile at iter start);
64
+ // markUnlock for done-claim never fires, so the metric would silently never
65
+ // emit. Future work: emit at lib_ralph_desk.zsh:602 archival site if needed.
66
+ //
67
+ // v0.15.4 audit H3: callers must invoke markLockStart BEFORE the chmod
68
+ // operation, not after, so the metric covers full lock duration including
69
+ // chmod execution time. Sub-ms skew, but semantically correct.
70
+ markLockStart(sentinelType, t = Date.now()) {
71
+ if (!this._enabled) return;
72
+ this._sentinelLockTimes.set(sentinelType, t);
73
+ }
74
+
75
+ markUnlock(sentinelType, ctx = {}, t = Date.now()) {
76
+ if (!this._enabled) return;
77
+ const start = this._sentinelLockTimes.get(sentinelType);
78
+ if (start === undefined) return;
79
+ this.record('sentinel_lock_to_unlock_ms', t - start, {
80
+ ...ctx,
81
+ sentinel_type: sentinelType,
82
+ });
83
+ this._sentinelLockTimes.delete(sentinelType);
84
+ }
85
+
86
+ // Snapshot + reset for end-of-iteration flush. Returns null when disabled
87
+ // so the analytics writer can omit the field cleanly.
88
+ flush() {
89
+ if (!this._enabled) return null;
90
+ const records = this._records;
91
+ this._records = [];
92
+ // Group by metric name for compact campaign.jsonl shape:
93
+ // { iter_signal_write_to_read_ms: [{value_ms,ts,...}, ...], ... }
94
+ const grouped = {};
95
+ for (const r of records) {
96
+ const { metric, ...rest } = r;
97
+ if (!grouped[metric]) grouped[metric] = [];
98
+ grouped[metric].push(rest);
99
+ }
100
+ return grouped;
101
+ }
102
+ }
@@ -261,6 +261,19 @@ _kill_pane_process() {
261
261
  if typeset -f log_debug >/dev/null 2>&1; then
262
262
  log_debug "[bug7] kill_pane_process pane=$pane_id role=$role"
263
263
  fi
264
+ # v0.15.4 PR-B4: pane_eof_to_cleanup_ms instrumentation (flag-gated).
265
+ # Records the wallclock from kill-start to wait_for_pane_ready return so
266
+ # B3 can value-assert the substrate fix actually closes the race window.
267
+ # Uses zsh native $EPOCHREALTIME (microsec) — portable to macOS BSD where
268
+ # `date +%N` is not supported.
269
+ local _b4_t0_ms=0
270
+ if [[ "${RLP_LIFECYCLE_METRICS:-0}" == "1" ]]; then
271
+ zmodload -e zsh/datetime || zmodload zsh/datetime 2>/dev/null
272
+ if [[ -n "${EPOCHREALTIME:-}" ]]; then
273
+ local _b4_t0_str="${EPOCHREALTIME//./}"
274
+ _b4_t0_ms=${_b4_t0_str:0:13}
275
+ fi
276
+ fi
264
277
  tmux send-keys -t "$pane_id" C-c 2>/dev/null
265
278
  sleep 0.5
266
279
  tmux send-keys -t "$pane_id" C-c 2>/dev/null
@@ -268,6 +281,12 @@ _kill_pane_process() {
268
281
  if typeset -f wait_for_pane_ready >/dev/null 2>&1; then
269
282
  wait_for_pane_ready "$pane_id" 5 2>/dev/null || true
270
283
  fi
284
+ if (( _b4_t0_ms > 0 )); then
285
+ local _b4_t1_str="${EPOCHREALTIME//./}"
286
+ local _b4_t1_ms=${_b4_t1_str:0:13}
287
+ log_lifecycle_metric "pane_eof_to_cleanup_ms" $((_b4_t1_ms - _b4_t0_ms)) \
288
+ "pane=$pane_id role=$role"
289
+ fi
271
290
  return 0
272
291
  }
273
292
 
@@ -285,6 +304,53 @@ _unlock_sentinel() {
285
304
  return 0
286
305
  }
287
306
 
307
+ # =============================================================================
308
+ # v0.15.4 PR-B4: Lifecycle observability — log_lifecycle_metric
309
+ # =============================================================================
310
+ # Plan: docs/plans/v0.15-phase-b-plan-v3.md §B4 (P2.1 critic-round-2 fix).
311
+ # Helper is GATED on $RLP_LIFECYCLE_METRICS=1 (no-op when unset). Emits to
312
+ # debug.log via log_debug, in a backgrounded subshell so the caller does not
313
+ # block on the FS write. The Node-side mirror is src/node/util/lifecycle-
314
+ # metrics.mjs LifecycleMetricsCollector.
315
+ #
316
+ # v0.15.4 audit M2: concurrent-appender semantics — `( ... ) &!` spawns a
317
+ # disowned subshell per metric. Multiple metrics can fire in rapid succession
318
+ # (e.g., during iter teardown) and race on debug.log. POSIX guarantees atomic
319
+ # append for writes <= PIPE_BUF (4096 bytes). A single LIFECYCLE line is
320
+ # ~150 bytes, well under the limit, so on local filesystems (APFS, ext4, xfs)
321
+ # concurrent appends produce intact non-interleaved lines. On NFS / FUSE /
322
+ # some Docker overlay setups PIPE_BUF guarantees may not hold; in those
323
+ # environments, expect possible interleaving. This is best-effort logging
324
+ # by design — the metric values land in campaign.jsonl via the Node leader's
325
+ # batched flush as the canonical authoritative record. debug.log is an
326
+ # audit aid, not the source of truth.
327
+ #
328
+ # Args:
329
+ # $1 metric_name e.g. iter_signal_write_to_read_ms
330
+ # $2 value_ms integer milliseconds (will be coerced via printf %d)
331
+ # $3 context (optional, free-form key=val pairs joined with spaces)
332
+ #
333
+ # Side effects:
334
+ # - When flag unset: returns 0 immediately (no fork, no FS call).
335
+ # - When flag set: forks `( log_debug "..." ) &!` to debug.log.
336
+ #
337
+ # Examples:
338
+ # log_lifecycle_metric "iter_signal_write_to_read_ms" "$delta" \
339
+ # "iter=$ITERATION us=$us_id pane=$WORKER_PANE"
340
+ # log_lifecycle_metric "pane_reap_latency_ms" "$delta" \
341
+ # "iter=$ITERATION sentinel=done-claim"
342
+ log_lifecycle_metric() {
343
+ [[ "${RLP_LIFECYCLE_METRICS:-0}" == "1" ]] || return 0
344
+ local metric="$1"
345
+ local value_ms="$2"
346
+ local ctx="${3:-}"
347
+ [[ -n "$metric" && -n "$value_ms" ]] || return 0
348
+ if typeset -f log_debug >/dev/null 2>&1; then
349
+ ( log_debug "[LIFECYCLE] metric=$metric value_ms=$value_ms $ctx" ) &!
350
+ fi
351
+ return 0
352
+ }
353
+
288
354
  # PR-A (Bug #10) — validate operator-written manual recovery artifacts.
289
355
  # Returns 0 when all 5 checks pass; 1 otherwise. Sets RECOVERY_FAIL_REASON
290
356
  # (global) on failure for caller logging. Mirrors the Node-side helper
@@ -710,6 +710,10 @@ handle_worker_exit_codex() {
710
710
  dc_us_id=$(jq -r '.us_id // "unknown"' "$DONE_CLAIM_FILE" 2>/dev/null)
711
711
  log " Codex worker completed with done-claim (us_id=$dc_us_id) and clean tree. Auto-generating signal."
712
712
  echo '{"iteration":'"$iter"',"status":"verify","us_id":"'"$dc_us_id"'","summary":"auto-generated after codex exit (clean tree)","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'"}' > "$signal_file"
713
+ # v0.15.4 PR-B2-FIX: codex worker pane already exited — reaper would no-op,
714
+ # but lock done-claim as defense-in-depth so any orphaned subprocess cannot
715
+ # rewrite the file before lib_ralph_desk.zsh:602 archives it.
716
+ _lock_sentinel "$DONE_CLAIM_FILE"
713
717
  _emit_a4_fallback_audit "$dc_us_id" "$iter" "codex_exit_with_done_claim_clean"
714
718
  return 0
715
719
  }
@@ -2292,6 +2296,15 @@ poll_for_signal() {
2292
2296
  if _bug8_check_synth_allowed "$ITERATION" "$dc_us_id" "inline_polling_a4_clean"; then
2293
2297
  log " WARNING: done-claim exists for $dc_us_id but no iter-signal. Tree clean — auto-generating signal (A4 fallback)."
2294
2298
  log_debug "[GOV] iter=$ITERATION done_claim_without_signal=true us_id=$dc_us_id action=auto_generate_signal"
2299
+ # v0.15.4 PR-B2-FIX: Worker pane is alive and idling post-done-claim
2300
+ # (the canonical Bug #5/7 race window). Reap before synthesizing the
2301
+ # signal so the worker cannot revise done-claim or emit a late
2302
+ # iter-signal that races the leader's synthesized one. Mirror of
2303
+ # Bug #7 Fix-Q parity at run_ralph_desk.zsh:3181 — kill before lock,
2304
+ # lock before synth-write so the next leader read sees a frozen
2305
+ # done-claim and a fresh signal_file in that order.
2306
+ _kill_pane_process "$pane_id" "worker-a4"
2307
+ _lock_sentinel "$DONE_CLAIM_FILE"
2295
2308
  echo '{"iteration":'"$ITERATION"',"status":"verify","us_id":"'"$dc_us_id"'","summary":"auto-generated by A4 fallback (done-claim + clean tree)","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'"}' > "$signal_file"
2296
2309
  _emit_a4_fallback_audit "$dc_us_id" "$ITERATION" "inline_polling_a4_clean"
2297
2310
  return 0
@@ -3180,6 +3193,11 @@ main() {
3180
3193
  # self-review and rewrite iter-signal.json (1m43s drift observed).
3181
3194
  _kill_pane_process "$WORKER_PANE" "worker"
3182
3195
  _lock_sentinel "$SIGNAL_FILE"
3196
+ # v0.15.4 PR-B2-FIX: same worker pass also produced done-claim. Freeze
3197
+ # it alongside iter-signal so Bug #8 gates and the iter-NNN-done-claim
3198
+ # archive (lib_ralph_desk.zsh:602) read a snapshot the worker can no
3199
+ # longer revise. Symmetric with iter-signal/verdict lock contract.
3200
+ _lock_sentinel "$DONE_CLAIM_FILE"
3183
3201
  # PR-0b-narrow: stamp leader handshake ack on the iter-signal (audit-only).
3184
3202
  _stamp_ack_field "$SIGNAL_FILE"
3185
3203
  else
@@ -1,49 +0,0 @@
1
- # Bug Report Overhaul — P2/P3 Backlog
2
-
3
- > Companion to `bug-report-overhaul-v1.md` (PR-A/B/C plan).
4
- > User stop-rule: ralplan iterates only until P0+P1 = 0; P2 and below are captured here, NOT blockers.
5
- > Re-prioritize from this file in a future ralplan when the operator-minutes-saved metric from PR-A/B/C lands.
6
-
7
- ---
8
-
9
- ## P2 — should fix in a follow-up PR after PR-A/B/C land
10
-
11
- ### From v0 plan (Option C/D, deferred features)
12
-
13
- - **Heartbeat-warning sidecar (Option B from v0)** — emit `<slug>-warning.{md,json}` when heartbeat anomaly crosses 50% of `iter-timeout`. Lets operator pre-empt a BLOCKED before the 30-min wall hits. Decoupled from this PR set because (a) report-quality is the dominant pain (D1), and (b) warning sidecar adds a second sentinel surface that risks false-positive fatigue. Revisit after PR-A/B land and we measure how many BLOCKEDs would have been pre-empted.
14
- - **GitHub Issues integration (Option D from v0)** — POST blocked context to a configured GitHub repo issue. Requires per-repo authn story (token storage, network retry, rate-limits) — violates principle 3 in the current PR set. Re-evaluate after a credible authn proposal exists.
15
- - **Pattern-learning loop** — mine `~/.claude/ralph-desk/analytics/*/bug-reports/` for emerging clusters. Auto-extends `docs/bug-patterns.json` with new candidate signatures for human review.
16
- - **Cross-campaign bug-report dashboard in `/rlp-desk analytics`** — surface patterns across projects.
17
- - **Auto-suggest "this looks like Bug #N — try fix-X" inline in CLI output** — operationalize PR-C's `pattern_match` data with an inline suggestion. Held back so the deterministic Jaccard implementation can be calibrated against real campaign data first.
18
- - **Operator-CLI `/rlp-desk recover <slug> --to verify`** — write the manual recovery artifacts (`iter-signal.json`, `done-claim.json`, `status.json` patch) deterministically. Currently a hand-rolled `jq` pipeline per Bug #10 §7 workaround.
19
-
20
- ### From Codex Critic Round 2 (BACKLOG)
21
-
22
- - **[P2-1]** PR-A `_validateOperatorRecoveryArtifacts` return shape — current pseudo-code mixes `if (valid)` (boolean coercion) with `valid.reason` (object access). Resolve at implementation time to either `{ ok: bool, reason: string }` (object) or pure boolean + separate side-channel for the warning text. Affects the audit log line shape.
23
- - **[P2-2]** PR-A test summary in §5 says "5 ACs (R1–R5)" but §8 added AC-R6 (`_skipNextWorkerDispatch` cleared after one use). Update §5 to "6 ACs (R1–R6)" for consistency before PR-A merges.
24
-
25
- ### From Codex Critic Round 3 (BACKLOG)
26
-
27
- - **[P2-3]** §9 step 5 banner-aware diff command only covers `run_ralph_desk.zsh`. PR-A and PR-B both also touch `lib_ralph_desk.zsh`. Add a matching `diff <(cat src/scripts/lib_ralph_desk.zsh) <(tail -n +N ~/.claude/ralph-desk/scripts/lib_ralph_desk.zsh)` step in the implementation runbook (verify the right `tail -n +N` offset at impl time — `lib_*.zsh` is sourced and may have no shebang). Extend to `init_ralph_desk.zsh` if PR-B touches it.
28
-
29
- ## P3 — nice-to-have polish
30
-
31
- ### From Codex Critic Round 2
32
-
33
- - **[P3-1]** Option C/D/E rejection rationale in v1 §4 says "Same as v0" — acceptable because v0 is co-located, but inline one-sentence rationale would make the v1 plan self-contained for future readers who do not have the v0 file.
34
-
35
- ### From Architect Round 1 (residual notes)
36
-
37
- - Validate the `bug-patterns.json` Jaccard threshold (0.7) against actual past blocks once we have ≥20 historical reports — current threshold is hand-picked. Likely needs a small calibration script in `scripts/`.
38
- - Consider whether `bug-reports/` should ship in the npm tarball default `.gitignore` of newly initialized projects — currently the schema doc only recommends operators add it themselves.
39
-
40
- ---
41
-
42
- ## Promotion criteria (when to re-ralplan one of these)
43
-
44
- A backlog item moves back into a planner draft when **any** of these is true:
45
-
46
- 1. PR-A/B/C lands and we measure ≥3 BLOCKEDs where the deferred item would have moved D1 by ≥10 minutes (e.g. heartbeat warning would have pre-empted a 30-min wait).
47
- 2. Operator hand-files ≥2 bug reports about the same backlog gap (signal that the deferral was wrong).
48
- 3. The `bug-patterns.json` seed becomes too large for human authoring (≥30 entries) — triggers the pattern-learning loop item.
49
- 4. A user explicitly asks for one (e.g. operator-CLI `/rlp-desk recover` once they fatigue of jq pipelines).
@@ -1,238 +0,0 @@
1
- # Bug Report Mechanism Overhaul — v0 (RALPLAN-DR Planner Draft)
2
-
3
- > **Status**: Planner draft awaiting Architect → Codex Critic.
4
- > **Mode**: deliberate (auto-enabled — touches governance, runner, slash command, test infra).
5
- > **Stop rule**: iterate until codex critic returns 0 P0 + 0 P1. P2 → backlog.
6
- > **Critic instruction**: *approve unless P0 or P1 found.*
7
-
8
- ---
9
-
10
- ## 1. Problem statement
11
-
12
- 10 hand-written 200-line bug reports (`Bug #1`–`Bug #10`, BOS dev `2026-05-01..05-07`) point at one root frustration: **bugs are endless and each one costs 30+ min of operator time to package** before the rlp-desk side can even start triage. Examples:
13
-
14
- | Pain | Evidence |
15
- |---|---|
16
- | Manual context capture | Each report re-collects: env, version, command, status snapshot, pane logs, settings, gitignore — all already on disk |
17
- | No similarity search | Bug #6/#7/#8 are all "worker hang variants"; operator re-discovers the cluster each time |
18
- | Recovery is broken | Bug #10 — leader resets `phase=worker` ignoring operator's `phase=verify` manual recovery; operator's iter-signal/done-claim files deleted |
19
- | Reactive only | Bugs surface only after full BLOCKED (~30 min poll timeout); no early warning on heartbeat anomalies |
20
- | No deterministic repro pack | rlp-desk side has to chase BOS for missing context (logs, env, version) → fix latency multiplier |
21
-
22
- The blocked-sentinel JSON (`schema_version: 2.0`) already classifies (`reason_category` / `recoverable` / `suggested_action`) but stops at the campaign boundary — it does not become a *bug report*. That gap is the target.
23
-
24
- ---
25
-
26
- ## 2. Principles (5)
27
-
28
- 1. **Capture-by-default, not by-request.** When the campaign blocks, the operator should not have to gather anything that already exists on disk.
29
- 2. **One canonical schema, two consumers.** A single `bug-report.json` feeds both BOS-side templates and rlp-desk-side triage; no divergent representations.
30
- 3. **Surgical diffs over new infra.** Extend the existing `blocked.{md,json}` writer + `/rlp-desk` subcommand surface; do not introduce a new daemon, queue, or service.
31
- 4. **Recovery must be idempotent.** Manual recovery of a BLOCKED campaign must not be silently overwritten on relaunch (Bug #10 contract).
32
- 5. **Earlier is cheaper.** A heartbeat-anomaly *warning* costs nothing; a 30-min BLOCKED poll-timeout is the most expensive form of feedback.
33
-
34
- ---
35
-
36
- ## 3. Decision drivers (top 3)
37
-
38
- | # | Driver | Why it dominates |
39
- |---|---|---|
40
- | D1 | **Operator minutes per BLOCKED → first actionable report** | Today: 30+ min hand-writing + log collection. Target: ≤2 min (review + 1-line headline edit). Drives the "auto-bundle" choice below. |
41
- | D2 | **Cluster recognition (avoid duplicate `Bug #N` for same root cause)** | 5 of 10 reports cluster around "worker hang on sentinel" or "verifier post-sentinel race". Without similarity hinting we keep paying triage cost N times. |
42
- | D3 | **Zero regression on `--mode tmux` 19th launch** | Per `docs/plans/native-agent-revert.md`, the production tmux path is mid-flight. Any change must be additive there; default behavior unchanged. |
43
-
44
- ---
45
-
46
- ## 4. Viable options
47
-
48
- ### Option A — **Bundle-first**: `/rlp-desk report <slug>` + auto-emit on BLOCKED *(recommended)*
49
-
50
- Add a single subcommand and one auto-trigger. Mechanics:
51
-
52
- - **Trigger**: every `_handlePollFailure` / `_emitBlockedSentinel` call already in `campaign-main-loop.mjs` and `write_blocked_sentinel` in `run_ralph_desk.zsh` ALSO writes `bug-reports/<slug>-<UTCISO>.json` + `<...>.md` (template-rendered).
53
- - **Schema**: extends current blocked-sentinel JSON v2.0 with: `repro.command`, `repro.env_snapshot`, `repro.git_head_sha`, `pane_tail.{worker,verifier}` (last 200 lines, redacted), `recent_iter_artifacts[]` (last 3 iterations' done-claim/verdict paths), `pattern_match.{candidate_bug_ids[], score}` (similarity vs known reports).
54
- - **Subcommand**: `/rlp-desk report <slug>` to (a) regenerate from saved campaign state, (b) attach a custom headline, (c) print the markdown to stdout for paste-into-issue-tracker.
55
- - **Pattern match**: deterministic — hashed signature on `{reason_category, failure_category, suggested_action, top-level pane stem}` against a `docs/bug-patterns.json` lookup (seeded with #1–#10).
56
- - **Bug #10 fix**: leader on relaunch honors `status.phase == "verify"` + valid manual artifacts (validated against schema) and skips worker dispatch. Same surgical injection point used by P1-D classifier.
57
-
58
- **Pros**: low-surface; reuses `_classifyBlock`, `writeSentinelExclusive`, `~/.claude/ralph-desk/analytics`; produces the same output regardless of whether BLOCKED came from `--mode tmux` or `--mode native`/`--mode agent`. Operator's job collapses to "edit headline".
59
-
60
- **Cons**: pattern-match is naive (string-stem); will need iteration. Pane-tail capture risks PII/secret leak — must redact (governance §1f already has redaction precedent).
61
-
62
- ### Option B — **Heartbeat-first**: pre-BLOCKED early warning channel
63
-
64
- Introduce a `<slug>-warning.{md,json}` sidecar emitted whenever a heartbeat anomaly crosses a soft threshold (50% of `iter-timeout`, no progress). Operator can opt into pre-empting the BLOCKED with `/rlp-desk warn <slug>` before the 30-min wall hits.
65
-
66
- **Pros**: shortens the perceived "bug is endless" tail by surfacing earlier.
67
-
68
- **Cons**: orthogonal to the *report quality* problem. Adds a second sentinel surface; risks false positives that train operators to ignore. Does not solve D1 (hand-writing) or D2 (clusters).
69
-
70
- ### Option C — **External tracker integration** (GitHub Issues auto-file)
71
-
72
- Instead of file artifacts, POST blocked context to a configured GitHub repo issue.
73
-
74
- **Pros**: makes rlp-desk-side triage visible without operator handoff.
75
-
76
- **Cons**: violates principle 3 (new infra: secrets, network, retry, rate-limits). Couples to an external service. Out-of-scope per ABSOLUTE rule "NEVER push to remote without explicit user approval" — would need a per-campaign auth path. **Invalidated.**
77
-
78
- ### Option D — **Status-quo + better doc template**
79
-
80
- Just publish a clearer template under `docs/rlp-desk/bug-report-template.md` and call it done.
81
-
82
- **Pros**: zero code change.
83
-
84
- **Cons**: does not move D1 (operator minutes) or D2 (clusters) at all. Bug #10 (Recovery breakage) untouched. **Invalidated.**
85
-
86
- ### Why A wins (with optional B as future-PR)
87
-
88
- A directly addresses D1 (auto-bundle), D2 (pattern_match seeded with #1–#10), and Bug #10 (relaunch hygiene). B is complementary but orthogonal — defer to a separate PR after A lands and we measure operator minutes-saved. C/D fail principle 3 / D1 respectively.
89
-
90
- ---
91
-
92
- ## 5. Scope (this PR)
93
-
94
- ### P0 — must land
95
-
96
- 1. **`bug-report.json` writer** — extend `_emitBlockedSentinel` (Node, `campaign-main-loop.mjs:923-968`) and `write_blocked_sentinel` (zsh, `lib_ralph_desk.zsh`) to also emit a per-block bug-report under `.rlp-desk/bug-reports/<slug>-<iso>.{json,md}`. Schema documented at `docs/rlp-desk/bug-report-schema.md`.
97
- 2. **Bug #10 relaunch hygiene** — in launch-time entry of `campaign-main-loop.mjs` (currently forces `phase=worker`+`iter=1`), branch on `status.phase == 'verify'` and validate operator-written `iter-signal.json` + `done-claim.json` against existing artifact-validators. If valid → skip worker dispatch, enter verifier directly. If invalid → log warning + fall through to current behavior.
98
- 3. **Redaction pass** — `pane_tail` and `env_snapshot` go through a deny-list (governance §1f redaction precedent: any `/(api[_-]?key|token|secret|password|bearer|authorization)/i` line replaced with `<REDACTED>`).
99
-
100
- ### P1 — must land
101
-
102
- 4. **`/rlp-desk report <slug>` subcommand** — added to `src/commands/rlp-desk.md` per current command-handler patterns; reads the latest blocked-sentinel JSON + most recent `bug-reports/*.json`, prints the markdown render to stdout. Optional `--headline "..."` flag rewrites the title line in-place. No auto-publish to any remote.
103
- 5. **`pattern_match` seed** — `docs/bug-patterns.json` shipped with deterministic signatures for Bug #1–#10 (manually authored from BOS reports). Bug-report writer fills `pattern_match.candidate_bug_ids[]` + `score` (Jaccard on `{reason_category, failure_category, top-level pane-tail token bag}`).
104
- 6. **Self-Verification gate compliance** — `src/scripts/run_ralph_desk.zsh` is touched → CLAUDE.md mandates 3 self-verification scenarios (LOW + MEDIUM + CRITICAL). Spelled out in §10.
105
-
106
- ### P2+ → `docs/plans/bug-report-overhaul-backlog.md` (separate file, not this PR)
107
-
108
- - Heartbeat-warning sidecar (Option B).
109
- - GitHub Issues integration (Option C, after authn story).
110
- - Pattern-learning loop that mines `~/.claude/ralph-desk/analytics/*/bug-reports/` for emerging clusters.
111
- - Cross-campaign bug-report dashboard in `/rlp-desk analytics`.
112
- - Auto-suggest "this looks like Bug #N — try fix-X" inline in CLI output (today: `pattern_match` is data-only).
113
-
114
- ---
115
-
116
- ## 6. Files to modify
117
-
118
- | File | Change | Risk |
119
- |---|---|---|
120
- | `src/node/runner/campaign-main-loop.mjs` | Extend `_emitBlockedSentinel` to call new `writeBugReport` helper; add Bug #10 relaunch-phase-honor branch in `_runCampaignBody` entry | MED |
121
- | `src/node/shared/bug-report.mjs` (NEW) | `writeBugReport({slug, classification, reason, paths, env, paneTails, recentArtifacts})` + redaction + pattern-match | LOW (new isolated module) |
122
- | `src/scripts/lib_ralph_desk.zsh` | New `_write_bug_report` helper called from `write_blocked_sentinel` | MED |
123
- | `src/scripts/run_ralph_desk.zsh` | Wire `_write_bug_report` after each `write_blocked_sentinel` site (≈10 sites, all already in one taxonomy) | MED |
124
- | `src/commands/rlp-desk.md` | Add `## report <slug>` section; document `bug-reports/` directory + schema link; add `/rlp-desk report` to help block | LOW |
125
- | `src/governance.md` | Add §1g "Bug Report Capture" — invariant: every BLOCKED writes a bug-report; redaction rules; relaunch hygiene contract (Bug #10) | LOW (additive) |
126
- | `docs/rlp-desk/bug-report-schema.md` (NEW) | JSON schema doc + worked example | LOW |
127
- | `docs/bug-patterns.json` (NEW) | Seed with #1–#10 signatures | LOW |
128
- | `tests/node/test-bug-report-writer.test.mjs` (NEW) | Schema, redaction, pattern-match unit tests | LOW |
129
- | `tests/node/test-relaunch-phase-verify-hygiene.test.mjs` (NEW) | Bug #10 fix unit + integration | MED |
130
- | `tests/test-bug-report-zsh-emit.sh` (NEW) | zsh side bug-report emit verification | MED |
131
-
132
- Total: 5 modified + 6 new. No deletions. Single PR — review surface bounded.
133
-
134
- ---
135
-
136
- ## 7. Pre-mortem (deliberate mode — 3 scenarios)
137
-
138
- ### S1 — Pane-tail leaks a secret into a committed bug-report
139
-
140
- A worker pane prints `Authorization: Bearer eyJ...` from a vendor SDK debug log. `pane_tail` captures it; operator commits the bug-report markdown without re-reading.
141
-
142
- **Mitigation**: redaction deny-list runs on the JSON writer side (not at view-time); deny-list is unit-tested in `test-bug-report-writer.test.mjs` with a fuzz-style fixture (10+ secret-shaped strings). Markdown render reads from JSON post-redaction, so it can never out-leak. Additional belt: `bug-reports/` is added to a sample `.gitignore` snippet in the schema doc; we do not auto-add to user repo `.gitignore`.
143
-
144
- **Residual risk**: vendor-specific secret formats not in deny-list. Acceptable: schema doc tells operator to scan before committing, and pattern_match leaves an `unredacted_count` audit field that flags how many lines hit the deny-list (operator can sanity-check).
145
-
146
- ### S2 — Bug #10 fix accidentally honors a stale `phase=verify` from a CRASHED leader and re-enters verifier on garbage state
147
-
148
- If the leader crashed mid-worker after writing `phase=verify` (race), relaunch could enter verifier with an inconsistent on-disk state.
149
-
150
- **Mitigation**: validation gate is strict — both `iter-signal.json` AND `done-claim.json` must (a) exist, (b) have `us_id` matching `status.target_us`, (c) have `iteration` matching `status.iteration`, (d) have `iter_signal_quality == 'specific'`, (e) be newer than the most recent `worker-prompt.md` mtime. Failure of ANY check → fall through to current behavior + log "phase=verify ignored: <reason>". This preserves backward compat and matches `_checkBlockedHygiene` precedent.
151
-
152
- **Residual risk**: a clever filesystem race can still pass all five checks. We accept this — the existing "every relaunch resets to worker" behavior is itself a bug (#10), and Option C's stricter "operator must pass `--resume-from-verify` flag" is named in P2 backlog as an opt-in escalation if operators report false positives.
153
-
154
- ### S3 — `pattern_match` false-positive trains operators to dismiss real bugs
155
-
156
- Two unrelated `infra_failure` blocks both score ≥0.8 against the same Bug #N signature; operator stops reading.
157
-
158
- **Mitigation**: `pattern_match` is **data-only** in P1 — no inline CLI suggestion. Score + candidate IDs are written to JSON; markdown render places them in a "Possible related bugs" footer with explicit "score: 0.83 — review before assuming match". Auto-suggest is deferred to P2 backlog precisely because we have not validated the signature space yet. Also: ship with **deterministic** Jaccard over a small token bag, not ML — failures are inspectable.
159
-
160
- **Residual risk**: low — operator opt-in to act on `pattern_match`.
161
-
162
- ---
163
-
164
- ## 8. Expanded test plan (deliberate mode)
165
-
166
- ### Unit (Node)
167
-
168
- `tests/node/test-bug-report-writer.test.mjs`:
169
-
170
- - AC-W1: schema fields all present + types match `docs/rlp-desk/bug-report-schema.md`.
171
- - AC-W2: redaction — 12 secret-shaped fixtures (Bearer token, AWS key, GH PAT, OpenAI key, generic `password=...`, etc.) all replaced by `<REDACTED>`; `meta.redacted_line_count` reflects count.
172
- - AC-W3: pane-tail truncates at 200 lines; preserves last lines (most recent diagnostic value).
173
- - AC-W4: `pattern_match` against seeded `docs/bug-patterns.json` — synthetic block matching Bug #6 signature returns `score >= 0.7` + correct `candidate_bug_ids`.
174
- - AC-W5: idempotent — second call with same `(slug, classification, iso)` is a no-op (uses `writeSentinelExclusive` semantics).
175
-
176
- `tests/node/test-relaunch-phase-verify-hygiene.test.mjs`:
177
-
178
- - AC-R1: status.phase=verify + valid artifacts → verifier-only entry (no worker dispatch).
179
- - AC-R2: status.phase=verify + missing `done-claim.json` → fall through to worker, log warning.
180
- - AC-R3: status.phase=verify + `us_id` mismatch → fall through, warning.
181
- - AC-R4: status.phase=verify + `iter-signal.json` older than worker-prompt.md → fall through, warning.
182
- - AC-R5: status.phase=verify + `iter_signal_quality != 'specific'` → fall through, warning.
183
-
184
- ### Integration (Node)
185
-
186
- `tests/node/us006-campaign-main-loop.test.mjs` extension:
187
-
188
- - AC-I1: BLOCKED via `flywheel_inconclusive` → bug-report file written to `.rlp-desk/bug-reports/`; JSON parses; `reason_category == 'mission_abort'`.
189
- - AC-I2: BLOCKED via `worker_exited` → bug-report `pattern_match.candidate_bug_ids` includes `Bug-7` (worker pane death lineage).
190
- - AC-I3: relaunch with valid `phase=verify` artifacts → no `iter-002.worker-prompt.md` created; verifier dispatched directly.
191
-
192
- ### Integration (zsh)
193
-
194
- `tests/test-bug-report-zsh-emit.sh` (NEW, mirrors `test-bug7-post-sentinel-race.sh` style):
195
-
196
- - Sc-1: stub `dispatch_worker` exits 1 → `write_blocked_sentinel` runs → `<slug>-<iso>.json` exists in `bug-reports/` + parses with `jq`.
197
- - Sc-2: redaction — pre-injected pane log with `Bearer X` → `jq .pane_tail.worker` does not contain `Bearer X`.
198
-
199
- ### Self-Verification scenarios (CLAUDE.md gate, MANDATORY since `run_ralph_desk.zsh` is touched)
200
-
201
- - **LOW**: redaction unit fixture passes; existing zsh + Node regression tests green.
202
- - **MEDIUM**: real campaign with stub worker that fails → bug-report appears; markdown render contains all required sections; operator can `cat` it; `pattern_match` populated.
203
- - **CRITICAL**: 2-iter campaign with deliberate BLOCKED at iter-1, then operator manual recovery (write iter-signal/done-claim by hand, set `phase=verify`), relaunch → verifier-only path runs (no worker iter-2 dispatch); Bug #10 reproduction scenario reversed; verdict accepted; `complete.md` written.
204
-
205
- All 3 must PASS before commit. If any FAIL: fix root cause, re-run failing scenario, then re-verify all 3.
206
-
207
- ---
208
-
209
- ## 9. Verification end-to-end
210
-
211
- 1. `node --test 'tests/node/*.test.mjs'` — all green; new tests visible.
212
- 2. `bash tests/test-bug-report-zsh-emit.sh` — green.
213
- 3. `bash tests/test-bug7-post-sentinel-race.sh` + `bash tests/test-bug7-poll-partial-write.sh` — unchanged green (no Bug #7 regression).
214
- 4. CLAUDE.md self-verification gate × 3 (above) — all PASS.
215
- 5. Manual: trigger BLOCKED in a sandbox campaign; verify `.rlp-desk/bug-reports/<slug>-<iso>.md` is human-readable + has `Possible related bugs` footer.
216
- 6. Banner-aware diff `src/` ⇆ `~/.claude/ralph-desk/` after `node scripts/postinstall.js`.
217
-
218
- ---
219
-
220
- ## 10. ADR (preview — final once Critic approves)
221
-
222
- - **Decision**: Adopt Option A (bundle-first, auto-emit on BLOCKED) for v0.16.0; defer heartbeat-warning (B) and external-tracker (C) to backlog.
223
- - **Drivers**: D1 operator-minutes, D2 cluster-recognition, D3 zero `--mode tmux` regression.
224
- - **Alternatives considered**: B (orthogonal — does not solve D1/D2), C (violates principle 3, requires authn/network), D (does not move any driver).
225
- - **Why chosen**: A reuses `_classifyBlock` + `writeSentinelExclusive`; surgical-diff principle satisfied; pattern_match seeded from real history.
226
- - **Consequences**: BLOCKED writes additional artifact (`bug-reports/<slug>-<iso>.{json,md}`); operator workflow shifts from "hand-write 200 lines" to "review + edit headline"; `bug-patterns.json` becomes a living artifact maintained alongside reports.
227
- - **Follow-ups**: Backlog file lists P2+ items. Heartbeat warning revisited after we measure operator minutes-saved on first 3 BLOCKED post-land.
228
-
229
- ---
230
-
231
- ## 11. Round-by-round resolution log
232
-
233
- | Round | Reviewer | Verdict | Findings closed |
234
- |---|---|---|---|
235
- | 0 | — | Planner v0 | initial draft |
236
- | 1 | Architect | _pending_ | _to fill_ |
237
- | 2 | Codex Critic | _pending_ | _to fill_ |
238
- | ... | | | |