@ai-dev-methodologies/rlp-desk 0.14.6 → 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -244,6 +244,158 @@ atomic_write() {
244
244
  mv "$tmp" "$target"
245
245
  }
246
246
 
247
+ # =============================================================================
248
+ # Bug #7 Fix-Q/R: Post-sentinel pane reaper + sentinel write-lock
249
+ # =============================================================================
250
+ # Without explicit teardown the claude/codex TUI returns to its idle prompt and
251
+ # self-reviews for ~2min after writing iter-signal.json or verify-verdict.json.
252
+ # Observed: verdict mtime drift 1m43s post-detect; iter-N verifier overlapped
253
+ # iter-N+1 worker for 2min. _kill_pane_process closes the race; _lock_sentinel
254
+ # is defense-in-depth that freezes the file mtime. Mirror of run_ralph_desk.zsh
255
+ # verifier-cleanup pattern at L2384-2397 (Ctrl+C + /exit + wait_for_pane_ready).
256
+ # Both helpers are fail-open: pane may already be dead, FS may ignore chmod.
257
+ _kill_pane_process() {
258
+ local pane_id="$1"
259
+ local role="${2:-producer}"
260
+ [[ -n "$pane_id" ]] || return 0
261
+ if typeset -f log_debug >/dev/null 2>&1; then
262
+ log_debug "[bug7] kill_pane_process pane=$pane_id role=$role"
263
+ fi
264
+ tmux send-keys -t "$pane_id" C-c 2>/dev/null
265
+ sleep 0.5
266
+ tmux send-keys -t "$pane_id" C-c 2>/dev/null
267
+ sleep 1
268
+ if typeset -f wait_for_pane_ready >/dev/null 2>&1; then
269
+ wait_for_pane_ready "$pane_id" 5 2>/dev/null || true
270
+ fi
271
+ return 0
272
+ }
273
+
274
+ _lock_sentinel() {
275
+ local file="$1"
276
+ [[ -n "$file" && -f "$file" ]] || return 0
277
+ chmod 0444 "$file" 2>/dev/null || true
278
+ return 0
279
+ }
280
+
281
+ _unlock_sentinel() {
282
+ local file="$1"
283
+ [[ -n "$file" && -f "$file" ]] || return 0
284
+ chmod 0644 "$file" 2>/dev/null || true
285
+ return 0
286
+ }
287
+
288
+ # PR-A (Bug #10) — validate operator-written manual recovery artifacts.
289
+ # Returns 0 when all 5 checks pass; 1 otherwise. Sets RECOVERY_FAIL_REASON
290
+ # (global) on failure for caller logging. Mirrors the Node-side helper
291
+ # `_validateOperatorRecoveryArtifacts` in `src/node/runner/campaign-main-loop.mjs`.
292
+ #
293
+ # Args:
294
+ # $1 iter-signal.json path
295
+ # $2 done-claim.json path
296
+ # $3 status.json path
297
+ # $4 iter-NNN.worker-prompt.md path (may not exist for iter-1 fresh start)
298
+ _validate_operator_recovery_artifacts() {
299
+ local sig_file="$1" done_file="$2" status_file="$3" prompt_file="$4"
300
+ RECOVERY_FAIL_REASON=""
301
+
302
+ # Check 1: both artifacts exist + parse as JSON
303
+ if [[ ! -f "$sig_file" ]]; then
304
+ RECOVERY_FAIL_REASON="iter-signal.json missing"; return 1
305
+ fi
306
+ if [[ ! -f "$done_file" ]]; then
307
+ RECOVERY_FAIL_REASON="done-claim.json missing"; return 1
308
+ fi
309
+ if ! command -v jq >/dev/null 2>&1; then
310
+ RECOVERY_FAIL_REASON="jq unavailable; cannot validate"; return 1
311
+ fi
312
+ if ! jq -e . "$sig_file" >/dev/null 2>&1; then
313
+ RECOVERY_FAIL_REASON="iter-signal.json parse error"; return 1
314
+ fi
315
+ if ! jq -e . "$done_file" >/dev/null 2>&1; then
316
+ RECOVERY_FAIL_REASON="done-claim.json parse error"; return 1
317
+ fi
318
+ if [[ ! -f "$status_file" ]] || ! jq -e . "$status_file" >/dev/null 2>&1; then
319
+ RECOVERY_FAIL_REASON="status.json missing or invalid"; return 1
320
+ fi
321
+
322
+ # Check 2: us_id match in both artifacts
323
+ local current_us sig_us done_us
324
+ current_us=$(jq -r '.current_us // ""' "$status_file" 2>/dev/null)
325
+ sig_us=$(jq -r '.us_id // ""' "$sig_file" 2>/dev/null)
326
+ done_us=$(jq -r '.us_id // ""' "$done_file" 2>/dev/null)
327
+ if [[ "$sig_us" != "$current_us" ]]; then
328
+ RECOVERY_FAIL_REASON="iter-signal.us_id ($sig_us) != status.current_us ($current_us)"; return 1
329
+ fi
330
+ if [[ "$done_us" != "$current_us" ]]; then
331
+ RECOVERY_FAIL_REASON="done-claim.us_id ($done_us) != status.current_us ($current_us)"; return 1
332
+ fi
333
+
334
+ # Check 3: iteration match in both artifacts
335
+ local current_iter sig_iter done_iter
336
+ current_iter=$(jq -r '.iteration // 0' "$status_file" 2>/dev/null)
337
+ sig_iter=$(jq -r '.iteration // 0' "$sig_file" 2>/dev/null)
338
+ done_iter=$(jq -r '.iteration // 0' "$done_file" 2>/dev/null)
339
+ if [[ "$sig_iter" != "$current_iter" ]]; then
340
+ RECOVERY_FAIL_REASON="iter-signal.iteration ($sig_iter) != status.iteration ($current_iter)"; return 1
341
+ fi
342
+ if [[ "$done_iter" != "$current_iter" ]]; then
343
+ RECOVERY_FAIL_REASON="done-claim.iteration ($done_iter) != status.iteration ($current_iter)"; return 1
344
+ fi
345
+
346
+ # Check 4: iter_signal_quality must equal 'specific'
347
+ local sig_quality
348
+ sig_quality=$(jq -r '.iter_signal_quality // ""' "$sig_file" 2>/dev/null)
349
+ if [[ "$sig_quality" != "specific" ]]; then
350
+ RECOVERY_FAIL_REASON="iter-signal.iter_signal_quality ($sig_quality) != 'specific'"; return 1
351
+ fi
352
+
353
+ # Check 5: artifact mtimes must be strictly newer than worker-prompt mtime.
354
+ # Vacuously passes when the prompt file does not exist (fresh iter-1 start
355
+ # before any leader-written prompt).
356
+ if [[ -f "$prompt_file" ]]; then
357
+ local prompt_mtime sig_mtime done_mtime
358
+ prompt_mtime=$(stat -f %m "$prompt_file" 2>/dev/null || stat -c %Y "$prompt_file" 2>/dev/null || print 0)
359
+ sig_mtime=$(stat -f %m "$sig_file" 2>/dev/null || stat -c %Y "$sig_file" 2>/dev/null || print 0)
360
+ done_mtime=$(stat -f %m "$done_file" 2>/dev/null || stat -c %Y "$done_file" 2>/dev/null || print 0)
361
+ if (( sig_mtime <= prompt_mtime )); then
362
+ RECOVERY_FAIL_REASON="iter-signal.json mtime ($sig_mtime) not strictly newer than worker-prompt mtime ($prompt_mtime)"; return 1
363
+ fi
364
+ if (( done_mtime <= prompt_mtime )); then
365
+ RECOVERY_FAIL_REASON="done-claim.json mtime ($done_mtime) not strictly newer than worker-prompt mtime ($prompt_mtime)"; return 1
366
+ fi
367
+ fi
368
+
369
+ return 0
370
+ }
371
+
372
+ # PR-0b-narrow (Plan v6) — stamp leader handshake ack onto the sentinel.
373
+ # Mirror of src/node/shared/fs.mjs::stampAckField. Best-effort, audit-only:
374
+ # any failure is silently swallowed. Sequence:
375
+ # 1. chmod 0644 (so jq + mv can write)
376
+ # 2. jq merge .leader_ack
377
+ # 3. atomic rename via tmp file
378
+ # 4. chmod 0444 (re-lock)
379
+ # Tolerant of jq absence (graceful degrade — no stamp, no error).
380
+ _stamp_ack_field() {
381
+ local file="$1"
382
+ [[ -n "$file" && -f "$file" ]] || return 0
383
+ command -v jq >/dev/null 2>&1 || return 0
384
+ local now_iso
385
+ now_iso=$(date -u +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo "")
386
+ local tmp="${file}.ack.tmp"
387
+ chmod 0644 "$file" 2>/dev/null || true
388
+ if jq --arg ts "$now_iso" \
389
+ '. + {leader_ack: {acked_by: "leader", acked_at: $ts, ack_pane_state: "shell"}}' \
390
+ "$file" > "$tmp" 2>/dev/null; then
391
+ mv "$tmp" "$file" 2>/dev/null || rm -f "$tmp" 2>/dev/null
392
+ else
393
+ rm -f "$tmp" 2>/dev/null
394
+ fi
395
+ chmod 0444 "$file" 2>/dev/null || true
396
+ return 0
397
+ }
398
+
247
399
  # =============================================================================
248
400
  # Scaffold Validation
249
401
  # =============================================================================
@@ -635,27 +635,82 @@ launch_verifier_claude() {
635
635
  # On exit: check done-claim, auto-generate iter-signal.
636
636
  # Args: $1=iteration $2=signal_file
637
637
  # Returns: 0 (signal generated), 1 (error)
638
+ # Bug #8 PR-B (codex critic P1.2 fix): shared 4-way gate used by both
639
+ # handle_worker_exit_codex and the inline-polling A4 path. Returns:
640
+ # 0 = synthesize allowed (caller writes signal_file + emits audit)
641
+ # 1 = BLOCKED (this function already wrote sentinel + emitted audit)
642
+ # Args: $1=iter $2=us_id $3=audit_clean_code (e.g. codex_exit_with_done_claim
643
+ # or inline_polling_a4_clean)
644
+ _bug8_check_synth_allowed() {
645
+ local iter="$1"
646
+ local us_id="${2:-${CURRENT_US:-ALL}}"
647
+ local audit_clean="$3"
648
+
649
+ # Gate 1: done-claim must exist.
650
+ if [[ ! -f "$DONE_CLAIM_FILE" ]]; then
651
+ log_error " Bug #8: no done-claim. Refusing to synthesize verify signal."
652
+ log_debug "[GOV] iter=$iter bug8=block_codex_exit_no_done_claim"
653
+ write_blocked_sentinel \
654
+ "Codex worker exited without writing done-claim (refusing to synthesize verify signal)" \
655
+ "$us_id" \
656
+ "infra_failure"
657
+ _emit_a4_fallback_audit "$us_id" "$iter" "blocked_codex_exit_no_done_claim"
658
+ return 1
659
+ fi
660
+
661
+ # Gate 2: git toplevel must equal $ROOT (canonicalized — macOS resolves
662
+ # /var → /private/var, NTFS may have 8.3 short paths; compare realpaths).
663
+ local _bug8_top _bug8_top_canon _bug8_root_canon
664
+ _bug8_top=$(git -C "$ROOT" rev-parse --show-toplevel 2>/dev/null)
665
+ _bug8_top_canon=$(cd "$_bug8_top" 2>/dev/null && pwd -P 2>/dev/null)
666
+ _bug8_root_canon=$(cd "$ROOT" 2>/dev/null && pwd -P 2>/dev/null)
667
+ if [[ -z "$_bug8_top" || "$_bug8_top_canon" != "$_bug8_root_canon" ]]; then
668
+ log_error " Bug #8: git unverifiable at \$ROOT=$ROOT (toplevel='$_bug8_top'). Refusing synthesis."
669
+ log_debug "[GOV] iter=$iter bug8=block_git_unverifiable root=$ROOT toplevel=$_bug8_top"
670
+ write_blocked_sentinel \
671
+ "git status unverifiable at $ROOT (toplevel='$_bug8_top'); refusing to synthesize verify signal" \
672
+ "$us_id" \
673
+ "infra_failure"
674
+ _emit_a4_fallback_audit "$us_id" "$iter" "blocked_git_unverifiable"
675
+ return 1
676
+ fi
677
+
678
+ # Gate 3: tree must be clean.
679
+ local _bug8_dirty
680
+ _bug8_dirty=$(git -C "$ROOT" status --porcelain 2>/dev/null)
681
+ if [[ -n "$_bug8_dirty" ]]; then
682
+ local _bug8_first5
683
+ _bug8_first5=$(printf '%s\n' "$_bug8_dirty" | head -n 5 | tr '\n' '|' | sed 's/|$//')
684
+ log_error " Bug #8: done-claim present but tree dirty. Refusing synthesis. dirty: $_bug8_first5"
685
+ log_debug "[GOV] iter=$iter bug8=block_dirty_tree us_id=$us_id dirty='$_bug8_first5'"
686
+ write_blocked_sentinel \
687
+ "worker_incomplete_uncommitted: done-claim present but tree dirty ($_bug8_first5)" \
688
+ "$us_id" \
689
+ "metric_failure"
690
+ _emit_a4_fallback_audit "$us_id" "$iter" "blocked_dirty_tree"
691
+ return 1
692
+ fi
693
+
694
+ # All gates passed — synthesize allowed.
695
+ return 0
696
+ }
697
+
638
698
  handle_worker_exit_codex() {
639
699
  local iter="$1"
640
700
  local signal_file="$2"
641
701
 
642
- log " Codex worker process exited. Checking for done-claim..."
643
- if [[ -f "$DONE_CLAIM_FILE" ]]; then
644
- local dc_us_id
645
- dc_us_id=$(jq -r '.us_id // "unknown"' "$DONE_CLAIM_FILE" 2>/dev/null)
646
- log " Codex worker completed with done-claim (us_id=$dc_us_id). Auto-generating signal."
647
- echo '{"iteration":'"$iter"',"status":"verify","us_id":"'"$dc_us_id"'","summary":"auto-generated after codex exit","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'"}' > "$signal_file"
648
- _emit_a4_fallback_audit "$dc_us_id" "$iter" "codex_exit_with_done_claim"
649
- else
650
- log " WARNING: Codex worker exited without done-claim. Generating verify signal for current US."
651
- local current_us
652
- current_us=$(jq -r '.us_id // "US-001"' "$DESK/memos/${SLUG}-iter-signal.json" 2>/dev/null || echo "US-001")
653
- local mem_us
654
- mem_us=$(sed -n 's/.*Next.*US-\([0-9]*\).*/US-\1/p' "$DESK/memos/${SLUG}-memory.md" 2>/dev/null | head -1)
655
- [[ -n "$mem_us" ]] && current_us="$mem_us"
656
- echo '{"iteration":'"$iter"',"status":"verify","us_id":"'"$current_us"'","summary":"auto-generated after codex exit (no done-claim)","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'"}' > "$signal_file"
657
- _emit_a4_fallback_audit "$current_us" "$iter" "codex_exit_no_done_claim"
702
+ log " Codex worker process exited. Checking for done-claim + clean tree..."
703
+
704
+ if ! _bug8_check_synth_allowed "$iter" "${CURRENT_US:-ALL}" "codex_exit_with_done_claim"; then
705
+ return 1
658
706
  fi
707
+
708
+ # All 3 gates passed: done-claim present, git OK, tree clean → synthesize.
709
+ local dc_us_id
710
+ dc_us_id=$(jq -r '.us_id // "unknown"' "$DONE_CLAIM_FILE" 2>/dev/null)
711
+ log " Codex worker completed with done-claim (us_id=$dc_us_id) and clean tree. Auto-generating signal."
712
+ echo '{"iteration":'"$iter"',"status":"verify","us_id":"'"$dc_us_id"'","summary":"auto-generated after codex exit (clean tree)","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'"}' > "$signal_file"
713
+ _emit_a4_fallback_audit "$dc_us_id" "$iter" "codex_exit_with_done_claim_clean"
659
714
  return 0
660
715
  }
661
716
 
@@ -2176,8 +2231,22 @@ poll_for_signal() {
2176
2231
 
2177
2232
  # Check if signal file appeared
2178
2233
  if [[ -f "$signal_file" ]]; then
2179
- log " Signal file detected: $signal_file"
2180
- return 0 # success
2234
+ # Bug #7-extra (BOS 2026-05-06): file existence is NOT enough. Worker
2235
+ # (claude opus) writes via Claude Code's Write tool, which is not
2236
+ # guaranteed atomic — the file can appear with empty / partial JSON
2237
+ # before the write completes. Verifier was being dispatched against a
2238
+ # half-written iter-signal.json. Validate that the file holds a single
2239
+ # parseable, non-null JSON value (`jq -e .`) before accepting; any
2240
+ # failure simply continues polling (next tick re-reads). Note: `jq
2241
+ # empty` was rejected because it accepts an EMPTY file as "zero
2242
+ # documents" — the exact race window we need to reject.
2243
+ if jq -e . "$signal_file" >/dev/null 2>&1; then
2244
+ log " Signal file detected: $signal_file"
2245
+ return 0 # success
2246
+ fi
2247
+ # Empty / truncated / mid-write JSON. Stay in the polling loop and let
2248
+ # the next tick re-read once the writer has finished.
2249
+ log_debug "[bug7-extra] $role signal file present but JSON not yet valid — continue polling"
2181
2250
  fi
2182
2251
 
2183
2252
  # A4 fallback: done-claim exists but no signal → Worker forgot iter-signal
@@ -2216,11 +2285,24 @@ poll_for_signal() {
2216
2285
  local dc_us_id
2217
2286
  dc_us_id=$(jq -r '.us_id // "unknown"' "$DONE_CLAIM_FILE" 2>/dev/null)
2218
2287
  if [[ -n "$dc_us_id" && "$dc_us_id" != "null" ]]; then
2219
- log " WARNING: done-claim exists for $dc_us_id but no iter-signal. Auto-generating signal (A4 fallback)."
2220
- log_debug "[GOV] iter=$ITERATION done_claim_without_signal=true us_id=$dc_us_id action=auto_generate_signal"
2221
- echo '{"iteration":'"$ITERATION"',"status":"verify","us_id":"'"$dc_us_id"'","summary":"auto-generated by A4 fallback (done-claim without signal)","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'"}' > "$signal_file"
2222
- _emit_a4_fallback_audit "$dc_us_id" "$ITERATION" "inline_polling_a4"
2223
- return 0
2288
+ # Bug #8 PR-B: defer to shared 4-way gate (codex critic P1.2).
2289
+ # _bug8_check_synth_allowed handles done-claim/git/dirty-tree gates
2290
+ # uniformly across handle_worker_exit_codex AND this inline path so
2291
+ # both codex-exit and inline-polling A4 enforce the same contract.
2292
+ if _bug8_check_synth_allowed "$ITERATION" "$dc_us_id" "inline_polling_a4_clean"; then
2293
+ log " WARNING: done-claim exists for $dc_us_id but no iter-signal. Tree clean — auto-generating signal (A4 fallback)."
2294
+ log_debug "[GOV] iter=$ITERATION done_claim_without_signal=true us_id=$dc_us_id action=auto_generate_signal"
2295
+ echo '{"iteration":'"$ITERATION"',"status":"verify","us_id":"'"$dc_us_id"'","summary":"auto-generated by A4 fallback (done-claim + clean tree)","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'"}' > "$signal_file"
2296
+ _emit_a4_fallback_audit "$dc_us_id" "$ITERATION" "inline_polling_a4_clean"
2297
+ return 0
2298
+ else
2299
+ # Bug #8 PR-B (codex critic round-2 P2): hard-stop rc=2 so the
2300
+ # main worker loop (L3119) treats this BLOCKED as terminal,
2301
+ # matching the handle_worker_exit_codex blocked path. rc=1 is
2302
+ # ambiguous — caller may interpret it as a recoverable poll
2303
+ # failure and re-loop while the BLOCKED sentinel is on disk.
2304
+ return 2
2305
+ fi
2224
2306
  fi
2225
2307
  fi
2226
2308
  fi
@@ -2271,8 +2353,16 @@ poll_for_signal() {
2271
2353
  fi
2272
2354
  # Dispatch to engine-specific exit handler
2273
2355
  if [[ "$WORKER_ENGINE" = "codex" && "$role" != *erifier* ]]; then
2274
- handle_worker_exit_codex "$ITERATION" "$signal_file"
2275
- return 0
2356
+ # Bug #8 PR-B: handle_worker_exit_codex now returns 1 when it has
2357
+ # written a BLOCKED sentinel (no done-claim, dirty tree, git
2358
+ # unverifiable). Propagate the return so main loop stops, instead
2359
+ # of swallowing it with `return 0` and continuing as if the poll
2360
+ # had succeeded.
2361
+ if handle_worker_exit_codex "$ITERATION" "$signal_file"; then
2362
+ return 0
2363
+ else
2364
+ return 2
2365
+ fi
2276
2366
  fi
2277
2367
  # Claude path (or verifier of any engine)
2278
2368
  if handle_worker_exit_claude "$pane_id" "$ITERATION" "$trigger_file"; then
@@ -2467,8 +2557,16 @@ run_single_verifier() {
2467
2557
  fi
2468
2558
  fi
2469
2559
 
2560
+ # Bug #7 Fix-Q/R: reap verifier pane the moment we accept the verdict so
2561
+ # codex/claude cannot keep self-reviewing and rewrite verify-verdict.json.
2562
+ # Lock applied AFTER cp so the archived snapshot is also frozen at intent.
2563
+ _kill_pane_process "$VERIFIER_PANE" "verifier-${suffix}"
2564
+
2470
2565
  # Copy verdict to destination
2471
2566
  cp "$VERDICT_FILE" "$verdict_dest"
2567
+ _lock_sentinel "$VERDICT_FILE"
2568
+ # PR-0b-narrow: stamp leader handshake ack on the verdict (audit-only).
2569
+ _stamp_ack_field "$VERDICT_FILE"
2472
2570
  log " Verifier$suffix verdict saved to $verdict_dest"
2473
2571
  return 0
2474
2572
  }
@@ -2528,6 +2626,14 @@ run_sequential_final_verify() {
2528
2626
  return 1
2529
2627
  fi
2530
2628
 
2629
+ # Bug #7 Fix-Q/R: reap verifier pane between per-US final verifications so
2630
+ # the previous codex/claude TUI cannot continue running while the next per-
2631
+ # US verifier dispatch reuses the same pane.
2632
+ _kill_pane_process "$VERIFIER_PANE" "verifier-final"
2633
+ _lock_sentinel "$VERDICT_FILE"
2634
+ # PR-0b-narrow: stamp leader handshake ack on the verdict (audit-only).
2635
+ _stamp_ack_field "$VERDICT_FILE"
2636
+
2531
2637
  # Check verdict
2532
2638
  local verdict
2533
2639
  verdict=$(jq -r '.verdict' "$VERDICT_FILE" 2>/dev/null)
@@ -2939,20 +3045,50 @@ main() {
2939
3045
  return 1
2940
3046
  fi
2941
3047
 
2942
- # --- governance.md s7 step 8 (cleanup): Clean previous iteration signals ---
2943
- rm -f "$SIGNAL_FILE" "$DONE_CLAIM_FILE" "$VERDICT_FILE" 2>/dev/null
2944
- rm -f "$WORKER_HEARTBEAT" "$VERIFIER_HEARTBEAT" 2>/dev/null
3048
+ # PR-A (Bug #10): operator-recovery hygiene check.
3049
+ # When the operator hand-rolls a `phase=verify` recovery (jq-patches
3050
+ # status.json, writes manual iter-signal.json + done-claim.json, deletes
3051
+ # the blocked sentinel), the leader MUST honor that work instead of
3052
+ # deleting the artifacts and resetting to phase=worker. Mirrors the
3053
+ # Node-side guard in src/node/runner/campaign-main-loop.mjs.
3054
+ local SKIP_NEXT_WORKER=0
3055
+ local LAST_PHASE=""
3056
+ if [[ -f "$STATUS_FILE" ]] && command -v jq >/dev/null 2>&1; then
3057
+ LAST_PHASE=$(jq -r '.phase // ""' "$STATUS_FILE" 2>/dev/null)
3058
+ fi
3059
+ if [[ "$LAST_PHASE" == "verify" ]]; then
3060
+ local _iter_prompt="$LOGS_DIR/iter-$(printf '%03d' $ITERATION).worker-prompt.md"
3061
+ if _validate_operator_recovery_artifacts \
3062
+ "$SIGNAL_FILE" "$DONE_CLAIM_FILE" "$STATUS_FILE" "$_iter_prompt"; then
3063
+ log "[recovery] Resuming verify phase — operator manual recovery detected (iter=$ITERATION)"
3064
+ log_debug "[recovery] iter=$ITERATION skip_worker=true reason=manual_recovery_validated"
3065
+ SKIP_NEXT_WORKER=1
3066
+ else
3067
+ log "[recovery] phase=verify ignored: ${RECOVERY_FAIL_REASON}"
3068
+ log_debug "[recovery] iter=$ITERATION skip_worker=false reason=\"${RECOVERY_FAIL_REASON}\""
3069
+ fi
3070
+ fi
2945
3071
 
2946
- # --- Clean previous claude session in panes (one-shot lifecycle) ---
2947
- # Only needed from iteration 2 onwards (iteration 1 has fresh panes)
2948
- if (( ITERATION > 1 )); then
2949
- # Send C-c first (in case claude is mid-task), then /exit
2950
- tmux send-keys -t "$WORKER_PANE" C-c 2>/dev/null
2951
- sleep 1
2952
- tmux send-keys -t "$WORKER_PANE" "/exit" C-m 2>/dev/null
2953
- sleep 2
2954
- # Wait for shell prompt before proceeding
2955
- wait_for_pane_ready "$WORKER_PANE" 10 2>/dev/null || true
3072
+ if (( ! SKIP_NEXT_WORKER )); then
3073
+ # --- governance.md s7 step 8 (cleanup): Clean previous iteration signals ---
3074
+ # Bug #7 Fix-R cleanup: unlock 0o444 sentinels written by the previous
3075
+ # iteration's reaper before rm so cleanup does not log permission noise.
3076
+ _unlock_sentinel "$SIGNAL_FILE"
3077
+ _unlock_sentinel "$VERDICT_FILE"
3078
+ rm -f "$SIGNAL_FILE" "$DONE_CLAIM_FILE" "$VERDICT_FILE" 2>/dev/null
3079
+ rm -f "$WORKER_HEARTBEAT" "$VERIFIER_HEARTBEAT" 2>/dev/null
3080
+
3081
+ # --- Clean previous claude session in panes (one-shot lifecycle) ---
3082
+ # Only needed from iteration 2 onwards (iteration 1 has fresh panes)
3083
+ if (( ITERATION > 1 )); then
3084
+ # Send C-c first (in case claude is mid-task), then /exit
3085
+ tmux send-keys -t "$WORKER_PANE" C-c 2>/dev/null
3086
+ sleep 1
3087
+ tmux send-keys -t "$WORKER_PANE" "/exit" C-m 2>/dev/null
3088
+ sleep 2
3089
+ # Wait for shell prompt before proceeding
3090
+ wait_for_pane_ready "$WORKER_PANE" 10 2>/dev/null || true
3091
+ fi
2956
3092
  fi
2957
3093
 
2958
3094
  # Reset per-iteration state
@@ -2964,33 +3100,44 @@ main() {
2964
3100
  # --- US-004: detect PRD changes for live update + re-split ---
2965
3101
  check_prd_update
2966
3102
 
2967
- # --- governance.md s7 step 4: Build worker prompt + trigger ---
2968
- write_worker_trigger "$ITERATION"
2969
- local worker_prompt="$LOGS_DIR/iter-$(printf '%03d' $ITERATION).worker-prompt.md"
2970
-
2971
- # AC1: capture worker start timestamp
3103
+ # AC1: capture worker start timestamp (still set for downstream telemetry
3104
+ # even when the worker dispatch is skipped — recovery still consumes time).
2972
3105
  ITER_WORKER_START=$(date +%s)
2973
3106
 
2974
- update_status "worker" "running"
3107
+ local worker_launch=""
3108
+ if (( ! SKIP_NEXT_WORKER )); then
3109
+ # --- governance.md s7 step 4: Build worker prompt + trigger ---
3110
+ write_worker_trigger "$ITERATION"
3111
+ local worker_prompt="$LOGS_DIR/iter-$(printf '%03d' $ITERATION).worker-prompt.md"
2975
3112
 
2976
- # --- governance.md s7 step 5: Execute Worker (dispatched to engine-specific function) ---
2977
- log_debug "[FLOW] iter=$ITERATION phase=worker engine=$WORKER_ENGINE model=$WORKER_MODEL dispatched=true"
3113
+ update_status "worker" "running"
2978
3114
 
2979
- local worker_launch
2980
- if [[ "$WORKER_ENGINE" = "codex" ]]; then
2981
- worker_launch="${CODEX_BIN:-codex} -m $WORKER_CODEX_MODEL -c model_reasoning_effort=\"$WORKER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
2982
- if ! launch_worker_codex "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
2983
- write_blocked_sentinel "Worker codex failed to start in pane" "" "infra_failure"
2984
- update_status "blocked" "worker_start_failed"
2985
- return 1
3115
+ # --- governance.md s7 step 5: Execute Worker (dispatched to engine-specific function) ---
3116
+ log_debug "[FLOW] iter=$ITERATION phase=worker engine=$WORKER_ENGINE model=$WORKER_MODEL dispatched=true"
3117
+
3118
+ if [[ "$WORKER_ENGINE" = "codex" ]]; then
3119
+ worker_launch="${CODEX_BIN:-codex} -m $WORKER_CODEX_MODEL -c model_reasoning_effort=\"$WORKER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
3120
+ if ! launch_worker_codex "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
3121
+ write_blocked_sentinel "Worker codex failed to start in pane" "" "infra_failure"
3122
+ update_status "blocked" "worker_start_failed"
3123
+ return 1
3124
+ fi
3125
+ else
3126
+ worker_launch="$(build_claude_cmd tui "$WORKER_MODEL" "" "" "$WORKER_EFFORT")"
3127
+ if ! launch_worker_claude "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
3128
+ write_blocked_sentinel "Worker claude failed to start in pane" "" "infra_failure"
3129
+ update_status "blocked" "worker_start_failed"
3130
+ return 1
3131
+ fi
2986
3132
  fi
2987
3133
  else
2988
- worker_launch="$(build_claude_cmd tui "$WORKER_MODEL" "" "" "$WORKER_EFFORT")"
2989
- if ! launch_worker_claude "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
2990
- write_blocked_sentinel "Worker claude failed to start in pane" "" "infra_failure"
2991
- update_status "blocked" "worker_start_failed"
2992
- return 1
2993
- fi
3134
+ # PR-A (Bug #10): one-shot recovery path. The operator's iter-signal.json
3135
+ # is already on disk; polling below picks it up immediately and the loop
3136
+ # transitions cleanly into the verifier phase. Persist phase=verify so a
3137
+ # subsequent crash-and-relaunch sees the same contract. SKIP_NEXT_WORKER
3138
+ # is local to this iteration so iter-N+1 dispatches the worker normally.
3139
+ update_status "verify" "running"
3140
+ log "[recovery] Skipping worker dispatch for iter=$ITERATION (one-shot, honoring operator manual recovery)"
2994
3141
  fi
2995
3142
 
2996
3143
  # --- governance.md s7 step 5+6: Poll for Worker completion ---
@@ -3003,6 +3150,12 @@ main() {
3003
3150
  if poll_for_signal "$SIGNAL_FILE" "$WORKER_HEARTBEAT" "$WORKER_PANE" "$worker_launch" "Worker"; then
3004
3151
  worker_poll_done=1
3005
3152
  log_debug "[FLOW] iter=$ITERATION poll_signal_received=true"
3153
+ # Bug #7 Fix-Q/R: reap worker pane immediately so claude/codex cannot
3154
+ # self-review and rewrite iter-signal.json (1m43s drift observed).
3155
+ _kill_pane_process "$WORKER_PANE" "worker"
3156
+ _lock_sentinel "$SIGNAL_FILE"
3157
+ # PR-0b-narrow: stamp leader handshake ack on the iter-signal (audit-only).
3158
+ _stamp_ack_field "$SIGNAL_FILE"
3006
3159
  else
3007
3160
  worker_poll_rc=$?
3008
3161
  if (( worker_poll_rc == 2 )); then
@@ -3210,6 +3363,12 @@ main() {
3210
3363
  update_status "blocked" "verifier_dead"
3211
3364
  return 1
3212
3365
  fi
3366
+ # Bug #7 Fix-Q/R: reap verifier pane immediately so codex cannot
3367
+ # rewrite verify-verdict.json post-detect (mtime drift fix).
3368
+ _kill_pane_process "$VERIFIER_PANE" "verifier"
3369
+ _lock_sentinel "$VERDICT_FILE"
3370
+ # PR-0b-narrow: stamp leader handshake ack on the verdict (audit-only).
3371
+ _stamp_ack_field "$VERDICT_FILE"
3213
3372
  fi
3214
3373
 
3215
3374
  # AC1: capture verifier end timestamp