@ai-dev-methodologies/rlp-desk 0.17.0 → 0.18.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -164,6 +164,34 @@ _check_consecutive_blocks() {
164
164
  return 0
165
165
  }
166
166
 
167
+ # F-22: bump the consecutive-failure counter for a soft-fail (request_info,
168
+ # unknown verdict/status). Returns 0 if the circuit breaker is now tripped
169
+ # (caller writes a sentinel + returns 1), 1 if still under threshold (continue).
170
+ # Closes the "silently loop to MAX_ITER without ever firing the CB" gap for
171
+ # verdict/status values the case-statement did not previously account for.
172
+ _bump_consecutive_failure() {
173
+ (( CONSECUTIVE_FAILURES++ ))
174
+ (( CONSECUTIVE_FAILURES >= EFFECTIVE_CB_THRESHOLD )) && return 0
175
+ return 1
176
+ }
177
+
178
+ # F-22: decide a worker/verifier BLOCK with grace. This is the call site that
179
+ # was MISSING — _check_consecutive_blocks was dead code (defined, never invoked),
180
+ # so the consecutive-blocks circuit breaker (governance §8) never ran and a
181
+ # SINGLE transient "blocked" (a fresh-context LLM mis-emitting the status, a
182
+ # formatting slip) terminated the whole campaign. Returns 0 = TERMINATE (caller
183
+ # writes the sentinel + returns 1); 1 = ABSORB as a soft-fail (loop continues,
184
+ # Worker retries). Forced terminal when: the category is a genuine infra_failure,
185
+ # the same canonical reason repeats >= BLOCK_CB_THRESHOLD, or the consecutive-
186
+ # failures CB trips. Otherwise a recoverable first/transient block is absorbed.
187
+ _block_with_grace() {
188
+ local reason="$1" category="${2:-metric_failure}"
189
+ _check_consecutive_blocks "$reason" "$category" "${ITERATION:-0}" || return 0
190
+ [[ "$category" == "infra_failure" ]] && return 0
191
+ _bump_consecutive_failure && return 0
192
+ return 1
193
+ }
194
+
167
195
  # --- Engine Selection (auto-detect from model format) ---
168
196
  # claude models (haiku/sonnet/opus) with :effort → claude engine + effort
169
197
  # codex models (gpt-*/spark) with :reasoning → codex engine + reasoning
@@ -205,11 +233,17 @@ FINAL_VERIFIER_EFFORT="${FINAL_VERIFIER_EFFORT:-}"
205
233
  # Auto-detect engine from model format for env var path (CLI path uses parse_model_flag)
206
234
  _auto_detect_engine WORKER_MODEL WORKER_ENGINE WORKER_CODEX_MODEL WORKER_CODEX_REASONING WORKER_EFFORT
207
235
  _auto_detect_engine VERIFIER_MODEL VERIFIER_ENGINE VERIFIER_CODEX_MODEL VERIFIER_CODEX_REASONING VERIFIER_EFFORT
208
- _auto_detect_engine FINAL_VERIFIER_MODEL FINAL_VERIFIER_ENGINE "" "" FINAL_VERIFIER_EFFORT
236
+ _auto_detect_engine FINAL_VERIFIER_MODEL FINAL_VERIFIER_ENGINE FINAL_VERIFIER_CODEX_MODEL FINAL_VERIFIER_CODEX_REASONING FINAL_VERIFIER_EFFORT
209
237
  WORKER_CODEX_MODEL="${WORKER_CODEX_MODEL:-gpt-5.5}"
210
238
  WORKER_CODEX_REASONING="${WORKER_CODEX_REASONING:-high}" # low|medium|high
211
239
  VERIFIER_CODEX_MODEL="${VERIFIER_CODEX_MODEL:-gpt-5.5}"
212
240
  VERIFIER_CODEX_REASONING="${VERIFIER_CODEX_REASONING:-high}" # low|medium|high
241
+ # D-1: FINAL verifier codex sub-vars (auto-detected above from FINAL_VERIFIER_MODEL,
242
+ # default here when not codex). Wired so the FINAL (ALL) verify can run a stronger
243
+ # model than the per-US verifier — the "final 엄격" knob (FINAL_VERIFIER_MODEL
244
+ # defaults to opus). Distinct from the removed per-iteration verifier auto-upgrade.
245
+ FINAL_VERIFIER_CODEX_MODEL="${FINAL_VERIFIER_CODEX_MODEL:-gpt-5.5}"
246
+ FINAL_VERIFIER_CODEX_REASONING="${FINAL_VERIFIER_CODEX_REASONING:-high}" # low|medium|high
213
247
  CODEX_BIN="" # resolved by check_dependencies when engine=codex
214
248
 
215
249
  # --- Verify Mode ---
@@ -274,6 +308,10 @@ MEMORY_FILE="$MEMOS_DIR/${SLUG}-memory.md"
274
308
  SIGNAL_FILE="$MEMOS_DIR/${SLUG}-iter-signal.json"
275
309
  DONE_CLAIM_FILE="$MEMOS_DIR/${SLUG}-done-claim.json"
276
310
  VERDICT_FILE="$MEMOS_DIR/${SLUG}-verify-verdict.json"
311
+ # F-14: durable, structured append-only ledger of verified-pass US — the
312
+ # drift-proof source-of-truth for VERIFIED_US restore (vs the Worker's prose
313
+ # "## Completed Stories", which is fresh-context LLM output that can drift).
314
+ VERIFIED_LEDGER="$MEMOS_DIR/${SLUG}-verified.jsonl"
277
315
  # v0.14.2 Bug Report #4: codex sometimes writes the verdict file to the
278
316
  # pre-v0.13.0 legacy path despite the prompt instructing otherwise (CWD
279
317
  # heuristics inside the codex CLI). Track the legacy path so the no-progress
@@ -312,6 +350,9 @@ BASELINE_COMMIT="" # git HEAD at campaign start (captured before loop)
312
350
  CAMPAIGN_REPORT_GENERATED=0 # guard against double-generation in cleanup trap
313
351
  SV_REPORT_GENERATED=0 # guard against double-generation in generate_sv_report
314
352
  VERIFIED_US="" # comma-separated list of verified US IDs (per-us mode)
353
+ _FINALIZE_PENDING=0 # D-16: armed when the last per-US pass completes coverage;
354
+ # the next loop top synthesizes an ALL verify signal and
355
+ # skips the (fragile) worker round-trip to emit it.
315
356
  CONSENSUS_ROUND=0 # current consensus round for current US
316
357
  US_LIST="" # comma-separated US IDs from PRD (per-us mode)
317
358
  LOCKFILE_ACQUIRED=0
@@ -389,6 +430,33 @@ launch_worker_codex() {
389
430
  sleep 1
390
431
  local _pane_text
391
432
  _pane_text=$(tmux capture-pane -t "$pane_id" -p 2>/dev/null || true)
433
+ # F-1: on launch codex may show "✨ Update available!" — an arrow-menu whose
434
+ # DEFAULT highlighted option is "1. Update now" (runs `npm install -g
435
+ # @openai/codex`) with "Press enter to continue". Our subsequent Enter would
436
+ # confirm option 1 and the update REPLACES the Worker session (hijack). This
437
+ # check MUST precede the '›' ready check below because the update menu also
438
+ # renders '›'. Move the selection to "2. Skip" (Down) then confirm (Enter).
439
+ # (Guarded: only fires when the update banner is present, so it is harmless
440
+ # in any normal pane state. Key sequence pending live-codex confirmation.)
441
+ if echo "$_pane_text" | grep -qiE 'Update available|1\. Update now' 2>/dev/null; then
442
+ log " Worker codex: update prompt detected — selecting '2. Skip' (F-1)."
443
+ log_debug "[GOV] iter=$iter codex_update_prompt=skipped role=worker"
444
+ tmux send-keys -t "$pane_id" Down 2>/dev/null; sleep 0.3
445
+ tmux send-keys -t "$pane_id" C-m 2>/dev/null; sleep 1
446
+ (( _codex_wait++ )); continue
447
+ fi
448
+ # F-16: codex 0.141 shows a "Do you trust the contents of this directory?
449
+ # 1. Yes, continue / 2. No, quit" prompt at startup (project-local config/
450
+ # hooks loading). Its '›' is otherwise mis-read as "ready" below, and the
451
+ # worker instruction sent into that menu can land on "No, quit" → codex exits
452
+ # → "worker not active" BLOCK. Accept it (Enter = default "1. Yes, continue")
453
+ # before the ready check. Validated end-to-end: codex then runs the task.
454
+ if echo "$_pane_text" | grep -qiE 'Do you trust|1\. Yes, continue' 2>/dev/null; then
455
+ log " Worker codex: directory-trust prompt — accepting (F-16)."
456
+ log_debug "[GOV] iter=$iter codex_trust_prompt=accepted role=worker"
457
+ tmux send-keys -t "$pane_id" C-m 2>/dev/null; sleep 1
458
+ (( _codex_wait++ )); continue
459
+ fi
392
460
  if echo "$_pane_text" | grep -q '›' 2>/dev/null; then
393
461
  _codex_ready=1
394
462
  log_debug "Worker codex TUI ready after ${_codex_wait}s"
@@ -543,6 +611,25 @@ launch_verifier_codex() {
543
611
  sleep 1
544
612
  local _pane_text
545
613
  _pane_text=$(tmux capture-pane -t "$pane_id" -p 2>/dev/null || true)
614
+ # F-1: dismiss codex's "✨ Update available!" launch menu before it hijacks the
615
+ # pane (default option is "1. Update now"). See launch_worker_codex for detail.
616
+ if echo "$_pane_text" | grep -qiE 'Update available|1\. Update now' 2>/dev/null; then
617
+ log " Verifier codex: update prompt detected — selecting '2. Skip' (F-1)."
618
+ log_debug "[GOV] iter=$iter codex_update_prompt=skipped role=verifier"
619
+ tmux send-keys -t "$pane_id" Down 2>/dev/null; sleep 0.3
620
+ tmux send-keys -t "$pane_id" C-m 2>/dev/null; sleep 1
621
+ (( _codex_wait++ )); continue
622
+ fi
623
+ # F-16: accept codex 0.141's "Do you trust this directory?" startup prompt
624
+ # (Enter = default "1. Yes, continue") before the ready check — see
625
+ # launch_worker_codex for detail. Otherwise the instruction lands in the menu
626
+ # and can select "No, quit" → codex exits → "verifier not active".
627
+ if echo "$_pane_text" | grep -qiE 'Do you trust|1\. Yes, continue' 2>/dev/null; then
628
+ log " Verifier codex: directory-trust prompt — accepting (F-16)."
629
+ log_debug "[GOV] iter=$iter codex_trust_prompt=accepted role=verifier"
630
+ tmux send-keys -t "$pane_id" C-m 2>/dev/null; sleep 1
631
+ (( _codex_wait++ )); continue
632
+ fi
546
633
  if echo "$_pane_text" | grep -q '›' 2>/dev/null; then
547
634
  _codex_ready=1
548
635
  log_debug "Verifier codex TUI ready after ${_codex_wait}s"
@@ -639,6 +726,16 @@ launch_verifier_claude() {
639
726
  # On exit: check done-claim, auto-generate iter-signal.
640
727
  # Args: $1=iteration $2=signal_file
641
728
  # Returns: 0 (signal generated), 1 (error)
729
+ # F-14: append a verified-pass US to the durable ledger (the leader's structured,
730
+ # drift-proof record of progress). Skips ALL/empty; append-only, readers dedup.
731
+ _append_verified_ledger() {
732
+ local us="$1"
733
+ [[ -z "$us" || "$us" == "ALL" ]] && return 0
734
+ mkdir -p "${VERIFIED_LEDGER:h}" 2>/dev/null
735
+ printf '{"us_id":"%s","iter":%s,"verified_at":"%s"}\n' \
736
+ "$us" "${ITERATION:-0}" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "$VERIFIED_LEDGER"
737
+ }
738
+
642
739
  # Bug #8 PR-B (codex critic P1.2 fix): shared 4-way gate used by both
643
740
  # handle_worker_exit_codex and the inline-polling A4 path. Returns:
644
741
  # 0 = synthesize allowed (caller writes signal_file + emits audit)
@@ -662,6 +759,37 @@ _bug8_check_synth_allowed() {
662
759
  return 1
663
760
  fi
664
761
 
762
+ # Gate 1b (D-2): done-claim FRESHNESS. A done-claim that lingered from a PRIOR
763
+ # run/iteration (e.g. a relaunch where the inter-iteration cleanup did not run)
764
+ # must NOT be synthesized into a verify signal for THIS iteration — it would
765
+ # credit a stale/wrong US into the durable ledger. The Worker writes its
766
+ # done-claim DURING this iteration, so a fresh claim is strictly NEWER than this
767
+ # iteration's worker-prompt; an older claim is stale. mtime-based on purpose:
768
+ # done-claim carries no reliable .iteration field (workers omit it), so an
769
+ # iteration match would false-reject every claim and break the A4 synth path.
770
+ local _dc_wp_file="$LOGS_DIR/iter-$(printf '%03d' "$iter").worker-prompt.md"
771
+ if [[ -f "$_dc_wp_file" ]]; then
772
+ # mtime, cross-platform: GNU `stat -c %Y` FIRST (on Linux `stat -f %m` means
773
+ # --file-system + %m=mount-point, returns a non-numeric path with exit 0 so a
774
+ # `-f`-first order would silently mis-read); macOS BSD `stat -c` errors → falls
775
+ # through to `stat -f %m` (the BSD mtime). Correct on both; `echo 0` = unknown.
776
+ local _dc_mt _wp_mt
777
+ _dc_mt=$(stat -c %Y "$DONE_CLAIM_FILE" 2>/dev/null || stat -f %m "$DONE_CLAIM_FILE" 2>/dev/null || echo 0)
778
+ _wp_mt=$(stat -c %Y "$_dc_wp_file" 2>/dev/null || stat -f %m "$_dc_wp_file" 2>/dev/null || echo 0)
779
+ [[ "$_dc_mt" == <-> ]] || _dc_mt=0 # guard: ignore any non-numeric stat output
780
+ [[ "$_wp_mt" == <-> ]] || _wp_mt=0
781
+ if (( _dc_mt > 0 && _wp_mt > 0 && _dc_mt < _wp_mt )); then
782
+ log_error " Bug #8: done-claim is STALE (mtime $_dc_mt < this iteration's worker-prompt $_wp_mt) — refusing to synthesize from a prior-run claim."
783
+ log_debug "[GOV] iter=$iter bug8=block_stale_done_claim dc_mt=$_dc_mt wp_mt=$_wp_mt"
784
+ write_blocked_sentinel \
785
+ "done-claim is stale (older than this iteration's worker dispatch) — refusing to synthesize a verify signal from a prior-run claim" \
786
+ "$us_id" \
787
+ "infra_failure"
788
+ _emit_a4_fallback_audit "$us_id" "$iter" "blocked_stale_done_claim"
789
+ return 1
790
+ fi
791
+ fi
792
+
665
793
  # Gate 2: git toplevel must equal $ROOT (canonicalized — macOS resolves
666
794
  # /var → /private/var, NTFS may have 8.3 short paths; compare realpaths).
667
795
  local _bug8_top _bug8_top_canon _bug8_root_canon
@@ -679,20 +807,59 @@ _bug8_check_synth_allowed() {
679
807
  return 1
680
808
  fi
681
809
 
682
- # Gate 3: tree must be clean.
810
+ # Gate 3: no UNCOMMITTED changes to TRACKED files (F-6 fix). We compare against
811
+ # HEAD with `git diff --name-only HEAD`, which lists ONLY tracked files modified
812
+ # vs HEAD — untracked cruft (logs, .DS_Store, local config, build/coverage
813
+ # output) the Worker never touched is never listed. Blocking on such cruft
814
+ # false-BLOCKed the campaign at iter 1 on ANY non-pristine repo — the single
815
+ # largest "never completes" cause found in large-campaign dogfood. The Verifier
816
+ # (test-spec) is the real correctness gate for the Worker's committed work; this
817
+ # gate only guards against a Worker that left TRACKED edits uncommitted.
683
818
  local _bug8_dirty
684
- _bug8_dirty=$(git -C "$ROOT" status --porcelain 2>/dev/null)
819
+ _bug8_dirty=$(git -C "$ROOT" diff --name-only HEAD 2>/dev/null)
685
820
  if [[ -n "$_bug8_dirty" ]]; then
686
- local _bug8_first5
687
- _bug8_first5=$(printf '%s\n' "$_bug8_dirty" | head -n 5 | tr '\n' '|' | sed 's/|$//')
688
- log_error " Bug #8: done-claim present but tree dirty. Refusing synthesis. dirty: $_bug8_first5"
689
- log_debug "[GOV] iter=$iter bug8=block_dirty_tree us_id=$us_id dirty='$_bug8_first5'"
690
- write_blocked_sentinel \
691
- "worker_incomplete_uncommitted: done-claim present but tree dirty ($_bug8_first5)" \
692
- "$us_id" \
693
- "metric_failure"
694
- _emit_a4_fallback_audit "$us_id" "$iter" "blocked_dirty_tree"
695
- return 1
821
+ # F-8 recovery (F-19 scoped): by Gate 1 a done-claim exists, so uncommitted
822
+ # TRACKED changes are most likely the Worker's own US work it failed to commit
823
+ # a frequent weak-model slip (the default haiku Worker reports "Committed ..."
824
+ # in its done-claim while the git commit never landed). Historically this
825
+ # TERMINATED the campaign, stranding completed work — the #1 weak-model "never
826
+ # completes" cause. Instead auto-commit the Worker's edits and proceed — but
827
+ # scope the commit to the Worker's OWN files: exclude any tracked file ALREADY
828
+ # dirty before the campaign (CAMPAIGN_PREEXISTING_DIRTY) so an operator's
829
+ # pre-existing uncommitted work is NEVER swept into a Worker-recovery commit.
830
+ # The Verifier (test-spec) is the real correctness gate, so a genuine mid-write
831
+ # bail still FAILs verify → fix loop; Bug #8's "no false PASS" intent is
832
+ # preserved by the Verifier, not by abort.
833
+ local _bug8_worker_files
834
+ _bug8_worker_files=$(comm -23 \
835
+ <(printf '%s\n' "$_bug8_dirty" | sort -u) \
836
+ <(printf '%s\n' "${CAMPAIGN_PREEXISTING_DIRTY:-}" | sort -u) \
837
+ | grep -v '^[[:space:]]*$')
838
+ if [[ -z "$_bug8_worker_files" ]]; then
839
+ # Every dirty tracked file was already dirty BEFORE the campaign — the Worker
840
+ # committed its own work (or made no tracked change). Nothing to recover; do
841
+ # NOT commit the operator's pre-existing edits. Allow synthesis to proceed.
842
+ log " Bug #8 F-8: only operator's pre-existing edits are dirty — Worker work already committed; proceeding without auto-commit."
843
+ log_debug "[GOV] iter=$iter bug8=preexisting_only_no_commit us_id=$us_id"
844
+ else
845
+ local _bug8_first5
846
+ _bug8_first5=$(printf '%s\n' "$_bug8_worker_files" | head -n 5 | tr '\n' '|' | sed 's/|$//')
847
+ log " Bug #8 F-8 recovery: done-claim + Worker's uncommitted tracked changes — auto-committing $us_id work (files: $_bug8_first5)."
848
+ log_debug "[GOV] iter=$iter bug8=recover_autocommit us_id=$us_id files='$_bug8_first5'"
849
+ local -a _bug8_add=("${(@f)_bug8_worker_files}")
850
+ if git -C "$ROOT" add -- "${_bug8_add[@]}" && git -C "$ROOT" commit -q -m "chore(leader-recovery): commit Worker's uncommitted $us_id changes (Bug #8 F-8)"; then
851
+ log " Leader-recovery auto-commit OK (Worker files only) — Verifier will gate correctness."
852
+ else
853
+ log_error " Bug #8: leader-recovery auto-commit failed. Refusing synthesis. files: $_bug8_first5"
854
+ log_debug "[GOV] iter=$iter bug8=block_autocommit_failed us_id=$us_id files='$_bug8_first5'"
855
+ write_blocked_sentinel \
856
+ "worker_incomplete_uncommitted: leader-recovery auto-commit failed ($_bug8_first5)" \
857
+ "$us_id" \
858
+ "metric_failure"
859
+ _emit_a4_fallback_audit "$us_id" "$iter" "blocked_autocommit_failed"
860
+ return 1
861
+ fi
862
+ fi
696
863
  fi
697
864
 
698
865
  # All gates passed — synthesize allowed.
@@ -1016,10 +1183,15 @@ check_copy_mode() {
1016
1183
  paste_to_pane() {
1017
1184
  local pane_id="$1"
1018
1185
  local text="$2"
1186
+ # D-8/D-13: per-leader+pane tmux buffer name (was a server-GLOBAL "rlp-paste").
1187
+ # Two leaders sharing one tmux server (different ROOTs) would ABA the single
1188
+ # global buffer — load-A / load-B / paste-A pastes B's text into A's pane. A
1189
+ # name keyed by leader pid + pane closes that.
1190
+ local _buf="rlp-paste-$$-${pane_id//[^0-9A-Za-z]/}"
1019
1191
  local tmpbuf="/tmp/.rlp-desk-paste-$$.tmp"
1020
1192
  echo -n "$text" > "$tmpbuf"
1021
- tmux load-buffer -b rlp-paste "$tmpbuf" 2>/dev/null
1022
- tmux paste-buffer -b rlp-paste -d -t "$pane_id" 2>/dev/null
1193
+ tmux load-buffer -b "$_buf" "$tmpbuf" 2>/dev/null
1194
+ tmux paste-buffer -b "$_buf" -d -t "$pane_id" 2>/dev/null # -d deletes the buffer after paste
1023
1195
  rm -f "$tmpbuf"
1024
1196
  }
1025
1197
 
@@ -1757,7 +1929,7 @@ restart_worker() {
1757
1929
 
1758
1930
  # Re-launch worker (tmux interactive pattern)
1759
1931
  if [[ "$WORKER_ENGINE" = "codex" ]]; then
1760
- safe_send_keys "$pane_id" "${CODEX_BIN:-codex} -m $WORKER_CODEX_MODEL -c model_reasoning_effort=\"$WORKER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
1932
+ safe_send_keys "$pane_id" "${CODEX_BIN:-codex} -m $WORKER_CODEX_MODEL -c model_reasoning_effort=\"$WORKER_CODEX_REASONING\" -c mcp_servers='{}' --disable plugins --dangerously-bypass-approvals-and-sandbox"
1761
1933
  else
1762
1934
  safe_send_keys "$pane_id" "$(build_claude_cmd tui "$WORKER_MODEL" "" "" "$WORKER_EFFORT")"
1763
1935
  fi
@@ -1815,6 +1987,12 @@ write_worker_trigger() {
1815
1987
  fi
1816
1988
  done
1817
1989
  fi
1990
+ # D-11: publish the in-flight US GLOBALLY so the lifecycle-path sentinels
1991
+ # (no-progress, prompt-stall, R12 watchdog) tag their BLOCKED sidecar with the
1992
+ # real us_id (they default to ${CURRENT_US:-ALL}, which was always ALL because
1993
+ # CURRENT_US was never assigned). The verify phase overwrites it with the US
1994
+ # actually under verification.
1995
+ [[ -n "$next_us" ]] && CURRENT_US="$next_us" || CURRENT_US="ALL"
1818
1996
 
1819
1997
  {
1820
1998
  # Per-US PRD injection: substitute full PRD path with per-US split path when available
@@ -2057,6 +2235,12 @@ TRIGGER_EOF
2057
2235
  # =============================================================================
2058
2236
 
2059
2237
  cleanup() {
2238
+ # D-8: re-entrancy guard. The trap is armed on EXIT INT TERM, so a TERM (cleanup
2239
+ # runs) immediately followed by process exit (EXIT fires cleanup AGAIN) would
2240
+ # double-run the non-idempotent steps — a double runner-lock release can rm a
2241
+ # relaunched leader's lock dir (ABA). Run the body at most once.
2242
+ (( ${CLEANUP_DONE:-0} )) && return 0
2243
+ CLEANUP_DONE=1
2060
2244
  log "Cleaning up..."
2061
2245
 
2062
2246
  # Remove lockfile
@@ -2066,12 +2250,15 @@ cleanup() {
2066
2250
  log_debug "cleanup: lockfile not owned by this process, skipping removal"
2067
2251
  fi
2068
2252
 
2069
- # US-026 R14 P0: remove project-scoped runner lockfile if owned by this slug
2253
+ # US-026 R14 P0 / D-9: remove the project-scoped runner lock if WE own it. The
2254
+ # lock file now holds our bare PID (acquire_slug_lock), so ownership is an exact
2255
+ # pid match — remove the lock file, the metadata sidecar, and the recovery mutex.
2070
2256
  if [[ -f "$RUNNER_LOCKFILE_PATH" ]]; then
2071
- local own_slug
2072
- own_slug=$(jq -r '.slug' "$RUNNER_LOCKFILE_PATH" 2>/dev/null)
2073
- if [[ "$own_slug" == "$SLUG" ]]; then
2074
- rm -rf "$RUNNER_LOCKDIR" "$RUNNER_LOCKFILE_PATH" 2>/dev/null
2257
+ local own_pid
2258
+ own_pid=$(cat "$RUNNER_LOCKFILE_PATH" 2>/dev/null)
2259
+ if [[ "$own_pid" == "$$" ]]; then
2260
+ rm -f "$RUNNER_LOCKFILE_PATH" "${RUNNER_LOCKFILE_PATH}.meta" 2>/dev/null
2261
+ rm -rf "${RUNNER_LOCKFILE_PATH}.recovery.d" 2>/dev/null
2075
2262
  fi
2076
2263
  fi
2077
2264
 
@@ -2423,8 +2610,19 @@ poll_for_signal() {
2423
2610
  # Dead pane detection during poll: check if claude/codex process died
2424
2611
  local poll_cmd
2425
2612
  poll_cmd=$(tmux display-message -p -t "$pane_id" '#{pane_current_command}' 2>/dev/null)
2426
- # Dead pane detection — delegates to check_dead_pane() for engine-aware logic
2427
- if check_dead_pane "$poll_cmd" "$WORKER_ENGINE" "$role"; then
2613
+ # Dead pane detection — delegates to check_dead_pane() for engine-aware logic.
2614
+ # D-10: pick the engine for the pane being polled, NOT always WORKER_ENGINE. In
2615
+ # a mixed-engine campaign (e.g. claude worker + codex verifier) the old code
2616
+ # judged the codex verifier's "bash" (codex's trigger shell) as DEAD using the
2617
+ # claude rule → false dead-pane → 3-strike → spurious BLOCK on a live verifier.
2618
+ # Derive from the role string (covers per-US, final, and consensus per-engine).
2619
+ local _dead_engine="$WORKER_ENGINE"
2620
+ if [[ "$role" == *codex* ]]; then _dead_engine="codex"
2621
+ elif [[ "$role" == *claude* ]]; then _dead_engine="claude"
2622
+ elif [[ "$role" == *inal* ]]; then _dead_engine="$FINAL_VERIFIER_ENGINE"
2623
+ elif [[ "$role" == *erifier* ]]; then _dead_engine="$VERIFIER_ENGINE"
2624
+ fi
2625
+ if check_dead_pane "$poll_cmd" "$_dead_engine" "$role"; then
2428
2626
  log " WARNING: $role pane $pane_id has bare shell ($poll_cmd) — process died during execution"
2429
2627
  log_debug "[GOV] iter=$ITERATION pane_dead_during_poll=true pane=$pane_id cmd=$poll_cmd role=$role"
2430
2628
  # Return failure so caller can handle recovery
@@ -2472,6 +2670,13 @@ run_single_verifier() {
2472
2670
  local model="$3" # model for this verifier
2473
2671
  local suffix="$4" # "-claude" or "-codex"
2474
2672
  local verdict_dest="$5" # where to copy the verdict file
2673
+ # D-1c (codex MEDIUM): claude reasoning effort for this verifier. Final
2674
+ # consensus passes FINAL_VERIFIER_EFFORT; per-US passes VERIFIER_EFFORT.
2675
+ # Defaults to VERIFIER_EFFORT so existing 5-arg callers are unchanged.
2676
+ # Single-dash (${6-...}, not ${6:-...}) so an explicitly-passed EMPTY effort
2677
+ # (e.g. final consensus with FINAL_VERIFIER_EFFORT unset) is preserved rather
2678
+ # than collapsing back to VERIFIER_EFFORT.
2679
+ local effort="${6-$VERIFIER_EFFORT}"
2475
2680
 
2476
2681
  # Write trigger for this engine
2477
2682
  write_verifier_trigger "$iter" "$engine" "$model" "$suffix"
@@ -2512,11 +2717,29 @@ run_single_verifier() {
2512
2717
  # Launch verifier — dispatch to engine-specific function
2513
2718
  local verifier_launch
2514
2719
  if [[ "$engine" = "codex" ]]; then
2515
- verifier_launch="${CODEX_BIN:-codex} -m $VERIFIER_CODEX_MODEL -c model_reasoning_effort=\"$VERIFIER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
2720
+ # D-1c: honor the passed-in model arg (consensus passes CONSENSUS_MODEL /
2721
+ # FINAL_CONSENSUS_MODEL as "model:reasoning") instead of always using the
2722
+ # global VERIFIER_CODEX_*; fall back to the globals when no model is given.
2723
+ local _cx_model="$VERIFIER_CODEX_MODEL" _cx_reason="$VERIFIER_CODEX_REASONING"
2724
+ if [[ -n "$model" && "$model" == *:* ]]; then
2725
+ # D-1c (codex LOW): validate "model:reasoning" before splitting. Reject an
2726
+ # empty model, an empty/unknown reasoning, or >1 colon (e.g. "gpt-5.5:",
2727
+ # ":medium", "foo:bar:baz") and fall back to the globals instead of
2728
+ # emitting a bad -m or empty reasoning_effort.
2729
+ local _m="${model%%:*}" _r="${model##*:}"
2730
+ if [[ -n "$_m" && "$model" != *:*:* && "$_r" == (minimal|low|medium|high|xhigh) ]]; then
2731
+ _cx_model="$_m"; _cx_reason="$_r"
2732
+ else
2733
+ log " WARNING: malformed consensus codex model '$model' — falling back to $_cx_model:$_cx_reason"
2734
+ fi
2735
+ elif [[ -n "$model" ]]; then
2736
+ _cx_model="$model"
2737
+ fi
2738
+ verifier_launch="${CODEX_BIN:-codex} -m $_cx_model -c model_reasoning_effort=\"$_cx_reason\" -c mcp_servers='{}' --disable plugins --dangerously-bypass-approvals-and-sandbox"
2516
2739
  launch_verifier_codex "$VERIFIER_PANE" "$prompt_file" "$iter" "$verifier_launch"
2517
- log_debug "Verifier$suffix codex TUI dispatched"
2740
+ log_debug "Verifier$suffix codex TUI dispatched (model=$_cx_model reasoning=$_cx_reason)"
2518
2741
  else
2519
- verifier_launch="$(build_claude_cmd tui "$model" "" "" "$VERIFIER_EFFORT")"
2742
+ verifier_launch="$(build_claude_cmd tui "$model" "" "" "$effort")"
2520
2743
  if ! launch_verifier_claude "$VERIFIER_PANE" "$prompt_file" "$iter" "$verifier_launch"; then
2521
2744
  log_error "Verifier$suffix failed to start"
2522
2745
  return 1
@@ -2566,12 +2789,17 @@ run_single_verifier() {
2566
2789
  else
2567
2790
  # Claude: use full poll_for_signal with heartbeat/nudge
2568
2791
  log " Polling for verify-verdict.json ($suffix)..."
2569
- if ! poll_for_signal "$VERDICT_FILE" "$VERIFIER_HEARTBEAT" "$VERIFIER_PANE" "$verifier_launch" "Verifier$suffix"; then
2570
- local verifier_poll_rc=$?
2792
+ # F-25: capture rc DIRECTLY (not inside `if ! cmd; then $?`, which yields the
2793
+ # if-statement status, not poll's rc — the same latent bug fixed at the main
2794
+ # verifier poll site). Keeps the rc==2 "sentinel already written" branch live.
2795
+ poll_for_signal "$VERDICT_FILE" "$VERIFIER_HEARTBEAT" "$VERIFIER_PANE" "$verifier_launch" "Verifier$suffix"
2796
+ local verifier_poll_rc=$?
2797
+ if (( verifier_poll_rc != 0 )); then
2571
2798
  if (( verifier_poll_rc == 2 )); then
2799
+ log_debug "[GOV] run_single_verifier poll hard-fail (rc=2, sentinel already written)"
2572
2800
  return 1
2573
2801
  fi
2574
- log_error "Verifier$suffix poll failed"
2802
+ log_error "Verifier$suffix poll failed (rc=$verifier_poll_rc)"
2575
2803
  return 1
2576
2804
  fi
2577
2805
  fi
@@ -2593,6 +2821,17 @@ run_single_verifier() {
2593
2821
  # --- Sequential final verify: run per-US scoped verifiers instead of one big ALL verify ---
2594
2822
  # Returns 0 if all US pass + integration check pass, 1 if any US fails, 2 if integration fails.
2595
2823
  # Sets FAILED_US global on failure.
2824
+ # D-16: true when every US in US_LIST is already present in VERIFIED_US.
2825
+ # Used to arm leader-driven finalize after the last per-US pass.
2826
+ _all_us_verified() {
2827
+ [[ -n "$US_LIST" ]] || return 1
2828
+ local _us
2829
+ for _us in $(echo "$US_LIST" | tr ',' ' '); do
2830
+ echo ",$VERIFIED_US," | grep -q ",$_us," || return 1
2831
+ done
2832
+ return 0
2833
+ }
2834
+
2596
2835
  run_sequential_final_verify() {
2597
2836
  local iter="$1"
2598
2837
  FAILED_US=""
@@ -2621,29 +2860,54 @@ run_sequential_final_verify() {
2621
2860
  fi
2622
2861
  wait_for_pane_ready "$VERIFIER_PANE" 10 2>/dev/null || true
2623
2862
 
2624
- # Launch verifier
2863
+ # Launch verifier. D-1: the FINAL (ALL) verify uses FINAL_VERIFIER_* (the
2864
+ # "final 엄격" knob — a configured stronger model, e.g. opus, for the final
2865
+ # gate), NOT the lighter per-US VERIFIER_*. This is the configured-final-model
2866
+ # distinction, distinct from the removed per-iteration verifier auto-upgrade.
2625
2867
  local verifier_launch
2626
- if [[ "$VERIFIER_ENGINE" = "codex" ]]; then
2627
- verifier_launch="${CODEX_BIN:-codex} -m $VERIFIER_CODEX_MODEL -c model_reasoning_effort=\"$VERIFIER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
2868
+ if [[ "$FINAL_VERIFIER_ENGINE" = "codex" ]]; then
2869
+ verifier_launch="${CODEX_BIN:-codex} -m $FINAL_VERIFIER_CODEX_MODEL -c model_reasoning_effort=\"$FINAL_VERIFIER_CODEX_REASONING\" -c mcp_servers='{}' --disable plugins --dangerously-bypass-approvals-and-sandbox"
2628
2870
  launch_verifier_codex "$VERIFIER_PANE" "$verifier_prompt" "$iter" "$verifier_launch"
2629
2871
  else
2630
- verifier_launch="$(build_claude_cmd tui "$VERIFIER_MODEL" "" "" "$VERIFIER_EFFORT")"
2872
+ verifier_launch="$(build_claude_cmd tui "$FINAL_VERIFIER_MODEL" "" "" "$FINAL_VERIFIER_EFFORT")"
2631
2873
  launch_verifier_claude "$VERIFIER_PANE" "$verifier_prompt" "$iter" "$verifier_launch" || {
2632
- log_error "Failed to launch verifier for $us"
2874
+ log_error "Failed to launch final verifier for $us"
2633
2875
  FAILED_US="$us"
2634
2876
  return 1
2635
2877
  }
2636
2878
  fi
2637
2879
 
2638
- # Poll for verdict
2880
+ # Poll for verdict. D-4: distinguish rc==2 (hard-fail, sentinel already
2881
+ # written → terminal) from rc==1 (transient pane race/timeout) and give ONE
2882
+ # replace-pane + re-dispatch retry before failing the US — the F-10 retry
2883
+ # parity the per-US main verifier site has but this final-verify path lacked
2884
+ # (a single transient poll miss falsely failed a US at the most expensive
2885
+ # end-of-campaign moment, charging a bogus consecutive failure).
2639
2886
  rm -f "$VERDICT_FILE"
2640
2887
  local poll_rc=0
2641
2888
  poll_for_signal "$VERDICT_FILE" "$VERIFIER_HEARTBEAT" "$VERIFIER_PANE" "$verifier_launch" "Verifier-final" || poll_rc=$?
2642
- if (( poll_rc != 0 )); then
2643
- log_error "Verifier poll failed for $us (rc=$poll_rc)"
2889
+ if (( poll_rc == 2 )); then
2890
+ log_error "Verifier hard-fail (rc=2, sentinel written) for $us in final verify"
2644
2891
  FAILED_US="$us"
2645
2892
  return 1
2646
2893
  fi
2894
+ if (( poll_rc == 1 )); then
2895
+ log " Verifier-final transient poll fail for $us — replacing pane + retrying once (D-4)"
2896
+ replace_worker_pane "$VERIFIER_PANE" "verifier"
2897
+ VERIFIER_PANE=$(jq -r '.panes.verifier' "$SESSION_CONFIG")
2898
+ if [[ "$FINAL_VERIFIER_ENGINE" = "codex" ]]; then
2899
+ launch_verifier_codex "$VERIFIER_PANE" "$verifier_prompt" "$iter" "$verifier_launch"
2900
+ else
2901
+ launch_verifier_claude "$VERIFIER_PANE" "$verifier_prompt" "$iter" "$verifier_launch" || { FAILED_US="$us"; return 1; }
2902
+ fi
2903
+ rm -f "$VERDICT_FILE"; poll_rc=0
2904
+ poll_for_signal "$VERDICT_FILE" "$VERIFIER_HEARTBEAT" "$VERIFIER_PANE" "$verifier_launch" "Verifier-final" || poll_rc=$?
2905
+ if (( poll_rc != 0 )); then
2906
+ log_error "Verifier poll failed for $us after replace+retry (rc=$poll_rc)"
2907
+ FAILED_US="$us"
2908
+ return 1
2909
+ fi
2910
+ fi
2647
2911
 
2648
2912
  # Bug #7 Fix-Q/R: reap verifier pane between per-US final verifications so
2649
2913
  # the previous codex/claude TUI cannot continue running while the next per-
@@ -2699,6 +2963,26 @@ _should_use_consensus() {
2699
2963
  # --- US-004: Run consensus verification (claude + codex sequentially) ---
2700
2964
  run_consensus_verification() {
2701
2965
  local iter="$1"
2966
+ # D-15: the US under consensus (for the merged verdict's us_id, so the D-3
2967
+ # cross-check applies to consensus too). Falls back to the caller's local
2968
+ # signal_us_id (zsh dynamic scope) then ALL.
2969
+ local cons_us_id="${2:-${signal_us_id:-ALL}}"
2970
+ # D-15 fix: us_id is interpolated into the merged-verdict JSON via echo, so make it
2971
+ # JSON-safe. It is always "ALL" or "US-<digits>"; anything else → ALL (a value
2972
+ # with a quote/backslash/control char would otherwise produce invalid JSON).
2973
+ [[ "$cons_us_id" == (ALL|US-<->) ]] || cons_us_id="ALL"
2974
+ # D-1c: wire the documented consensus cross-verifier model knobs. Primary
2975
+ # (claude) uses VERIFIER_MODEL/FINAL_VERIFIER_MODEL; cross (codex) uses
2976
+ # CONSENSUS_MODEL/FINAL_CONSENSUS_MODEL ("model:reasoning"). Final (ALL)
2977
+ # picks the stricter pair; per-US picks the lighter pair.
2978
+ local _cons_claude_model _cons_codex_model _cons_claude_effort
2979
+ if [[ "$cons_us_id" == "ALL" ]]; then
2980
+ _cons_claude_model="$FINAL_VERIFIER_MODEL"; _cons_codex_model="$FINAL_CONSENSUS_MODEL"
2981
+ _cons_claude_effort="$FINAL_VERIFIER_EFFORT" # codex MEDIUM: final claude effort
2982
+ else
2983
+ _cons_claude_model="$VERIFIER_MODEL"; _cons_codex_model="$CONSENSUS_MODEL"
2984
+ _cons_claude_effort="$VERIFIER_EFFORT"
2985
+ fi
2702
2986
  local claude_verdict_file="$LOGS_DIR/iter-$(printf '%03d' $iter).verify-verdict-claude.json"
2703
2987
  local codex_verdict_file="$LOGS_DIR/iter-$(printf '%03d' $iter).verify-verdict-codex.json"
2704
2988
 
@@ -2712,7 +2996,7 @@ run_consensus_verification() {
2712
2996
 
2713
2997
  # Run claude verifier first
2714
2998
  local _claude_t0=$(date +%s)
2715
- if ! run_single_verifier "$iter" "claude" "$VERIFIER_MODEL" "-claude" "$claude_verdict_file"; then
2999
+ if ! run_single_verifier "$iter" "claude" "$_cons_claude_model" "-claude" "$claude_verdict_file" "$_cons_claude_effort"; then
2716
3000
  log_error "Claude verifier failed in consensus round $CONSENSUS_ROUND"
2717
3001
  return 1
2718
3002
  fi
@@ -2723,7 +3007,7 @@ run_consensus_verification() {
2723
3007
  log " WARNING: Claude verdict is '$CLAUDE_VERDICT' — likely interrupted. Retrying claude verifier..."
2724
3008
  log_debug "[GOV] iter=$iter phase=consensus_claude_retry reason=null_verdict"
2725
3009
  rm -f "$claude_verdict_file" 2>/dev/null
2726
- if ! run_single_verifier "$iter" "claude" "$VERIFIER_MODEL" "-claude" "$claude_verdict_file"; then
3010
+ if ! run_single_verifier "$iter" "claude" "$_cons_claude_model" "-claude" "$claude_verdict_file" "$_cons_claude_effort"; then
2727
3011
  log_error "Claude verifier retry also failed"
2728
3012
  return 1
2729
3013
  fi
@@ -2733,19 +3017,36 @@ run_consensus_verification() {
2733
3017
  return 1
2734
3018
  fi
2735
3019
  fi
2736
- log_debug "[GOV] iter=$iter phase=consensus_claude verdict=$CLAUDE_VERDICT model=$VERIFIER_MODEL"
3020
+ log_debug "[GOV] iter=$iter phase=consensus_claude verdict=$CLAUDE_VERDICT model=$_cons_claude_model"
2737
3021
 
2738
3022
  # consensus-fail-fast removed (complexity vs value too low)
2739
3023
 
2740
3024
  # Run codex verifier second
2741
3025
  local _codex_t0=$(date +%s)
2742
- if ! run_single_verifier "$iter" "codex" "$VERIFIER_CODEX_MODEL" "-codex" "$codex_verdict_file"; then
3026
+ if ! run_single_verifier "$iter" "codex" "$_cons_codex_model" "-codex" "$codex_verdict_file"; then
2743
3027
  log_error "Codex verifier failed in consensus round $CONSENSUS_ROUND"
2744
3028
  return 1
2745
3029
  fi
2746
3030
  ITER_VERIFIER_CODEX_DURATION_S=$(( $(date +%s) - _codex_t0 ))
2747
3031
  CODEX_VERDICT=$(jq -r '.verdict' "$codex_verdict_file" 2>/dev/null)
2748
- log_debug "[GOV] iter=$iter phase=consensus_codex verdict=$CODEX_VERDICT model=$VERIFIER_CODEX_MODEL reasoning=$VERIFIER_CODEX_REASONING"
3032
+ # D-14: validate codex verdict is not null/empty — retry once (symmetry with the
3033
+ # claude null-retry above). A transient codex interruption otherwise counts as a
3034
+ # non-pass, burns a consensus round, and can BLOCK after 6 rounds.
3035
+ if [[ -z "$CODEX_VERDICT" || "$CODEX_VERDICT" == "null" ]]; then
3036
+ log " WARNING: Codex verdict is '$CODEX_VERDICT' — likely interrupted. Retrying codex verifier..."
3037
+ log_debug "[GOV] iter=$iter phase=consensus_codex_retry reason=null_verdict"
3038
+ rm -f "$codex_verdict_file" 2>/dev/null
3039
+ if ! run_single_verifier "$iter" "codex" "$_cons_codex_model" "-codex" "$codex_verdict_file"; then
3040
+ log_error "Codex verifier retry also failed"
3041
+ return 1
3042
+ fi
3043
+ CODEX_VERDICT=$(jq -r '.verdict' "$codex_verdict_file" 2>/dev/null)
3044
+ if [[ -z "$CODEX_VERDICT" || "$CODEX_VERDICT" == "null" ]]; then
3045
+ log_error "Codex verdict still null after retry — consensus cannot proceed"
3046
+ return 1
3047
+ fi
3048
+ fi
3049
+ log_debug "[GOV] iter=$iter phase=consensus_codex verdict=$CODEX_VERDICT model=$_cons_codex_model"
2749
3050
 
2750
3051
  log " Consensus: claude=$CLAUDE_VERDICT codex=$CODEX_VERDICT"
2751
3052
  local _combined_action="retry"
@@ -2760,6 +3061,7 @@ run_consensus_verification() {
2760
3061
  {
2761
3062
  echo '{'
2762
3063
  echo ' "verdict": "pass",'
3064
+ echo ' "us_id": "'"$cons_us_id"'",'
2763
3065
  echo ' "verified_at_utc": "'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'",'
2764
3066
  echo ' "summary": "Consensus PASS: both claude and codex verified independently",'
2765
3067
  echo ' "recommended_state_transition": "complete",'
@@ -2852,51 +3154,38 @@ run_consensus_verification() {
2852
3154
  # =============================================================================
2853
3155
 
2854
3156
  main() {
2855
- # --- US-026 R14 P0: project-scoped runner lockfile (mkdir atomic) ---
2856
- # Prevents duplicate runners on the same project root regardless of slug.
2857
- # Different ROOT_HASH allows independent parallel runners across projects.
3157
+ # --- US-026 R14 P0: project-scoped runner lock (per-ROOT, regardless of slug) ---
3158
+ # D-9: delegate to acquire_slug_lock the F-20-proven, race-safe primitive where
3159
+ # the PID *is* the lock (`set -C` atomic create writes the pid in one redirect),
3160
+ # so there is NO acquire/pid-write gap. The previous dir-based design (mkdir a dir
3161
+ # + a separate pid file) had a fundamental gap between acquiring the dir and
3162
+ # writing the pid that a recovery mutex alone could not close (codex D-9 R2).
3163
+ # Metadata (slug/root) goes to a sidecar for the duplicate message + audit.
3164
+ # Different ROOT_HASH → independent parallel runners across projects.
2858
3165
  mkdir -p "$(dirname "$RUNNER_LOCKFILE_PATH")" 2>/dev/null
2859
- if ! mkdir "$RUNNER_LOCKDIR" 2>/dev/null; then
3166
+ if acquire_slug_lock "$RUNNER_LOCKFILE_PATH"; then
3167
+ printf '{"pid":%s,"slug":"%s","root":"%s","started_at":"%s"}\n' \
3168
+ "$$" "$SLUG" "$ROOT" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "${RUNNER_LOCKFILE_PATH}.meta" 2>/dev/null
3169
+ else
2860
3170
  local existing existing_slug
2861
- existing=$(jq -r '.pid' "$RUNNER_LOCKFILE_PATH" 2>/dev/null || echo 0)
2862
- existing_slug=$(jq -r '.slug // "unknown"' "$RUNNER_LOCKFILE_PATH" 2>/dev/null || echo unknown)
2863
- if [[ "$existing" -gt 0 ]] && kill -0 "$existing" 2>/dev/null; then
2864
- echo "duplicate rlp-desk runner detected on this project root. existing pid=$existing slug=$existing_slug, this attempt slug=$SLUG. exiting." >&2
2865
- echo " Recover with: rm -rf '$RUNNER_LOCKDIR' '$RUNNER_LOCKFILE_PATH' (only if pid $existing is confirmed dead)" >&2
2866
- exit 1
2867
- fi
2868
- rm -rf "$RUNNER_LOCKDIR"
2869
- mkdir "$RUNNER_LOCKDIR" 2>/dev/null || {
2870
- echo "failed to acquire runner lock after stale cleanup; another wrapper raced ahead. exit 1" >&2
2871
- exit 1
2872
- }
2873
- echo "stale runner lockfile cleaned (pid $existing dead) — acquired" >&2
3171
+ existing=$(cat "$RUNNER_LOCKFILE_PATH" 2>/dev/null)
3172
+ existing_slug=$(jq -r '.slug // "unknown"' "${RUNNER_LOCKFILE_PATH}.meta" 2>/dev/null || echo unknown)
3173
+ echo "duplicate rlp-desk runner detected on this project root. existing pid=${existing:-unknown} slug=$existing_slug, this attempt slug=$SLUG. exiting." >&2
3174
+ echo " Recover with: rm -f '$RUNNER_LOCKFILE_PATH' '${RUNNER_LOCKFILE_PATH}.meta' && rm -rf '${RUNNER_LOCKFILE_PATH}.recovery.d' (only if pid ${existing:-?} is confirmed dead)" >&2
3175
+ exit 1
2874
3176
  fi
2875
- printf '{"pid":%s,"slug":"%s","root":"%s","started_at":"%s"}\n' \
2876
- "$$" "$SLUG" "$ROOT" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$RUNNER_LOCKFILE_PATH"
2877
3177
 
2878
- # --- Lockfile: prevent duplicate execution ---
2879
- local lockfile="$LOCKFILE_PATH"
2880
- mkdir -p "$(dirname "$lockfile")" 2>/dev/null
2881
- if ! (set -C; echo $$ > "$lockfile") 2>/dev/null; then
2882
- local lock_pid
2883
- lock_pid=$(cat "$lockfile" 2>/dev/null)
2884
- if kill -0 "$lock_pid" 2>/dev/null; then
2885
- log_error "Another instance is already running (PID $lock_pid). Kill $lock_pid or rm $lockfile"
2886
- exit 1
2887
- fi
2888
- # Stale lock — overwrite.
2889
- # NOTE (ZSH-4, deferred): a fully race-safe stale-lock recovery is a separate
2890
- # distributed-lock redesign (codex review found subtle rm/create + mutex-leak
2891
- # races in patch attempts). This finding is LOW: the outer RUNNER_LOCKDIR mkdir
2892
- # lock (keyed on the same $ROOT) already serializes runners before this inner
2893
- # path is reached, so the inner race is unreachable in practice. Left at the
2894
- # tested baseline pending a dedicated redesign.
2895
- log "Stale lock detected (PID ${lock_pid:-unknown} not running), recovering"
2896
- echo $$ > "$lockfile"
3178
+ # --- Lockfile: prevent duplicate execution (ZSH-4 race-safe, v0.17.1) ---
3179
+ # Delegates to acquire_slug_lock (lib_ralph_desk.zsh): atomic set -C fast path +
3180
+ # mkdir-mutex-serialized, PID-reaped stale recovery. Race-safe vs concurrent
3181
+ # recoverers, gap-starters, and a crashed-recoverer mutex leak.
3182
+ if acquire_slug_lock "$LOCKFILE_PATH"; then
2897
3183
  LOCKFILE_ACQUIRED=1
2898
3184
  else
2899
- LOCKFILE_ACQUIRED=1
3185
+ local lock_pid
3186
+ lock_pid=$(cat "$LOCKFILE_PATH" 2>/dev/null)
3187
+ log_error "Another instance is already running or won the lock race (PID ${lock_pid:-unknown}). Kill it or rm $LOCKFILE_PATH"
3188
+ exit 1
2900
3189
  fi
2901
3190
  # US-023 R11 P2-K: chain `_emit_final_cost_log` so cost-log.jsonl is never silently empty on exit.
2902
3191
  trap '_emit_final_cost_log; cleanup' EXIT INT TERM
@@ -2994,19 +3283,25 @@ main() {
2994
3283
  US_LIST=$(grep -oE 'US-[0-9]+' "$prd_file" | sort -u | tr '\n' ',' | sed 's/,$//')
2995
3284
  fi
2996
3285
 
2997
- # Initialize VERIFIED_US from memory's Completed Stories (carry over previous runs)
2998
- local memory_file="$DESK/memos/${SLUG}-memory.md"
2999
- if [[ -f "$memory_file" ]]; then
3000
- local completed_us
3001
- completed_us=$(sed -n '/^## Completed Stories$/,/^## /p' "$memory_file" 2>/dev/null | grep '^- US-' | sed 's/^- \(US-[0-9]*\):.*/\1/' | sort -u | tr '\n' ',' | sed 's/,$//')
3002
- if [[ -n "$completed_us" ]]; then
3003
- VERIFIED_US="$completed_us"
3004
- log " Loaded completed stories from memory: $VERIFIED_US"
3005
- log_debug "[FLOW] loaded_verified_us_from_memory=$VERIFIED_US"
3006
- fi
3286
+ # F-14 + status.json promotion (Item-4): VERIFIED_US restore precedence,
3287
+ # most-durable first —
3288
+ # 1. durable append-only ledger (leader-written, structured)
3289
+ # 2. status.json verified_us (leader serialization written EVERY phase by
3290
+ # update_status structured, reliable; promoted ABOVE the prose parse)
3291
+ # 3. the Worker's prose "## Completed Stories" LAST resort (fresh-context
3292
+ # LLM output that can drift; only legacy campaigns without 1 or 2 use it).
3293
+ if [[ -f "$VERIFIED_LEDGER" ]]; then
3294
+ local ledger_verified
3295
+ ledger_verified=$(jq -rR 'fromjson? | .us_id // empty' "$VERIFIED_LEDGER" 2>/dev/null | grep -E '^US-[0-9]+$' | sort -u | tr '\n' ',' | sed 's/,$//')
3296
+ if [[ -n "$ledger_verified" ]]; then
3297
+ VERIFIED_US="$ledger_verified"
3298
+ log " Restored verified_us from durable ledger: $VERIFIED_US"
3299
+ log_debug "[FLOW] restored_verified_us_from_ledger=$VERIFIED_US"
3007
3300
  fi
3301
+ fi
3008
3302
 
3009
- # D1: Fallback — restore verified_us from status.json if memory had none
3303
+ # 2nd source: status.json verified_us structured leader serialization,
3304
+ # more reliable than the prose parse below (Item-4: promoted above prose).
3010
3305
  if [[ -z "$VERIFIED_US" && -f "$STATUS_FILE" ]]; then
3011
3306
  local status_verified
3012
3307
  status_verified=$(jq -r '.verified_us // [] | join(",")' "$STATUS_FILE" 2>/dev/null)
@@ -3016,6 +3311,82 @@ main() {
3016
3311
  log_debug "[FLOW] restored_verified_us_from_status=$VERIFIED_US"
3017
3312
  fi
3018
3313
  fi
3314
+
3315
+ # LAST resort: the Worker's prose "## Completed Stories" (drift-prone; legacy).
3316
+ local memory_file="$DESK/memos/${SLUG}-memory.md"
3317
+ if [[ -z "$VERIFIED_US" && -f "$memory_file" ]]; then
3318
+ local completed_us
3319
+ completed_us=$(sed -n '/^## Completed Stories$/,/^## /p' "$memory_file" 2>/dev/null | grep '^- US-' | sed 's/^- \(US-[0-9]*\):.*/\1/' | sort -u | tr '\n' ',' | sed 's/,$//')
3320
+ if [[ -n "$completed_us" ]]; then
3321
+ VERIFIED_US="$completed_us"
3322
+ log " Loaded completed stories from memory (last-resort prose): $VERIFIED_US"
3323
+ log_debug "[FLOW] loaded_verified_us_from_memory=$VERIFIED_US"
3324
+ fi
3325
+ fi
3326
+
3327
+ fi
3328
+
3329
+ # F-13 (batch-safe): restore the circuit-breaker counter on relaunch. This runs
3330
+ # OUTSIDE the per-us block above because consecutive_failures is meaningful in
3331
+ # EVERY verify mode — a batch-mode campaign crash-loops the same way, so nesting
3332
+ # the restore under `per-us` let a batch relaunch reset its CB to 0 and evade the
3333
+ # breaker. status.json persists the counter (lib_ralph_desk.zsh) every phase;
3334
+ # only verified_us was ever read back. (verified_us restore stays per-us: batch
3335
+ # mode has no per-US progress to rehydrate.) Normal reset-on-progress applies.
3336
+ if [[ -f "$STATUS_FILE" ]]; then
3337
+ local _status_cf
3338
+ _status_cf=$(jq -r '.consecutive_failures // 0' "$STATUS_FILE" 2>/dev/null)
3339
+ if [[ "$_status_cf" == <-> && "$_status_cf" -gt 0 ]]; then
3340
+ CONSECUTIVE_FAILURES="$_status_cf"
3341
+ log " Restored consecutive_failures from status.json: $CONSECUTIVE_FAILURES"
3342
+ log_debug "[FLOW] restored_consecutive_failures_from_status=$CONSECUTIVE_FAILURES"
3343
+ fi
3344
+ # D-5: also restore the consecutive-BLOCKS state so the now-live block CB
3345
+ # (F-22) survives a relaunch — otherwise a crash-loop resets it to 0 every
3346
+ # relaunch and the block breaker is evadable (the same durability hole F-13
3347
+ # closed for consecutive_failures). Restore last_block_reason too, else a
3348
+ # restored count is immediately reset on the next block (reason wouldn't match).
3349
+ local _status_cb _status_lbr
3350
+ _status_cb=$(jq -r '.consecutive_blocks // 0' "$STATUS_FILE" 2>/dev/null)
3351
+ _status_lbr=$(jq -r '.last_block_reason // ""' "$STATUS_FILE" 2>/dev/null)
3352
+ # D-5 fix: restore ATOMICALLY — both the count AND the reason, or neither. A
3353
+ # count without its reason is a useless half-state (the next block's reason
3354
+ # wouldn't match the empty LAST_BLOCK_REASON and would reset the count to 1
3355
+ # anyway), so require both to be present before applying.
3356
+ if [[ "$_status_cb" == <-> && "$_status_cb" -gt 0 && -n "$_status_lbr" ]]; then
3357
+ CONSECUTIVE_BLOCKS="$_status_cb"
3358
+ LAST_BLOCK_REASON="$_status_lbr"
3359
+ log " Restored consecutive_blocks from status.json: $CONSECUTIVE_BLOCKS"
3360
+ log_debug "[FLOW] restored_consecutive_blocks_from_status=$CONSECUTIVE_BLOCKS"
3361
+ fi
3362
+ # D-5b (restore-priority, user-chosen): if the Worker was AUTO-upgraded during a
3363
+ # prior segment (model_upgraded==1), restore the upgraded model + its engine
3364
+ # triple + the upgrade bookkeeping, so a crash-relaunch resumes at the upgraded
3365
+ # model (and the architecture-escalation trigger survives) instead of silently
3366
+ # reverting to the base model and re-spending iterations to re-upgrade. Gated on
3367
+ # model_upgraded==1 so it ONLY overrides for the auto-upgrade case (a fresh
3368
+ # campaign that never upgraded keeps the env/CLI model).
3369
+ local _status_mu
3370
+ _status_mu=$(jq -r '.model_upgraded // 0' "$STATUS_FILE" 2>/dev/null)
3371
+ if [[ "$_status_mu" == "1" ]]; then
3372
+ local _s_wm _s_we _s_wcm _s_wcr _s_owm _s_sufc
3373
+ _s_wm=$(jq -r '.worker_model // empty' "$STATUS_FILE" 2>/dev/null)
3374
+ _s_we=$(jq -r '.worker_engine // empty' "$STATUS_FILE" 2>/dev/null)
3375
+ _s_wcm=$(jq -r '.worker_codex_model // empty' "$STATUS_FILE" 2>/dev/null)
3376
+ _s_wcr=$(jq -r '.worker_codex_reasoning // empty' "$STATUS_FILE" 2>/dev/null)
3377
+ _s_owm=$(jq -r '.original_worker_model // empty' "$STATUS_FILE" 2>/dev/null)
3378
+ _s_sufc=$(jq -r '.same_us_fail_count // 0' "$STATUS_FILE" 2>/dev/null)
3379
+ if [[ -n "$_s_wm" && -n "$_s_we" ]]; then
3380
+ _MODEL_UPGRADED=1
3381
+ WORKER_MODEL="$_s_wm"; WORKER_ENGINE="$_s_we"
3382
+ [[ -n "$_s_wcm" ]] && WORKER_CODEX_MODEL="$_s_wcm"
3383
+ [[ -n "$_s_wcr" ]] && WORKER_CODEX_REASONING="$_s_wcr"
3384
+ [[ -n "$_s_owm" ]] && _ORIGINAL_WORKER_MODEL="$_s_owm"
3385
+ [[ "$_s_sufc" == <-> ]] && _SAME_US_FAIL_COUNT="$_s_sufc"
3386
+ log " Restored auto-upgraded Worker model: $WORKER_MODEL ($WORKER_ENGINE), orig=${_ORIGINAL_WORKER_MODEL:-?}, same_us_fails=$_SAME_US_FAIL_COUNT (D-5b restore-priority)"
3387
+ log_debug "[FLOW] restored_model_upgrade=true worker_model=$WORKER_MODEL engine=$WORKER_ENGINE same_us_fail=$_SAME_US_FAIL_COUNT"
3388
+ fi
3389
+ fi
3019
3390
  fi
3020
3391
 
3021
3392
  # Initialize PRD snapshot state for live update detection
@@ -3028,6 +3399,16 @@ main() {
3028
3399
  # Print security warning (governance.md s7: --dangerously-skip-permissions)
3029
3400
  print_security_warning
3030
3401
 
3402
+ # F-8 scope guard (F-19): snapshot the tracked files that are ALREADY dirty
3403
+ # before the campaign touches anything. The F-8 leader-recovery auto-commit
3404
+ # (Bug #8 Gate 3) must commit only the Worker's OWN edits and never sweep an
3405
+ # operator's pre-existing uncommitted work into a Worker-recovery commit.
3406
+ # `git diff --name-only HEAD` lists tracked files modified vs HEAD (staged or
3407
+ # not); untracked cruft is excluded and is never auto-committed. Empty when the
3408
+ # tree starts clean. Recorded once; excluded at recovery time in Gate 3.
3409
+ typeset -g CAMPAIGN_PREEXISTING_DIRTY
3410
+ CAMPAIGN_PREEXISTING_DIRTY=$(git -C "$ROOT" diff --name-only HEAD 2>/dev/null)
3411
+
3031
3412
  # Validate scaffold
3032
3413
  validate_scaffold
3033
3414
 
@@ -3120,6 +3501,28 @@ main() {
3120
3501
  fi
3121
3502
  fi
3122
3503
 
3504
+ # D-16: leader-driven finalize. The previous iteration's last per-US pass
3505
+ # completed coverage and armed _FINALIZE_PENDING instead of dispatching a
3506
+ # worker round-trip to emit an ALL signal. Synthesize that ALL verify signal
3507
+ # ourselves and skip the worker; the existing verify path (signal_us_id=ALL →
3508
+ # run_sequential_final_verify) handles completion AND the fix-loop on failure.
3509
+ # Operator recovery (PR-A) takes precedence — only finalize if it did not claim
3510
+ # this iteration. A crash before this point loses the flag and safely falls
3511
+ # back to the worker round-trip (the pre-D-16 path).
3512
+ if (( _FINALIZE_PENDING )) && [[ "$SKIP_NEXT_WORKER" -eq 0 ]]; then
3513
+ _FINALIZE_PENDING=0
3514
+ log " Leader finalize (D-16): all US verified ($VERIFIED_US) — synthesizing ALL verify signal, skipping worker round-trip."
3515
+ log_debug "[FLOW] iter=$ITERATION d16_finalize=true verified_us=$VERIFIED_US"
3516
+ printf '{"iteration": %d, "status": "verify", "us_id": "ALL", "summary": "leader finalize (D-16: all per-US verified)", "timestamp": "%s"}\n' \
3517
+ "$ITERATION" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" | atomic_write "$SIGNAL_FILE"
3518
+ update_status "verify" "running"
3519
+ SKIP_NEXT_WORKER=1
3520
+ else
3521
+ # Any normally-dispatched iteration clears a stale arm (defensive; the flag
3522
+ # is consumed above on the immediately-following iteration in practice).
3523
+ _FINALIZE_PENDING=0
3524
+ fi
3525
+
3123
3526
  if (( ! SKIP_NEXT_WORKER )); then
3124
3527
  # --- governance.md s7 step 8 (cleanup): Clean previous iteration signals ---
3125
3528
  # Bug #7 Fix-R cleanup: unlock 0o444 sentinels written by the previous
@@ -3166,19 +3569,34 @@ main() {
3166
3569
  # --- governance.md s7 step 5: Execute Worker (dispatched to engine-specific function) ---
3167
3570
  log_debug "[FLOW] iter=$ITERATION phase=worker engine=$WORKER_ENGINE model=$WORKER_MODEL dispatched=true"
3168
3571
 
3572
+ # F-11: a pane-start failure is usually the transient F6.1 spawn race
3573
+ # (send-keys before the pane's shell is ready). Replace the pane and retry
3574
+ # ONCE before BLOCKing, instead of terminating the campaign on a transient.
3169
3575
  if [[ "$WORKER_ENGINE" = "codex" ]]; then
3170
- worker_launch="${CODEX_BIN:-codex} -m $WORKER_CODEX_MODEL -c model_reasoning_effort=\"$WORKER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
3576
+ worker_launch="${CODEX_BIN:-codex} -m $WORKER_CODEX_MODEL -c model_reasoning_effort=\"$WORKER_CODEX_REASONING\" -c mcp_servers='{}' --disable plugins --dangerously-bypass-approvals-and-sandbox"
3171
3577
  if ! launch_worker_codex "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
3172
- write_blocked_sentinel "Worker codex failed to start in pane" "" "infra_failure"
3173
- update_status "blocked" "worker_start_failed"
3174
- return 1
3578
+ log " Worker codex failed to start replacing pane and retrying once (F-11)."
3579
+ log_debug "[GOV] iter=$ITERATION worker_start_failed=true action=replace_retry engine=codex"
3580
+ replace_worker_pane "$WORKER_PANE" "worker"
3581
+ WORKER_PANE=$(jq -r '.panes.worker' "$SESSION_CONFIG")
3582
+ if ! launch_worker_codex "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
3583
+ write_blocked_sentinel "Worker codex failed to start in pane after replace+retry" "" "infra_failure"
3584
+ update_status "blocked" "worker_start_failed"
3585
+ return 1
3586
+ fi
3175
3587
  fi
3176
3588
  else
3177
3589
  worker_launch="$(build_claude_cmd tui "$WORKER_MODEL" "" "" "$WORKER_EFFORT")"
3178
3590
  if ! launch_worker_claude "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
3179
- write_blocked_sentinel "Worker claude failed to start in pane" "" "infra_failure"
3180
- update_status "blocked" "worker_start_failed"
3181
- return 1
3591
+ log " Worker claude failed to start replacing pane and retrying once (F-11)."
3592
+ log_debug "[GOV] iter=$ITERATION worker_start_failed=true action=replace_retry engine=claude"
3593
+ replace_worker_pane "$WORKER_PANE" "worker"
3594
+ WORKER_PANE=$(jq -r '.panes.worker' "$SESSION_CONFIG")
3595
+ if ! launch_worker_claude "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
3596
+ write_blocked_sentinel "Worker claude failed to start in pane after replace+retry" "" "infra_failure"
3597
+ update_status "blocked" "worker_start_failed"
3598
+ return 1
3599
+ fi
3182
3600
  fi
3183
3601
  fi
3184
3602
  else
@@ -3299,12 +3717,25 @@ main() {
3299
3717
  local vp_count
3300
3718
  vp_count=$(jq -r '.verified_acs // [] | length' "$SIGNAL_FILE" 2>/dev/null || echo 0)
3301
3719
  if [[ "$vp_count" -eq 0 ]]; then
3302
- log " Worker signal verify_partial but verified_acs is empty — downgrading to blocked (verify_partial_malformed)."
3720
+ # F-12: a Worker formatting slip (verify_partial with empty verified_acs)
3721
+ # is recoverable — route it back to the Worker as a soft-fail BOUNDED by
3722
+ # the consecutive-failure circuit breaker, instead of a terminal
3723
+ # mission_abort that ends the whole campaign on a single malformed signal.
3724
+ # A fresh-context Worker that keeps malforming still trips the CB and
3725
+ # blocks; one slip just costs an iteration.
3303
3726
  local vp_us_id
3304
3727
  vp_us_id=$(jq -r '.us_id // empty' "$SIGNAL_FILE" 2>/dev/null)
3305
- write_blocked_sentinel "verify_partial_malformed: empty verified_acs" "${vp_us_id:-${CURRENT_US:-ALL}}" "mission_abort"
3306
- update_status "blocked" "verify_partial_malformed"
3307
- break
3728
+ (( CONSECUTIVE_FAILURES++ ))
3729
+ log " Worker verify_partial malformed (empty verified_acs) — soft-fail retry $CONSECUTIVE_FAILURES/$EFFECTIVE_CB_THRESHOLD (bounded by CB)."
3730
+ log_debug "[GOV] iter=$ITERATION verify_partial_malformed=soft_fail consecutive_failures=$CONSECUTIVE_FAILURES threshold=$EFFECTIVE_CB_THRESHOLD"
3731
+ update_status "worker" "verify_partial_malformed_retry"
3732
+ if (( CONSECUTIVE_FAILURES >= EFFECTIVE_CB_THRESHOLD )); then
3733
+ log_error " verify_partial_malformed repeated $CONSECUTIVE_FAILURES times (>= $EFFECTIVE_CB_THRESHOLD) — blocking."
3734
+ write_blocked_sentinel "verify_partial_malformed repeated $CONSECUTIVE_FAILURES times" "${vp_us_id:-${CURRENT_US:-ALL}}" "repeat_axis"
3735
+ update_status "blocked" "verify_partial_malformed_cb"
3736
+ break
3737
+ fi
3738
+ continue
3308
3739
  fi
3309
3740
  log " Worker signal verify_partial (verified_acs count=$vp_count). Routing to verify path."
3310
3741
  signal_status="verify"
@@ -3314,6 +3745,13 @@ main() {
3314
3745
  # Read us_id from signal for per-US scoping
3315
3746
  local signal_us_id=""
3316
3747
  signal_us_id=$(jq -r '.us_id // empty' "$SIGNAL_FILE" 2>/dev/null)
3748
+ # F-23: normalize case so a Worker emitting "all"/"All" still triggers the
3749
+ # final/ALL verify + completion paths (which match "ALL" exactly). US ids
3750
+ # are already uppercase ("US-001"), so this is a no-op for well-formed ids.
3751
+ signal_us_id="${signal_us_id:u}"
3752
+ # D-11: the US under verification is the in-flight US for lifecycle sentinels
3753
+ # fired during the verify poll (no-progress / stall / R12).
3754
+ [[ -n "$signal_us_id" ]] && CURRENT_US="$signal_us_id"
3317
3755
  log " Worker claims done (us_id=${signal_us_id:-all}). Dispatching Verifier..."
3318
3756
 
3319
3757
  # AC1: capture verifier start timestamp
@@ -3348,7 +3786,7 @@ main() {
3348
3786
  if (( use_consensus )); then
3349
3787
  # US-004: Run consensus verification (claude + codex sequentially)
3350
3788
  local consensus_rc=0
3351
- run_consensus_verification "$ITERATION" || consensus_rc=$?
3789
+ run_consensus_verification "$ITERATION" "$signal_us_id" || consensus_rc=$?
3352
3790
 
3353
3791
  if (( consensus_rc == 2 )); then
3354
3792
  # Consensus disagreement — treat as fail, fix loop will handle
@@ -3389,15 +3827,35 @@ main() {
3389
3827
  fi
3390
3828
  wait_for_pane_ready "$VERIFIER_PANE" 10 2>/dev/null || true
3391
3829
 
3830
+ # D-1: a final/ALL verify reaching the single-engine path (batch mode, or
3831
+ # any ALL verify not handled by the per-us sequential path) uses the
3832
+ # stronger FINAL_VERIFIER_*; per-US verifies keep the lighter VERIFIER_*.
3833
+ # For signal_us_id != ALL, _v_* alias VERIFIER_* EXACTLY — no behavior
3834
+ # change on the per-US hot path.
3835
+ local _v_eng _v_model _v_cxm _v_cxr _v_eff _v_role
3836
+ if [[ "$signal_us_id" == "ALL" ]]; then
3837
+ _v_eng="$FINAL_VERIFIER_ENGINE"; _v_model="$FINAL_VERIFIER_MODEL"
3838
+ _v_cxm="$FINAL_VERIFIER_CODEX_MODEL"; _v_cxr="$FINAL_VERIFIER_CODEX_REASONING"; _v_eff="$FINAL_VERIFIER_EFFORT"
3839
+ # D-10 fix: an ALL verify here runs FINAL_VERIFIER_ENGINE, so the poll's
3840
+ # dead-pane check must derive FINAL_VERIFIER_ENGINE too — use the
3841
+ # "*inal*" role so poll_for_signal's engine derivation matches _v_eng
3842
+ # (else a codex final verifier's "bash" is misjudged with VERIFIER_ENGINE).
3843
+ _v_role="Verifier-final"
3844
+ else
3845
+ _v_eng="$VERIFIER_ENGINE"; _v_model="$VERIFIER_MODEL"
3846
+ _v_cxm="$VERIFIER_CODEX_MODEL"; _v_cxr="$VERIFIER_CODEX_REASONING"; _v_eff="$VERIFIER_EFFORT"
3847
+ _v_role="Verifier"
3848
+ fi
3849
+
3392
3850
  local verifier_launch
3393
- if [[ "$VERIFIER_ENGINE" = "codex" ]]; then
3394
- verifier_launch="${CODEX_BIN:-codex} -m $VERIFIER_CODEX_MODEL -c model_reasoning_effort=\"$VERIFIER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
3851
+ if [[ "$_v_eng" = "codex" ]]; then
3852
+ verifier_launch="${CODEX_BIN:-codex} -m $_v_cxm -c model_reasoning_effort=\"$_v_cxr\" -c mcp_servers='{}' --disable plugins --dangerously-bypass-approvals-and-sandbox"
3395
3853
  else
3396
- verifier_launch="$(build_claude_cmd tui "$VERIFIER_MODEL" "" "" "$VERIFIER_EFFORT")"
3854
+ verifier_launch="$(build_claude_cmd tui "$_v_model" "" "" "$_v_eff")"
3397
3855
  fi
3398
- log_debug "[FLOW] iter=$ITERATION phase=verifier engine=$VERIFIER_ENGINE model=$VERIFIER_MODEL scope=${signal_us_id:-all} dispatched=true"
3856
+ log_debug "[FLOW] iter=$ITERATION phase=verifier engine=$_v_eng model=$_v_model scope=${signal_us_id:-all} dispatched=true"
3399
3857
 
3400
- if [[ "$VERIFIER_ENGINE" = "codex" ]]; then
3858
+ if [[ "$_v_eng" = "codex" ]]; then
3401
3859
  launch_verifier_codex "$VERIFIER_PANE" "$verifier_prompt" "$ITERATION" "$verifier_launch"
3402
3860
  else
3403
3861
  if ! launch_verifier_claude "$VERIFIER_PANE" "$verifier_prompt" "$ITERATION" "$verifier_launch"; then
@@ -3406,16 +3864,45 @@ main() {
3406
3864
  fi
3407
3865
  fi
3408
3866
 
3409
- # Poll for verify-verdict.json
3867
+ # Poll for verify-verdict.json — F-10: 3-strike replace+re-dispatch
3868
+ # parity with the Worker's MONITOR_FAILURE_COUNT breaker. "Bug Report #5"
3869
+ # hardened the Worker poll-fail path (retry-3-then-block) but left the
3870
+ # Verifier path as an immediate terminal BLOCK, so a single transient
3871
+ # verifier death (API blip / pane-spawn race, also F-11) ended a campaign
3872
+ # the Worker path would have survived. rc==2 keeps its original meaning
3873
+ # (already-handled → return). Only 3 consecutive failures BLOCK.
3410
3874
  log " Polling for verify-verdict.json..."
3411
- if ! poll_for_signal "$VERDICT_FILE" "$VERIFIER_HEARTBEAT" "$VERIFIER_PANE" "$verifier_launch" "Verifier"; then
3875
+ local _vpoll_strike=0 _vpoll_ok=0
3876
+ while (( _vpoll_strike < 3 )); do
3877
+ # Capture poll rc DIRECTLY — `$?` after `if cmd; then…fi` is the
3878
+ # if-statement's status (0), not cmd's rc (the original `if ! poll;
3879
+ # then local rc=$?` had this latent bug, so its `rc==2` branch was
3880
+ # dead and a hard-fail double-wrote a sentinel). rc: 0=verdict,
3881
+ # 1=timeout (retryable), 2=hard-failed + infra_failure already recorded.
3882
+ poll_for_signal "$VERDICT_FILE" "$VERIFIER_HEARTBEAT" "$VERIFIER_PANE" "$verifier_launch" "$_v_role"
3412
3883
  local verifier_poll_rc=$?
3884
+ if (( verifier_poll_rc == 0 )); then
3885
+ _vpoll_ok=1; break
3886
+ fi
3413
3887
  if (( verifier_poll_rc == 2 )); then
3414
- return 1
3888
+ return 1 # hard-failed; poll already recorded infra_failure — do not retry
3889
+ fi
3890
+ (( _vpoll_strike++ ))
3891
+ log " WARNING: Verifier poll failed (strike $_vpoll_strike/3) — replacing pane and re-dispatching"
3892
+ log_debug "[GOV] iter=$ITERATION verifier_monitor_failure=$_vpoll_strike/3"
3893
+ update_status "verifier" "poll_failed"
3894
+ (( _vpoll_strike >= 3 )) && break
3895
+ replace_worker_pane "$VERIFIER_PANE" "verifier"
3896
+ VERIFIER_PANE=$(jq -r '.panes.verifier' "$SESSION_CONFIG")
3897
+ if [[ "$_v_eng" = "codex" ]]; then
3898
+ launch_verifier_codex "$VERIFIER_PANE" "$verifier_prompt" "$ITERATION" "$verifier_launch"
3899
+ else
3900
+ launch_verifier_claude "$VERIFIER_PANE" "$verifier_prompt" "$ITERATION" "$verifier_launch" || true
3415
3901
  fi
3416
- log_error "Verifier poll failed"
3417
- # Verifier is dead/stuck BLOCK and let user decide
3418
- write_blocked_sentinel "Verifier process dead/stuck (poll failed). Pane preserved for inspection." "" "infra_failure"
3902
+ done
3903
+ if (( ! _vpoll_ok )); then
3904
+ log_error "Verifier poll failed 3× (dead/stuck after retries)"
3905
+ write_blocked_sentinel "Verifier process dead/stuck after 3 retries. Pane preserved for inspection." "" "infra_failure"
3419
3906
  update_status "blocked" "verifier_dead"
3420
3907
  return 1
3421
3908
  fi
@@ -3435,6 +3922,10 @@ main() {
3435
3922
  verdict=$(jq -r '.verdict' "$VERDICT_FILE" 2>/dev/null)
3436
3923
  local recommended
3437
3924
  recommended=$(jq -r '.recommended_state_transition' "$VERDICT_FILE" 2>/dev/null)
3925
+ # F-23: normalize so a verifier's phrasing variant doesn't strand a
3926
+ # genuinely-complete campaign at MAX_ITER. "Complete"/"completed"/"done"
3927
+ # all mean complete; comparison below is lowercase-exact.
3928
+ recommended="${recommended:l}"
3438
3929
  local verdict_summary
3439
3930
  verdict_summary=$(jq -r '.summary // "no summary"' "$VERDICT_FILE" 2>/dev/null)
3440
3931
 
@@ -3445,10 +3936,20 @@ main() {
3445
3936
 
3446
3937
  case "$verdict" in
3447
3938
  pass)
3939
+ # D-3 fix: snapshot the CB BEFORE the pass-success reset so a wrong-US
3940
+ # "pass" (us_id mismatch, handled below) accumulates the CB across
3941
+ # iterations instead of restarting from 0 each time (the reset on the
3942
+ # next line would otherwise defeat the mismatch soft-fail's CB bound).
3943
+ local _cf_before_pass=$CONSECUTIVE_FAILURES
3448
3944
  CONSECUTIVE_FAILURES=0
3449
3945
  CONSENSUS_ROUND=0
3450
3946
  _SAME_US_FAIL_COUNT=0
3451
3947
  _LAST_FAILED_US=""
3948
+ # F-22b: a pass is real progress — reset the consecutive-BLOCKS state
3949
+ # too so the now-live block CB counts only blocks with NO intervening
3950
+ # success ("consecutive" in the true sense, not cumulative).
3951
+ CONSECUTIVE_BLOCKS=0
3952
+ LAST_BLOCK_REASON=""
3452
3953
  if (( _MODEL_UPGRADED )); then
3453
3954
  log " Worker model restored: ${WORKER_MODEL} → ${_ORIGINAL_WORKER_MODEL} (pass verdict)"
3454
3955
  log_debug "[DECIDE] iter=$ITERATION phase=model_select model_restore=true from=${WORKER_MODEL} to=${_ORIGINAL_WORKER_MODEL}"
@@ -3462,17 +3963,60 @@ main() {
3462
3963
 
3463
3964
  # --- Verified US tracking (both per-us and batch modes) ---
3464
3965
  if [[ -n "$signal_us_id" && "$signal_us_id" != "ALL" ]]; then
3465
- # Add this US to verified list
3466
- if [[ -n "$VERIFIED_US" ]]; then
3467
- VERIFIED_US="${VERIFIED_US},${signal_us_id}"
3966
+ # D-3: cross-check the verdict's OWN us_id against the US the leader
3967
+ # scoped this verify to. If the verifier graded a DIFFERENT US, do
3968
+ # NOT credit signal_us_id (it was not actually verified) — soft-fail
3969
+ # so the Worker re-runs the contracted US. Acts ONLY on a PRESENT
3970
+ # mismatch (absent verdict us_id = trust the scope), so a correctly-
3971
+ # scoped verifier is never affected.
3972
+ local _verdict_us_id
3973
+ _verdict_us_id=$(jq -r '.us_id // empty' "$VERDICT_FILE" 2>/dev/null)
3974
+ _verdict_us_id="${_verdict_us_id:u}"
3975
+ if [[ -n "$_verdict_us_id" && "$_verdict_us_id" != "$signal_us_id" ]]; then
3976
+ log_error " Verdict us_id mismatch: verifier graded $_verdict_us_id but leader scoped $signal_us_id — NOT crediting (soft-fail)."
3977
+ log_debug "[GOV] iter=$ITERATION verdict_us_id_mismatch verdict_us=$_verdict_us_id signal_us=$signal_us_id"
3978
+ update_status "verifier" "us_id_mismatch"
3979
+ # D-3 fix: undo the pass-entry CB reset so consecutive mismatches
3980
+ # actually accumulate toward the breaker (else each restarts at 0).
3981
+ CONSECUTIVE_FAILURES=$_cf_before_pass
3982
+ if _bump_consecutive_failure; then
3983
+ write_blocked_sentinel "${EFFECTIVE_CB_THRESHOLD} consecutive verdict us_id mismatches" "" "repeat_axis"
3984
+ update_status "blocked" "consecutive_failures"
3985
+ return 1
3986
+ fi
3468
3987
  else
3469
- VERIFIED_US="$signal_us_id"
3988
+ # Add this US to verified list. D-12: dedup — a fresh-context Worker
3989
+ # can re-submit an already-verified US (memory drift); don't
3990
+ # double-credit it (mirrors the fail/partial-progress guard, and
3991
+ # keeps VERIFIED_US + the ledger + the coverage count honest).
3992
+ if echo ",$VERIFIED_US," | grep -q ",$signal_us_id,"; then
3993
+ log " US $signal_us_id already verified — not re-crediting (dedup)."
3994
+ log_debug "[FLOW] iter=$ITERATION verified_us_dedup=$signal_us_id"
3995
+ else
3996
+ if [[ -n "$VERIFIED_US" ]]; then
3997
+ VERIFIED_US="${VERIFIED_US},${signal_us_id}"
3998
+ else
3999
+ VERIFIED_US="$signal_us_id"
4000
+ fi
4001
+ log " US $signal_us_id verified. Verified so far: $VERIFIED_US"
4002
+ log_debug "[FLOW] iter=$ITERATION verified_us_update=$signal_us_id verified_us_total=$VERIFIED_US"
4003
+ _append_verified_ledger "$signal_us_id" # F-14: durable source-of-truth
4004
+ fi
4005
+ update_status "verifier" "pass_us"
4006
+ # D-16: if this pass completed coverage (every US in US_LIST is now
4007
+ # verified), arm leader-driven finalize so the NEXT loop top runs the
4008
+ # sequential final verify DIRECTLY — instead of a worker round-trip
4009
+ # whose only job is to emit an ALL signal (a fragile extra LLM
4010
+ # iteration, observed hanging on an API rate-limit in SV CRITICAL).
4011
+ if [[ "$VERIFY_MODE" == "per-us" && -n "$US_LIST" ]] && _all_us_verified; then
4012
+ _FINALIZE_PENDING=1
4013
+ log " Coverage complete ($VERIFIED_US) — arming leader finalize (D-16, no worker round-trip)."
4014
+ log_debug "[FLOW] iter=$ITERATION d16_arm_finalize=true verified_us=$VERIFIED_US"
4015
+ else
4016
+ : # more US remain → Worker will do next US on next iteration
4017
+ fi
3470
4018
  fi
3471
- log " US $signal_us_id verified. Verified so far: $VERIFIED_US"
3472
- log_debug "[FLOW] iter=$ITERATION verified_us_update=$signal_us_id verified_us_total=$VERIFIED_US"
3473
- update_status "verifier" "pass_us"
3474
- # Worker will do next US on next iteration
3475
- elif [[ "$recommended" == "complete" || "$signal_us_id" == "ALL" ]]; then
4019
+ elif [[ "$recommended" == (complete|completed|done) || "$signal_us_id" == "ALL" ]]; then
3476
4020
  # Final full verify passed or complete recommended
3477
4021
  write_complete_sentinel "$verdict_summary"
3478
4022
  update_status "complete" "pass"
@@ -3499,6 +4043,7 @@ main() {
3499
4043
  VERIFIED_US="$_pus"
3500
4044
  fi
3501
4045
  log " Partial progress: $_pus passed (overall FAIL). Verified so far: $VERIFIED_US"
4046
+ _append_verified_ledger "$_pus" # F-14: durable source-of-truth
3502
4047
  fi
3503
4048
  done
3504
4049
  log_debug "[FLOW] iter=$ITERATION partial_progress prev=$_prev_verified now=$VERIFIED_US"
@@ -3507,6 +4052,9 @@ main() {
3507
4052
  # Partial progress resets consecutive failures (progress was made)
3508
4053
  if [[ "$VERIFIED_US" != "$_prev_verified" ]]; then
3509
4054
  CONSECUTIVE_FAILURES=0
4055
+ # F-22b: partial progress also resets the consecutive-blocks state.
4056
+ CONSECUTIVE_BLOCKS=0
4057
+ LAST_BLOCK_REASON=""
3510
4058
  log " Progress detected — consecutive_failures reset to 0"
3511
4059
  log_debug "[GOV] iter=$ITERATION consecutive_failures_reset=partial_progress"
3512
4060
  fi
@@ -3575,31 +4123,66 @@ main() {
3575
4123
  log " Questions: \"$verdict_summary_ri\""
3576
4124
  log " Treating as soft fail — Worker will see verdict in next iteration."
3577
4125
  update_status "verifier" "request_info"
4126
+ # F-22: count request_info toward the CB so a verifier looping on
4127
+ # request_info trips the breaker instead of spinning to MAX_ITER.
4128
+ if _bump_consecutive_failure; then
4129
+ write_blocked_sentinel "${EFFECTIVE_CB_THRESHOLD} consecutive non-advancing verdicts (request_info)" "" "repeat_axis"
4130
+ update_status "blocked" "consecutive_failures"
4131
+ return 1
4132
+ fi
3578
4133
  ;;
3579
4134
  blocked)
3580
4135
  local _verdict_cat
3581
4136
  _verdict_cat=$(_classify_cross_us_or_metric "$verdict_summary")
3582
- write_blocked_sentinel "Verifier verdict: blocked - $verdict_summary" "" "$_verdict_cat"
3583
- update_status "blocked" "verifier_blocked"
3584
- return 1
4137
+ # F-22: a transient/first "blocked" no longer kills the campaign —
4138
+ # absorb as a soft-fail with grace; terminate only on a genuine infra
4139
+ # block, the same reason repeated >= BLOCK_CB_THRESHOLD, or the CB.
4140
+ if _block_with_grace "Verifier verdict: blocked - $verdict_summary" "$_verdict_cat"; then
4141
+ write_blocked_sentinel "Verifier verdict: blocked - $verdict_summary" "" "$_verdict_cat"
4142
+ update_status "blocked" "verifier_blocked"
4143
+ return 1
4144
+ fi
4145
+ log " Verifier verdict=blocked absorbed as soft-fail (consecutive_failures=$CONSECUTIVE_FAILURES; reason not yet repeated ${BLOCK_CB_THRESHOLD}×) — Worker will retry."
4146
+ update_status "verifier" "blocked_softfail"
3585
4147
  ;;
3586
4148
  *)
3587
4149
  log_error "Unknown verdict: $verdict"
3588
4150
  update_status "verifier" "unknown_verdict"
4151
+ # F-22: unknown verdict is a soft-fail that counts toward the CB
4152
+ # (was: silent continue to MAX_ITER with no diagnostic BLOCK).
4153
+ if _bump_consecutive_failure; then
4154
+ write_blocked_sentinel "${EFFECTIVE_CB_THRESHOLD} consecutive unrecognized verifier verdicts" "" "repeat_axis"
4155
+ update_status "blocked" "consecutive_failures"
4156
+ return 1
4157
+ fi
3589
4158
  ;;
3590
4159
  esac
3591
4160
  ;;
3592
4161
  blocked)
3593
- # --- governance.md s7 step 6: blocked -> write sentinel ---
4162
+ # --- governance.md s7 step 6: blocked -> write sentinel (with grace) ---
3594
4163
  local _signal_cat
3595
4164
  _signal_cat=$(_classify_cross_us_or_metric "$signal_summary")
3596
- write_blocked_sentinel "Worker reported blocked: $signal_summary" "" "$_signal_cat"
3597
- update_status "blocked" "worker_blocked"
3598
- return 1
4165
+ # F-22: a transient/first Worker-reported "blocked" no longer kills the
4166
+ # campaign — absorb as a soft-fail with grace (same gate as the verifier
4167
+ # blocked path); terminate only on infra, repeated reason, or the CB.
4168
+ if _block_with_grace "Worker reported blocked: $signal_summary" "$_signal_cat"; then
4169
+ write_blocked_sentinel "Worker reported blocked: $signal_summary" "" "$_signal_cat"
4170
+ update_status "blocked" "worker_blocked"
4171
+ return 1
4172
+ fi
4173
+ log " Worker status=blocked absorbed as soft-fail (consecutive_failures=$CONSECUTIVE_FAILURES) — re-dispatching Worker."
4174
+ update_status "worker" "blocked_softfail"
3599
4175
  ;;
3600
4176
  *)
3601
4177
  log_error "Unknown signal status: $signal_status"
3602
4178
  update_status "worker" "unknown_status"
4179
+ # F-22: unknown signal status is a soft-fail that counts toward the CB
4180
+ # (was: silent continue to MAX_ITER).
4181
+ if _bump_consecutive_failure; then
4182
+ write_blocked_sentinel "${EFFECTIVE_CB_THRESHOLD} consecutive unrecognized worker signals" "" "repeat_axis"
4183
+ update_status "blocked" "consecutive_failures"
4184
+ return 1
4185
+ fi
3603
4186
  ;;
3604
4187
  esac
3605
4188