@ai-dev-methodologies/rlp-desk 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -164,6 +164,34 @@ _check_consecutive_blocks() {
164
164
  return 0
165
165
  }
166
166
 
167
+ # F-22: bump the consecutive-failure counter for a soft-fail (request_info,
168
+ # unknown verdict/status). Returns 0 if the circuit breaker is now tripped
169
+ # (caller writes a sentinel + returns 1), 1 if still under threshold (continue).
170
+ # Closes the "silently loop to MAX_ITER without ever firing the CB" gap for
171
+ # verdict/status values the case-statement did not previously account for.
172
+ _bump_consecutive_failure() {
173
+ (( CONSECUTIVE_FAILURES++ ))
174
+ (( CONSECUTIVE_FAILURES >= EFFECTIVE_CB_THRESHOLD )) && return 0
175
+ return 1
176
+ }
177
+
178
+ # F-22: decide a worker/verifier BLOCK with grace. This is the call site that
179
+ # was MISSING — _check_consecutive_blocks was dead code (defined, never invoked),
180
+ # so the consecutive-blocks circuit breaker (governance §8) never ran and a
181
+ # SINGLE transient "blocked" (a fresh-context LLM mis-emitting the status, a
182
+ # formatting slip) terminated the whole campaign. Returns 0 = TERMINATE (caller
183
+ # writes the sentinel + returns 1); 1 = ABSORB as a soft-fail (loop continues,
184
+ # Worker retries). Forced terminal when: the category is a genuine infra_failure,
185
+ # the same canonical reason repeats >= BLOCK_CB_THRESHOLD, or the consecutive-
186
+ # failures CB trips. Otherwise a recoverable first/transient block is absorbed.
187
+ _block_with_grace() {
188
+ local reason="$1" category="${2:-metric_failure}"
189
+ _check_consecutive_blocks "$reason" "$category" "${ITERATION:-0}" || return 0
190
+ [[ "$category" == "infra_failure" ]] && return 0
191
+ _bump_consecutive_failure && return 0
192
+ return 1
193
+ }
194
+
167
195
  # --- Engine Selection (auto-detect from model format) ---
168
196
  # claude models (haiku/sonnet/opus) with :effort → claude engine + effort
169
197
  # codex models (gpt-*/spark) with :reasoning → codex engine + reasoning
@@ -205,11 +233,17 @@ FINAL_VERIFIER_EFFORT="${FINAL_VERIFIER_EFFORT:-}"
205
233
  # Auto-detect engine from model format for env var path (CLI path uses parse_model_flag)
206
234
  _auto_detect_engine WORKER_MODEL WORKER_ENGINE WORKER_CODEX_MODEL WORKER_CODEX_REASONING WORKER_EFFORT
207
235
  _auto_detect_engine VERIFIER_MODEL VERIFIER_ENGINE VERIFIER_CODEX_MODEL VERIFIER_CODEX_REASONING VERIFIER_EFFORT
208
- _auto_detect_engine FINAL_VERIFIER_MODEL FINAL_VERIFIER_ENGINE "" "" FINAL_VERIFIER_EFFORT
236
+ _auto_detect_engine FINAL_VERIFIER_MODEL FINAL_VERIFIER_ENGINE FINAL_VERIFIER_CODEX_MODEL FINAL_VERIFIER_CODEX_REASONING FINAL_VERIFIER_EFFORT
209
237
  WORKER_CODEX_MODEL="${WORKER_CODEX_MODEL:-gpt-5.5}"
210
238
  WORKER_CODEX_REASONING="${WORKER_CODEX_REASONING:-high}" # low|medium|high
211
239
  VERIFIER_CODEX_MODEL="${VERIFIER_CODEX_MODEL:-gpt-5.5}"
212
240
  VERIFIER_CODEX_REASONING="${VERIFIER_CODEX_REASONING:-high}" # low|medium|high
241
+ # D-1: FINAL verifier codex sub-vars (auto-detected above from FINAL_VERIFIER_MODEL,
242
+ # default here when not codex). Wired so the FINAL (ALL) verify can run a stronger
243
+ # model than the per-US verifier — the "final 엄격" knob (FINAL_VERIFIER_MODEL
244
+ # defaults to opus). Distinct from the removed per-iteration verifier auto-upgrade.
245
+ FINAL_VERIFIER_CODEX_MODEL="${FINAL_VERIFIER_CODEX_MODEL:-gpt-5.5}"
246
+ FINAL_VERIFIER_CODEX_REASONING="${FINAL_VERIFIER_CODEX_REASONING:-high}" # low|medium|high
213
247
  CODEX_BIN="" # resolved by check_dependencies when engine=codex
214
248
 
215
249
  # --- Verify Mode ---
@@ -274,6 +308,10 @@ MEMORY_FILE="$MEMOS_DIR/${SLUG}-memory.md"
274
308
  SIGNAL_FILE="$MEMOS_DIR/${SLUG}-iter-signal.json"
275
309
  DONE_CLAIM_FILE="$MEMOS_DIR/${SLUG}-done-claim.json"
276
310
  VERDICT_FILE="$MEMOS_DIR/${SLUG}-verify-verdict.json"
311
+ # F-14: durable, structured append-only ledger of verified-pass US — the
312
+ # drift-proof source-of-truth for VERIFIED_US restore (vs the Worker's prose
313
+ # "## Completed Stories", which is fresh-context LLM output that can drift).
314
+ VERIFIED_LEDGER="$MEMOS_DIR/${SLUG}-verified.jsonl"
277
315
  # v0.14.2 Bug Report #4: codex sometimes writes the verdict file to the
278
316
  # pre-v0.13.0 legacy path despite the prompt instructing otherwise (CWD
279
317
  # heuristics inside the codex CLI). Track the legacy path so the no-progress
@@ -389,6 +427,33 @@ launch_worker_codex() {
389
427
  sleep 1
390
428
  local _pane_text
391
429
  _pane_text=$(tmux capture-pane -t "$pane_id" -p 2>/dev/null || true)
430
+ # F-1: on launch codex may show "✨ Update available!" — an arrow-menu whose
431
+ # DEFAULT highlighted option is "1. Update now" (runs `npm install -g
432
+ # @openai/codex`) with "Press enter to continue". Our subsequent Enter would
433
+ # confirm option 1 and the update REPLACES the Worker session (hijack). This
434
+ # check MUST precede the '›' ready check below because the update menu also
435
+ # renders '›'. Move the selection to "2. Skip" (Down) then confirm (Enter).
436
+ # (Guarded: only fires when the update banner is present, so it is harmless
437
+ # in any normal pane state. Key sequence pending live-codex confirmation.)
438
+ if echo "$_pane_text" | grep -qiE 'Update available|1\. Update now' 2>/dev/null; then
439
+ log " Worker codex: update prompt detected — selecting '2. Skip' (F-1)."
440
+ log_debug "[GOV] iter=$iter codex_update_prompt=skipped role=worker"
441
+ tmux send-keys -t "$pane_id" Down 2>/dev/null; sleep 0.3
442
+ tmux send-keys -t "$pane_id" C-m 2>/dev/null; sleep 1
443
+ (( _codex_wait++ )); continue
444
+ fi
445
+ # F-16: codex 0.141 shows a "Do you trust the contents of this directory?
446
+ # 1. Yes, continue / 2. No, quit" prompt at startup (project-local config/
447
+ # hooks loading). Its '›' is otherwise mis-read as "ready" below, and the
448
+ # worker instruction sent into that menu can land on "No, quit" → codex exits
449
+ # → "worker not active" BLOCK. Accept it (Enter = default "1. Yes, continue")
450
+ # before the ready check. Validated end-to-end: codex then runs the task.
451
+ if echo "$_pane_text" | grep -qiE 'Do you trust|1\. Yes, continue' 2>/dev/null; then
452
+ log " Worker codex: directory-trust prompt — accepting (F-16)."
453
+ log_debug "[GOV] iter=$iter codex_trust_prompt=accepted role=worker"
454
+ tmux send-keys -t "$pane_id" C-m 2>/dev/null; sleep 1
455
+ (( _codex_wait++ )); continue
456
+ fi
392
457
  if echo "$_pane_text" | grep -q '›' 2>/dev/null; then
393
458
  _codex_ready=1
394
459
  log_debug "Worker codex TUI ready after ${_codex_wait}s"
@@ -543,6 +608,25 @@ launch_verifier_codex() {
543
608
  sleep 1
544
609
  local _pane_text
545
610
  _pane_text=$(tmux capture-pane -t "$pane_id" -p 2>/dev/null || true)
611
+ # F-1: dismiss codex's "✨ Update available!" launch menu before it hijacks the
612
+ # pane (default option is "1. Update now"). See launch_worker_codex for detail.
613
+ if echo "$_pane_text" | grep -qiE 'Update available|1\. Update now' 2>/dev/null; then
614
+ log " Verifier codex: update prompt detected — selecting '2. Skip' (F-1)."
615
+ log_debug "[GOV] iter=$iter codex_update_prompt=skipped role=verifier"
616
+ tmux send-keys -t "$pane_id" Down 2>/dev/null; sleep 0.3
617
+ tmux send-keys -t "$pane_id" C-m 2>/dev/null; sleep 1
618
+ (( _codex_wait++ )); continue
619
+ fi
620
+ # F-16: accept codex 0.141's "Do you trust this directory?" startup prompt
621
+ # (Enter = default "1. Yes, continue") before the ready check — see
622
+ # launch_worker_codex for detail. Otherwise the instruction lands in the menu
623
+ # and can select "No, quit" → codex exits → "verifier not active".
624
+ if echo "$_pane_text" | grep -qiE 'Do you trust|1\. Yes, continue' 2>/dev/null; then
625
+ log " Verifier codex: directory-trust prompt — accepting (F-16)."
626
+ log_debug "[GOV] iter=$iter codex_trust_prompt=accepted role=verifier"
627
+ tmux send-keys -t "$pane_id" C-m 2>/dev/null; sleep 1
628
+ (( _codex_wait++ )); continue
629
+ fi
546
630
  if echo "$_pane_text" | grep -q '›' 2>/dev/null; then
547
631
  _codex_ready=1
548
632
  log_debug "Verifier codex TUI ready after ${_codex_wait}s"
@@ -639,6 +723,16 @@ launch_verifier_claude() {
639
723
  # On exit: check done-claim, auto-generate iter-signal.
640
724
  # Args: $1=iteration $2=signal_file
641
725
  # Returns: 0 (signal generated), 1 (error)
726
+ # F-14: append a verified-pass US to the durable ledger (the leader's structured,
727
+ # drift-proof record of progress). Skips ALL/empty; append-only, readers dedup.
728
+ _append_verified_ledger() {
729
+ local us="$1"
730
+ [[ -z "$us" || "$us" == "ALL" ]] && return 0
731
+ mkdir -p "${VERIFIED_LEDGER:h}" 2>/dev/null
732
+ printf '{"us_id":"%s","iter":%s,"verified_at":"%s"}\n' \
733
+ "$us" "${ITERATION:-0}" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "$VERIFIED_LEDGER"
734
+ }
735
+
642
736
  # Bug #8 PR-B (codex critic P1.2 fix): shared 4-way gate used by both
643
737
  # handle_worker_exit_codex and the inline-polling A4 path. Returns:
644
738
  # 0 = synthesize allowed (caller writes signal_file + emits audit)
@@ -662,6 +756,37 @@ _bug8_check_synth_allowed() {
662
756
  return 1
663
757
  fi
664
758
 
759
+ # Gate 1b (D-2): done-claim FRESHNESS. A done-claim that lingered from a PRIOR
760
+ # run/iteration (e.g. a relaunch where the inter-iteration cleanup did not run)
761
+ # must NOT be synthesized into a verify signal for THIS iteration — it would
762
+ # credit a stale/wrong US into the durable ledger. The Worker writes its
763
+ # done-claim DURING this iteration, so a fresh claim is strictly NEWER than this
764
+ # iteration's worker-prompt; an older claim is stale. mtime-based on purpose:
765
+ # done-claim carries no reliable .iteration field (workers omit it), so an
766
+ # iteration match would false-reject every claim and break the A4 synth path.
767
+ local _dc_wp_file="$LOGS_DIR/iter-$(printf '%03d' "$iter").worker-prompt.md"
768
+ if [[ -f "$_dc_wp_file" ]]; then
769
+ # mtime, cross-platform: GNU `stat -c %Y` FIRST (on Linux `stat -f %m` means
770
+ # --file-system + %m=mount-point, returns a non-numeric path with exit 0 so a
771
+ # `-f`-first order would silently mis-read); macOS BSD `stat -c` errors → falls
772
+ # through to `stat -f %m` (the BSD mtime). Correct on both; `echo 0` = unknown.
773
+ local _dc_mt _wp_mt
774
+ _dc_mt=$(stat -c %Y "$DONE_CLAIM_FILE" 2>/dev/null || stat -f %m "$DONE_CLAIM_FILE" 2>/dev/null || echo 0)
775
+ _wp_mt=$(stat -c %Y "$_dc_wp_file" 2>/dev/null || stat -f %m "$_dc_wp_file" 2>/dev/null || echo 0)
776
+ [[ "$_dc_mt" == <-> ]] || _dc_mt=0 # guard: ignore any non-numeric stat output
777
+ [[ "$_wp_mt" == <-> ]] || _wp_mt=0
778
+ if (( _dc_mt > 0 && _wp_mt > 0 && _dc_mt < _wp_mt )); then
779
+ log_error " Bug #8: done-claim is STALE (mtime $_dc_mt < this iteration's worker-prompt $_wp_mt) — refusing to synthesize from a prior-run claim."
780
+ log_debug "[GOV] iter=$iter bug8=block_stale_done_claim dc_mt=$_dc_mt wp_mt=$_wp_mt"
781
+ write_blocked_sentinel \
782
+ "done-claim is stale (older than this iteration's worker dispatch) — refusing to synthesize a verify signal from a prior-run claim" \
783
+ "$us_id" \
784
+ "infra_failure"
785
+ _emit_a4_fallback_audit "$us_id" "$iter" "blocked_stale_done_claim"
786
+ return 1
787
+ fi
788
+ fi
789
+
665
790
  # Gate 2: git toplevel must equal $ROOT (canonicalized — macOS resolves
666
791
  # /var → /private/var, NTFS may have 8.3 short paths; compare realpaths).
667
792
  local _bug8_top _bug8_top_canon _bug8_root_canon
@@ -679,20 +804,59 @@ _bug8_check_synth_allowed() {
679
804
  return 1
680
805
  fi
681
806
 
682
- # Gate 3: tree must be clean.
807
+ # Gate 3: no UNCOMMITTED changes to TRACKED files (F-6 fix). We compare against
808
+ # HEAD with `git diff --name-only HEAD`, which lists ONLY tracked files modified
809
+ # vs HEAD — untracked cruft (logs, .DS_Store, local config, build/coverage
810
+ # output) the Worker never touched is never listed. Blocking on such cruft
811
+ # false-BLOCKed the campaign at iter 1 on ANY non-pristine repo — the single
812
+ # largest "never completes" cause found in large-campaign dogfood. The Verifier
813
+ # (test-spec) is the real correctness gate for the Worker's committed work; this
814
+ # gate only guards against a Worker that left TRACKED edits uncommitted.
683
815
  local _bug8_dirty
684
- _bug8_dirty=$(git -C "$ROOT" status --porcelain 2>/dev/null)
816
+ _bug8_dirty=$(git -C "$ROOT" diff --name-only HEAD 2>/dev/null)
685
817
  if [[ -n "$_bug8_dirty" ]]; then
686
- local _bug8_first5
687
- _bug8_first5=$(printf '%s\n' "$_bug8_dirty" | head -n 5 | tr '\n' '|' | sed 's/|$//')
688
- log_error " Bug #8: done-claim present but tree dirty. Refusing synthesis. dirty: $_bug8_first5"
689
- log_debug "[GOV] iter=$iter bug8=block_dirty_tree us_id=$us_id dirty='$_bug8_first5'"
690
- write_blocked_sentinel \
691
- "worker_incomplete_uncommitted: done-claim present but tree dirty ($_bug8_first5)" \
692
- "$us_id" \
693
- "metric_failure"
694
- _emit_a4_fallback_audit "$us_id" "$iter" "blocked_dirty_tree"
695
- return 1
818
+ # F-8 recovery (F-19 scoped): by Gate 1 a done-claim exists, so uncommitted
819
+ # TRACKED changes are most likely the Worker's own US work it failed to commit
820
+ # a frequent weak-model slip (the default haiku Worker reports "Committed ..."
821
+ # in its done-claim while the git commit never landed). Historically this
822
+ # TERMINATED the campaign, stranding completed work — the #1 weak-model "never
823
+ # completes" cause. Instead auto-commit the Worker's edits and proceed — but
824
+ # scope the commit to the Worker's OWN files: exclude any tracked file ALREADY
825
+ # dirty before the campaign (CAMPAIGN_PREEXISTING_DIRTY) so an operator's
826
+ # pre-existing uncommitted work is NEVER swept into a Worker-recovery commit.
827
+ # The Verifier (test-spec) is the real correctness gate, so a genuine mid-write
828
+ # bail still FAILs verify → fix loop; Bug #8's "no false PASS" intent is
829
+ # preserved by the Verifier, not by abort.
830
+ local _bug8_worker_files
831
+ _bug8_worker_files=$(comm -23 \
832
+ <(printf '%s\n' "$_bug8_dirty" | sort -u) \
833
+ <(printf '%s\n' "${CAMPAIGN_PREEXISTING_DIRTY:-}" | sort -u) \
834
+ | grep -v '^[[:space:]]*$')
835
+ if [[ -z "$_bug8_worker_files" ]]; then
836
+ # Every dirty tracked file was already dirty BEFORE the campaign — the Worker
837
+ # committed its own work (or made no tracked change). Nothing to recover; do
838
+ # NOT commit the operator's pre-existing edits. Allow synthesis to proceed.
839
+ log " Bug #8 F-8: only operator's pre-existing edits are dirty — Worker work already committed; proceeding without auto-commit."
840
+ log_debug "[GOV] iter=$iter bug8=preexisting_only_no_commit us_id=$us_id"
841
+ else
842
+ local _bug8_first5
843
+ _bug8_first5=$(printf '%s\n' "$_bug8_worker_files" | head -n 5 | tr '\n' '|' | sed 's/|$//')
844
+ log " Bug #8 F-8 recovery: done-claim + Worker's uncommitted tracked changes — auto-committing $us_id work (files: $_bug8_first5)."
845
+ log_debug "[GOV] iter=$iter bug8=recover_autocommit us_id=$us_id files='$_bug8_first5'"
846
+ local -a _bug8_add=("${(@f)_bug8_worker_files}")
847
+ if git -C "$ROOT" add -- "${_bug8_add[@]}" && git -C "$ROOT" commit -q -m "chore(leader-recovery): commit Worker's uncommitted $us_id changes (Bug #8 F-8)"; then
848
+ log " Leader-recovery auto-commit OK (Worker files only) — Verifier will gate correctness."
849
+ else
850
+ log_error " Bug #8: leader-recovery auto-commit failed. Refusing synthesis. files: $_bug8_first5"
851
+ log_debug "[GOV] iter=$iter bug8=block_autocommit_failed us_id=$us_id files='$_bug8_first5'"
852
+ write_blocked_sentinel \
853
+ "worker_incomplete_uncommitted: leader-recovery auto-commit failed ($_bug8_first5)" \
854
+ "$us_id" \
855
+ "metric_failure"
856
+ _emit_a4_fallback_audit "$us_id" "$iter" "blocked_autocommit_failed"
857
+ return 1
858
+ fi
859
+ fi
696
860
  fi
697
861
 
698
862
  # All gates passed — synthesize allowed.
@@ -1016,10 +1180,15 @@ check_copy_mode() {
1016
1180
  paste_to_pane() {
1017
1181
  local pane_id="$1"
1018
1182
  local text="$2"
1183
+ # D-8/D-13: per-leader+pane tmux buffer name (was a server-GLOBAL "rlp-paste").
1184
+ # Two leaders sharing one tmux server (different ROOTs) would ABA the single
1185
+ # global buffer — load-A / load-B / paste-A pastes B's text into A's pane. A
1186
+ # name keyed by leader pid + pane closes that.
1187
+ local _buf="rlp-paste-$$-${pane_id//[^0-9A-Za-z]/}"
1019
1188
  local tmpbuf="/tmp/.rlp-desk-paste-$$.tmp"
1020
1189
  echo -n "$text" > "$tmpbuf"
1021
- tmux load-buffer -b rlp-paste "$tmpbuf" 2>/dev/null
1022
- tmux paste-buffer -b rlp-paste -d -t "$pane_id" 2>/dev/null
1190
+ tmux load-buffer -b "$_buf" "$tmpbuf" 2>/dev/null
1191
+ tmux paste-buffer -b "$_buf" -d -t "$pane_id" 2>/dev/null # -d deletes the buffer after paste
1023
1192
  rm -f "$tmpbuf"
1024
1193
  }
1025
1194
 
@@ -1757,7 +1926,7 @@ restart_worker() {
1757
1926
 
1758
1927
  # Re-launch worker (tmux interactive pattern)
1759
1928
  if [[ "$WORKER_ENGINE" = "codex" ]]; then
1760
- safe_send_keys "$pane_id" "${CODEX_BIN:-codex} -m $WORKER_CODEX_MODEL -c model_reasoning_effort=\"$WORKER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
1929
+ safe_send_keys "$pane_id" "${CODEX_BIN:-codex} -m $WORKER_CODEX_MODEL -c model_reasoning_effort=\"$WORKER_CODEX_REASONING\" -c mcp_servers='{}' --disable plugins --dangerously-bypass-approvals-and-sandbox"
1761
1930
  else
1762
1931
  safe_send_keys "$pane_id" "$(build_claude_cmd tui "$WORKER_MODEL" "" "" "$WORKER_EFFORT")"
1763
1932
  fi
@@ -1815,6 +1984,12 @@ write_worker_trigger() {
1815
1984
  fi
1816
1985
  done
1817
1986
  fi
1987
+ # D-11: publish the in-flight US GLOBALLY so the lifecycle-path sentinels
1988
+ # (no-progress, prompt-stall, R12 watchdog) tag their BLOCKED sidecar with the
1989
+ # real us_id (they default to ${CURRENT_US:-ALL}, which was always ALL because
1990
+ # CURRENT_US was never assigned). The verify phase overwrites it with the US
1991
+ # actually under verification.
1992
+ [[ -n "$next_us" ]] && CURRENT_US="$next_us" || CURRENT_US="ALL"
1818
1993
 
1819
1994
  {
1820
1995
  # Per-US PRD injection: substitute full PRD path with per-US split path when available
@@ -2057,6 +2232,12 @@ TRIGGER_EOF
2057
2232
  # =============================================================================
2058
2233
 
2059
2234
  cleanup() {
2235
+ # D-8: re-entrancy guard. The trap is armed on EXIT INT TERM, so a TERM (cleanup
2236
+ # runs) immediately followed by process exit (EXIT fires cleanup AGAIN) would
2237
+ # double-run the non-idempotent steps — a double runner-lock release can rm a
2238
+ # relaunched leader's lock dir (ABA). Run the body at most once.
2239
+ (( ${CLEANUP_DONE:-0} )) && return 0
2240
+ CLEANUP_DONE=1
2060
2241
  log "Cleaning up..."
2061
2242
 
2062
2243
  # Remove lockfile
@@ -2066,12 +2247,15 @@ cleanup() {
2066
2247
  log_debug "cleanup: lockfile not owned by this process, skipping removal"
2067
2248
  fi
2068
2249
 
2069
- # US-026 R14 P0: remove project-scoped runner lockfile if owned by this slug
2250
+ # US-026 R14 P0 / D-9: remove the project-scoped runner lock if WE own it. The
2251
+ # lock file now holds our bare PID (acquire_slug_lock), so ownership is an exact
2252
+ # pid match — remove the lock file, the metadata sidecar, and the recovery mutex.
2070
2253
  if [[ -f "$RUNNER_LOCKFILE_PATH" ]]; then
2071
- local own_slug
2072
- own_slug=$(jq -r '.slug' "$RUNNER_LOCKFILE_PATH" 2>/dev/null)
2073
- if [[ "$own_slug" == "$SLUG" ]]; then
2074
- rm -rf "$RUNNER_LOCKDIR" "$RUNNER_LOCKFILE_PATH" 2>/dev/null
2254
+ local own_pid
2255
+ own_pid=$(cat "$RUNNER_LOCKFILE_PATH" 2>/dev/null)
2256
+ if [[ "$own_pid" == "$$" ]]; then
2257
+ rm -f "$RUNNER_LOCKFILE_PATH" "${RUNNER_LOCKFILE_PATH}.meta" 2>/dev/null
2258
+ rm -rf "${RUNNER_LOCKFILE_PATH}.recovery.d" 2>/dev/null
2075
2259
  fi
2076
2260
  fi
2077
2261
 
@@ -2423,8 +2607,19 @@ poll_for_signal() {
2423
2607
  # Dead pane detection during poll: check if claude/codex process died
2424
2608
  local poll_cmd
2425
2609
  poll_cmd=$(tmux display-message -p -t "$pane_id" '#{pane_current_command}' 2>/dev/null)
2426
- # Dead pane detection — delegates to check_dead_pane() for engine-aware logic
2427
- if check_dead_pane "$poll_cmd" "$WORKER_ENGINE" "$role"; then
2610
+ # Dead pane detection — delegates to check_dead_pane() for engine-aware logic.
2611
+ # D-10: pick the engine for the pane being polled, NOT always WORKER_ENGINE. In
2612
+ # a mixed-engine campaign (e.g. claude worker + codex verifier) the old code
2613
+ # judged the codex verifier's "bash" (codex's trigger shell) as DEAD using the
2614
+ # claude rule → false dead-pane → 3-strike → spurious BLOCK on a live verifier.
2615
+ # Derive from the role string (covers per-US, final, and consensus per-engine).
2616
+ local _dead_engine="$WORKER_ENGINE"
2617
+ if [[ "$role" == *codex* ]]; then _dead_engine="codex"
2618
+ elif [[ "$role" == *claude* ]]; then _dead_engine="claude"
2619
+ elif [[ "$role" == *inal* ]]; then _dead_engine="$FINAL_VERIFIER_ENGINE"
2620
+ elif [[ "$role" == *erifier* ]]; then _dead_engine="$VERIFIER_ENGINE"
2621
+ fi
2622
+ if check_dead_pane "$poll_cmd" "$_dead_engine" "$role"; then
2428
2623
  log " WARNING: $role pane $pane_id has bare shell ($poll_cmd) — process died during execution"
2429
2624
  log_debug "[GOV] iter=$ITERATION pane_dead_during_poll=true pane=$pane_id cmd=$poll_cmd role=$role"
2430
2625
  # Return failure so caller can handle recovery
@@ -2472,6 +2667,13 @@ run_single_verifier() {
2472
2667
  local model="$3" # model for this verifier
2473
2668
  local suffix="$4" # "-claude" or "-codex"
2474
2669
  local verdict_dest="$5" # where to copy the verdict file
2670
+ # D-1c (codex MEDIUM): claude reasoning effort for this verifier. Final
2671
+ # consensus passes FINAL_VERIFIER_EFFORT; per-US passes VERIFIER_EFFORT.
2672
+ # Defaults to VERIFIER_EFFORT so existing 5-arg callers are unchanged.
2673
+ # Single-dash (${6-...}, not ${6:-...}) so an explicitly-passed EMPTY effort
2674
+ # (e.g. final consensus with FINAL_VERIFIER_EFFORT unset) is preserved rather
2675
+ # than collapsing back to VERIFIER_EFFORT.
2676
+ local effort="${6-$VERIFIER_EFFORT}"
2475
2677
 
2476
2678
  # Write trigger for this engine
2477
2679
  write_verifier_trigger "$iter" "$engine" "$model" "$suffix"
@@ -2512,11 +2714,29 @@ run_single_verifier() {
2512
2714
  # Launch verifier — dispatch to engine-specific function
2513
2715
  local verifier_launch
2514
2716
  if [[ "$engine" = "codex" ]]; then
2515
- verifier_launch="${CODEX_BIN:-codex} -m $VERIFIER_CODEX_MODEL -c model_reasoning_effort=\"$VERIFIER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
2717
+ # D-1c: honor the passed-in model arg (consensus passes CONSENSUS_MODEL /
2718
+ # FINAL_CONSENSUS_MODEL as "model:reasoning") instead of always using the
2719
+ # global VERIFIER_CODEX_*; fall back to the globals when no model is given.
2720
+ local _cx_model="$VERIFIER_CODEX_MODEL" _cx_reason="$VERIFIER_CODEX_REASONING"
2721
+ if [[ -n "$model" && "$model" == *:* ]]; then
2722
+ # D-1c (codex LOW): validate "model:reasoning" before splitting. Reject an
2723
+ # empty model, an empty/unknown reasoning, or >1 colon (e.g. "gpt-5.5:",
2724
+ # ":medium", "foo:bar:baz") and fall back to the globals instead of
2725
+ # emitting a bad -m or empty reasoning_effort.
2726
+ local _m="${model%%:*}" _r="${model##*:}"
2727
+ if [[ -n "$_m" && "$model" != *:*:* && "$_r" == (minimal|low|medium|high|xhigh) ]]; then
2728
+ _cx_model="$_m"; _cx_reason="$_r"
2729
+ else
2730
+ log " WARNING: malformed consensus codex model '$model' — falling back to $_cx_model:$_cx_reason"
2731
+ fi
2732
+ elif [[ -n "$model" ]]; then
2733
+ _cx_model="$model"
2734
+ fi
2735
+ verifier_launch="${CODEX_BIN:-codex} -m $_cx_model -c model_reasoning_effort=\"$_cx_reason\" -c mcp_servers='{}' --disable plugins --dangerously-bypass-approvals-and-sandbox"
2516
2736
  launch_verifier_codex "$VERIFIER_PANE" "$prompt_file" "$iter" "$verifier_launch"
2517
- log_debug "Verifier$suffix codex TUI dispatched"
2737
+ log_debug "Verifier$suffix codex TUI dispatched (model=$_cx_model reasoning=$_cx_reason)"
2518
2738
  else
2519
- verifier_launch="$(build_claude_cmd tui "$model" "" "" "$VERIFIER_EFFORT")"
2739
+ verifier_launch="$(build_claude_cmd tui "$model" "" "" "$effort")"
2520
2740
  if ! launch_verifier_claude "$VERIFIER_PANE" "$prompt_file" "$iter" "$verifier_launch"; then
2521
2741
  log_error "Verifier$suffix failed to start"
2522
2742
  return 1
@@ -2566,12 +2786,17 @@ run_single_verifier() {
2566
2786
  else
2567
2787
  # Claude: use full poll_for_signal with heartbeat/nudge
2568
2788
  log " Polling for verify-verdict.json ($suffix)..."
2569
- if ! poll_for_signal "$VERDICT_FILE" "$VERIFIER_HEARTBEAT" "$VERIFIER_PANE" "$verifier_launch" "Verifier$suffix"; then
2570
- local verifier_poll_rc=$?
2789
+ # F-25: capture rc DIRECTLY (not inside `if ! cmd; then $?`, which yields the
2790
+ # if-statement status, not poll's rc — the same latent bug fixed at the main
2791
+ # verifier poll site). Keeps the rc==2 "sentinel already written" branch live.
2792
+ poll_for_signal "$VERDICT_FILE" "$VERIFIER_HEARTBEAT" "$VERIFIER_PANE" "$verifier_launch" "Verifier$suffix"
2793
+ local verifier_poll_rc=$?
2794
+ if (( verifier_poll_rc != 0 )); then
2571
2795
  if (( verifier_poll_rc == 2 )); then
2796
+ log_debug "[GOV] run_single_verifier poll hard-fail (rc=2, sentinel already written)"
2572
2797
  return 1
2573
2798
  fi
2574
- log_error "Verifier$suffix poll failed"
2799
+ log_error "Verifier$suffix poll failed (rc=$verifier_poll_rc)"
2575
2800
  return 1
2576
2801
  fi
2577
2802
  fi
@@ -2621,29 +2846,54 @@ run_sequential_final_verify() {
2621
2846
  fi
2622
2847
  wait_for_pane_ready "$VERIFIER_PANE" 10 2>/dev/null || true
2623
2848
 
2624
- # Launch verifier
2849
+ # Launch verifier. D-1: the FINAL (ALL) verify uses FINAL_VERIFIER_* (the
2850
+ # "final 엄격" knob — a configured stronger model, e.g. opus, for the final
2851
+ # gate), NOT the lighter per-US VERIFIER_*. This is the configured-final-model
2852
+ # distinction, distinct from the removed per-iteration verifier auto-upgrade.
2625
2853
  local verifier_launch
2626
- if [[ "$VERIFIER_ENGINE" = "codex" ]]; then
2627
- verifier_launch="${CODEX_BIN:-codex} -m $VERIFIER_CODEX_MODEL -c model_reasoning_effort=\"$VERIFIER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
2854
+ if [[ "$FINAL_VERIFIER_ENGINE" = "codex" ]]; then
2855
+ verifier_launch="${CODEX_BIN:-codex} -m $FINAL_VERIFIER_CODEX_MODEL -c model_reasoning_effort=\"$FINAL_VERIFIER_CODEX_REASONING\" -c mcp_servers='{}' --disable plugins --dangerously-bypass-approvals-and-sandbox"
2628
2856
  launch_verifier_codex "$VERIFIER_PANE" "$verifier_prompt" "$iter" "$verifier_launch"
2629
2857
  else
2630
- verifier_launch="$(build_claude_cmd tui "$VERIFIER_MODEL" "" "" "$VERIFIER_EFFORT")"
2858
+ verifier_launch="$(build_claude_cmd tui "$FINAL_VERIFIER_MODEL" "" "" "$FINAL_VERIFIER_EFFORT")"
2631
2859
  launch_verifier_claude "$VERIFIER_PANE" "$verifier_prompt" "$iter" "$verifier_launch" || {
2632
- log_error "Failed to launch verifier for $us"
2860
+ log_error "Failed to launch final verifier for $us"
2633
2861
  FAILED_US="$us"
2634
2862
  return 1
2635
2863
  }
2636
2864
  fi
2637
2865
 
2638
- # Poll for verdict
2866
+ # Poll for verdict. D-4: distinguish rc==2 (hard-fail, sentinel already
2867
+ # written → terminal) from rc==1 (transient pane race/timeout) and give ONE
2868
+ # replace-pane + re-dispatch retry before failing the US — the F-10 retry
2869
+ # parity the per-US main verifier site has but this final-verify path lacked
2870
+ # (a single transient poll miss falsely failed a US at the most expensive
2871
+ # end-of-campaign moment, charging a bogus consecutive failure).
2639
2872
  rm -f "$VERDICT_FILE"
2640
2873
  local poll_rc=0
2641
2874
  poll_for_signal "$VERDICT_FILE" "$VERIFIER_HEARTBEAT" "$VERIFIER_PANE" "$verifier_launch" "Verifier-final" || poll_rc=$?
2642
- if (( poll_rc != 0 )); then
2643
- log_error "Verifier poll failed for $us (rc=$poll_rc)"
2875
+ if (( poll_rc == 2 )); then
2876
+ log_error "Verifier hard-fail (rc=2, sentinel written) for $us in final verify"
2644
2877
  FAILED_US="$us"
2645
2878
  return 1
2646
2879
  fi
2880
+ if (( poll_rc == 1 )); then
2881
+ log " Verifier-final transient poll fail for $us — replacing pane + retrying once (D-4)"
2882
+ replace_worker_pane "$VERIFIER_PANE" "verifier"
2883
+ VERIFIER_PANE=$(jq -r '.panes.verifier' "$SESSION_CONFIG")
2884
+ if [[ "$FINAL_VERIFIER_ENGINE" = "codex" ]]; then
2885
+ launch_verifier_codex "$VERIFIER_PANE" "$verifier_prompt" "$iter" "$verifier_launch"
2886
+ else
2887
+ launch_verifier_claude "$VERIFIER_PANE" "$verifier_prompt" "$iter" "$verifier_launch" || { FAILED_US="$us"; return 1; }
2888
+ fi
2889
+ rm -f "$VERDICT_FILE"; poll_rc=0
2890
+ poll_for_signal "$VERDICT_FILE" "$VERIFIER_HEARTBEAT" "$VERIFIER_PANE" "$verifier_launch" "Verifier-final" || poll_rc=$?
2891
+ if (( poll_rc != 0 )); then
2892
+ log_error "Verifier poll failed for $us after replace+retry (rc=$poll_rc)"
2893
+ FAILED_US="$us"
2894
+ return 1
2895
+ fi
2896
+ fi
2647
2897
 
2648
2898
  # Bug #7 Fix-Q/R: reap verifier pane between per-US final verifications so
2649
2899
  # the previous codex/claude TUI cannot continue running while the next per-
@@ -2699,6 +2949,26 @@ _should_use_consensus() {
2699
2949
  # --- US-004: Run consensus verification (claude + codex sequentially) ---
2700
2950
  run_consensus_verification() {
2701
2951
  local iter="$1"
2952
+ # D-15: the US under consensus (for the merged verdict's us_id, so the D-3
2953
+ # cross-check applies to consensus too). Falls back to the caller's local
2954
+ # signal_us_id (zsh dynamic scope) then ALL.
2955
+ local cons_us_id="${2:-${signal_us_id:-ALL}}"
2956
+ # D-15 fix: us_id is interpolated into the merged-verdict JSON via echo, so make it
2957
+ # JSON-safe. It is always "ALL" or "US-<digits>"; anything else → ALL (a value
2958
+ # with a quote/backslash/control char would otherwise produce invalid JSON).
2959
+ [[ "$cons_us_id" == (ALL|US-<->) ]] || cons_us_id="ALL"
2960
+ # D-1c: wire the documented consensus cross-verifier model knobs. Primary
2961
+ # (claude) uses VERIFIER_MODEL/FINAL_VERIFIER_MODEL; cross (codex) uses
2962
+ # CONSENSUS_MODEL/FINAL_CONSENSUS_MODEL ("model:reasoning"). Final (ALL)
2963
+ # picks the stricter pair; per-US picks the lighter pair.
2964
+ local _cons_claude_model _cons_codex_model _cons_claude_effort
2965
+ if [[ "$cons_us_id" == "ALL" ]]; then
2966
+ _cons_claude_model="$FINAL_VERIFIER_MODEL"; _cons_codex_model="$FINAL_CONSENSUS_MODEL"
2967
+ _cons_claude_effort="$FINAL_VERIFIER_EFFORT" # codex MEDIUM: final claude effort
2968
+ else
2969
+ _cons_claude_model="$VERIFIER_MODEL"; _cons_codex_model="$CONSENSUS_MODEL"
2970
+ _cons_claude_effort="$VERIFIER_EFFORT"
2971
+ fi
2702
2972
  local claude_verdict_file="$LOGS_DIR/iter-$(printf '%03d' $iter).verify-verdict-claude.json"
2703
2973
  local codex_verdict_file="$LOGS_DIR/iter-$(printf '%03d' $iter).verify-verdict-codex.json"
2704
2974
 
@@ -2712,7 +2982,7 @@ run_consensus_verification() {
2712
2982
 
2713
2983
  # Run claude verifier first
2714
2984
  local _claude_t0=$(date +%s)
2715
- if ! run_single_verifier "$iter" "claude" "$VERIFIER_MODEL" "-claude" "$claude_verdict_file"; then
2985
+ if ! run_single_verifier "$iter" "claude" "$_cons_claude_model" "-claude" "$claude_verdict_file" "$_cons_claude_effort"; then
2716
2986
  log_error "Claude verifier failed in consensus round $CONSENSUS_ROUND"
2717
2987
  return 1
2718
2988
  fi
@@ -2723,7 +2993,7 @@ run_consensus_verification() {
2723
2993
  log " WARNING: Claude verdict is '$CLAUDE_VERDICT' — likely interrupted. Retrying claude verifier..."
2724
2994
  log_debug "[GOV] iter=$iter phase=consensus_claude_retry reason=null_verdict"
2725
2995
  rm -f "$claude_verdict_file" 2>/dev/null
2726
- if ! run_single_verifier "$iter" "claude" "$VERIFIER_MODEL" "-claude" "$claude_verdict_file"; then
2996
+ if ! run_single_verifier "$iter" "claude" "$_cons_claude_model" "-claude" "$claude_verdict_file" "$_cons_claude_effort"; then
2727
2997
  log_error "Claude verifier retry also failed"
2728
2998
  return 1
2729
2999
  fi
@@ -2733,19 +3003,36 @@ run_consensus_verification() {
2733
3003
  return 1
2734
3004
  fi
2735
3005
  fi
2736
- log_debug "[GOV] iter=$iter phase=consensus_claude verdict=$CLAUDE_VERDICT model=$VERIFIER_MODEL"
3006
+ log_debug "[GOV] iter=$iter phase=consensus_claude verdict=$CLAUDE_VERDICT model=$_cons_claude_model"
2737
3007
 
2738
3008
  # consensus-fail-fast removed (complexity vs value too low)
2739
3009
 
2740
3010
  # Run codex verifier second
2741
3011
  local _codex_t0=$(date +%s)
2742
- if ! run_single_verifier "$iter" "codex" "$VERIFIER_CODEX_MODEL" "-codex" "$codex_verdict_file"; then
3012
+ if ! run_single_verifier "$iter" "codex" "$_cons_codex_model" "-codex" "$codex_verdict_file"; then
2743
3013
  log_error "Codex verifier failed in consensus round $CONSENSUS_ROUND"
2744
3014
  return 1
2745
3015
  fi
2746
3016
  ITER_VERIFIER_CODEX_DURATION_S=$(( $(date +%s) - _codex_t0 ))
2747
3017
  CODEX_VERDICT=$(jq -r '.verdict' "$codex_verdict_file" 2>/dev/null)
2748
- log_debug "[GOV] iter=$iter phase=consensus_codex verdict=$CODEX_VERDICT model=$VERIFIER_CODEX_MODEL reasoning=$VERIFIER_CODEX_REASONING"
3018
+ # D-14: validate codex verdict is not null/empty — retry once (symmetry with the
3019
+ # claude null-retry above). A transient codex interruption otherwise counts as a
3020
+ # non-pass, burns a consensus round, and can BLOCK after 6 rounds.
3021
+ if [[ -z "$CODEX_VERDICT" || "$CODEX_VERDICT" == "null" ]]; then
3022
+ log " WARNING: Codex verdict is '$CODEX_VERDICT' — likely interrupted. Retrying codex verifier..."
3023
+ log_debug "[GOV] iter=$iter phase=consensus_codex_retry reason=null_verdict"
3024
+ rm -f "$codex_verdict_file" 2>/dev/null
3025
+ if ! run_single_verifier "$iter" "codex" "$_cons_codex_model" "-codex" "$codex_verdict_file"; then
3026
+ log_error "Codex verifier retry also failed"
3027
+ return 1
3028
+ fi
3029
+ CODEX_VERDICT=$(jq -r '.verdict' "$codex_verdict_file" 2>/dev/null)
3030
+ if [[ -z "$CODEX_VERDICT" || "$CODEX_VERDICT" == "null" ]]; then
3031
+ log_error "Codex verdict still null after retry — consensus cannot proceed"
3032
+ return 1
3033
+ fi
3034
+ fi
3035
+ log_debug "[GOV] iter=$iter phase=consensus_codex verdict=$CODEX_VERDICT model=$_cons_codex_model"
2749
3036
 
2750
3037
  log " Consensus: claude=$CLAUDE_VERDICT codex=$CODEX_VERDICT"
2751
3038
  local _combined_action="retry"
@@ -2760,6 +3047,7 @@ run_consensus_verification() {
2760
3047
  {
2761
3048
  echo '{'
2762
3049
  echo ' "verdict": "pass",'
3050
+ echo ' "us_id": "'"$cons_us_id"'",'
2763
3051
  echo ' "verified_at_utc": "'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'",'
2764
3052
  echo ' "summary": "Consensus PASS: both claude and codex verified independently",'
2765
3053
  echo ' "recommended_state_transition": "complete",'
@@ -2852,51 +3140,38 @@ run_consensus_verification() {
2852
3140
  # =============================================================================
2853
3141
 
2854
3142
  main() {
2855
- # --- US-026 R14 P0: project-scoped runner lockfile (mkdir atomic) ---
2856
- # Prevents duplicate runners on the same project root regardless of slug.
2857
- # Different ROOT_HASH allows independent parallel runners across projects.
3143
+ # --- US-026 R14 P0: project-scoped runner lock (per-ROOT, regardless of slug) ---
3144
+ # D-9: delegate to acquire_slug_lock the F-20-proven, race-safe primitive where
3145
+ # the PID *is* the lock (`set -C` atomic create writes the pid in one redirect),
3146
+ # so there is NO acquire/pid-write gap. The previous dir-based design (mkdir a dir
3147
+ # + a separate pid file) had a fundamental gap between acquiring the dir and
3148
+ # writing the pid that a recovery mutex alone could not close (codex D-9 R2).
3149
+ # Metadata (slug/root) goes to a sidecar for the duplicate message + audit.
3150
+ # Different ROOT_HASH → independent parallel runners across projects.
2858
3151
  mkdir -p "$(dirname "$RUNNER_LOCKFILE_PATH")" 2>/dev/null
2859
- if ! mkdir "$RUNNER_LOCKDIR" 2>/dev/null; then
3152
+ if acquire_slug_lock "$RUNNER_LOCKFILE_PATH"; then
3153
+ printf '{"pid":%s,"slug":"%s","root":"%s","started_at":"%s"}\n' \
3154
+ "$$" "$SLUG" "$ROOT" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "${RUNNER_LOCKFILE_PATH}.meta" 2>/dev/null
3155
+ else
2860
3156
  local existing existing_slug
2861
- existing=$(jq -r '.pid' "$RUNNER_LOCKFILE_PATH" 2>/dev/null || echo 0)
2862
- existing_slug=$(jq -r '.slug // "unknown"' "$RUNNER_LOCKFILE_PATH" 2>/dev/null || echo unknown)
2863
- if [[ "$existing" -gt 0 ]] && kill -0 "$existing" 2>/dev/null; then
2864
- echo "duplicate rlp-desk runner detected on this project root. existing pid=$existing slug=$existing_slug, this attempt slug=$SLUG. exiting." >&2
2865
- echo " Recover with: rm -rf '$RUNNER_LOCKDIR' '$RUNNER_LOCKFILE_PATH' (only if pid $existing is confirmed dead)" >&2
2866
- exit 1
2867
- fi
2868
- rm -rf "$RUNNER_LOCKDIR"
2869
- mkdir "$RUNNER_LOCKDIR" 2>/dev/null || {
2870
- echo "failed to acquire runner lock after stale cleanup; another wrapper raced ahead. exit 1" >&2
2871
- exit 1
2872
- }
2873
- echo "stale runner lockfile cleaned (pid $existing dead) — acquired" >&2
3157
+ existing=$(cat "$RUNNER_LOCKFILE_PATH" 2>/dev/null)
3158
+ existing_slug=$(jq -r '.slug // "unknown"' "${RUNNER_LOCKFILE_PATH}.meta" 2>/dev/null || echo unknown)
3159
+ echo "duplicate rlp-desk runner detected on this project root. existing pid=${existing:-unknown} slug=$existing_slug, this attempt slug=$SLUG. exiting." >&2
3160
+ echo " Recover with: rm -f '$RUNNER_LOCKFILE_PATH' '${RUNNER_LOCKFILE_PATH}.meta' && rm -rf '${RUNNER_LOCKFILE_PATH}.recovery.d' (only if pid ${existing:-?} is confirmed dead)" >&2
3161
+ exit 1
2874
3162
  fi
2875
- printf '{"pid":%s,"slug":"%s","root":"%s","started_at":"%s"}\n' \
2876
- "$$" "$SLUG" "$ROOT" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$RUNNER_LOCKFILE_PATH"
2877
3163
 
2878
- # --- Lockfile: prevent duplicate execution ---
2879
- local lockfile="$LOCKFILE_PATH"
2880
- mkdir -p "$(dirname "$lockfile")" 2>/dev/null
2881
- if ! (set -C; echo $$ > "$lockfile") 2>/dev/null; then
2882
- local lock_pid
2883
- lock_pid=$(cat "$lockfile" 2>/dev/null)
2884
- if kill -0 "$lock_pid" 2>/dev/null; then
2885
- log_error "Another instance is already running (PID $lock_pid). Kill $lock_pid or rm $lockfile"
2886
- exit 1
2887
- fi
2888
- # Stale lock — overwrite.
2889
- # NOTE (ZSH-4, deferred): a fully race-safe stale-lock recovery is a separate
2890
- # distributed-lock redesign (codex review found subtle rm/create + mutex-leak
2891
- # races in patch attempts). This finding is LOW: the outer RUNNER_LOCKDIR mkdir
2892
- # lock (keyed on the same $ROOT) already serializes runners before this inner
2893
- # path is reached, so the inner race is unreachable in practice. Left at the
2894
- # tested baseline pending a dedicated redesign.
2895
- log "Stale lock detected (PID ${lock_pid:-unknown} not running), recovering"
2896
- echo $$ > "$lockfile"
3164
+ # --- Lockfile: prevent duplicate execution (ZSH-4 race-safe, v0.17.1) ---
3165
+ # Delegates to acquire_slug_lock (lib_ralph_desk.zsh): atomic set -C fast path +
3166
+ # mkdir-mutex-serialized, PID-reaped stale recovery. Race-safe vs concurrent
3167
+ # recoverers, gap-starters, and a crashed-recoverer mutex leak.
3168
+ if acquire_slug_lock "$LOCKFILE_PATH"; then
2897
3169
  LOCKFILE_ACQUIRED=1
2898
3170
  else
2899
- LOCKFILE_ACQUIRED=1
3171
+ local lock_pid
3172
+ lock_pid=$(cat "$LOCKFILE_PATH" 2>/dev/null)
3173
+ log_error "Another instance is already running or won the lock race (PID ${lock_pid:-unknown}). Kill it or rm $LOCKFILE_PATH"
3174
+ exit 1
2900
3175
  fi
2901
3176
  # US-023 R11 P2-K: chain `_emit_final_cost_log` so cost-log.jsonl is never silently empty on exit.
2902
3177
  trap '_emit_final_cost_log; cleanup' EXIT INT TERM
@@ -2994,19 +3269,25 @@ main() {
2994
3269
  US_LIST=$(grep -oE 'US-[0-9]+' "$prd_file" | sort -u | tr '\n' ',' | sed 's/,$//')
2995
3270
  fi
2996
3271
 
2997
- # Initialize VERIFIED_US from memory's Completed Stories (carry over previous runs)
2998
- local memory_file="$DESK/memos/${SLUG}-memory.md"
2999
- if [[ -f "$memory_file" ]]; then
3000
- local completed_us
3001
- completed_us=$(sed -n '/^## Completed Stories$/,/^## /p' "$memory_file" 2>/dev/null | grep '^- US-' | sed 's/^- \(US-[0-9]*\):.*/\1/' | sort -u | tr '\n' ',' | sed 's/,$//')
3002
- if [[ -n "$completed_us" ]]; then
3003
- VERIFIED_US="$completed_us"
3004
- log " Loaded completed stories from memory: $VERIFIED_US"
3005
- log_debug "[FLOW] loaded_verified_us_from_memory=$VERIFIED_US"
3006
- fi
3272
+ # F-14 + status.json promotion (Item-4): VERIFIED_US restore precedence,
3273
+ # most-durable first —
3274
+ # 1. durable append-only ledger (leader-written, structured)
3275
+ # 2. status.json verified_us (leader serialization written EVERY phase by
3276
+ # update_status structured, reliable; promoted ABOVE the prose parse)
3277
+ # 3. the Worker's prose "## Completed Stories" LAST resort (fresh-context
3278
+ # LLM output that can drift; only legacy campaigns without 1 or 2 use it).
3279
+ if [[ -f "$VERIFIED_LEDGER" ]]; then
3280
+ local ledger_verified
3281
+ ledger_verified=$(jq -rR 'fromjson? | .us_id // empty' "$VERIFIED_LEDGER" 2>/dev/null | grep -E '^US-[0-9]+$' | sort -u | tr '\n' ',' | sed 's/,$//')
3282
+ if [[ -n "$ledger_verified" ]]; then
3283
+ VERIFIED_US="$ledger_verified"
3284
+ log " Restored verified_us from durable ledger: $VERIFIED_US"
3285
+ log_debug "[FLOW] restored_verified_us_from_ledger=$VERIFIED_US"
3007
3286
  fi
3287
+ fi
3008
3288
 
3009
- # D1: Fallback — restore verified_us from status.json if memory had none
3289
+ # 2nd source: status.json verified_us structured leader serialization,
3290
+ # more reliable than the prose parse below (Item-4: promoted above prose).
3010
3291
  if [[ -z "$VERIFIED_US" && -f "$STATUS_FILE" ]]; then
3011
3292
  local status_verified
3012
3293
  status_verified=$(jq -r '.verified_us // [] | join(",")' "$STATUS_FILE" 2>/dev/null)
@@ -3016,6 +3297,82 @@ main() {
3016
3297
  log_debug "[FLOW] restored_verified_us_from_status=$VERIFIED_US"
3017
3298
  fi
3018
3299
  fi
3300
+
3301
+ # LAST resort: the Worker's prose "## Completed Stories" (drift-prone; legacy).
3302
+ local memory_file="$DESK/memos/${SLUG}-memory.md"
3303
+ if [[ -z "$VERIFIED_US" && -f "$memory_file" ]]; then
3304
+ local completed_us
3305
+ completed_us=$(sed -n '/^## Completed Stories$/,/^## /p' "$memory_file" 2>/dev/null | grep '^- US-' | sed 's/^- \(US-[0-9]*\):.*/\1/' | sort -u | tr '\n' ',' | sed 's/,$//')
3306
+ if [[ -n "$completed_us" ]]; then
3307
+ VERIFIED_US="$completed_us"
3308
+ log " Loaded completed stories from memory (last-resort prose): $VERIFIED_US"
3309
+ log_debug "[FLOW] loaded_verified_us_from_memory=$VERIFIED_US"
3310
+ fi
3311
+ fi
3312
+
3313
+ fi
3314
+
3315
+ # F-13 (batch-safe): restore the circuit-breaker counter on relaunch. This runs
3316
+ # OUTSIDE the per-us block above because consecutive_failures is meaningful in
3317
+ # EVERY verify mode — a batch-mode campaign crash-loops the same way, so nesting
3318
+ # the restore under `per-us` let a batch relaunch reset its CB to 0 and evade the
3319
+ # breaker. status.json persists the counter (lib_ralph_desk.zsh) every phase;
3320
+ # only verified_us was ever read back. (verified_us restore stays per-us: batch
3321
+ # mode has no per-US progress to rehydrate.) Normal reset-on-progress applies.
3322
+ if [[ -f "$STATUS_FILE" ]]; then
3323
+ local _status_cf
3324
+ _status_cf=$(jq -r '.consecutive_failures // 0' "$STATUS_FILE" 2>/dev/null)
3325
+ if [[ "$_status_cf" == <-> && "$_status_cf" -gt 0 ]]; then
3326
+ CONSECUTIVE_FAILURES="$_status_cf"
3327
+ log " Restored consecutive_failures from status.json: $CONSECUTIVE_FAILURES"
3328
+ log_debug "[FLOW] restored_consecutive_failures_from_status=$CONSECUTIVE_FAILURES"
3329
+ fi
3330
+ # D-5: also restore the consecutive-BLOCKS state so the now-live block CB
3331
+ # (F-22) survives a relaunch — otherwise a crash-loop resets it to 0 every
3332
+ # relaunch and the block breaker is evadable (the same durability hole F-13
3333
+ # closed for consecutive_failures). Restore last_block_reason too, else a
3334
+ # restored count is immediately reset on the next block (reason wouldn't match).
3335
+ local _status_cb _status_lbr
3336
+ _status_cb=$(jq -r '.consecutive_blocks // 0' "$STATUS_FILE" 2>/dev/null)
3337
+ _status_lbr=$(jq -r '.last_block_reason // ""' "$STATUS_FILE" 2>/dev/null)
3338
+ # D-5 fix: restore ATOMICALLY — both the count AND the reason, or neither. A
3339
+ # count without its reason is a useless half-state (the next block's reason
3340
+ # wouldn't match the empty LAST_BLOCK_REASON and would reset the count to 1
3341
+ # anyway), so require both to be present before applying.
3342
+ if [[ "$_status_cb" == <-> && "$_status_cb" -gt 0 && -n "$_status_lbr" ]]; then
3343
+ CONSECUTIVE_BLOCKS="$_status_cb"
3344
+ LAST_BLOCK_REASON="$_status_lbr"
3345
+ log " Restored consecutive_blocks from status.json: $CONSECUTIVE_BLOCKS"
3346
+ log_debug "[FLOW] restored_consecutive_blocks_from_status=$CONSECUTIVE_BLOCKS"
3347
+ fi
3348
+ # D-5b (restore-priority, user-chosen): if the Worker was AUTO-upgraded during a
3349
+ # prior segment (model_upgraded==1), restore the upgraded model + its engine
3350
+ # triple + the upgrade bookkeeping, so a crash-relaunch resumes at the upgraded
3351
+ # model (and the architecture-escalation trigger survives) instead of silently
3352
+ # reverting to the base model and re-spending iterations to re-upgrade. Gated on
3353
+ # model_upgraded==1 so it ONLY overrides for the auto-upgrade case (a fresh
3354
+ # campaign that never upgraded keeps the env/CLI model).
3355
+ local _status_mu
3356
+ _status_mu=$(jq -r '.model_upgraded // 0' "$STATUS_FILE" 2>/dev/null)
3357
+ if [[ "$_status_mu" == "1" ]]; then
3358
+ local _s_wm _s_we _s_wcm _s_wcr _s_owm _s_sufc
3359
+ _s_wm=$(jq -r '.worker_model // empty' "$STATUS_FILE" 2>/dev/null)
3360
+ _s_we=$(jq -r '.worker_engine // empty' "$STATUS_FILE" 2>/dev/null)
3361
+ _s_wcm=$(jq -r '.worker_codex_model // empty' "$STATUS_FILE" 2>/dev/null)
3362
+ _s_wcr=$(jq -r '.worker_codex_reasoning // empty' "$STATUS_FILE" 2>/dev/null)
3363
+ _s_owm=$(jq -r '.original_worker_model // empty' "$STATUS_FILE" 2>/dev/null)
3364
+ _s_sufc=$(jq -r '.same_us_fail_count // 0' "$STATUS_FILE" 2>/dev/null)
3365
+ if [[ -n "$_s_wm" && -n "$_s_we" ]]; then
3366
+ _MODEL_UPGRADED=1
3367
+ WORKER_MODEL="$_s_wm"; WORKER_ENGINE="$_s_we"
3368
+ [[ -n "$_s_wcm" ]] && WORKER_CODEX_MODEL="$_s_wcm"
3369
+ [[ -n "$_s_wcr" ]] && WORKER_CODEX_REASONING="$_s_wcr"
3370
+ [[ -n "$_s_owm" ]] && _ORIGINAL_WORKER_MODEL="$_s_owm"
3371
+ [[ "$_s_sufc" == <-> ]] && _SAME_US_FAIL_COUNT="$_s_sufc"
3372
+ log " Restored auto-upgraded Worker model: $WORKER_MODEL ($WORKER_ENGINE), orig=${_ORIGINAL_WORKER_MODEL:-?}, same_us_fails=$_SAME_US_FAIL_COUNT (D-5b restore-priority)"
3373
+ log_debug "[FLOW] restored_model_upgrade=true worker_model=$WORKER_MODEL engine=$WORKER_ENGINE same_us_fail=$_SAME_US_FAIL_COUNT"
3374
+ fi
3375
+ fi
3019
3376
  fi
3020
3377
 
3021
3378
  # Initialize PRD snapshot state for live update detection
@@ -3028,6 +3385,16 @@ main() {
3028
3385
  # Print security warning (governance.md s7: --dangerously-skip-permissions)
3029
3386
  print_security_warning
3030
3387
 
3388
+ # F-8 scope guard (F-19): snapshot the tracked files that are ALREADY dirty
3389
+ # before the campaign touches anything. The F-8 leader-recovery auto-commit
3390
+ # (Bug #8 Gate 3) must commit only the Worker's OWN edits and never sweep an
3391
+ # operator's pre-existing uncommitted work into a Worker-recovery commit.
3392
+ # `git diff --name-only HEAD` lists tracked files modified vs HEAD (staged or
3393
+ # not); untracked cruft is excluded and is never auto-committed. Empty when the
3394
+ # tree starts clean. Recorded once; excluded at recovery time in Gate 3.
3395
+ typeset -g CAMPAIGN_PREEXISTING_DIRTY
3396
+ CAMPAIGN_PREEXISTING_DIRTY=$(git -C "$ROOT" diff --name-only HEAD 2>/dev/null)
3397
+
3031
3398
  # Validate scaffold
3032
3399
  validate_scaffold
3033
3400
 
@@ -3166,19 +3533,34 @@ main() {
3166
3533
  # --- governance.md s7 step 5: Execute Worker (dispatched to engine-specific function) ---
3167
3534
  log_debug "[FLOW] iter=$ITERATION phase=worker engine=$WORKER_ENGINE model=$WORKER_MODEL dispatched=true"
3168
3535
 
3536
+ # F-11: a pane-start failure is usually the transient F6.1 spawn race
3537
+ # (send-keys before the pane's shell is ready). Replace the pane and retry
3538
+ # ONCE before BLOCKing, instead of terminating the campaign on a transient.
3169
3539
  if [[ "$WORKER_ENGINE" = "codex" ]]; then
3170
- worker_launch="${CODEX_BIN:-codex} -m $WORKER_CODEX_MODEL -c model_reasoning_effort=\"$WORKER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
3540
+ worker_launch="${CODEX_BIN:-codex} -m $WORKER_CODEX_MODEL -c model_reasoning_effort=\"$WORKER_CODEX_REASONING\" -c mcp_servers='{}' --disable plugins --dangerously-bypass-approvals-and-sandbox"
3171
3541
  if ! launch_worker_codex "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
3172
- write_blocked_sentinel "Worker codex failed to start in pane" "" "infra_failure"
3173
- update_status "blocked" "worker_start_failed"
3174
- return 1
3542
+ log " Worker codex failed to start replacing pane and retrying once (F-11)."
3543
+ log_debug "[GOV] iter=$ITERATION worker_start_failed=true action=replace_retry engine=codex"
3544
+ replace_worker_pane "$WORKER_PANE" "worker"
3545
+ WORKER_PANE=$(jq -r '.panes.worker' "$SESSION_CONFIG")
3546
+ if ! launch_worker_codex "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
3547
+ write_blocked_sentinel "Worker codex failed to start in pane after replace+retry" "" "infra_failure"
3548
+ update_status "blocked" "worker_start_failed"
3549
+ return 1
3550
+ fi
3175
3551
  fi
3176
3552
  else
3177
3553
  worker_launch="$(build_claude_cmd tui "$WORKER_MODEL" "" "" "$WORKER_EFFORT")"
3178
3554
  if ! launch_worker_claude "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
3179
- write_blocked_sentinel "Worker claude failed to start in pane" "" "infra_failure"
3180
- update_status "blocked" "worker_start_failed"
3181
- return 1
3555
+ log " Worker claude failed to start replacing pane and retrying once (F-11)."
3556
+ log_debug "[GOV] iter=$ITERATION worker_start_failed=true action=replace_retry engine=claude"
3557
+ replace_worker_pane "$WORKER_PANE" "worker"
3558
+ WORKER_PANE=$(jq -r '.panes.worker' "$SESSION_CONFIG")
3559
+ if ! launch_worker_claude "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
3560
+ write_blocked_sentinel "Worker claude failed to start in pane after replace+retry" "" "infra_failure"
3561
+ update_status "blocked" "worker_start_failed"
3562
+ return 1
3563
+ fi
3182
3564
  fi
3183
3565
  fi
3184
3566
  else
@@ -3299,12 +3681,25 @@ main() {
3299
3681
  local vp_count
3300
3682
  vp_count=$(jq -r '.verified_acs // [] | length' "$SIGNAL_FILE" 2>/dev/null || echo 0)
3301
3683
  if [[ "$vp_count" -eq 0 ]]; then
3302
- log " Worker signal verify_partial but verified_acs is empty — downgrading to blocked (verify_partial_malformed)."
3684
+ # F-12: a Worker formatting slip (verify_partial with empty verified_acs)
3685
+ # is recoverable — route it back to the Worker as a soft-fail BOUNDED by
3686
+ # the consecutive-failure circuit breaker, instead of a terminal
3687
+ # mission_abort that ends the whole campaign on a single malformed signal.
3688
+ # A fresh-context Worker that keeps malforming still trips the CB and
3689
+ # blocks; one slip just costs an iteration.
3303
3690
  local vp_us_id
3304
3691
  vp_us_id=$(jq -r '.us_id // empty' "$SIGNAL_FILE" 2>/dev/null)
3305
- write_blocked_sentinel "verify_partial_malformed: empty verified_acs" "${vp_us_id:-${CURRENT_US:-ALL}}" "mission_abort"
3306
- update_status "blocked" "verify_partial_malformed"
3307
- break
3692
+ (( CONSECUTIVE_FAILURES++ ))
3693
+ log " Worker verify_partial malformed (empty verified_acs) — soft-fail retry $CONSECUTIVE_FAILURES/$EFFECTIVE_CB_THRESHOLD (bounded by CB)."
3694
+ log_debug "[GOV] iter=$ITERATION verify_partial_malformed=soft_fail consecutive_failures=$CONSECUTIVE_FAILURES threshold=$EFFECTIVE_CB_THRESHOLD"
3695
+ update_status "worker" "verify_partial_malformed_retry"
3696
+ if (( CONSECUTIVE_FAILURES >= EFFECTIVE_CB_THRESHOLD )); then
3697
+ log_error " verify_partial_malformed repeated $CONSECUTIVE_FAILURES times (>= $EFFECTIVE_CB_THRESHOLD) — blocking."
3698
+ write_blocked_sentinel "verify_partial_malformed repeated $CONSECUTIVE_FAILURES times" "${vp_us_id:-${CURRENT_US:-ALL}}" "repeat_axis"
3699
+ update_status "blocked" "verify_partial_malformed_cb"
3700
+ break
3701
+ fi
3702
+ continue
3308
3703
  fi
3309
3704
  log " Worker signal verify_partial (verified_acs count=$vp_count). Routing to verify path."
3310
3705
  signal_status="verify"
@@ -3314,6 +3709,13 @@ main() {
3314
3709
  # Read us_id from signal for per-US scoping
3315
3710
  local signal_us_id=""
3316
3711
  signal_us_id=$(jq -r '.us_id // empty' "$SIGNAL_FILE" 2>/dev/null)
3712
+ # F-23: normalize case so a Worker emitting "all"/"All" still triggers the
3713
+ # final/ALL verify + completion paths (which match "ALL" exactly). US ids
3714
+ # are already uppercase ("US-001"), so this is a no-op for well-formed ids.
3715
+ signal_us_id="${signal_us_id:u}"
3716
+ # D-11: the US under verification is the in-flight US for lifecycle sentinels
3717
+ # fired during the verify poll (no-progress / stall / R12).
3718
+ [[ -n "$signal_us_id" ]] && CURRENT_US="$signal_us_id"
3317
3719
  log " Worker claims done (us_id=${signal_us_id:-all}). Dispatching Verifier..."
3318
3720
 
3319
3721
  # AC1: capture verifier start timestamp
@@ -3348,7 +3750,7 @@ main() {
3348
3750
  if (( use_consensus )); then
3349
3751
  # US-004: Run consensus verification (claude + codex sequentially)
3350
3752
  local consensus_rc=0
3351
- run_consensus_verification "$ITERATION" || consensus_rc=$?
3753
+ run_consensus_verification "$ITERATION" "$signal_us_id" || consensus_rc=$?
3352
3754
 
3353
3755
  if (( consensus_rc == 2 )); then
3354
3756
  # Consensus disagreement — treat as fail, fix loop will handle
@@ -3389,15 +3791,35 @@ main() {
3389
3791
  fi
3390
3792
  wait_for_pane_ready "$VERIFIER_PANE" 10 2>/dev/null || true
3391
3793
 
3794
+ # D-1: a final/ALL verify reaching the single-engine path (batch mode, or
3795
+ # any ALL verify not handled by the per-us sequential path) uses the
3796
+ # stronger FINAL_VERIFIER_*; per-US verifies keep the lighter VERIFIER_*.
3797
+ # For signal_us_id != ALL, _v_* alias VERIFIER_* EXACTLY — no behavior
3798
+ # change on the per-US hot path.
3799
+ local _v_eng _v_model _v_cxm _v_cxr _v_eff _v_role
3800
+ if [[ "$signal_us_id" == "ALL" ]]; then
3801
+ _v_eng="$FINAL_VERIFIER_ENGINE"; _v_model="$FINAL_VERIFIER_MODEL"
3802
+ _v_cxm="$FINAL_VERIFIER_CODEX_MODEL"; _v_cxr="$FINAL_VERIFIER_CODEX_REASONING"; _v_eff="$FINAL_VERIFIER_EFFORT"
3803
+ # D-10 fix: an ALL verify here runs FINAL_VERIFIER_ENGINE, so the poll's
3804
+ # dead-pane check must derive FINAL_VERIFIER_ENGINE too — use the
3805
+ # "*inal*" role so poll_for_signal's engine derivation matches _v_eng
3806
+ # (else a codex final verifier's "bash" is misjudged with VERIFIER_ENGINE).
3807
+ _v_role="Verifier-final"
3808
+ else
3809
+ _v_eng="$VERIFIER_ENGINE"; _v_model="$VERIFIER_MODEL"
3810
+ _v_cxm="$VERIFIER_CODEX_MODEL"; _v_cxr="$VERIFIER_CODEX_REASONING"; _v_eff="$VERIFIER_EFFORT"
3811
+ _v_role="Verifier"
3812
+ fi
3813
+
3392
3814
  local verifier_launch
3393
- if [[ "$VERIFIER_ENGINE" = "codex" ]]; then
3394
- verifier_launch="${CODEX_BIN:-codex} -m $VERIFIER_CODEX_MODEL -c model_reasoning_effort=\"$VERIFIER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
3815
+ if [[ "$_v_eng" = "codex" ]]; then
3816
+ verifier_launch="${CODEX_BIN:-codex} -m $_v_cxm -c model_reasoning_effort=\"$_v_cxr\" -c mcp_servers='{}' --disable plugins --dangerously-bypass-approvals-and-sandbox"
3395
3817
  else
3396
- verifier_launch="$(build_claude_cmd tui "$VERIFIER_MODEL" "" "" "$VERIFIER_EFFORT")"
3818
+ verifier_launch="$(build_claude_cmd tui "$_v_model" "" "" "$_v_eff")"
3397
3819
  fi
3398
- log_debug "[FLOW] iter=$ITERATION phase=verifier engine=$VERIFIER_ENGINE model=$VERIFIER_MODEL scope=${signal_us_id:-all} dispatched=true"
3820
+ log_debug "[FLOW] iter=$ITERATION phase=verifier engine=$_v_eng model=$_v_model scope=${signal_us_id:-all} dispatched=true"
3399
3821
 
3400
- if [[ "$VERIFIER_ENGINE" = "codex" ]]; then
3822
+ if [[ "$_v_eng" = "codex" ]]; then
3401
3823
  launch_verifier_codex "$VERIFIER_PANE" "$verifier_prompt" "$ITERATION" "$verifier_launch"
3402
3824
  else
3403
3825
  if ! launch_verifier_claude "$VERIFIER_PANE" "$verifier_prompt" "$ITERATION" "$verifier_launch"; then
@@ -3406,16 +3828,45 @@ main() {
3406
3828
  fi
3407
3829
  fi
3408
3830
 
3409
- # Poll for verify-verdict.json
3831
+ # Poll for verify-verdict.json — F-10: 3-strike replace+re-dispatch
3832
+ # parity with the Worker's MONITOR_FAILURE_COUNT breaker. "Bug Report #5"
3833
+ # hardened the Worker poll-fail path (retry-3-then-block) but left the
3834
+ # Verifier path as an immediate terminal BLOCK, so a single transient
3835
+ # verifier death (API blip / pane-spawn race, also F-11) ended a campaign
3836
+ # the Worker path would have survived. rc==2 keeps its original meaning
3837
+ # (already-handled → return). Only 3 consecutive failures BLOCK.
3410
3838
  log " Polling for verify-verdict.json..."
3411
- if ! poll_for_signal "$VERDICT_FILE" "$VERIFIER_HEARTBEAT" "$VERIFIER_PANE" "$verifier_launch" "Verifier"; then
3839
+ local _vpoll_strike=0 _vpoll_ok=0
3840
+ while (( _vpoll_strike < 3 )); do
3841
+ # Capture poll rc DIRECTLY — `$?` after `if cmd; then…fi` is the
3842
+ # if-statement's status (0), not cmd's rc (the original `if ! poll;
3843
+ # then local rc=$?` had this latent bug, so its `rc==2` branch was
3844
+ # dead and a hard-fail double-wrote a sentinel). rc: 0=verdict,
3845
+ # 1=timeout (retryable), 2=hard-failed + infra_failure already recorded.
3846
+ poll_for_signal "$VERDICT_FILE" "$VERIFIER_HEARTBEAT" "$VERIFIER_PANE" "$verifier_launch" "$_v_role"
3412
3847
  local verifier_poll_rc=$?
3848
+ if (( verifier_poll_rc == 0 )); then
3849
+ _vpoll_ok=1; break
3850
+ fi
3413
3851
  if (( verifier_poll_rc == 2 )); then
3414
- return 1
3852
+ return 1 # hard-failed; poll already recorded infra_failure — do not retry
3853
+ fi
3854
+ (( _vpoll_strike++ ))
3855
+ log " WARNING: Verifier poll failed (strike $_vpoll_strike/3) — replacing pane and re-dispatching"
3856
+ log_debug "[GOV] iter=$ITERATION verifier_monitor_failure=$_vpoll_strike/3"
3857
+ update_status "verifier" "poll_failed"
3858
+ (( _vpoll_strike >= 3 )) && break
3859
+ replace_worker_pane "$VERIFIER_PANE" "verifier"
3860
+ VERIFIER_PANE=$(jq -r '.panes.verifier' "$SESSION_CONFIG")
3861
+ if [[ "$_v_eng" = "codex" ]]; then
3862
+ launch_verifier_codex "$VERIFIER_PANE" "$verifier_prompt" "$ITERATION" "$verifier_launch"
3863
+ else
3864
+ launch_verifier_claude "$VERIFIER_PANE" "$verifier_prompt" "$ITERATION" "$verifier_launch" || true
3415
3865
  fi
3416
- log_error "Verifier poll failed"
3417
- # Verifier is dead/stuck BLOCK and let user decide
3418
- write_blocked_sentinel "Verifier process dead/stuck (poll failed). Pane preserved for inspection." "" "infra_failure"
3866
+ done
3867
+ if (( ! _vpoll_ok )); then
3868
+ log_error "Verifier poll failed 3× (dead/stuck after retries)"
3869
+ write_blocked_sentinel "Verifier process dead/stuck after 3 retries. Pane preserved for inspection." "" "infra_failure"
3419
3870
  update_status "blocked" "verifier_dead"
3420
3871
  return 1
3421
3872
  fi
@@ -3435,6 +3886,10 @@ main() {
3435
3886
  verdict=$(jq -r '.verdict' "$VERDICT_FILE" 2>/dev/null)
3436
3887
  local recommended
3437
3888
  recommended=$(jq -r '.recommended_state_transition' "$VERDICT_FILE" 2>/dev/null)
3889
+ # F-23: normalize so a verifier's phrasing variant doesn't strand a
3890
+ # genuinely-complete campaign at MAX_ITER. "Complete"/"completed"/"done"
3891
+ # all mean complete; comparison below is lowercase-exact.
3892
+ recommended="${recommended:l}"
3438
3893
  local verdict_summary
3439
3894
  verdict_summary=$(jq -r '.summary // "no summary"' "$VERDICT_FILE" 2>/dev/null)
3440
3895
 
@@ -3445,10 +3900,20 @@ main() {
3445
3900
 
3446
3901
  case "$verdict" in
3447
3902
  pass)
3903
+ # D-3 fix: snapshot the CB BEFORE the pass-success reset so a wrong-US
3904
+ # "pass" (us_id mismatch, handled below) accumulates the CB across
3905
+ # iterations instead of restarting from 0 each time (the reset on the
3906
+ # next line would otherwise defeat the mismatch soft-fail's CB bound).
3907
+ local _cf_before_pass=$CONSECUTIVE_FAILURES
3448
3908
  CONSECUTIVE_FAILURES=0
3449
3909
  CONSENSUS_ROUND=0
3450
3910
  _SAME_US_FAIL_COUNT=0
3451
3911
  _LAST_FAILED_US=""
3912
+ # F-22b: a pass is real progress — reset the consecutive-BLOCKS state
3913
+ # too so the now-live block CB counts only blocks with NO intervening
3914
+ # success ("consecutive" in the true sense, not cumulative).
3915
+ CONSECUTIVE_BLOCKS=0
3916
+ LAST_BLOCK_REASON=""
3452
3917
  if (( _MODEL_UPGRADED )); then
3453
3918
  log " Worker model restored: ${WORKER_MODEL} → ${_ORIGINAL_WORKER_MODEL} (pass verdict)"
3454
3919
  log_debug "[DECIDE] iter=$ITERATION phase=model_select model_restore=true from=${WORKER_MODEL} to=${_ORIGINAL_WORKER_MODEL}"
@@ -3462,17 +3927,49 @@ main() {
3462
3927
 
3463
3928
  # --- Verified US tracking (both per-us and batch modes) ---
3464
3929
  if [[ -n "$signal_us_id" && "$signal_us_id" != "ALL" ]]; then
3465
- # Add this US to verified list
3466
- if [[ -n "$VERIFIED_US" ]]; then
3467
- VERIFIED_US="${VERIFIED_US},${signal_us_id}"
3930
+ # D-3: cross-check the verdict's OWN us_id against the US the leader
3931
+ # scoped this verify to. If the verifier graded a DIFFERENT US, do
3932
+ # NOT credit signal_us_id (it was not actually verified) — soft-fail
3933
+ # so the Worker re-runs the contracted US. Acts ONLY on a PRESENT
3934
+ # mismatch (absent verdict us_id = trust the scope), so a correctly-
3935
+ # scoped verifier is never affected.
3936
+ local _verdict_us_id
3937
+ _verdict_us_id=$(jq -r '.us_id // empty' "$VERDICT_FILE" 2>/dev/null)
3938
+ _verdict_us_id="${_verdict_us_id:u}"
3939
+ if [[ -n "$_verdict_us_id" && "$_verdict_us_id" != "$signal_us_id" ]]; then
3940
+ log_error " Verdict us_id mismatch: verifier graded $_verdict_us_id but leader scoped $signal_us_id — NOT crediting (soft-fail)."
3941
+ log_debug "[GOV] iter=$ITERATION verdict_us_id_mismatch verdict_us=$_verdict_us_id signal_us=$signal_us_id"
3942
+ update_status "verifier" "us_id_mismatch"
3943
+ # D-3 fix: undo the pass-entry CB reset so consecutive mismatches
3944
+ # actually accumulate toward the breaker (else each restarts at 0).
3945
+ CONSECUTIVE_FAILURES=$_cf_before_pass
3946
+ if _bump_consecutive_failure; then
3947
+ write_blocked_sentinel "${EFFECTIVE_CB_THRESHOLD} consecutive verdict us_id mismatches" "" "repeat_axis"
3948
+ update_status "blocked" "consecutive_failures"
3949
+ return 1
3950
+ fi
3468
3951
  else
3469
- VERIFIED_US="$signal_us_id"
3952
+ # Add this US to verified list. D-12: dedup — a fresh-context Worker
3953
+ # can re-submit an already-verified US (memory drift); don't
3954
+ # double-credit it (mirrors the fail/partial-progress guard, and
3955
+ # keeps VERIFIED_US + the ledger + the coverage count honest).
3956
+ if echo ",$VERIFIED_US," | grep -q ",$signal_us_id,"; then
3957
+ log " US $signal_us_id already verified — not re-crediting (dedup)."
3958
+ log_debug "[FLOW] iter=$ITERATION verified_us_dedup=$signal_us_id"
3959
+ else
3960
+ if [[ -n "$VERIFIED_US" ]]; then
3961
+ VERIFIED_US="${VERIFIED_US},${signal_us_id}"
3962
+ else
3963
+ VERIFIED_US="$signal_us_id"
3964
+ fi
3965
+ log " US $signal_us_id verified. Verified so far: $VERIFIED_US"
3966
+ log_debug "[FLOW] iter=$ITERATION verified_us_update=$signal_us_id verified_us_total=$VERIFIED_US"
3967
+ _append_verified_ledger "$signal_us_id" # F-14: durable source-of-truth
3968
+ fi
3969
+ update_status "verifier" "pass_us"
3970
+ # Worker will do next US on next iteration
3470
3971
  fi
3471
- log " US $signal_us_id verified. Verified so far: $VERIFIED_US"
3472
- log_debug "[FLOW] iter=$ITERATION verified_us_update=$signal_us_id verified_us_total=$VERIFIED_US"
3473
- update_status "verifier" "pass_us"
3474
- # Worker will do next US on next iteration
3475
- elif [[ "$recommended" == "complete" || "$signal_us_id" == "ALL" ]]; then
3972
+ elif [[ "$recommended" == (complete|completed|done) || "$signal_us_id" == "ALL" ]]; then
3476
3973
  # Final full verify passed or complete recommended
3477
3974
  write_complete_sentinel "$verdict_summary"
3478
3975
  update_status "complete" "pass"
@@ -3499,6 +3996,7 @@ main() {
3499
3996
  VERIFIED_US="$_pus"
3500
3997
  fi
3501
3998
  log " Partial progress: $_pus passed (overall FAIL). Verified so far: $VERIFIED_US"
3999
+ _append_verified_ledger "$_pus" # F-14: durable source-of-truth
3502
4000
  fi
3503
4001
  done
3504
4002
  log_debug "[FLOW] iter=$ITERATION partial_progress prev=$_prev_verified now=$VERIFIED_US"
@@ -3507,6 +4005,9 @@ main() {
3507
4005
  # Partial progress resets consecutive failures (progress was made)
3508
4006
  if [[ "$VERIFIED_US" != "$_prev_verified" ]]; then
3509
4007
  CONSECUTIVE_FAILURES=0
4008
+ # F-22b: partial progress also resets the consecutive-blocks state.
4009
+ CONSECUTIVE_BLOCKS=0
4010
+ LAST_BLOCK_REASON=""
3510
4011
  log " Progress detected — consecutive_failures reset to 0"
3511
4012
  log_debug "[GOV] iter=$ITERATION consecutive_failures_reset=partial_progress"
3512
4013
  fi
@@ -3575,31 +4076,66 @@ main() {
3575
4076
  log " Questions: \"$verdict_summary_ri\""
3576
4077
  log " Treating as soft fail — Worker will see verdict in next iteration."
3577
4078
  update_status "verifier" "request_info"
4079
+ # F-22: count request_info toward the CB so a verifier looping on
4080
+ # request_info trips the breaker instead of spinning to MAX_ITER.
4081
+ if _bump_consecutive_failure; then
4082
+ write_blocked_sentinel "${EFFECTIVE_CB_THRESHOLD} consecutive non-advancing verdicts (request_info)" "" "repeat_axis"
4083
+ update_status "blocked" "consecutive_failures"
4084
+ return 1
4085
+ fi
3578
4086
  ;;
3579
4087
  blocked)
3580
4088
  local _verdict_cat
3581
4089
  _verdict_cat=$(_classify_cross_us_or_metric "$verdict_summary")
3582
- write_blocked_sentinel "Verifier verdict: blocked - $verdict_summary" "" "$_verdict_cat"
3583
- update_status "blocked" "verifier_blocked"
3584
- return 1
4090
+ # F-22: a transient/first "blocked" no longer kills the campaign —
4091
+ # absorb as a soft-fail with grace; terminate only on a genuine infra
4092
+ # block, the same reason repeated >= BLOCK_CB_THRESHOLD, or the CB.
4093
+ if _block_with_grace "Verifier verdict: blocked - $verdict_summary" "$_verdict_cat"; then
4094
+ write_blocked_sentinel "Verifier verdict: blocked - $verdict_summary" "" "$_verdict_cat"
4095
+ update_status "blocked" "verifier_blocked"
4096
+ return 1
4097
+ fi
4098
+ log " Verifier verdict=blocked absorbed as soft-fail (consecutive_failures=$CONSECUTIVE_FAILURES; reason not yet repeated ${BLOCK_CB_THRESHOLD}×) — Worker will retry."
4099
+ update_status "verifier" "blocked_softfail"
3585
4100
  ;;
3586
4101
  *)
3587
4102
  log_error "Unknown verdict: $verdict"
3588
4103
  update_status "verifier" "unknown_verdict"
4104
+ # F-22: unknown verdict is a soft-fail that counts toward the CB
4105
+ # (was: silent continue to MAX_ITER with no diagnostic BLOCK).
4106
+ if _bump_consecutive_failure; then
4107
+ write_blocked_sentinel "${EFFECTIVE_CB_THRESHOLD} consecutive unrecognized verifier verdicts" "" "repeat_axis"
4108
+ update_status "blocked" "consecutive_failures"
4109
+ return 1
4110
+ fi
3589
4111
  ;;
3590
4112
  esac
3591
4113
  ;;
3592
4114
  blocked)
3593
- # --- governance.md s7 step 6: blocked -> write sentinel ---
4115
+ # --- governance.md s7 step 6: blocked -> write sentinel (with grace) ---
3594
4116
  local _signal_cat
3595
4117
  _signal_cat=$(_classify_cross_us_or_metric "$signal_summary")
3596
- write_blocked_sentinel "Worker reported blocked: $signal_summary" "" "$_signal_cat"
3597
- update_status "blocked" "worker_blocked"
3598
- return 1
4118
+ # F-22: a transient/first Worker-reported "blocked" no longer kills the
4119
+ # campaign — absorb as a soft-fail with grace (same gate as the verifier
4120
+ # blocked path); terminate only on infra, repeated reason, or the CB.
4121
+ if _block_with_grace "Worker reported blocked: $signal_summary" "$_signal_cat"; then
4122
+ write_blocked_sentinel "Worker reported blocked: $signal_summary" "" "$_signal_cat"
4123
+ update_status "blocked" "worker_blocked"
4124
+ return 1
4125
+ fi
4126
+ log " Worker status=blocked absorbed as soft-fail (consecutive_failures=$CONSECUTIVE_FAILURES) — re-dispatching Worker."
4127
+ update_status "worker" "blocked_softfail"
3599
4128
  ;;
3600
4129
  *)
3601
4130
  log_error "Unknown signal status: $signal_status"
3602
4131
  update_status "worker" "unknown_status"
4132
+ # F-22: unknown signal status is a soft-fail that counts toward the CB
4133
+ # (was: silent continue to MAX_ITER).
4134
+ if _bump_consecutive_failure; then
4135
+ write_blocked_sentinel "${EFFECTIVE_CB_THRESHOLD} consecutive unrecognized worker signals" "" "repeat_axis"
4136
+ update_status "blocked" "consecutive_failures"
4137
+ return 1
4138
+ fi
3603
4139
  ;;
3604
4140
  esac
3605
4141