@ai-dev-methodologies/rlp-desk 0.17.0 → 0.18.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3725 @@
1
+ #!/bin/zsh
2
+ set -uo pipefail
3
+ # NOTE: We use set -u (undefined var check) and pipefail, but NOT set -e
4
+ # because the main loop uses explicit error checks throughout.
5
+
6
+ # =============================================================================
7
+ # Ralph Desk Tmux Runner
8
+ #
9
+ # Implements the Leader loop from governance.md section 7 as a shell script.
10
+ # Uses tmux proven patterns: write-then-notify, pane IDs (%N),
11
+ # copy-mode guards, verification-based retry, heartbeat monitoring,
12
+ # idle pane nudging, exponential backoff restarts, atomic file writes.
13
+ #
14
+ # Usage:
15
+ # LOOP_NAME=<slug> ./run_ralph_desk.zsh
16
+ #
17
+ # Required env:
18
+ # LOOP_NAME - slug identifier for the campaign
19
+ #
20
+ # Optional env:
21
+ # ROOT - project root (default: $PWD)
22
+ # MAX_ITER - max iterations (default: 20)
23
+ # WORKER_MODEL - claude model for Worker (default: sonnet)
24
+ # VERIFIER_MODEL - claude model for Verifier (default: opus)
25
+ # POLL_INTERVAL - seconds between signal checks (default: 5)
26
+ # ITER_TIMEOUT - per-iteration timeout in seconds (default: 600)
27
+ # HEARTBEAT_STALE_THRESHOLD - seconds before heartbeat is stale (default: 120)
28
+ # MAX_RESTARTS - max restart attempts per worker (default: 3)
29
+ # IDLE_NUDGE_THRESHOLD - seconds of idle before nudge (default: 30)
30
+ # MAX_NUDGES - max nudges per pane per iteration (default: 3)
31
+ #
32
+ # Per-role codex config:
33
+ # WORKER_CODEX_MODEL - codex model for Worker (default: gpt-5.5)
34
+ # WORKER_CODEX_REASONING - codex reasoning for Worker (default: high)
35
+ # VERIFIER_CODEX_MODEL - codex model for Verifier (default: gpt-5.5)
36
+ # VERIFIER_CODEX_REASONING - codex reasoning for Verifier (default: high)
37
+ #
38
+ # Consensus scope:
39
+ # CONSENSUS_SCOPE - when consensus applies (default: all)
40
+ # all=every verify, final-only=final ALL only
41
+ #
42
+ # Dependencies: tmux, claude CLI, jq
43
+ # Optional: codex CLI (required when WORKER_ENGINE=codex, VERIFIER_ENGINE=codex, or VERIFY_CONSENSUS=1)
44
+ # =============================================================================
45
+
46
+ # --- Environment Variables ---
47
+ SLUG="${LOOP_NAME:?ERROR: LOOP_NAME is required. Set it to the campaign slug.}"
48
+ ROOT="${ROOT:-$PWD}"
49
+ MAX_ITER="${MAX_ITER:-20}"
50
+ WORKER_MODEL="${WORKER_MODEL:-haiku}"
51
+ VERIFIER_MODEL="${VERIFIER_MODEL:-sonnet}"
52
+ FINAL_VERIFIER_MODEL="${FINAL_VERIFIER_MODEL:-opus}"
53
+ POLL_INTERVAL="${POLL_INTERVAL:-5}"
54
+ ITER_TIMEOUT="${ITER_TIMEOUT:-600}"
55
+ HEARTBEAT_STALE_THRESHOLD="${HEARTBEAT_STALE_THRESHOLD:-120}"
56
+ MAX_RESTARTS="${MAX_RESTARTS:-3}"
57
+ IDLE_NUDGE_THRESHOLD="${IDLE_NUDGE_THRESHOLD:-30}"
58
+ MAX_NUDGES="${MAX_NUDGES:-3}"
59
+ WITH_SELF_VERIFICATION="${WITH_SELF_VERIFICATION:-0}"
60
+ WITH_SELF_VERIFICATION_REQUESTED="$WITH_SELF_VERIFICATION" # preserves original user intent for traceability (governance §1f)
61
+ SV_SKIPPED_REASON="" # set when SV is disabled despite user request
62
+
63
+ # v0.14.0 — zsh runner restored as primary tmux mode path.
64
+ # v5.7 §4.2's deprecation gate (rejected --flywheel/--flywheel-guard/
65
+ # --with-self-verification) is removed: the Node port shipped without
66
+ # zsh-equivalent safety nets (heartbeat, copy-mode guard, prompt-stall,
67
+ # no-progress, stale-context, claude model upgrade chain, etc.), so the
68
+ # Node leader is now reserved for `--mode agent` (LLM-driven) only.
69
+ # `--mode tmux` invocations from src/node/run.mjs delegate here as a
70
+ # subprocess via env vars. ARCH Wave C / ADR-001: FLYWHEEL and FLYWHEEL_GUARD
71
+ # are NOT implemented in the zsh leader (no dispatch site) and are deprecated —
72
+ # do NOT claim otherwise. WITH_SELF_VERIFICATION is forwarded for traceability,
73
+ # but the SV report is produced by the Node post-pass in run.mjs runTmuxViaZsh
74
+ # after this script exits (this script keeps its $TMUX early-return to avoid the
75
+ # `claude --print` no-TTY hang).
76
+ AUTONOMOUS_MODE="${AUTONOMOUS_MODE:-0}" # 1=don't stop on ambiguity, PRD is authoritative
77
+ # P1-E Lane enforcement: WARN-only by default; --lane-strict opts into BLOCKED
78
+ # escalation. governance §7¾. The opt-in defaults to "warn"; "strict" trips
79
+ # BLOCKED with reason_category=infra_failure + recoverable=true (downgrade
80
+ # from terminal_alert) so an inaccurate mtime audit cannot terminally kill a
81
+ # campaign.
82
+ LANE_MODE="${LANE_MODE:-warn}"
83
+ # US-018 R6 P1-F Test density: WARN by default; --test-density-strict turns
84
+ # init exit non-zero when any AC has < 3 tests (governance §7f).
85
+ TEST_DENSITY_MODE="${TEST_DENSITY_MODE:-warn}"
86
+ # US-021 R9 P2-I consecutive_blocks circuit breaker (governance §8). When the
87
+ # same canonical block reason fires N times in a row the runner writes
88
+ # .sisyphus/mission-abort.json and exits non-zero so contract defects don't
89
+ # silently loop. infra_failure category and the very first iteration are exempt.
90
+ BLOCK_CB_THRESHOLD="${BLOCK_CB_THRESHOLD:-3}"
91
+ CONSECUTIVE_BLOCKS=0
92
+ LAST_BLOCK_REASON=""
93
+
94
+ # US-021 R9 P2-I: track repeated same-reason blocks. infra_failure category and
95
+ # the very first iteration are exempt (mission setup blocks shouldn't trip
96
+ # the abort). Returns 0 if loop should continue, 1 (after writing
97
+ # mission-abort.json) if the threshold is reached.
98
+ # US-023 R11 P2-K: guarantee at least one cost-log.jsonl entry per campaign.
99
+ # An empty cost-log can mean either "no usage recorded" or "logging broken" —
100
+ # we make the distinction observable by always emitting a final entry on exit
101
+ # (idempotent via COST_LOG_FINAL_WRITTEN). Wired into the existing cleanup trap.
102
+ COST_LOG_FINAL_WRITTEN=0
103
+ _emit_final_cost_log() {
104
+ if [[ "${COST_LOG_FINAL_WRITTEN:-0}" -ne 0 ]]; then
105
+ return 0
106
+ fi
107
+ COST_LOG_FINAL_WRITTEN=1
108
+ if [[ -n "${ITERATION:-}" && -n "${LOGS_DIR:-}" ]]; then
109
+ write_cost_log "${ITERATION:-0}" 2>/dev/null || true
110
+ fi
111
+ }
112
+
113
+ # US-024 R12 P0: tmux pane/session lifecycle monitor.
114
+ # Single authoritative timeout: 5 attempts × 1s sleep = 5s budget.
115
+ # Invoked at 3 sites: create_session post-finish, main loop iter entry, and
116
+ # every send-keys/paste post-action before the wait-loop. Writes infra_failure
117
+ # BLOCKED sentinel and exits 1 when any pane or the session is dead beyond budget.
118
+ _r12_check_lifecycle() {
119
+ local site="${1:-unknown}"
120
+ local _attempts=0
121
+ while ! _verify_session_alive "$SESSION_NAME" || \
122
+ ! _verify_pane_alive "$LEADER_PANE" || \
123
+ ! _verify_pane_alive "$WORKER_PANE" || \
124
+ ! _verify_pane_alive "$VERIFIER_PANE"; do
125
+ (( _attempts++ ))
126
+ if (( _attempts >= 5 )); then
127
+ log_error "[r12:$site] tmux session/pane dead after 5x1s polling (5s authoritative budget). session=$SESSION_NAME panes leader=$LEADER_PANE worker=$WORKER_PANE verifier=$VERIFIER_PANE"
128
+ tmux list-panes -a -F '#{session_name}:#{pane_id} dead=#{pane_dead}' 2>&1 | head -20 >> "${DEBUG_LOG:-/dev/null}"
129
+ write_blocked_sentinel "tmux session/pane dead during $site" "${CURRENT_US:-ALL}" "infra_failure"
130
+ exit 1
131
+ fi
132
+ sleep 1
133
+ done
134
+ return 0
135
+ }
136
+
137
+ _check_consecutive_blocks() {
138
+ local reason="$1"
139
+ local category="${2:-metric_failure}"
140
+ local iter="${3:-${ITERATION:-0}}"
141
+ if [[ "$category" == "infra_failure" ]] || (( iter <= 1 )); then
142
+ LAST_BLOCK_REASON=""
143
+ CONSECUTIVE_BLOCKS=0
144
+ return 0
145
+ fi
146
+ local canonical
147
+ canonical=$(_canonical_block_reason "$reason" 2>/dev/null)
148
+ if [[ "$canonical" == "$LAST_BLOCK_REASON" && -n "$canonical" ]]; then
149
+ CONSECUTIVE_BLOCKS=$((CONSECUTIVE_BLOCKS + 1))
150
+ else
151
+ CONSECUTIVE_BLOCKS=1
152
+ LAST_BLOCK_REASON="$canonical"
153
+ fi
154
+ if (( CONSECUTIVE_BLOCKS >= BLOCK_CB_THRESHOLD )); then
155
+ local abort_dir="$DESK/.sisyphus"
156
+ mkdir -p "$abort_dir" 2>/dev/null
157
+ local abort_file="$abort_dir/mission-abort.json"
158
+ printf '{"reason":"consecutive_blocks","count":%s,"last_reason":"%s","threshold":%s,"timestamp":"%s"}\n' \
159
+ "$CONSECUTIVE_BLOCKS" "$canonical" "$BLOCK_CB_THRESHOLD" \
160
+ "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$abort_file"
161
+ log_error "Mission abort: same canonical block reason '$canonical' repeated $CONSECUTIVE_BLOCKS times (>= $BLOCK_CB_THRESHOLD)"
162
+ return 1
163
+ fi
164
+ return 0
165
+ }
166
+
167
+ # --- Engine Selection (auto-detect from model format) ---
168
+ # claude models (haiku/sonnet/opus) with :effort → claude engine + effort
169
+ # codex models (gpt-*/spark) with :reasoning → codex engine + reasoning
170
+ # plain name → claude engine (no effort/reasoning)
171
+ _auto_detect_engine() {
172
+ local model_var="$1" engine_var="$2" codex_model_var="$3" codex_reasoning_var="$4" effort_var="${5:-}"
173
+ local model_val="${(P)model_var}"
174
+ if [[ "$model_val" == *:* ]]; then
175
+ local model_part="${model_val%%:*}"
176
+ local level_part="${model_val##*:}"
177
+ case "$model_part" in
178
+ haiku|sonnet|opus)
179
+ # Claude model with effort — keep engine as claude, store effort
180
+ eval "$engine_var=claude"
181
+ eval "$model_var=$model_part"
182
+ [[ -n "$effort_var" ]] && eval "$effort_var=$level_part"
183
+ ;;
184
+ *)
185
+ # Codex model with reasoning
186
+ [[ "$model_part" == "spark" ]] && model_part="gpt-5.3-codex-spark"
187
+ eval "$engine_var=codex"
188
+ eval "$model_var=$model_part"
189
+ [[ -n "$codex_model_var" ]] && eval "$codex_model_var=$model_part"
190
+ [[ -n "$codex_reasoning_var" ]] && eval "$codex_reasoning_var=$level_part"
191
+ ;;
192
+ esac
193
+ fi
194
+ }
195
+
196
+ WORKER_ENGINE="${WORKER_ENGINE:-claude}"
197
+ VERIFIER_ENGINE="${VERIFIER_ENGINE:-claude}"
198
+ FINAL_VERIFIER_ENGINE="${FINAL_VERIFIER_ENGINE:-claude}"
199
+
200
+ # Effort levels for Claude models (set by _auto_detect_engine or CLI --worker-model opus:max)
201
+ WORKER_EFFORT="${WORKER_EFFORT:-}"
202
+ VERIFIER_EFFORT="${VERIFIER_EFFORT:-}"
203
+ FINAL_VERIFIER_EFFORT="${FINAL_VERIFIER_EFFORT:-}"
204
+
205
+ # Auto-detect engine from model format for env var path (CLI path uses parse_model_flag)
206
+ _auto_detect_engine WORKER_MODEL WORKER_ENGINE WORKER_CODEX_MODEL WORKER_CODEX_REASONING WORKER_EFFORT
207
+ _auto_detect_engine VERIFIER_MODEL VERIFIER_ENGINE VERIFIER_CODEX_MODEL VERIFIER_CODEX_REASONING VERIFIER_EFFORT
208
+ _auto_detect_engine FINAL_VERIFIER_MODEL FINAL_VERIFIER_ENGINE "" "" FINAL_VERIFIER_EFFORT
209
+ WORKER_CODEX_MODEL="${WORKER_CODEX_MODEL:-gpt-5.5}"
210
+ WORKER_CODEX_REASONING="${WORKER_CODEX_REASONING:-high}" # low|medium|high
211
+ VERIFIER_CODEX_MODEL="${VERIFIER_CODEX_MODEL:-gpt-5.5}"
212
+ VERIFIER_CODEX_REASONING="${VERIFIER_CODEX_REASONING:-high}" # low|medium|high
213
+ CODEX_BIN="" # resolved by check_dependencies when engine=codex
214
+
215
+ # --- Verify Mode ---
216
+ VERIFY_MODE="${VERIFY_MODE:-per-us}" # per-us|batch
217
+ # Consensus: off|all|final-only (replaces VERIFY_CONSENSUS + FINAL_CONSENSUS + CONSENSUS_SCOPE)
218
+ CONSENSUS_MODE="${CONSENSUS_MODE:-off}" # off|all|final-only
219
+ CONSENSUS_MODEL="${CONSENSUS_MODEL:-gpt-5.5:medium}" # per-US cross-verifier (lighter)
220
+ FINAL_CONSENSUS_MODEL="${FINAL_CONSENSUS_MODEL:-gpt-5.5:high}" # final cross-verifier (stricter)
221
+ # Legacy compat: map old flags to CONSENSUS_MODE
222
+ if [[ "${VERIFY_CONSENSUS:-0}" = "1" ]]; then
223
+ CONSENSUS_MODE="${CONSENSUS_SCOPE:-all}"
224
+ elif [[ "${FINAL_CONSENSUS:-0}" = "1" ]]; then
225
+ CONSENSUS_MODE="final-only"
226
+ fi
227
+ CONSENSUS_SCOPE="${CONSENSUS_SCOPE:-${CONSENSUS_MODE}}"
228
+ CB_THRESHOLD="${CB_THRESHOLD:-6}" # consecutive failures before BLOCKED (default: 6)
229
+ # Effective CB threshold: doubled when consensus mode active
230
+ if [[ "$CONSENSUS_MODE" != "off" ]]; then
231
+ EFFECTIVE_CB_THRESHOLD=$(( CB_THRESHOLD * 2 ))
232
+ else
233
+ EFFECTIVE_CB_THRESHOLD=$CB_THRESHOLD
234
+ fi
235
+ _API_MAX_RETRIES="${_API_MAX_RETRIES:-5}"
236
+ _API_RETRY_INTERVAL_S="${_API_RETRY_INTERVAL_S:-30}"
237
+
238
+ # --- Derived Paths ---
239
+ DESK="$ROOT/${RLP_DESK_RUNTIME_DIR:-.rlp-desk}"
240
+ # v0.13.0: legacy detection — refuse to run when .claude/ralph-desk/ is still
241
+ # present. init mode auto-migrates; run mode protects in-flight campaigns.
242
+ if [[ -d "$ROOT/.claude/ralph-desk" ]]; then
243
+ print -u2 "ERROR: Legacy .claude/ralph-desk/ detected at $ROOT/.claude/ralph-desk."
244
+ print -u2 "Run mode does not auto-migrate to protect in-flight campaigns."
245
+ print -u2 "Run: mv .claude/ralph-desk ${RLP_DESK_RUNTIME_DIR:-.rlp-desk} then re-run."
246
+ exit 1
247
+ fi
248
+ # US-026 R14 P0: project-root-hashed runner lockfile prevents duplicate runner spawns
249
+ # on the same project root while allowing parallel runs across different projects.
250
+ # shasum is mac-default; sha1sum on Linux; cksum is POSIX-final fallback.
251
+ ROOT_HASH=$(printf '%s' "$ROOT" | { shasum 2>/dev/null || sha1sum 2>/dev/null || cksum; } | awk '{print substr($1,1,8)}')
252
+ RUNNER_LOCKFILE_PATH="$DESK/logs/.rlp-desk-runner-$ROOT_HASH.lock"
253
+ RUNNER_LOCKDIR="${RUNNER_LOCKFILE_PATH}.d"
254
+ PROMPTS_DIR="$DESK/prompts"
255
+ CONTEXT_DIR="$DESK/context"
256
+ MEMOS_DIR="$DESK/memos"
257
+ LOGS_DIR="$DESK/logs/$SLUG"
258
+ RUNTIME_DIR="$LOGS_DIR/runtime"
259
+ PRD_FILE="$DESK/plans/prd-$SLUG.md"
260
+ TEST_SPEC_FILE="$DESK/plans/test-spec-$SLUG.md"
261
+ # --- Analytics Directory (v5.7 §4.11.b: project-local) ---
262
+ # Was previously $HOME/.claude/ralph-desk/analytics/<slug>--<hash> (cross-project
263
+ # rollup). With v0.12.0 the canonical location is project-local; cross-project
264
+ # rollup is the Leader's responsibility via ~/.claude/ralph-desk/registry.jsonl
265
+ # (Worker/Verifier prompts never reference the registry path — see §4.11.c).
266
+ ANALYTICS_SLUG_HASH=$(echo -n "$ROOT" | md5 -q 2>/dev/null || md5sum <<< "$ROOT" | cut -d' ' -f1)
267
+ ANALYTICS_DIR="$DESK/analytics/${SLUG}--${ANALYTICS_SLUG_HASH:0:8}"
268
+ CAMPAIGN_JSONL="$ANALYTICS_DIR/campaign.jsonl"
269
+ METADATA_FILE="$ANALYTICS_DIR/metadata.json"
270
+ WORKER_PROMPT_BASE="$PROMPTS_DIR/${SLUG}.worker.prompt.md"
271
+ VERIFIER_PROMPT_BASE="$PROMPTS_DIR/${SLUG}.verifier.prompt.md"
272
+ CONTEXT_FILE="$CONTEXT_DIR/${SLUG}-latest.md"
273
+ MEMORY_FILE="$MEMOS_DIR/${SLUG}-memory.md"
274
+ SIGNAL_FILE="$MEMOS_DIR/${SLUG}-iter-signal.json"
275
+ DONE_CLAIM_FILE="$MEMOS_DIR/${SLUG}-done-claim.json"
276
+ VERDICT_FILE="$MEMOS_DIR/${SLUG}-verify-verdict.json"
277
+ # v0.14.2 Bug Report #4: codex sometimes writes the verdict file to the
278
+ # pre-v0.13.0 legacy path despite the prompt instructing otherwise (CWD
279
+ # heuristics inside the codex CLI). Track the legacy path so the no-progress
280
+ # watcher and the harvest step can both fall back to it before BLOCKing the
281
+ # campaign. Auto-migration logic lives in _migrate_legacy_verdict().
282
+ LEGACY_VERDICT_FILE="$ROOT/.claude/ralph-desk/memos/${SLUG}-verify-verdict.json"
283
+ COMPLETE_SENTINEL="$MEMOS_DIR/${SLUG}-complete.md"
284
+ BLOCKED_SENTINEL="$MEMOS_DIR/${SLUG}-blocked.md"
285
+ LOCKFILE_PATH="$DESK/logs/.rlp-desk-${SLUG}.lock"
286
+ STATUS_FILE="$RUNTIME_DIR/status.json"
287
+ SESSION_CONFIG="$RUNTIME_DIR/session-config.json"
288
+ WORKER_HEARTBEAT="$RUNTIME_DIR/worker-heartbeat.json"
289
+ VERIFIER_HEARTBEAT="$RUNTIME_DIR/verifier-heartbeat.json"
290
+ COST_LOG="$LOGS_DIR/cost-log.jsonl"
291
+
292
+ # --- Session Naming ---
293
+ TIMESTAMP=$(date +%Y%m%d-%H%M%S)
294
+ SESSION_NAME="rlp-desk-${SLUG}-${TIMESTAMP}"
295
+
296
+ # --- State Tracking ---
297
+ typeset -A LAST_PANE_CONTENT
298
+ typeset -A PANE_IDLE_SINCE
299
+ typeset -A WORKER_RESTARTS
300
+ typeset -A US_FAIL_HISTORY
301
+ STALE_CONTEXT_COUNT=0
302
+ HEARTBEAT_STALE_COUNT=0
303
+ MONITOR_FAILURE_COUNT=0
304
+ CONSECUTIVE_FAILURES=0
305
+ PREV_CONTEXT_HASH=""
306
+ PREV_PRD_HASH=""
307
+ PREV_PRD_US_LIST=""
308
+ _PRD_CHANGED=0
309
+ ITERATION=0
310
+ START_TIME=$(date +%s)
311
+ BASELINE_COMMIT="" # git HEAD at campaign start (captured before loop)
312
+ CAMPAIGN_REPORT_GENERATED=0 # guard against double-generation in cleanup trap
313
+ SV_REPORT_GENERATED=0 # guard against double-generation in generate_sv_report
314
+ VERIFIED_US="" # comma-separated list of verified US IDs (per-us mode)
315
+ CONSENSUS_ROUND=0 # current consensus round for current US
316
+ US_LIST="" # comma-separated US IDs from PRD (per-us mode)
317
+ LOCKFILE_ACQUIRED=0
318
+ LOCK_WORKER_MODEL="${LOCK_WORKER_MODEL:-0}" # 0|1 — set by --lock-worker-model; disables progressive upgrade
319
+ _SAME_US_FAIL_COUNT=0 # consecutive same-US fail counter (upgrade trigger at >= 2)
320
+ _LAST_FAILED_US="" # last failed US ID (same-US tracking for upgrade logic)
321
+ _MODEL_UPGRADED=0 # 1 if Worker model was auto-upgraded during campaign
322
+ _ORIGINAL_WORKER_MODEL="" # WORKER_MODEL saved before first upgrade (for restore on pass)
323
+ _ORIGINAL_WORKER_CODEX_REASONING="" # WORKER_CODEX_REASONING saved before first upgrade
324
+
325
+ # =============================================================================
326
+ # Utility Functions
327
+ # =============================================================================
328
+
329
+ DEBUG="${DEBUG:-0}"
330
+ DEBUG_LOG="$ANALYTICS_DIR/debug.log"
331
+
332
+ # Source shared business logic
333
+ LIB_DIR="/Users/kyjin/dev/own/ai-dev-methodologies/ai-dev-methodologies-hq/workspace/rlp-desk/src/scripts"
334
+ source "$LIB_DIR/lib_ralph_desk.zsh"
335
+
336
+ # A16: Warn if running in foreground (may conflict with Claude Code pane)
337
+ if [[ -z "${RLP_BACKGROUND:-}" ]]; then
338
+ echo "⚠ WARNING: Running in foreground. This may conflict with Claude Code's pane." >&2
339
+ echo " Recommended: launch via Bash tool with run_in_background: true" >&2
340
+ echo " Set RLP_BACKGROUND=1 to suppress this warning." >&2
341
+ fi
342
+
343
+ # check_dead_pane() — determine if pane command indicates a dead/exited process
344
+ # Engine-aware: bash is normal for codex workers (trigger runs in bash),
345
+ # but indicates dead pane for claude workers.
346
+ # Args: $1=pane_current_command $2=engine (claude|codex) $3=role (worker|verifier)
347
+ # Returns: 0 if dead, 1 if alive
348
+ check_dead_pane() {
349
+ local poll_cmd="$1"
350
+ local engine="${2:-claude}"
351
+ local role="${3:-worker}"
352
+
353
+ if [[ -z "$poll_cmd" ]]; then
354
+ return 0 # empty = dead
355
+ elif [[ "$poll_cmd" == "zsh" ]]; then
356
+ return 0 # bare zsh = dead
357
+ elif [[ "$poll_cmd" == "bash" && "$engine" != "codex" ]]; then
358
+ return 0 # bash = dead for claude (codex uses bash trigger)
359
+ fi
360
+ return 1 # alive
361
+ }
362
+
363
+ # launch_worker_codex() — launch codex Worker TUI, send instruction, verify submission
364
+ # Matches launch_worker_claude() pattern for consistent tmux-visible execution.
365
+ # Args: $1=pane_id $2=prompt_file $3=iteration $4=worker_launch_cmd
366
+ # Returns: 0 on success, 1 on fatal failure
367
+ launch_worker_codex() {
368
+ local pane_id="$1"
369
+ local prompt_file="$2"
370
+ local iter="$3"
371
+ local worker_launch="$4"
372
+
373
+ log " Launching Worker codex TUI in pane $pane_id..."
374
+ # Clean pane before launch: kill any lingering process, ensure fresh shell
375
+ local _pre_cmd
376
+ _pre_cmd=$(tmux display-message -p -t "$pane_id" '#{pane_current_command}' 2>/dev/null || echo "")
377
+ if [[ "$_pre_cmd" != "zsh" && "$_pre_cmd" != "bash" && -n "$_pre_cmd" ]]; then
378
+ log_debug "Worker pane has lingering process ($_pre_cmd), cleaning..."
379
+ tmux send-keys -t "$pane_id" C-c 2>/dev/null; sleep 0.5
380
+ tmux send-keys -t "$pane_id" C-c 2>/dev/null; sleep 1
381
+ fi
382
+ paste_to_pane "$pane_id" "$worker_launch"
383
+ tmux send-keys -t "$pane_id" C-m
384
+
385
+ # Wait for codex TUI prompt (›) instead of shell prompt
386
+ local _codex_ready=0
387
+ local _codex_wait=0
388
+ while (( _codex_wait < 30 )); do
389
+ sleep 1
390
+ local _pane_text
391
+ _pane_text=$(tmux capture-pane -t "$pane_id" -p 2>/dev/null || true)
392
+ if echo "$_pane_text" | grep -q '›' 2>/dev/null; then
393
+ _codex_ready=1
394
+ log_debug "Worker codex TUI ready after ${_codex_wait}s"
395
+ break
396
+ fi
397
+ (( _codex_wait++ ))
398
+ done
399
+ if (( ! _codex_ready )); then
400
+ log_error "Worker codex TUI not ready after 30s"
401
+ return 1
402
+ fi
403
+
404
+ # Send instruction to codex TUI
405
+ sleep 1
406
+ local worker_instruction="Read and execute the instructions in $prompt_file"
407
+ paste_to_pane "$pane_id" "$worker_instruction"
408
+ tmux send-keys -t "$pane_id" C-m
409
+ log_debug "Worker codex instruction sent (${#worker_instruction} chars)"
410
+
411
+ # Submit loop — verify codex started working
412
+ local submit_attempts=0
413
+ while (( submit_attempts < 15 )); do
414
+ sleep 2
415
+ local pane_check
416
+ pane_check=$(tmux capture-pane -t "$pane_id" -p 2>/dev/null)
417
+ if echo "$pane_check" | grep -qi "working\|thinking\|Exploring\|Running\|reading\|searching\|editing\|writing" 2>/dev/null; then
418
+ log_debug "Worker codex started working after $((submit_attempts + 1)) checks"
419
+ break
420
+ fi
421
+ if (( submit_attempts == 8 )); then
422
+ log_debug "Adaptive instruction retry: clearing line and re-typing"
423
+ tmux send-keys -t "$pane_id" C-u 2>/dev/null
424
+ sleep 0.1
425
+ paste_to_pane "$pane_id" "$worker_instruction"
426
+ tmux send-keys -t "$pane_id" C-m
427
+ fi
428
+ tmux send-keys -t "$pane_id" C-m 2>/dev/null
429
+ sleep 0.3
430
+ tmux send-keys -t "$pane_id" C-m 2>/dev/null
431
+ (( submit_attempts++ ))
432
+ done
433
+ return 0
434
+ }
435
+
436
+ # launch_worker_claude() — launch claude Worker TUI, send instruction, verify submission
437
+ # Handles: TUI startup, wait_for_pane_ready, instruction send, 15-iteration submit loop,
438
+ # restart recovery on submit failure.
439
+ # Args: $1=pane_id $2=prompt_file $3=iteration $4=worker_launch_cmd
440
+ # Returns: 0 on success, 1 on fatal failure (caller writes BLOCKED)
441
+ launch_worker_claude() {
442
+ local pane_id="$1"
443
+ local prompt_file="$2"
444
+ local iter="$3"
445
+ local worker_launch="$4"
446
+
447
+ log " Launching Worker claude in pane $pane_id..."
448
+ paste_to_pane "$pane_id" "$worker_launch"
449
+ tmux send-keys -t "$pane_id" C-m
450
+
451
+ # Wait for claude TUI to be ready
452
+ if ! wait_for_pane_ready "$pane_id" 30; then
453
+ log_error "Worker claude failed to start"
454
+ return 1
455
+ fi
456
+
457
+ # Send instruction to claude TUI
458
+ sleep 3
459
+ local worker_instruction="Read and execute the instructions in $prompt_file"
460
+ paste_to_pane "$pane_id" "$worker_instruction"
461
+ tmux send-keys -t "$pane_id" C-m
462
+ log_debug "Worker instruction sent directly (${#worker_instruction} chars)"
463
+
464
+ # 15-iteration submit loop — verify claude started working
465
+ local submit_attempts=0
466
+ while (( submit_attempts < 15 )); do
467
+ sleep 2
468
+ local pane_check
469
+ pane_check=$(tmux capture-pane -t "$pane_id" -p 2>/dev/null)
470
+ if echo "$pane_check" | grep -qi "esc to interrupt\|thinking\|working\|kneading\|crunching\|clauding\|billowing\|brewing\|tinkering\|burrowing\|saut\|Exploring\|Running\|exec\|Explored\|Prestidigitating\|Undulating\|Reading\|Bash\|Edit\|Write\|Grep\|Glob" 2>/dev/null; then
471
+ log_debug "Worker started working after $((submit_attempts + 1)) submit checks"
472
+ log_debug "[FLOW] iter=$iter worker_submit_check=OK attempts=$((submit_attempts + 1))"
473
+ break
474
+ fi
475
+ # Every 3 failed attempts, re-send full instruction
476
+ if (( submit_attempts > 0 && submit_attempts % 3 == 0 )); then
477
+ log_debug "Re-sending full worker instruction (attempt $submit_attempts)"
478
+ tmux send-keys -t "$pane_id" C-u 2>/dev/null
479
+ sleep 0.2
480
+ paste_to_pane "$pane_id" "$worker_instruction"
481
+ sleep 0.15
482
+ tmux send-keys -t "$pane_id" C-m
483
+ sleep 1
484
+ fi
485
+ tmux send-keys -t "$pane_id" C-m 2>/dev/null
486
+ sleep 0.3
487
+ tmux send-keys -t "$pane_id" C-m 2>/dev/null
488
+ (( submit_attempts++ ))
489
+ done
490
+
491
+ # If 15 attempts failed, restart claude and retry
492
+ if (( submit_attempts >= 15 )); then
493
+ log " WARNING: Worker instruction not consumed after 15 attempts — restarting claude"
494
+ log_debug "[GOV] iter=$iter worker_instruction_failed=true attempts=15 action=restart_claude"
495
+ tmux send-keys -t "$pane_id" C-c 2>/dev/null
496
+ sleep 0.5
497
+ tmux send-keys -t "$pane_id" "/exit" C-m 2>/dev/null
498
+ sleep 2
499
+ wait_for_pane_ready "$pane_id" 10 2>/dev/null || true
500
+ paste_to_pane "$pane_id" "$worker_launch"
501
+ tmux send-keys -t "$pane_id" C-m
502
+ if wait_for_pane_ready "$pane_id" 30; then
503
+ sleep 3
504
+ paste_to_pane "$pane_id" "$worker_instruction"
505
+ tmux send-keys -t "$pane_id" C-m
506
+ log " Worker restarted and instruction re-sent"
507
+ log_debug "[FLOW] iter=$iter worker_restart_recovery=success"
508
+ else
509
+ log_error "Worker restart failed — pane not ready"
510
+ log_debug "[FLOW] iter=$iter worker_restart_recovery=failed"
511
+ fi
512
+ fi
513
+
514
+ return 0
515
+ }
516
+
517
+ # launch_verifier_codex() — launch codex Verifier TUI, send instruction, verify submission
518
+ # Matches launch_verifier_claude() pattern for consistent tmux-visible execution.
519
+ # Args: $1=pane_id $2=prompt_file $3=iteration $4=launch_cmd
520
+ # Returns: 0 on success
521
+ launch_verifier_codex() {
522
+ local pane_id="$1"
523
+ local prompt_file="$2"
524
+ local iter="$3"
525
+ local verifier_launch="$4"
526
+
527
+ log " Launching Verifier codex TUI in pane $pane_id..."
528
+ # Clean pane before launch: kill any lingering process, ensure fresh shell
529
+ local _pre_cmd
530
+ _pre_cmd=$(tmux display-message -p -t "$pane_id" '#{pane_current_command}' 2>/dev/null || echo "")
531
+ if [[ "$_pre_cmd" != "zsh" && "$_pre_cmd" != "bash" && -n "$_pre_cmd" ]]; then
532
+ log_debug "Verifier pane has lingering process ($_pre_cmd), cleaning..."
533
+ tmux send-keys -t "$pane_id" C-c 2>/dev/null; sleep 0.5
534
+ tmux send-keys -t "$pane_id" C-c 2>/dev/null; sleep 1
535
+ fi
536
+ paste_to_pane "$pane_id" "$verifier_launch"
537
+ tmux send-keys -t "$pane_id" C-m
538
+
539
+ # Wait for codex TUI prompt (›) instead of shell prompt
540
+ local _codex_ready=0
541
+ local _codex_wait=0
542
+ while (( _codex_wait < 30 )); do
543
+ sleep 1
544
+ local _pane_text
545
+ _pane_text=$(tmux capture-pane -t "$pane_id" -p 2>/dev/null || true)
546
+ if echo "$_pane_text" | grep -q '›' 2>/dev/null; then
547
+ _codex_ready=1
548
+ log_debug "Verifier codex TUI ready after ${_codex_wait}s"
549
+ break
550
+ fi
551
+ (( _codex_wait++ ))
552
+ done
553
+ if (( ! _codex_ready )); then
554
+ log_error "Verifier codex TUI not ready after 30s"
555
+ return 1
556
+ fi
557
+
558
+ sleep 1
559
+ local verifier_instruction="Read and execute the instructions in $prompt_file"
560
+ paste_to_pane "$pane_id" "$verifier_instruction"
561
+ tmux send-keys -t "$pane_id" C-m
562
+ log_debug "Verifier codex instruction sent"
563
+
564
+ # Submit loop — verify codex started working
565
+ local submit_attempts=0
566
+ while (( submit_attempts < 15 )); do
567
+ sleep 2
568
+ local vs_check
569
+ vs_check=$(tmux capture-pane -t "$pane_id" -p 2>/dev/null)
570
+ if echo "$vs_check" | grep -qi "working\|thinking\|Exploring\|Running\|reading\|searching\|editing\|writing" 2>/dev/null; then
571
+ log_debug "Verifier codex started working after $((submit_attempts + 1)) checks"
572
+ break
573
+ fi
574
+ if (( submit_attempts == 8 )); then
575
+ log_debug "Adaptive instruction retry: clearing line and re-typing"
576
+ tmux send-keys -t "$pane_id" C-u 2>/dev/null
577
+ sleep 0.1
578
+ paste_to_pane "$pane_id" "$verifier_instruction"
579
+ tmux send-keys -t "$pane_id" C-m
580
+ fi
581
+ tmux send-keys -t "$pane_id" C-m 2>/dev/null
582
+ sleep 0.3
583
+ tmux send-keys -t "$pane_id" C-m 2>/dev/null
584
+ (( submit_attempts++ ))
585
+ done
586
+ return 0
587
+ }
588
+
589
+ # launch_verifier_claude() — launch claude Verifier TUI, send instruction, verify submission
590
+ # Args: $1=pane_id $2=prompt_file $3=iteration $4=launch_cmd
591
+ # Returns: 0 on success
592
+ launch_verifier_claude() {
593
+ local pane_id="$1"
594
+ local prompt_file="$2"
595
+ local iter="$3"
596
+ local verifier_launch="$4"
597
+
598
+ log " Launching Verifier claude in pane $pane_id..."
599
+ paste_to_pane "$pane_id" "$verifier_launch"
600
+ tmux send-keys -t "$pane_id" C-m
601
+
602
+ if ! wait_for_pane_ready "$pane_id" 30; then
603
+ log_error "Verifier failed to start"
604
+ return 1
605
+ fi
606
+
607
+ sleep 3
608
+ local verifier_instruction="Read and execute the instructions in $prompt_file"
609
+ paste_to_pane "$pane_id" "$verifier_instruction"
610
+ tmux send-keys -t "$pane_id" C-m
611
+ log_debug "Verifier instruction sent directly"
612
+
613
+ # Submit loop — verify verifier started working
614
+ local submit_attempts=0
615
+ while (( submit_attempts < 15 )); do
616
+ sleep 2
617
+ local vs_check
618
+ vs_check=$(tmux capture-pane -t "$pane_id" -p 2>/dev/null)
619
+ if echo "$vs_check" | grep -qi "esc to interrupt\|thinking\|working\|kneading\|crunching\|clauding\|billowing\|brewing\|tinkering\|burrowing\|saut\|Exploring\|Running\|exec\|Explored" 2>/dev/null; then
620
+ log_debug "Verifier started working after $((submit_attempts + 1)) checks"
621
+ break
622
+ fi
623
+ if (( submit_attempts == 8 )); then
624
+ log_debug "Adaptive instruction retry: clearing line and re-typing"
625
+ tmux send-keys -t "$pane_id" C-u 2>/dev/null
626
+ sleep 0.1
627
+ paste_to_pane "$pane_id" "$verifier_instruction"
628
+ tmux send-keys -t "$pane_id" C-m
629
+ fi
630
+ tmux send-keys -t "$pane_id" C-m 2>/dev/null
631
+ sleep 0.3
632
+ tmux send-keys -t "$pane_id" C-m 2>/dev/null
633
+ (( submit_attempts++ ))
634
+ done
635
+ return 0
636
+ }
637
+
638
+ # handle_worker_exit_codex() — handle codex worker process exit (1-shot exec)
639
+ # On exit: check done-claim, auto-generate iter-signal.
640
+ # Args: $1=iteration $2=signal_file
641
+ # Returns: 0 (signal generated), 1 (error)
642
+ # Bug #8 PR-B (codex critic P1.2 fix): shared 4-way gate used by both
643
+ # handle_worker_exit_codex and the inline-polling A4 path. Returns:
644
+ # 0 = synthesize allowed (caller writes signal_file + emits audit)
645
+ # 1 = BLOCKED (this function already wrote sentinel + emitted audit)
646
+ # Args: $1=iter $2=us_id $3=audit_clean_code (e.g. codex_exit_with_done_claim
647
+ # or inline_polling_a4_clean)
648
+ _bug8_check_synth_allowed() {
649
+ local iter="$1"
650
+ local us_id="${2:-${CURRENT_US:-ALL}}"
651
+ local audit_clean="$3"
652
+
653
+ # Gate 1: done-claim must exist.
654
+ if [[ ! -f "$DONE_CLAIM_FILE" ]]; then
655
+ log_error " Bug #8: no done-claim. Refusing to synthesize verify signal."
656
+ log_debug "[GOV] iter=$iter bug8=block_codex_exit_no_done_claim"
657
+ write_blocked_sentinel \
658
+ "Codex worker exited without writing done-claim (refusing to synthesize verify signal)" \
659
+ "$us_id" \
660
+ "infra_failure"
661
+ _emit_a4_fallback_audit "$us_id" "$iter" "blocked_codex_exit_no_done_claim"
662
+ return 1
663
+ fi
664
+
665
+ # Gate 2: git toplevel must equal $ROOT (canonicalized — macOS resolves
666
+ # /var → /private/var, NTFS may have 8.3 short paths; compare realpaths).
667
+ local _bug8_top _bug8_top_canon _bug8_root_canon
668
+ _bug8_top=$(git -C "$ROOT" rev-parse --show-toplevel 2>/dev/null)
669
+ _bug8_top_canon=$(cd "$_bug8_top" 2>/dev/null && pwd -P 2>/dev/null)
670
+ _bug8_root_canon=$(cd "$ROOT" 2>/dev/null && pwd -P 2>/dev/null)
671
+ if [[ -z "$_bug8_top" || "$_bug8_top_canon" != "$_bug8_root_canon" ]]; then
672
+ log_error " Bug #8: git unverifiable at \$ROOT=$ROOT (toplevel='$_bug8_top'). Refusing synthesis."
673
+ log_debug "[GOV] iter=$iter bug8=block_git_unverifiable root=$ROOT toplevel=$_bug8_top"
674
+ write_blocked_sentinel \
675
+ "git status unverifiable at $ROOT (toplevel='$_bug8_top'); refusing to synthesize verify signal" \
676
+ "$us_id" \
677
+ "infra_failure"
678
+ _emit_a4_fallback_audit "$us_id" "$iter" "blocked_git_unverifiable"
679
+ return 1
680
+ fi
681
+
682
+ # Gate 3: tree must be clean.
683
+ local _bug8_dirty
684
+ _bug8_dirty=$(git -C "$ROOT" status --porcelain 2>/dev/null)
685
+ if [[ -n "$_bug8_dirty" ]]; then
686
+ local _bug8_first5
687
+ _bug8_first5=$(printf '%s\n' "$_bug8_dirty" | head -n 5 | tr '\n' '|' | sed 's/|$//')
688
+ log_error " Bug #8: done-claim present but tree dirty. Refusing synthesis. dirty: $_bug8_first5"
689
+ log_debug "[GOV] iter=$iter bug8=block_dirty_tree us_id=$us_id dirty='$_bug8_first5'"
690
+ write_blocked_sentinel \
691
+ "worker_incomplete_uncommitted: done-claim present but tree dirty ($_bug8_first5)" \
692
+ "$us_id" \
693
+ "metric_failure"
694
+ _emit_a4_fallback_audit "$us_id" "$iter" "blocked_dirty_tree"
695
+ return 1
696
+ fi
697
+
698
+ # All gates passed — synthesize allowed.
699
+ return 0
700
+ }
701
+
702
+ handle_worker_exit_codex() {
703
+ local iter="$1"
704
+ local signal_file="$2"
705
+
706
+ log " Codex worker process exited. Checking for done-claim + clean tree..."
707
+
708
+ if ! _bug8_check_synth_allowed "$iter" "${CURRENT_US:-ALL}" "codex_exit_with_done_claim"; then
709
+ return 1
710
+ fi
711
+
712
+ # All 3 gates passed: done-claim present, git OK, tree clean → synthesize.
713
+ local dc_us_id
714
+ dc_us_id=$(jq -r '.us_id // "unknown"' "$DONE_CLAIM_FILE" 2>/dev/null)
715
+ log " Codex worker completed with done-claim (us_id=$dc_us_id) and clean tree. Auto-generating signal."
716
+ echo '{"iteration":'"$iter"',"status":"verify","us_id":"'"$dc_us_id"'","summary":"auto-generated after codex exit (clean tree)","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'"}' > "$signal_file"
717
+ # v0.15.4 PR-B2-FIX: codex worker pane already exited — reaper would no-op,
718
+ # but lock done-claim as defense-in-depth so any orphaned subprocess cannot
719
+ # rewrite the file before lib_ralph_desk.zsh:602 archives it.
720
+ _lock_sentinel "$DONE_CLAIM_FILE"
721
+ _emit_a4_fallback_audit "$dc_us_id" "$iter" "codex_exit_with_done_claim_clean"
722
+ return 0
723
+ }
724
+
725
+ # handle_worker_exit_claude() — handle claude worker process exit (restart with backoff)
726
+ # Args: $1=pane_id $2=iteration $3=trigger_file
727
+ # Returns: 0 (restarted), 1 (max restarts exceeded)
728
+ handle_worker_exit_claude() {
729
+ local pane_id="$1"
730
+ local iter="$2"
731
+ local trigger_file="$3"
732
+
733
+ log_error "Worker exited without writing signal file"
734
+ if restart_worker "$pane_id" "$iter" "$trigger_file"; then
735
+ return 0
736
+ else
737
+ return 1
738
+ fi
739
+ }
740
+
741
+ # --- omc-teams pattern: Kill-and-replace dead/stuck worker panes ---
742
+ replace_worker_pane() {
743
+ local old_pane="$1"
744
+ local role="$2" # "worker" or "verifier"
745
+
746
+ log " Replacing dead $role pane $old_pane..."
747
+ tmux kill-pane -t "$old_pane" 2>/dev/null
748
+
749
+ # Create fresh pane maintaining original layout: worker(top-right) / verifier(bottom-right)
750
+ local new_pane
751
+ if [[ "$role" == "verifier" ]]; then
752
+ # Verifier goes below worker: split vertically from worker pane
753
+ if tmux display-message -t "$WORKER_PANE" -p '#{pane_id}' &>/dev/null; then
754
+ new_pane=$(tmux split-window -v -d -t "$WORKER_PANE" -P -F '#{pane_id}' -c "$ROOT")
755
+ else
756
+ # Fallback: worker pane also dead, split horizontally from leader
757
+ new_pane=$(tmux split-window -h -d -t "$LEADER_PANE" -P -F '#{pane_id}' -c "$ROOT")
758
+ fi
759
+ else
760
+ # Worker goes above verifier: split vertically before verifier pane
761
+ if tmux display-message -t "$VERIFIER_PANE" -p '#{pane_id}' &>/dev/null; then
762
+ new_pane=$(tmux split-window -v -b -d -t "$VERIFIER_PANE" -P -F '#{pane_id}' -c "$ROOT")
763
+ else
764
+ # Fallback: verifier pane also dead, split horizontally from leader
765
+ new_pane=$(tmux split-window -h -d -t "$LEADER_PANE" -P -F '#{pane_id}' -c "$ROOT")
766
+ fi
767
+ fi
768
+
769
+ log " New $role pane: $new_pane (replaced $old_pane)"
770
+ log_debug "[FLOW] iter=$ITERATION pane_replaced=${role} old=$old_pane new=$new_pane"
771
+
772
+ # Update session-config.json with new pane ID
773
+ if [[ -f "$SESSION_CONFIG" ]]; then
774
+ jq --arg role "$role" --arg pane "$new_pane" \
775
+ '.panes[$role] = $pane' "$SESSION_CONFIG" | atomic_write "$SESSION_CONFIG"
776
+ log_debug "Updated session-config.json: $role pane → $new_pane"
777
+ fi
778
+
779
+ echo "$new_pane"
780
+ }
781
+
782
+ # =============================================================================
783
+ # Dependency Checks
784
+ # =============================================================================
785
+
786
+ # --- governance.md s7 step 1: Validate prerequisites before starting ---
787
+ check_dependencies() {
788
+ local missing=0
789
+
790
+ if ! command -v tmux >/dev/null 2>&1; then
791
+ log_error "tmux is required but not found. Install with: brew install tmux"
792
+ missing=1
793
+ fi
794
+
795
+ # claude required only when claude engine is used for Worker or Verifier execution;
796
+ # codex-only campaigns can run without claude — generate_sv_report degrades gracefully
797
+ if [[ "$WORKER_ENGINE" != "codex" || "$VERIFIER_ENGINE" != "codex" ]]; then
798
+ if ! command -v claude >/dev/null 2>&1; then
799
+ log_error "claude CLI is required but not found. See: https://docs.anthropic.com/en/docs/claude-cli"
800
+ missing=1
801
+ fi
802
+ fi
803
+
804
+ if ! command -v jq >/dev/null 2>&1; then
805
+ log_error "jq is required but not found. Install with: brew install jq"
806
+ missing=1
807
+ fi
808
+
809
+ # Codex binary required only when engine=codex or consensus verification is enabled
810
+ if [[ "$WORKER_ENGINE" = "codex" || "$VERIFIER_ENGINE" = "codex" || "$CONSENSUS_MODE" != "off" ]]; then
811
+ if ! command -v codex >/dev/null 2>&1; then
812
+ log_error "codex CLI not found. Install: npm install -g @openai/codex"
813
+ missing=1
814
+ fi
815
+ fi
816
+
817
+ if (( missing )); then
818
+ exit 1
819
+ fi
820
+
821
+ # Resolve full path to claude binary when claude engine is in use
822
+ if [[ "$WORKER_ENGINE" != "codex" || "$VERIFIER_ENGINE" != "codex" ]]; then
823
+ CLAUDE_BIN=$(command -v claude 2>/dev/null || echo "claude")
824
+ log " Claude binary: $CLAUDE_BIN"
825
+ fi
826
+
827
+ # Resolve codex binary if needed
828
+ if [[ "$WORKER_ENGINE" = "codex" || "$VERIFIER_ENGINE" = "codex" || "$CONSENSUS_MODE" != "off" ]]; then
829
+ CODEX_BIN=$(command -v codex 2>/dev/null || echo "codex")
830
+ log " Codex binary: $CODEX_BIN"
831
+ fi
832
+ }
833
+
834
+ # =============================================================================
835
+ # Session Management (tmux pattern: pane IDs)
836
+ # =============================================================================
837
+
838
+ # --- governance.md s7 step 1: Check for existing sessions ---
839
+ check_existing_sessions() {
840
+ local current_session
841
+ current_session=$(tmux display-message -p '#{session_name}' 2>/dev/null || echo "")
842
+ local existing
843
+ existing=$(tmux list-sessions -F '#{session_name}' 2>/dev/null | grep "^rlp-desk-${SLUG}-" | grep -v "^${current_session}$" || true)
844
+ if [[ -n "$existing" ]]; then
845
+ log_error "Existing tmux session(s) found for slug '$SLUG':"
846
+ echo "$existing" | while read -r s; do
847
+ echo " - $s"
848
+ done
849
+ echo ""
850
+ echo "Kill existing session first:"
851
+ echo " tmux kill-session -t <session-name>"
852
+ exit 1
853
+ fi
854
+ }
855
+
856
+ # --- governance.md s7 step 1: Create tmux session with pane IDs (%N) ---
857
+ create_session() {
858
+ log "Creating tmux session: $SESSION_NAME"
859
+
860
+ # tmux split-pane pattern
861
+ if [[ -n "${TMUX:-}" ]]; then
862
+ # Inside tmux: split CURRENT pane in place
863
+ # Current pane stays as-is (leader/user stays here)
864
+ # Worker/Verifier appear on the RIGHT, user sees them immediately
865
+ LEADER_PANE=$(tmux display-message -p '#{pane_id}')
866
+ SESSION_NAME=$(tmux display-message -p '#{session_name}')
867
+ log " Splitting current pane in session: $SESSION_NAME"
868
+
869
+ # -h off current pane → right column (worker)
870
+ WORKER_PANE=$(tmux split-window -h -d -t "$LEADER_PANE" -P -F '#{pane_id}' -c "$ROOT")
871
+ # -v off worker → stacked below on right (verifier)
872
+ VERIFIER_PANE=$(tmux split-window -v -d -t "$WORKER_PANE" -P -F '#{pane_id}' -c "$ROOT")
873
+ else
874
+ # Outside tmux: wrap current terminal into a new tmux session and attach
875
+ # tmux pattern: user sees panes immediately, no separate attach needed
876
+ # US-025 R13 P0: verify tmux new-session exit code; if collision + RLP_BACKGROUND,
877
+ # disambiguate with -bg-<epoch>-<pid> suffix and a residual has-session loop.
878
+ if ! tmux new-session -d -s "$SESSION_NAME" -x 200 -y 50 -c "$ROOT" 2>/dev/null; then
879
+ if tmux has-session -t "$SESSION_NAME" 2>/dev/null; then
880
+ if [[ "${RLP_BACKGROUND:-0}" == "1" ]]; then
881
+ SESSION_NAME="${SESSION_NAME}-bg-$(date +%s)-$$"
882
+ while tmux has-session -t "$SESSION_NAME" 2>/dev/null; do
883
+ SESSION_NAME="${SESSION_NAME}-$(awk 'BEGIN{srand();print int(1000+rand()*9000)}')"
884
+ done
885
+ tmux new-session -d -s "$SESSION_NAME" -x 200 -y 50 -c "$ROOT" || {
886
+ log_error "tmux new-session retry failed for $SESSION_NAME"
887
+ exit 1
888
+ }
889
+ else
890
+ log_error "tmux new-session failed: session $SESSION_NAME already exists (set RLP_BACKGROUND=1 to auto-rename)"
891
+ exit 1
892
+ fi
893
+ else
894
+ log_error "tmux new-session failed and session does not exist: $SESSION_NAME"
895
+ exit 1
896
+ fi
897
+ fi
898
+ # destroy-unattached off keeps the session alive when no tmux client is attached.
899
+ # Best-effort only: it does NOT survive manual `tmux kill-session` or tmux server restart.
900
+ # If either happens, R12 (lifecycle monitor) detects it and writes infra_failure BLOCKED.
901
+ if [[ "${RLP_BACKGROUND:-0}" == "1" ]]; then
902
+ tmux set-option -t "$SESSION_NAME" destroy-unattached off 2>/dev/null
903
+ fi
904
+ LEADER_PANE=$(tmux display-message -p -t "$SESSION_NAME" '#{pane_id}')
905
+ WORKER_PANE=$(tmux split-window -h -d -t "$LEADER_PANE" -P -F '#{pane_id}' -c "$ROOT")
906
+ VERIFIER_PANE=$(tmux split-window -v -d -t "$WORKER_PANE" -P -F '#{pane_id}' -c "$ROOT")
907
+
908
+ fi
909
+
910
+ # Set pane titles and enable border labels for visual distinction
911
+ local worker_label="Worker ($WORKER_ENGINE:$WORKER_MODEL)"
912
+ local verifier_label="Verifier ($VERIFIER_ENGINE:$VERIFIER_MODEL)"
913
+ [[ "$CONSENSUS_MODE" != "off" ]] && verifier_label="Verifier ($VERIFIER_ENGINE:$VERIFIER_MODEL + consensus)"
914
+ tmux select-pane -t "$LEADER_PANE" -T "Leader" 2>/dev/null
915
+ tmux select-pane -t "$WORKER_PANE" -T "$worker_label" 2>/dev/null
916
+ tmux select-pane -t "$VERIFIER_PANE" -T "$verifier_label" 2>/dev/null
917
+ # Color-coded pane borders: green=leader, blue=worker, yellow=verifier
918
+ tmux set-option -p -t "$LEADER_PANE" pane-border-style "fg=green" 2>/dev/null
919
+ tmux set-option -p -t "$WORKER_PANE" pane-border-style "fg=blue" 2>/dev/null
920
+ tmux set-option -p -t "$VERIFIER_PANE" pane-border-style "fg=yellow" 2>/dev/null
921
+ # Show pane titles in border
922
+ tmux set-option pane-border-status top 2>/dev/null
923
+ tmux set-option pane-border-format "#{?pane_active,#[fg=white bold],#[fg=grey]} #{pane_title} " 2>/dev/null
924
+
925
+ log " Leader pane: $LEADER_PANE"
926
+ log " Worker pane: $WORKER_PANE"
927
+ log " Verifier pane: $VERIFIER_PANE"
928
+
929
+ # US-024 R12 P0: lifecycle check site #1 — verify all panes/session alive after creation.
930
+ _r12_check_lifecycle "create_session"
931
+
932
+ # AC12: Capture baseline commit before writing session config
933
+ BASELINE_COMMIT=$(git -C "$ROOT" rev-parse HEAD 2>/dev/null || echo "none")
934
+
935
+ # Truncate cost-log for fresh run (previous data in versioned campaign reports)
936
+ # NOTE: ': >' not bare '>' — in zsh a bare redirect with no command runs $NULLCMD
937
+ # (=cat), which blocks reading stdin when the leader has an open TTY (D-1 dogfood hang).
938
+ : > "$COST_LOG"
939
+
940
+ # v5.7 §4.2: WITH_SELF_VERIFICATION=1 is hard-rejected at script entry now,
941
+ # so by the time we reach create_session() the flag is guaranteed to be 0.
942
+ # The legacy "NOTE: Agent-mode only; disabling" log line was removed because
943
+ # the deprecation banner at startup is more honest (we exit 2, we don't
944
+ # silently disable).
945
+
946
+ # Write session config (atomic write)
947
+ echo '{
948
+ "session_name": "'"$SESSION_NAME"'",
949
+ "slug": "'"$SLUG"'",
950
+ "created_at": "'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'",
951
+ "baseline_commit": "'"$BASELINE_COMMIT"'",
952
+ "panes": {
953
+ "leader": "'"$LEADER_PANE"'",
954
+ "worker": "'"$WORKER_PANE"'",
955
+ "verifier": "'"$VERIFIER_PANE"'"
956
+ },
957
+ "pid": '$$',
958
+ "root": "'"$ROOT"'",
959
+ "models": {
960
+ "worker": "'"$WORKER_MODEL"'",
961
+ "verifier": "'"$VERIFIER_MODEL"'"
962
+ },
963
+ "engines": {
964
+ "worker": "'"$WORKER_ENGINE"'",
965
+ "verifier": "'"$VERIFIER_ENGINE"'",
966
+ "worker_codex_model": "'"$WORKER_CODEX_MODEL"'",
967
+ "worker_codex_reasoning": "'"$WORKER_CODEX_REASONING"'",
968
+ "verifier_codex_model": "'"$VERIFIER_CODEX_MODEL"'",
969
+ "verifier_codex_reasoning": "'"$VERIFIER_CODEX_REASONING"'"
970
+ },
971
+ "verification": {
972
+ "verify_mode": "'"$VERIFY_MODE"'",
973
+ "consensus_mode": "'"$CONSENSUS_MODE"'"
974
+ },
975
+ "config": {
976
+ "max_iter": '"$MAX_ITER"',
977
+ "poll_interval": '"$POLL_INTERVAL"',
978
+ "iter_timeout": '"$ITER_TIMEOUT"',
979
+ "heartbeat_stale_threshold": '"$HEARTBEAT_STALE_THRESHOLD"',
980
+ "max_restarts": '"$MAX_RESTARTS"',
981
+ "idle_nudge_threshold": '"$IDLE_NUDGE_THRESHOLD"',
982
+ "max_nudges": '"$MAX_NUDGES"',
983
+ "cb_threshold": '"$CB_THRESHOLD"',
984
+ "effective_cb_threshold": '"$EFFECTIVE_CB_THRESHOLD"',
985
+ "with_self_verification": '"$WITH_SELF_VERIFICATION"',
986
+ "with_self_verification_requested": '"$WITH_SELF_VERIFICATION_REQUESTED"',
987
+ "sv_skipped_reason": "'"$SV_SKIPPED_REASON"'",
988
+ "lane_mode": "'"$LANE_MODE"'",
989
+ "autonomous_mode": '"$AUTONOMOUS_MODE"'
990
+ }
991
+ }' | atomic_write "$SESSION_CONFIG"
992
+
993
+ log " Session config: $SESSION_CONFIG"
994
+ }
995
+
996
+ # =============================================================================
997
+ # Copy-Mode Guard (tmux pattern)
998
+ # =============================================================================
999
+
1000
+ # --- governance.md s7 step 5: Check pane_in_mode before every send-keys ---
1001
+ check_copy_mode() {
1002
+ local pane_id="$1"
1003
+ local in_mode
1004
+ in_mode=$(tmux display-message -p -t "$pane_id" '#{pane_in_mode}' 2>/dev/null) || return 1
1005
+ if [[ "$in_mode" -eq 1 ]]; then
1006
+ return 1 # pane is in copy mode, cannot send keys
1007
+ fi
1008
+ return 0
1009
+ }
1010
+
1011
+ # =============================================================================
1012
+ # Verification-Based Send Retry (tmux pattern)
1013
+ # =============================================================================
1014
+
1015
+ # --- Reliable text paste via tmux buffer (avoids send-keys -l char-by-char issues) ---
1016
+ paste_to_pane() {
1017
+ local pane_id="$1"
1018
+ local text="$2"
1019
+ local tmpbuf="/tmp/.rlp-desk-paste-$$.tmp"
1020
+ echo -n "$text" > "$tmpbuf"
1021
+ tmux load-buffer -b rlp-paste "$tmpbuf" 2>/dev/null
1022
+ tmux paste-buffer -b rlp-paste -d -t "$pane_id" 2>/dev/null
1023
+ rm -f "$tmpbuf"
1024
+ }
1025
+
1026
+ # --- governance.md s7 step 5: Send with copy-mode guard and retry ---
1027
+ safe_send_keys() {
1028
+ local pane_id="$1"
1029
+ local text="$2"
1030
+
1031
+ # --- Exact tmux sendToWorker pattern (tmux-session.js:527-626) ---
1032
+
1033
+ # Guard: copy-mode captures keys; skip entirely
1034
+ if ! check_copy_mode "$pane_id"; then
1035
+ log_debug " Pane $pane_id in copy mode, skipping send"
1036
+ return 1
1037
+ fi
1038
+
1039
+ # Check for trust prompt and auto-dismiss
1040
+ local initial_capture
1041
+ initial_capture=$(tmux capture-pane -t "$pane_id" -p -S -20 2>/dev/null)
1042
+ local pane_busy=0
1043
+ if echo "$initial_capture" | grep -q "esc to interrupt" 2>/dev/null; then
1044
+ pane_busy=1
1045
+ fi
1046
+ if echo "$initial_capture" | grep -q "Do you trust" 2>/dev/null; then
1047
+ log_debug " Trust prompt detected, dismissing"
1048
+ tmux send-keys -t "$pane_id" C-m
1049
+ sleep 0.12
1050
+ fi
1051
+ # Auto-approve permission prompts ("Do you want to create/overwrite X?")
1052
+ if echo "$initial_capture" | grep -q "Do you want to" 2>/dev/null; then
1053
+ log_debug " Permission prompt detected, auto-approving"
1054
+ tmux send-keys -t "$pane_id" C-m
1055
+ sleep 0.3
1056
+ fi
1057
+ # Auto-dismiss codex update prompt (select Skip)
1058
+ if echo "$initial_capture" | grep -qi "new version\|update.*codex\|codex.*update" 2>/dev/null; then
1059
+ log_debug " Codex update prompt detected, selecting Skip"
1060
+ tmux send-keys -t "$pane_id" "2" C-m
1061
+ sleep 0.2
1062
+ fi
1063
+ # Send text via buffer paste (reliable for long strings)
1064
+ log_debug " Pasting text to pane $pane_id (${#text} chars)"
1065
+ paste_to_pane "$pane_id" "$text"
1066
+
1067
+ # Allow input buffer to settle (tmux: 150ms)
1068
+ sleep 0.15
1069
+
1070
+ # Submit: up to 6 rounds of C-m double-press
1071
+ local round=0
1072
+ while (( round < 6 )); do
1073
+ sleep 0.1
1074
+ if (( round == 0 && pane_busy )); then
1075
+ # Busy pane: just C-m (DO NOT send Tab — it toggles Claude Code permission mode)
1076
+ tmux send-keys -t "$pane_id" C-m
1077
+ else
1078
+ tmux send-keys -t "$pane_id" C-m
1079
+ sleep 0.2
1080
+ tmux send-keys -t "$pane_id" C-m
1081
+ fi
1082
+ sleep 0.14
1083
+
1084
+ # Check if text was consumed
1085
+ local check_capture
1086
+ check_capture=$(tmux capture-pane -t "$pane_id" -p 2>/dev/null | tail -5)
1087
+ if ! echo "$check_capture" | grep -qF "$text" 2>/dev/null; then
1088
+ log_debug " Text consumed after round $((round + 1))"
1089
+ return 0
1090
+ fi
1091
+ sleep 0.14
1092
+ (( round++ ))
1093
+ done
1094
+
1095
+ # Safety gate: copy-mode check
1096
+ if ! check_copy_mode "$pane_id"; then
1097
+ log_debug " Copy mode activated during send, aborting"
1098
+ return 1
1099
+ fi
1100
+
1101
+ # Adaptive fallback: C-u clear line, resend (tmux pattern)
1102
+ log_debug " Adaptive retry — clearing line and resending"
1103
+ tmux send-keys -t "$pane_id" C-u
1104
+ sleep 0.08
1105
+ if ! check_copy_mode "$pane_id"; then
1106
+ return 1
1107
+ fi
1108
+ paste_to_pane "$pane_id" "$text"
1109
+ sleep 0.12
1110
+ local retry_round=0
1111
+ while (( retry_round < 4 )); do
1112
+ tmux send-keys -t "$pane_id" C-m
1113
+ sleep 0.18
1114
+ tmux send-keys -t "$pane_id" C-m
1115
+ sleep 0.14
1116
+ local retry_capture
1117
+ retry_capture=$(tmux capture-pane -t "$pane_id" -p 2>/dev/null | tail -5)
1118
+ if ! echo "$retry_capture" | grep -qF "$text" 2>/dev/null; then
1119
+ log_debug " Text consumed after adaptive retry round $((retry_round + 1))"
1120
+ return 0
1121
+ fi
1122
+ (( retry_round++ ))
1123
+ done
1124
+
1125
+ # Fail-open: one last nudge
1126
+ if ! check_copy_mode "$pane_id"; then
1127
+ return 1
1128
+ fi
1129
+ tmux send-keys -t "$pane_id" C-m
1130
+ sleep 0.12
1131
+ tmux send-keys -t "$pane_id" C-m
1132
+ log_debug " Fail-open — text may or may not have been submitted"
1133
+ return 0
1134
+ }
1135
+
1136
+ # =============================================================================
1137
+ # Wait for Pane Ready (tmux pattern: paneLooksReady)
1138
+ # =============================================================================
1139
+
1140
+ wait_for_pane_ready() {
1141
+ local pane_id="$1"
1142
+ local timeout="${2:-10}" # tmux default: 10s
1143
+ local start=$(date +%s)
1144
+ log " Waiting for pane $pane_id ready..."
1145
+ while (( $(date +%s) - start < timeout )); do
1146
+ local captured
1147
+ captured=$(tmux capture-pane -t "$pane_id" -p -S -20 2>/dev/null)
1148
+
1149
+ # Auto-dismiss trust prompt (tmux pattern: paneHasTrustPrompt)
1150
+ if echo "$captured" | grep -q "Do you trust" 2>/dev/null; then
1151
+ log " Trust prompt detected, auto-dismissing..."
1152
+ tmux send-keys -t "$pane_id" C-m
1153
+ sleep 0.12
1154
+ tmux send-keys -t "$pane_id" C-m
1155
+ sleep 2
1156
+ continue
1157
+ fi
1158
+
1159
+ # Auto-approve permission prompts ("Do you want to create/overwrite X?")
1160
+ if echo "$captured" | grep -q "Do you want to" 2>/dev/null; then
1161
+ log " Permission prompt detected, auto-approving..."
1162
+ tmux send-keys -t "$pane_id" C-m
1163
+ sleep 0.5
1164
+ continue
1165
+ fi
1166
+
1167
+ # Auto-dismiss codex update prompt (select Skip = option 2)
1168
+ if echo "$captured" | grep -qi "new version\|update.*codex\|codex.*update" 2>/dev/null; then
1169
+ log " Codex update prompt detected, selecting Skip..."
1170
+ tmux send-keys -t "$pane_id" "2" C-m
1171
+ sleep 0.5
1172
+ continue
1173
+ fi
1174
+
1175
+ # tmux paneLooksReady: check each line for prompt char at line start
1176
+ local ready=0
1177
+ echo "$captured" | while IFS= read -r line; do
1178
+ local trimmed="${line## }"
1179
+ if [[ "$trimmed" == ❯* || "$trimmed" == \>* || "$trimmed" == ›* || "$trimmed" == »* ]]; then
1180
+ ready=1
1181
+ break
1182
+ fi
1183
+ done 2>/dev/null
1184
+
1185
+ # Also check via grep as fallback
1186
+ if echo "$captured" | tail -5 | grep -qE '^\s*[❯›]' 2>/dev/null; then
1187
+ ready=1
1188
+ fi
1189
+
1190
+ if (( ready )) || echo "$captured" | tail -3 | grep -qE '^\s*[❯›>]' 2>/dev/null; then
1191
+ # Check no active task running
1192
+ if ! echo "$captured" | grep -q "esc to interrupt" 2>/dev/null; then
1193
+ log " Pane $pane_id is ready."
1194
+ return 0
1195
+ fi
1196
+ fi
1197
+ sleep 0.25
1198
+ done
1199
+ # Timeout — return success anyway (fail-open, let safe_send_keys handle it)
1200
+ log " Pane $pane_id ready timeout after ${timeout}s (proceeding anyway)"
1201
+ return 0
1202
+ }
1203
+
1204
+ # =============================================================================
1205
+ # Heartbeat Monitoring (tmux pattern)
1206
+ # =============================================================================
1207
+
1208
+ # --- governance.md s7 step 5+6: Check heartbeat freshness ---
1209
+ check_heartbeat() {
1210
+ local hb_file="$1"
1211
+ local threshold="$HEARTBEAT_STALE_THRESHOLD"
1212
+
1213
+ if [[ ! -f "$hb_file" ]]; then
1214
+ return 1
1215
+ fi
1216
+
1217
+ local hb_epoch now_epoch
1218
+ # Read epoch seconds directly (avoids timezone parsing bugs)
1219
+ hb_epoch=$(jq -r '.epoch // empty' "$hb_file" 2>/dev/null) || return 1
1220
+
1221
+ if [[ -z "$hb_epoch" ]]; then
1222
+ return 1
1223
+ fi
1224
+
1225
+ now_epoch=$(date +%s)
1226
+ (( now_epoch - hb_epoch < threshold ))
1227
+ }
1228
+
1229
+ # Check if heartbeat indicates process has exited
1230
+ check_heartbeat_exited() {
1231
+ local hb_file="$1"
1232
+ if [[ ! -f "$hb_file" ]]; then
1233
+ return 1
1234
+ fi
1235
+ local hb_status
1236
+ hb_status=$(jq -r '.status // empty' "$hb_file" 2>/dev/null)
1237
+ [[ "$hb_status" == "exited" ]]
1238
+ }
1239
+
1240
+ # =============================================================================
1241
+ # Idle Pane Nudging (tmux pattern)
1242
+ # =============================================================================
1243
+
1244
+ # --- v5.7 §4.13.a: Mid-execution permission-prompt auto-dismiss (Bug 4 fix) ---
1245
+ # claude CLI v2.1.114+ surfaces TUI-layer prompts ("Do you want to create...")
1246
+ # even with --dangerously-skip-permissions on certain Write paths. Without this
1247
+ # helper, Workers/Verifiers hang until IDLE_NUDGE_THRESHOLD timeout.
1248
+ #
1249
+ # Window-bounded match (codex Critic v5.7): require both a prompt phrase AND a
1250
+ # TUI affordance marker on the SAME, PREVIOUS, or NEXT line. Whole-capture dual
1251
+ # grep would let unrelated text trigger Enter (R-V5-9 false-positive).
1252
+ # Per-pane 3-second debounce prevents rapid double-Enter.
1253
+ zmodload zsh/datetime 2>/dev/null || true
1254
+ _now_s() { print -- "${EPOCHSECONDS:-$(date +%s)}"; }
1255
+
1256
+ typeset -gA LAST_AUTO_APPROVE_TS
1257
+ # v5.7 §4.16: track when each pane FIRST entered a prompt-stuck state.
1258
+ # Cleared on first capture without prompt visible. Used for bounded
1259
+ # prompt-stall escalation (BLOCKED `prompt_stall`) so alive-but-stuck
1260
+ # Workers can't infinite-wait (codex Critic HIGH finding).
1261
+ typeset -gA PANE_PROMPT_STUCK_SINCE
1262
+ typeset -gA PANE_DISMISS_FAILED_COUNT
1263
+ PROMPT_STALL_TIMEOUT="${PROMPT_STALL_TIMEOUT:-300}" # 5 min default
1264
+ PROMPT_DISMISS_FAIL_LIMIT="${PROMPT_DISMISS_FAIL_LIMIT:-20}" # ~100s of fruitless dismiss attempts
1265
+
1266
+ # v5.7 §4.17: generic no-progress timeout (codex Critic HIGH — closes the gap
1267
+ # where an undetected prompt or alive-but-frozen Worker bypasses Layer 4).
1268
+ # Independent of prompt detection: if pane content stops changing for this many
1269
+ # seconds AND signal file still missing, write BLOCKED `infra_failure` reason
1270
+ # `worker_no_progress` so silent infinite-wait is impossible.
1271
+ PROGRESS_NO_CHANGE_TIMEOUT="${PROGRESS_NO_CHANGE_TIMEOUT:-600}" # 10 min default
1272
+ typeset -gA PANE_LAST_CHANGE_TS # epoch when content last changed
1273
+ typeset -gA PANE_LAST_CONTENT_FOR_PROGRESS # captured content for diff
1274
+
1275
+ # v0.14.1: codex post-work idle UI grace. When a verifier pane shows codex's
1276
+ # "Worked for Xm Ys" idle line at byte-stasis time, grant one extra
1277
+ # CODEX_IDLE_GRACE_S (default 120s) before BLOCK. Per-pane bookkeeping to
1278
+ # avoid granting it repeatedly. Bug Report #3 (BOS 2026-05-04).
1279
+ CODEX_IDLE_GRACE_S="${CODEX_IDLE_GRACE_S:-120}"
1280
+ typeset -gA PANE_CODEX_IDLE_GRACED
1281
+ # v0.14.2: per-verifier-pane trace flag — log the verdict-lookup outcome
1282
+ # exactly once per byte-stasis transition. Bug Report #4 (BOS 2026-05-05).
1283
+ typeset -gA PANE_VERIFIER_TRACE_LOGGED
1284
+
1285
+ # v5.7 §4.17: default-No prompt detection. Pressing Enter on these means
1286
+ # CANCEL/REJECT, not approve — so we BLOCK with traceability instead of
1287
+ # silently auto-dismissing the wrong way.
1288
+ typeset -g _DEFAULT_NO_RE='\[y/N\]|\(yes/no, default no\)|default[: ]+no|^[[:space:]]*N\)'
1289
+
1290
+ # v5.7 §4.16: broadened prompt detection (codex Critic MEDIUM).
1291
+ # v5.7 §4.20 (E2E real-claude-CLI finding): claude v2.1.114+ uses new trust
1292
+ # prompt format ("Quick safety check: Is this a project you ... trust?")
1293
+ # and a numbered picker with `❯` cursor adjacent to the digit ("❯1.Yes").
1294
+ # Old patterns ("Do you trust") missed it entirely → Worker hung 5min until
1295
+ # iter-timeout. Adds: Quick safety check|trust this (folder|directory) for
1296
+ # PROMPT_RE; ❯\s*\d+\. (zero-or-more space) and `Enter to confirm` / `1\.
1297
+ # (Yes|No)` for AFFORDANCE_RE.
1298
+ typeset -g _PROMPT_RE='Do you (want to|trust)|Confirm execution|Are you sure|Continue\?|Proceed\?|Allow this|Approve this|Press y to|Choose an option|Select \[|Quick safety check|trust this (folder|directory)|Is this a project you'
1299
+ typeset -g _AFFORDANCE_RE='\(y/n\)|\[Y/n\]|\[y/N\]|\(yes/no|❯[[:space:]]*[0-9]+\.|(^|[[:space:]])1\) (Yes|No)|(^|[[:space:]])[YyNn]\)|press (y|enter) to|Enter to confirm'
1300
+
1301
+ # v5.7 §4.18 (E2E real-tmux + omc benchmarking): "active task" markers used
1302
+ # to distinguish a Worker that is busy producing output (and may legitimately
1303
+ # print "(y/n)" inside its body text) from a Worker that is *idle at an
1304
+ # unrecognized prompt*. Mirrors omc-team's `paneHasActiveTask` heuristic
1305
+ # (src/team/tmux-session.ts:659). When ANY of these markers is in the recent
1306
+ # pane tail, the Worker is alive — auto_dismiss must NOT fast-fail on a
1307
+ # suspected-unknown prompt because the affordance text is just transcript.
1308
+ typeset -g _ACTIVE_TASK_RE='esc to interrupt|background terminal running|^[[:space:]]*[·✻][[:space:]]+[A-Za-z]+(\.{3}|…)'
1309
+
1310
+ auto_dismiss_prompts() {
1311
+ local pane_id="$1"
1312
+ local now
1313
+ now=$(_now_s)
1314
+ local last=${LAST_AUTO_APPROVE_TS[$pane_id]:-0}
1315
+
1316
+ local capture
1317
+ # v5.7 §4.21 (E2E real-claude-CLI finding): claude v2.x trust prompt wraps
1318
+ # to ~30 lines on narrow panes. -S -10 missed the question header. -50
1319
+ # covers the full prompt.
1320
+ capture=$(tmux capture-pane -t "$pane_id" -p -S -50 2>/dev/null) || return 0
1321
+
1322
+ # v5.7 §4.21 (E2E real-claude-CLI finding): claude v2.x trust prompt is
1323
+ # multi-line and wraps narrowly, so per-line PROMPT_RE+AFFORDANCE adjacency
1324
+ # misses it. Special-case the signature ("Quick safety check ... Enter to
1325
+ # confirm" with `❯N.Yes` cursor on option 1). This is default-Yes — Enter
1326
+ # approves trust.
1327
+ # §4.21.b: tmux narrow-pane wrap breaks the question phrase across lines
1328
+ # (`Quick safety\n check`). Normalize all whitespace to single spaces so
1329
+ # substring matching works regardless of pane width.
1330
+ local _norm_capture="${capture//[$'\n\r\t']/ }"
1331
+ while [[ "$_norm_capture" == *" "* ]]; do _norm_capture="${_norm_capture// / }"; done
1332
+ if { [[ "$_norm_capture" == *"Quick safety check"* ]] || [[ "$_norm_capture" == *"trust this folder"* ]] || [[ "$_norm_capture" == *"trust this directory"* ]]; } \
1333
+ && [[ "$_norm_capture" == *"Enter to confirm"* ]] \
1334
+ && [[ "$_norm_capture" =~ '❯ ?[0-9]+\. ?Yes' ]]; then
1335
+ if (( now - last >= 3 )); then
1336
+ log " Claude v2.x trust prompt detected in pane $pane_id, auto-approving (Enter)"
1337
+ log_debug "[FLOW] claude_trust_prompt_auto_approved=true pane=$pane_id"
1338
+ tmux send-keys -t "$pane_id" Enter 2>/dev/null
1339
+ LAST_AUTO_APPROVE_TS[$pane_id]=$now
1340
+ fi
1341
+ return 0
1342
+ fi
1343
+ # Older claude trust prompt format (omc-team parity).
1344
+ if [[ "$_norm_capture" == *"Do you trust the contents of this directory"* ]] \
1345
+ && { [[ "$_norm_capture" =~ 'Yes,[[:space:]]*continue' ]] || [[ "$_norm_capture" == *"Press enter to continue"* ]]; }; then
1346
+ if (( now - last >= 3 )); then
1347
+ log " Claude (legacy) trust prompt detected in pane $pane_id, auto-approving (Enter)"
1348
+ log_debug "[FLOW] claude_trust_prompt_auto_approved=true pane=$pane_id"
1349
+ tmux send-keys -t "$pane_id" Enter 2>/dev/null
1350
+ LAST_AUTO_APPROVE_TS[$pane_id]=$now
1351
+ fi
1352
+ return 0
1353
+ fi
1354
+
1355
+ local -a lines
1356
+ lines=("${(@f)capture}")
1357
+ local i n=${#lines[@]} prompt_visible=0
1358
+ # v5.7 §4.23 (E2E real-claude-CLI finding): tmux narrow-pane wrap breaks
1359
+ # multi-line prompts (e.g. "Do you want to\nmake this edit to\nfile.md?\n
1360
+ # ❯ 1. Yes") so PROMPT+AFFORDANCE±1 line-adjacency misses them. Fix: run
1361
+ # the match against the LAST 15 normalized lines (whitespace collapsed)
1362
+ # — where the active prompt sits — as a single string. PROMPT_RE +
1363
+ # AFFORDANCE_RE both present → auto-Enter unless DEFAULT_NO_RE present
1364
+ # (BLOCK). §4.17.b is preserved: full-capture default-No scan protects
1365
+ # against scrollback contamination.
1366
+ local _tail_start=$((n > 15 ? n - 14 : 1))
1367
+ local _tail_normalized=""
1368
+ for ((i=_tail_start; i <= n; i++)); do
1369
+ _tail_normalized+="${lines[i]} "
1370
+ done
1371
+ while [[ "$_tail_normalized" == *" "* ]]; do _tail_normalized="${_tail_normalized// / }"; done
1372
+ local default_no_seen=0
1373
+ local sample_pattern="${_tail_normalized:0:120}"
1374
+ if [[ "$_tail_normalized" =~ $_PROMPT_RE ]] && [[ "$_tail_normalized" =~ $_AFFORDANCE_RE ]]; then
1375
+ prompt_visible=1
1376
+ fi
1377
+ # Default-No scan: full capture, not just tail (scrollback contamination guard).
1378
+ if [[ "$capture" =~ $_DEFAULT_NO_RE ]]; then
1379
+ default_no_seen=1
1380
+ fi
1381
+
1382
+ if (( default_no_seen )); then
1383
+ # v5.7 §4.17 + §4.17.b: default-No prompts ([y/N], "default: no") cannot
1384
+ # be auto-Enter'd safely — pressing Enter would CANCEL the operation.
1385
+ # If the pane has ANY default-No prompt visible (even alongside older
1386
+ # default-Yes prompts in scrollback), BLOCK with traceability.
1387
+ log_error "Default-No prompt detected in pane $pane_id — cannot safely auto-dismiss"
1388
+ log_debug "[GOV] default_no_prompt_detected=true pane=$pane_id action=block"
1389
+ write_blocked_sentinel \
1390
+ "Pane shows a default-No / explicit-No-default permission prompt. Auto-Enter would CANCEL the operation rather than approve it. Operator must manually respond with 'y' or extend prompt-handling logic. Pattern: $sample_pattern" \
1391
+ "${CURRENT_US:-ALL}" \
1392
+ "infra_failure"
1393
+ return 0
1394
+ fi
1395
+
1396
+ if (( prompt_visible )); then
1397
+ # All visible prompts are default-Yes-equivalent — safe to auto-Enter.
1398
+ if [[ -z "${PANE_PROMPT_STUCK_SINCE[$pane_id]:-}" ]]; then
1399
+ PANE_PROMPT_STUCK_SINCE[$pane_id]=$now
1400
+ fi
1401
+ if (( now - last >= 3 )); then
1402
+ log " Permission prompt detected in pane $pane_id, auto-approving (Enter)"
1403
+ log_debug "[FLOW] permission_prompt_auto_approved=true pane=$pane_id"
1404
+ tmux send-keys -t "$pane_id" Enter 2>/dev/null
1405
+ LAST_AUTO_APPROVE_TS[$pane_id]=$now
1406
+ PANE_DISMISS_FAILED_COUNT[$pane_id]=$((${PANE_DISMISS_FAILED_COUNT[$pane_id]:-0} + 1))
1407
+ fi
1408
+ return 0
1409
+ fi
1410
+
1411
+ # v5.7 §4.18: unknown-prompt fast-fail (E2E + omc benchmarking finding).
1412
+ # If pane has an affordance marker (y/n bracket etc.) but NO recognized
1413
+ # PROMPT_RE phrasing, the Worker is likely awaiting an unknown variant of
1414
+ # a yes/no prompt. omc-team's principle (tmux-session.ts:639): never
1415
+ # auto-Enter on unknown prompts — pressing Enter could approve OR cancel
1416
+ # depending on default. BLOCK immediately so the operator can extend the
1417
+ # PROMPT_RE catalog, instead of waiting 10 min for the freeze timeout.
1418
+ #
1419
+ # False-positive guard: skip if any "active task" marker is present
1420
+ # (esc to interrupt / background terminal / spinner) — that means the
1421
+ # Worker is producing output and the affordance text is just transcript.
1422
+ local active=0
1423
+ local affordance_seen=0
1424
+ local sample=""
1425
+ for ((i=1; i <= n; i++)); do
1426
+ if [[ "${lines[i]}" =~ $_ACTIVE_TASK_RE ]]; then
1427
+ active=1
1428
+ break
1429
+ fi
1430
+ done
1431
+ if (( ! active )); then
1432
+ # Only check the last 5 non-empty lines (where an idle prompt would sit).
1433
+ local -a tail_lines
1434
+ tail_lines=()
1435
+ local k
1436
+ for ((k=n; k >= 1 && ${#tail_lines[@]} < 5; k--)); do
1437
+ [[ -z "${lines[k]}" ]] && continue
1438
+ tail_lines=("${lines[k]}" "${tail_lines[@]}")
1439
+ done
1440
+ for line in "${tail_lines[@]}"; do
1441
+ if [[ "$line" =~ $_AFFORDANCE_RE ]]; then
1442
+ affordance_seen=1
1443
+ sample="${line:0:120}"
1444
+ break
1445
+ fi
1446
+ done
1447
+ fi
1448
+ if (( affordance_seen )); then
1449
+ # Re-check default-No (could be the active prompt's bracket — must BLOCK).
1450
+ local default_no_in_tail=0
1451
+ for line in "${tail_lines[@]}"; do
1452
+ if [[ "$line" =~ $_DEFAULT_NO_RE ]]; then
1453
+ default_no_in_tail=1
1454
+ break
1455
+ fi
1456
+ done
1457
+ local reason
1458
+ if (( default_no_in_tail )); then
1459
+ reason="Pane shows a default-No affordance ([y/N], 'default: no') but the surrounding prompt phrasing is not in PROMPT_RE. Auto-Enter would CANCEL. Operator must respond manually or extend PROMPT_RE. Sample: $sample"
1460
+ else
1461
+ reason="Pane shows a y/n affordance marker without a recognized prompt phrasing — likely an unknown CLI prompt variant. Refusing to guess auto-Enter (which could be the wrong default). Operator must respond manually or extend PROMPT_RE. Sample: $sample"
1462
+ fi
1463
+ log_error "Unknown-prompt affordance detected in pane $pane_id — fast-fail BLOCK"
1464
+ log_debug "[GOV] unknown_prompt_detected=true pane=$pane_id action=block default_no=$default_no_in_tail"
1465
+ write_blocked_sentinel "$reason" "${CURRENT_US:-ALL}" "infra_failure"
1466
+ return 0
1467
+ fi
1468
+ # No prompt visible — clear stall tracking so re-entry is fresh.
1469
+ if [[ -n "${PANE_PROMPT_STUCK_SINCE[$pane_id]:-}" ]]; then
1470
+ log_debug "[FLOW] prompt_cleared=true pane=$pane_id"
1471
+ # zsh: unset assoc-array member via reset to empty + delete key.
1472
+ PANE_PROMPT_STUCK_SINCE[$pane_id]=""
1473
+ PANE_DISMISS_FAILED_COUNT[$pane_id]=""
1474
+ unset "PANE_PROMPT_STUCK_SINCE[$pane_id]"
1475
+ unset "PANE_DISMISS_FAILED_COUNT[$pane_id]"
1476
+ fi
1477
+ }
1478
+
1479
+ # v5.7 §4.16: bounded prompt-stall escalation (codex Critic HIGH finding).
1480
+ # Closes the "alive process → extend indefinitely" gap: if a pane stays in
1481
+ # prompt-visible state for PROMPT_STALL_TIMEOUT (default 5min) OR
1482
+ # auto_dismiss has tried PROMPT_DISMISS_FAIL_LIMIT times without progress,
1483
+ # write BLOCKED `prompt_stall` so the campaign exits with traceability
1484
+ # instead of infinite-waiting.
1485
+ #
1486
+ # Returns 0 if pane is fine; returns 1 (and writes BLOCKED sentinel) if
1487
+ # stall threshold exceeded — caller should propagate the failure.
1488
+ check_prompt_stall() {
1489
+ local pane_id="$1"
1490
+ local us_id="${2:-${CURRENT_US:-ALL}}"
1491
+ local stuck_since=${PANE_PROMPT_STUCK_SINCE[$pane_id]:-0}
1492
+ (( stuck_since == 0 )) && return 0
1493
+ local now
1494
+ now=$(_now_s)
1495
+ local stuck_for=$(( now - stuck_since ))
1496
+ local fail_count=${PANE_DISMISS_FAILED_COUNT[$pane_id]:-0}
1497
+
1498
+ if (( stuck_for >= PROMPT_STALL_TIMEOUT )) || (( fail_count >= PROMPT_DISMISS_FAIL_LIMIT )); then
1499
+ log_error "Pane $pane_id stuck on prompt for ${stuck_for}s ($fail_count dismiss attempts) — escalating to BLOCKED"
1500
+ log_debug "[GOV] iter=${ITERATION:-0} prompt_stall_escalated=true pane=$pane_id stuck_for=${stuck_for}s dismiss_attempts=$fail_count threshold=${PROMPT_STALL_TIMEOUT}s"
1501
+ write_blocked_sentinel \
1502
+ "Pane stuck on TUI prompt for ${stuck_for}s after ${fail_count} dismiss attempts. Auto-dismiss patterns may need to be widened (see ~/.claude/ralph-desk/known-prompts.txt convention) or the underlying claude CLI prompt is genuinely unsupported. No documentation produced for this iteration." \
1503
+ "$us_id" \
1504
+ "infra_failure"
1505
+ return 1
1506
+ fi
1507
+ return 0
1508
+ }
1509
+
1510
+ # v0.14.1 / v0.14.2: codex post-work idle UI detector. The codex CLI shows
1511
+ # a status line like "─ Worked for 5m 36s ──" + a "› " prompt + "Context
1512
+ # X% left" / model + suggestion ("Improve documentation in @filename")
1513
+ # after it finishes the verifier task and is waiting for the next user
1514
+ # input. This is NOT a permission prompt — it is a successful idle state.
1515
+ # The byte-stasis check below mistook this for "frozen" and BLOCKED a
1516
+ # verifier whose verdict file was already on disk. v0.14.2 Bug Report #4
1517
+ # observed the v0.14.1 patterns being too narrow (BOS 12th launch had
1518
+ # extra horizontal-rule wrapping that broke the strict dash-bracket regex)
1519
+ # — relaxed below to multiple independent markers; ANY one fires idle.
1520
+ is_codex_idle_ui() {
1521
+ local pane_text="$1"
1522
+ # 1. "Worked for Xm Ys" — most reliable codex idle marker.
1523
+ print -- "$pane_text" | grep -qE 'Worked for [0-9]+m [0-9]+s' && return 0
1524
+ # 2. "Context X% left" status bar — appears whenever codex is alive +
1525
+ # waiting at the prompt; captures the case where horizontal rules
1526
+ # above were stripped by tmux capture truncation.
1527
+ print -- "$pane_text" | grep -qE 'Context [0-9]+%[[:space:]]*left' && return 0
1528
+ # 3. codex model + branch line (e.g. "gpt-5.5 high · feature/...") —
1529
+ # only printed alongside the idle prompt, never during work.
1530
+ print -- "$pane_text" | grep -qE 'gpt-[0-9]+(\.[0-9]+)? (low|medium|high|xhigh) ·' && return 0
1531
+ # 4. codex default-suggestion prompt prefix at line start. v0.14.1 had
1532
+ # only "›" but BOS Bug #4 showed the leading character can be wrapped
1533
+ # by tmux narrowness — also accept the suggestion phrases verbatim.
1534
+ print -- "$pane_text" | grep -qE 'Improve documentation in @|Summarize recent commits|Explain (this )?code' && return 0
1535
+ return 1
1536
+ }
1537
+
1538
+ # v0.14.2 Bug Report #4 H1: codex sometimes lands the verdict at the
1539
+ # pre-v0.13.0 legacy path (`<root>/.claude/ralph-desk/memos/...`) instead
1540
+ # of `.rlp-desk/memos/`, even when the prompt instructs otherwise. When
1541
+ # we observe the legacy file with valid JSON, atomically rename it into
1542
+ # place so the rest of the pipeline (harvest + analytics + sentinels)
1543
+ # sees a single canonical path. Best-effort: any failure leaves the file
1544
+ # untouched and the campaign keeps polling.
1545
+ _migrate_legacy_verdict() {
1546
+ [[ -n "${LEGACY_VERDICT_FILE:-}" && -f "$LEGACY_VERDICT_FILE" ]] || return 1
1547
+ jq -e . "$LEGACY_VERDICT_FILE" >/dev/null 2>&1 || return 1
1548
+ log "Verdict file found at legacy path ${LEGACY_VERDICT_FILE} — moving to ${VERDICT_FILE}"
1549
+ log_debug "[GOV] iter=${ITERATION:-0} legacy_verdict_migrated=true from=${LEGACY_VERDICT_FILE} to=${VERDICT_FILE}"
1550
+ mkdir -p "$(dirname "$VERDICT_FILE")" 2>/dev/null
1551
+ mv -f "$LEGACY_VERDICT_FILE" "$VERDICT_FILE" 2>/dev/null && return 0
1552
+ return 1
1553
+ }
1554
+
1555
+ # v0.14.1 / v0.14.2: verdict-aware short-circuit. When the pane being
1556
+ # polled is the verifier pane AND a valid verdict file already exists on
1557
+ # disk (canonical path OR legacy path that we then auto-migrate), the
1558
+ # verifier has finished its work — the harvest step (run_single_verifier
1559
+ # / consensus loop) is the one that should observe the verdict, not the
1560
+ # generic no-progress watcher. Returning 0 here lets the outer loop keep
1561
+ # polling instead of escalating BLOCKED. Bug Reports #3 (BOS 2026-05-04)
1562
+ # + #4 (BOS 2026-05-05).
1563
+ _verifier_pane_has_verdict() {
1564
+ local pane_id="$1"
1565
+ [[ "$pane_id" == "${VERIFIER_PANE:-}" || "$pane_id" == "${FINAL_VERIFIER_PANE:-}" ]] || return 1
1566
+ # Canonical path first.
1567
+ if [[ -n "${VERDICT_FILE:-}" && -f "$VERDICT_FILE" ]]; then
1568
+ jq -e . "$VERDICT_FILE" >/dev/null 2>&1 && return 0
1569
+ fi
1570
+ # v0.14.2 Fix-D: codex may have written to the legacy path. Try to
1571
+ # migrate; success means the canonical file is now in place.
1572
+ _migrate_legacy_verdict && return 0
1573
+ return 1
1574
+ }
1575
+
1576
+ # v0.14.5 Bug Report #6 Fix-M (worker mirror of Fix-A/Fix-D):
1577
+ # Worker (claude sonnet 1m) writes commit + iter-signal.json verify signal
1578
+ # then claude CLI parks at its idle prompt. check_no_progress observes
1579
+ # byte-stasis on the worker pane and would BLOCK after 600s even though
1580
+ # the signal is on disk. When the pane is the worker pane AND a valid
1581
+ # iter-signal is on disk, defer to the harvest step (poll_for_signal in
1582
+ # run_single_worker) instead of escalating BLOCKED.
1583
+ _worker_pane_has_signal() {
1584
+ local pane_id="$1"
1585
+ [[ -n "${WORKER_PANE:-}" && "$pane_id" == "${WORKER_PANE}" ]] || return 1
1586
+ [[ -n "${SIGNAL_FILE:-}" && -s "$SIGNAL_FILE" ]] || return 1
1587
+ jq -e . "$SIGNAL_FILE" >/dev/null 2>&1 || return 1
1588
+ local iter_field us_field status_field
1589
+ iter_field=$(jq -r '.iteration // empty' "$SIGNAL_FILE" 2>/dev/null)
1590
+ us_field=$(jq -r '.us_id // empty' "$SIGNAL_FILE" 2>/dev/null)
1591
+ status_field=$(jq -r '.status // empty' "$SIGNAL_FILE" 2>/dev/null)
1592
+ [[ "$iter_field" =~ ^[0-9]+$ ]] || return 1
1593
+ [[ -n "$us_field" ]] || return 1
1594
+ [[ "$status_field" == "verify" || "$status_field" == "verify_partial" ]] || return 1
1595
+ return 0
1596
+ }
1597
+
1598
+ # v5.7 §4.17 (codex Critic HIGH): generic no-progress timeout — independent
1599
+ # of prompt detection. Closes the gap where an undetected prompt or alive-
1600
+ # but-frozen Worker can bypass Layer 4 and infinite-wait.
1601
+ #
1602
+ # Strategy: capture pane content each call, hash/compare to last; if
1603
+ # unchanged for PROGRESS_NO_CHANGE_TIMEOUT (default 10min), write BLOCKED.
1604
+ # Returns 0 if pane is making progress (or first call); 1 (and writes
1605
+ # BLOCKED) if no-progress threshold exceeded.
1606
+ check_no_progress() {
1607
+ local pane_id="$1"
1608
+ local us_id="${2:-${CURRENT_US:-ALL}}"
1609
+ local now
1610
+ now=$(_now_s)
1611
+ local capture
1612
+ capture=$(tmux capture-pane -t "$pane_id" -p -S -20 2>/dev/null) || return 0
1613
+
1614
+ # v0.14.1 Fix-A / v0.14.2 Fix-D: codex verifier writes verdict, then
1615
+ # sits at "Worked for Xm Ys" idle UI. byte-stasis would BLOCK after
1616
+ # 600s even though the verdict is on disk. Check both canonical and
1617
+ # legacy verdict paths — auto-migrate legacy if found — and defer to
1618
+ # the harvest step when the pane is a verifier pane.
1619
+ if _verifier_pane_has_verdict "$pane_id"; then
1620
+ PANE_LAST_CONTENT_FOR_PROGRESS[$pane_id]="$capture"
1621
+ PANE_LAST_CHANGE_TS[$pane_id]=$now
1622
+ return 0
1623
+ fi
1624
+ # v0.14.5 Bug Report #6 Fix-M: claude worker finishes (commit + iter-signal
1625
+ # write) then parks at its idle prompt. byte-stasis would BLOCK after 600s
1626
+ # even though the signal is on disk. Worker mirror of the verifier branch
1627
+ # above — defer to poll_for_signal harvest when SIGNAL_FILE is valid.
1628
+ if _worker_pane_has_signal "$pane_id"; then
1629
+ PANE_LAST_CONTENT_FOR_PROGRESS[$pane_id]="$capture"
1630
+ PANE_LAST_CHANGE_TS[$pane_id]=$now
1631
+ log_debug "[GOV] iter=${ITERATION:-0} worker_progress_check=signal_present pane=$pane_id signal=${SIGNAL_FILE}"
1632
+ return 0
1633
+ fi
1634
+ # v0.14.2: root-cause tracing for Bug Report #4. When the watcher is
1635
+ # examining a verifier pane that does NOT have a verdict yet, log once
1636
+ # per byte-stasis transition so post-mortem can tell whether the
1637
+ # verdict was missing entirely vs. the idle-UI grace was the gating
1638
+ # factor. Idempotent flag lives in PANE_VERIFIER_TRACE_LOGGED.
1639
+ if [[ "$pane_id" == "${VERIFIER_PANE:-}" || "$pane_id" == "${FINAL_VERIFIER_PANE:-}" ]]; then
1640
+ if [[ -z "${PANE_VERIFIER_TRACE_LOGGED[$pane_id]:-}" ]]; then
1641
+ PANE_VERIFIER_TRACE_LOGGED[$pane_id]=1
1642
+ log_debug "[GOV] iter=${ITERATION:-0} verifier_progress_check=miss pane=$pane_id verdict_canonical=${VERDICT_FILE} verdict_canonical_exists=$([[ -f "$VERDICT_FILE" ]] && echo true || echo false) verdict_legacy=${LEGACY_VERDICT_FILE:-unset} verdict_legacy_exists=$([[ -f "${LEGACY_VERDICT_FILE:-/nonexistent}" ]] && echo true || echo false)"
1643
+ fi
1644
+ fi
1645
+
1646
+ local last_content="${PANE_LAST_CONTENT_FOR_PROGRESS[$pane_id]:-}"
1647
+ if [[ "$capture" != "$last_content" ]]; then
1648
+ PANE_LAST_CONTENT_FOR_PROGRESS[$pane_id]="$capture"
1649
+ PANE_LAST_CHANGE_TS[$pane_id]=$now
1650
+ return 0
1651
+ fi
1652
+
1653
+ local last_change=${PANE_LAST_CHANGE_TS[$pane_id]:-0}
1654
+ if (( last_change == 0 )); then
1655
+ PANE_LAST_CHANGE_TS[$pane_id]=$now
1656
+ return 0
1657
+ fi
1658
+
1659
+ local frozen_for=$(( now - last_change ))
1660
+ if (( frozen_for >= PROGRESS_NO_CHANGE_TIMEOUT )); then
1661
+ # v0.14.1 Fix-B: even without a verdict file, codex sometimes parks at
1662
+ # its idle UI mid-run (e.g. partial-write window before atomic mv).
1663
+ # Grant one-time +CODEX_IDLE_GRACE_S grace before escalating so we do
1664
+ # not BLOCK at the exact second the verdict is being mv'd into place.
1665
+ if is_codex_idle_ui "$capture"; then
1666
+ local already_graced="${PANE_CODEX_IDLE_GRACED[$pane_id]:-0}"
1667
+ if (( already_graced == 0 )); then
1668
+ PANE_CODEX_IDLE_GRACED[$pane_id]=1
1669
+ PANE_LAST_CHANGE_TS[$pane_id]=$now
1670
+ log "Pane $pane_id at codex idle UI for ${frozen_for}s — granting +${CODEX_IDLE_GRACE_S}s grace before BLOCK escalation"
1671
+ log_debug "[GOV] iter=${ITERATION:-0} codex_idle_grace=true pane=$pane_id grace_s=${CODEX_IDLE_GRACE_S}"
1672
+ return 0
1673
+ fi
1674
+ fi
1675
+ log_error "Pane $pane_id has not changed for ${frozen_for}s — alive but frozen. Escalating to BLOCKED."
1676
+ log_debug "[GOV] iter=${ITERATION:-0} no_progress_escalated=true pane=$pane_id frozen_for=${frozen_for}s threshold=${PROGRESS_NO_CHANGE_TIMEOUT}s"
1677
+ write_blocked_sentinel \
1678
+ "Pane content has been unchanged for ${frozen_for}s (>= ${PROGRESS_NO_CHANGE_TIMEOUT}s threshold). Worker process may be alive but stuck on an undetected prompt, hung network call, or genuine deadlock. No documentation produced; manual inspection required." \
1679
+ "$us_id" \
1680
+ "infra_failure"
1681
+ return 1
1682
+ fi
1683
+ return 0
1684
+ }
1685
+
1686
+ # --- governance.md s7 step 5+6: Nudge idle panes ---
1687
+ check_and_nudge_idle_pane() {
1688
+ local pane_id="$1"
1689
+ local nudge_count_var="$2"
1690
+
1691
+ # v5.7 §4.13.a: auto-dismiss permission prompts before idle check.
1692
+ # Otherwise Worker hangs at "Do you want to create..." until nudge timeout.
1693
+ auto_dismiss_prompts "$pane_id"
1694
+
1695
+ local current_content
1696
+ current_content=$(tmux capture-pane -t "$pane_id" -p 2>/dev/null | tail -3)
1697
+
1698
+ if [[ "$current_content" == "${LAST_PANE_CONTENT[$pane_id]:-}" ]]; then
1699
+ local idle_since="${PANE_IDLE_SINCE[$pane_id]:-$(date +%s)}"
1700
+ local now
1701
+ now=$(date +%s)
1702
+ if (( now - idle_since > IDLE_NUDGE_THRESHOLD )); then
1703
+ # A12 fix: NEVER nudge if pane is busy (thinking/working) — nudge interrupts claude
1704
+ local _nudge_capture
1705
+ _nudge_capture=$(tmux capture-pane -t "$pane_id" -p -S -5 2>/dev/null)
1706
+ if echo "$_nudge_capture" | grep -qi "esc to interrupt\|thinking\|working\|kneading\|crunching\|clauding\|billowing\|brewing\|tinkering\|burrowing\|saut\|razzle\|bunning\|zesting\|fermenting\|actualizing\|composing\|evaporating\|churning" 2>/dev/null; then
1707
+ log_debug " Pane $pane_id appears busy (thinking/working), skipping nudge"
1708
+ else
1709
+ local count=${(P)nudge_count_var}
1710
+ if (( count < MAX_NUDGES )); then
1711
+ log " Nudging idle pane $pane_id (nudge $((count + 1))/$MAX_NUDGES)"
1712
+ safe_send_keys "$pane_id" ""
1713
+ (( count++ ))
1714
+ eval "$nudge_count_var=$count"
1715
+ fi
1716
+ fi
1717
+ fi
1718
+ else
1719
+ LAST_PANE_CONTENT[$pane_id]="$current_content"
1720
+ PANE_IDLE_SINCE[$pane_id]=$(date +%s)
1721
+ fi
1722
+ }
1723
+
1724
+ # =============================================================================
1725
+ # Exponential Backoff Restart (tmux pattern)
1726
+ # =============================================================================
1727
+
1728
+ # --- governance.md s7 step 5: Restart dead workers with backoff ---
1729
+ restart_worker() {
1730
+ local pane_id="$1"
1731
+ local iter="$2"
1732
+ local trigger_file="$3"
1733
+
1734
+ # Codex workers are 1-shot exec; restart is not applicable
1735
+ if [[ "$WORKER_ENGINE" = "codex" ]]; then
1736
+ log_debug "restart_worker called for codex engine — no-op (1-shot exec)"
1737
+ return 1
1738
+ fi
1739
+
1740
+ local restart_count="${WORKER_RESTARTS[$iter]:-0}"
1741
+
1742
+ if (( restart_count >= MAX_RESTARTS )); then
1743
+ log_error "Worker exceeded max restarts ($MAX_RESTARTS) for iteration $iter"
1744
+ return 1 # caller writes BLOCKED
1745
+ fi
1746
+
1747
+ # Exponential backoff: 5s, 10s, 20s, 60s (cap)
1748
+ local -a delays=(5 10 20 60)
1749
+ local delay=${delays[$((restart_count + 1))]:-60}
1750
+ log " Restarting worker (attempt $((restart_count + 1))/$MAX_RESTARTS) after ${delay}s backoff..."
1751
+ sleep "$delay"
1752
+
1753
+ # Kill existing claude, wait for shell prompt
1754
+ tmux send-keys -t "$pane_id" C-c 2>/dev/null
1755
+ tmux send-keys -t "$pane_id" "/exit" C-m 2>/dev/null
1756
+ sleep 2
1757
+
1758
+ # Re-launch worker (tmux interactive pattern)
1759
+ if [[ "$WORKER_ENGINE" = "codex" ]]; then
1760
+ safe_send_keys "$pane_id" "${CODEX_BIN:-codex} -m $WORKER_CODEX_MODEL -c model_reasoning_effort=\"$WORKER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
1761
+ else
1762
+ safe_send_keys "$pane_id" "$(build_claude_cmd tui "$WORKER_MODEL" "" "" "$WORKER_EFFORT")"
1763
+ fi
1764
+ WORKER_RESTARTS[$iter]=$((restart_count + 1))
1765
+ return 0
1766
+ }
1767
+
1768
+ # =============================================================================
1769
+ # Write-Then-Notify: Trigger Script Generation (tmux CRITICAL pattern)
1770
+ # =============================================================================
1771
+
1772
+ # Per-US PRD injection helper
1773
+ # Substitutes the full PRD path with a per-US split path in the Worker prompt base.
1774
+ # Falls back to the full PRD with a stderr warning if the split file is missing.
1775
+ # Args: $1=prompt_base_file $2=full_prd_path $3=per_us_prd_path (empty = no substitution)
1776
+ inject_per_us_prd() {
1777
+ local prompt_base="$1"
1778
+ local full_prd="$2"
1779
+ local per_us_prd="${3:-}"
1780
+
1781
+ if [[ -n "$per_us_prd" && -f "$per_us_prd" ]]; then
1782
+ sed "s|$full_prd|$per_us_prd|g" "$prompt_base"
1783
+ else
1784
+ if [[ -n "$per_us_prd" ]]; then
1785
+ echo "WARNING: per-US split file not found: $per_us_prd — falling back to full PRD injection" >&2
1786
+ fi
1787
+ cat "$prompt_base"
1788
+ fi
1789
+ }
1790
+
1791
+ # --- governance.md s7 step 4+5: Write prompt and trigger to files ---
1792
+ # NEVER send prompt content through tmux send-keys.
1793
+ # Write payloads to files, send only short trigger commands (<200 chars).
1794
+ write_worker_trigger() {
1795
+ local iter="$1"
1796
+ local prompt_file="$LOGS_DIR/iter-$(printf '%03d' $iter).worker-prompt.md"
1797
+ local trigger_file="$LOGS_DIR/iter-$(printf '%03d' $iter).worker-trigger.sh"
1798
+ local output_log="$LOGS_DIR/iter-$(printf '%03d' $iter).worker-output.log"
1799
+
1800
+ # Build the worker prompt: base prompt + iteration context
1801
+ local contract
1802
+ contract=$(sed -n '/^## Next Iteration Contract$/,/^## /{ /^## Next/d; /^## [^N]/d; p; }' "$MEMORY_FILE" 2>/dev/null | head -5)
1803
+
1804
+ # Check for fix contract from previous verifier failure
1805
+ local prev_iter=$((iter - 1))
1806
+ local fix_contract_file="$LOGS_DIR/iter-$(printf '%03d' $prev_iter).fix-contract.md"
1807
+
1808
+ # Compute next unverified US before prompt assembly (required for per-US PRD injection)
1809
+ local next_us=""
1810
+ if [[ "$VERIFY_MODE" = "per-us" && -n "$US_LIST" ]]; then
1811
+ for us in $(echo "$US_LIST" | tr ',' ' '); do
1812
+ if ! echo ",$VERIFIED_US," | grep -q ",$us,"; then
1813
+ next_us="$us"
1814
+ break
1815
+ fi
1816
+ done
1817
+ fi
1818
+
1819
+ {
1820
+ # Per-US PRD injection: substitute full PRD path with per-US split path when available
1821
+ local per_us_prd=""
1822
+ [[ -n "$next_us" ]] && per_us_prd="$DESK/plans/prd-${SLUG}-${next_us}.md"
1823
+ inject_per_us_prd "$WORKER_PROMPT_BASE" "$DESK/plans/prd-${SLUG}.md" "$per_us_prd"
1824
+ echo ""
1825
+ echo "---"
1826
+ echo "## Iteration Context"
1827
+ echo "- **Iteration**: $iter"
1828
+ echo "- **Memory Stop Status**: $(sed -n '/^## Stop Status$/,/^$/{ /^## /d; /^$/d; p; }' "$MEMORY_FILE" 2>/dev/null | head -1)"
1829
+ echo "- **Next Iteration Contract**: ${contract:-Start from the beginning}"
1830
+ if (( _PRD_CHANGED )); then
1831
+ echo "NOTE: PRD was updated since last iteration. New/changed US may exist."
1832
+ fi
1833
+
1834
+ # Include fix contract if previous verifier failed
1835
+ if [[ -f "$fix_contract_file" ]]; then
1836
+ echo ""
1837
+ echo "---"
1838
+ echo "## IMPORTANT: Fix Contract from Verifier (iteration $prev_iter)"
1839
+ echo "The Verifier REJECTED your previous work. You MUST fix the issues below."
1840
+ echo "Do NOT just resubmit — actually change the code to address each issue."
1841
+ echo ""
1842
+ cat "$fix_contract_file"
1843
+ fi
1844
+
1845
+ # Per-US mode: tell Worker exactly which US to work on
1846
+ if [[ "$VERIFY_MODE" = "per-us" && -n "$US_LIST" ]]; then
1847
+ if [[ -n "$next_us" ]]; then
1848
+ echo ""
1849
+ echo "---"
1850
+ echo "## PER-US SCOPE LOCK (this iteration) — OVERRIDES memory contract"
1851
+ echo "**IGNORE the 'Next Iteration Contract' from memory if it references a different story.**"
1852
+ echo "The Leader has determined that **${next_us}** is the next unverified story."
1853
+ echo "You MUST implement ONLY **${next_us}** in this iteration."
1854
+ echo "Do NOT implement any other user stories."
1855
+ # Per-US test-spec injection: point Worker to scoped test-spec if available
1856
+ local per_us_test_spec="$DESK/plans/test-spec-${SLUG}-${next_us}.md"
1857
+ if [[ -f "$per_us_test_spec" ]]; then
1858
+ echo "- **Test Spec**: Read ONLY \`$per_us_test_spec\` (scoped to ${next_us})"
1859
+ else
1860
+ echo "- **Test Spec**: Read \`$DESK/plans/test-spec-${SLUG}.md\` (full — find ${next_us} section)"
1861
+ fi
1862
+ echo "When done, you MUST WRITE (not just print) the verify signal to the iter-signal FILE at: ${SIGNAL_FILE}"
1863
+ echo "Write this exact JSON to that file (us_id=\"${next_us}\", not \"ALL\"): {\"iteration\": N, \"status\": \"verify\", \"us_id\": \"${next_us}\", \"summary\": \"what was done\", \"timestamp\": \"ISO\"}"
1864
+ echo ""
1865
+ echo "**Update the campaign memory's 'Next Iteration Contract' to reflect ${next_us}.**"
1866
+ elif [[ -n "$VERIFIED_US" ]]; then
1867
+ # All individual US verified — this is the final full verify iteration
1868
+ echo ""
1869
+ echo "---"
1870
+ echo "## FINAL VERIFICATION ITERATION"
1871
+ echo "All individual US have been verified: $VERIFIED_US"
1872
+ echo "Run all tests and verification commands to confirm everything works together."
1873
+ echo "Signal verify with us_id=\"ALL\" for the final full verification."
1874
+ fi
1875
+ elif [[ "$VERIFY_MODE" = "batch" ]]; then
1876
+ echo ""
1877
+ echo "---"
1878
+ if [[ -n "$VERIFIED_US" ]]; then
1879
+ echo "## BATCH MODE — CONTINUE FROM PARTIAL PROGRESS"
1880
+ echo "The following US have already been verified: **$VERIFIED_US**"
1881
+ echo "- Do NOT re-implement these — they are done."
1882
+ echo "- Focus ONLY on the remaining unverified user stories."
1883
+ echo '- Signal verify with us_id="ALL" when the remaining stories are complete.'
1884
+ else
1885
+ echo "## BATCH MODE OVERRIDE"
1886
+ echo "Ignore any per-US signal instructions above. In batch mode:"
1887
+ echo "- Implement ALL user stories in this iteration"
1888
+ echo '- Signal verify with us_id="ALL" only when ALL stories are complete'
1889
+ echo "- Do NOT signal verify after individual stories"
1890
+ fi
1891
+ fi
1892
+
1893
+ # Autonomous mode: don't stop on ambiguity, PRD is authoritative
1894
+ if (( AUTONOMOUS_MODE )); then
1895
+ echo ""
1896
+ echo "---"
1897
+ echo "## AUTONOMOUS MODE"
1898
+ echo "Do NOT stop or ask questions when encountering ambiguity or document conflicts."
1899
+ echo "**Resolution priority**: PRD > test-spec > context > memory"
1900
+ echo "If documents disagree, follow PRD and proceed. Log any conflict you find by"
1901
+ echo "appending to \`$LOGS_DIR/conflict-log.jsonl\` in format:"
1902
+ echo ' {"iteration":N,"us_id":"US-NNN","source_a":"prd","source_b":"test-spec","conflict":"description","resolution":"followed PRD"}'
1903
+ echo "Do NOT wait for human input. Keep working."
1904
+ fi
1905
+ } | atomic_write "$prompt_file"
1906
+
1907
+ # Write trigger script (DO NOT use exec -- breaks heartbeat cleanup)
1908
+ # Engine-specific launch command (expanded at write time)
1909
+ if [[ "$WORKER_ENGINE" = "codex" ]]; then
1910
+ local engine_cmd="${CODEX_BIN:-codex} \\
1911
+ -m $WORKER_CODEX_MODEL \\
1912
+ -c model_reasoning_effort=\"$WORKER_CODEX_REASONING\" \\
1913
+ --disable plugins --dangerously-bypass-approvals-and-sandbox \\
1914
+ \"\$(cat $prompt_file)\""
1915
+ local engine_comment="# Run codex with fresh context (fallback trigger — TUI primary launch via launch_worker_codex)"
1916
+ else
1917
+ local engine_cmd
1918
+ engine_cmd=$(build_claude_cmd print "$WORKER_MODEL" "$prompt_file" "$output_log" "$WORKER_EFFORT")
1919
+ local engine_comment="# Run claude with fresh context, no MCP/skills (governance.md s7 step 5)"
1920
+ fi
1921
+
1922
+ {
1923
+ cat <<TRIGGER_EOF
1924
+ #!/bin/zsh
1925
+ # Trigger for iteration $iter worker - generated by run_ralph_desk.zsh
1926
+ # DO NOT use exec here -- it breaks heartbeat cleanup
1927
+
1928
+ HEARTBEAT_FILE="$WORKER_HEARTBEAT"
1929
+
1930
+ # Background heartbeat writer (tmux pattern)
1931
+ (
1932
+ while true; do
1933
+ echo '{"epoch":'\$(date +%s)',"pid":'"\$\$"'}' > "\${HEARTBEAT_FILE}.tmp.\$\$"
1934
+ mv "\${HEARTBEAT_FILE}.tmp.\$\$" "\$HEARTBEAT_FILE"
1935
+ sleep 15
1936
+ done
1937
+ ) &
1938
+ HEARTBEAT_PID=\$!
1939
+
1940
+ $engine_comment
1941
+ $engine_cmd
1942
+
1943
+ # Cleanup heartbeat writer
1944
+ kill \$HEARTBEAT_PID 2>/dev/null
1945
+ wait \$HEARTBEAT_PID 2>/dev/null
1946
+ echo '{"epoch":'\$(date +%s)',"status":"exited"}' > "\${HEARTBEAT_FILE}.tmp.\$\$"
1947
+ mv "\${HEARTBEAT_FILE}.tmp.\$\$" "\$HEARTBEAT_FILE"
1948
+ TRIGGER_EOF
1949
+ } | atomic_write "$trigger_file"
1950
+ chmod +x "$trigger_file"
1951
+
1952
+ log " Worker prompt: $prompt_file"
1953
+ log " Worker trigger: $trigger_file"
1954
+ }
1955
+
1956
+ write_verifier_trigger() {
1957
+ local iter="$1"
1958
+ local verifier_engine="${2:-$VERIFIER_ENGINE}" # allow override for consensus
1959
+ local verifier_model="${3:-$VERIFIER_MODEL}"
1960
+ local suffix="${4:-}" # optional suffix for consensus (e.g., "-claude", "-codex")
1961
+ local prompt_file="$LOGS_DIR/iter-$(printf '%03d' $iter).verifier${suffix}-prompt.md"
1962
+ local trigger_file="$LOGS_DIR/iter-$(printf '%03d' $iter).verifier${suffix}-trigger.sh"
1963
+ local output_log="$LOGS_DIR/iter-$(printf '%03d' $iter).verifier${suffix}-output.log"
1964
+
1965
+ # Read us_id from iter-signal.json for per-US scoping
1966
+ local us_id=""
1967
+ if [[ -f "$SIGNAL_FILE" ]]; then
1968
+ us_id=$(jq -r '.us_id // empty' "$SIGNAL_FILE" 2>/dev/null)
1969
+ fi
1970
+
1971
+ # Build verifier prompt from base with US scope
1972
+ {
1973
+ cat "$VERIFIER_PROMPT_BASE"
1974
+ echo ""
1975
+ echo "---"
1976
+ echo "## Verification Context"
1977
+ echo "- **Iteration**: $iter"
1978
+ echo "- **Done Claim**: $DONE_CLAIM_FILE"
1979
+ echo "- **Verify Mode**: $VERIFY_MODE"
1980
+ if [[ -n "$us_id" ]]; then
1981
+ if [[ "$us_id" = "ALL" ]]; then
1982
+ echo "- **Scope**: FULL VERIFY — check ALL acceptance criteria from the PRD"
1983
+ else
1984
+ echo "- **Scope**: Verify ONLY the acceptance criteria for **${us_id}**"
1985
+ fi
1986
+ if [[ -n "$VERIFIED_US" ]]; then
1987
+ echo "- **Previously verified US**: $VERIFIED_US"
1988
+ echo "- **Note**: Skip re-verifying the above US. Focus on unverified stories."
1989
+ fi
1990
+ fi
1991
+
1992
+ # Autonomous mode: don't stop on ambiguity, PRD is authoritative
1993
+ if (( AUTONOMOUS_MODE )); then
1994
+ echo ""
1995
+ echo "---"
1996
+ echo "## AUTONOMOUS MODE"
1997
+ echo "Do NOT stop or ask questions when encountering ambiguity or document conflicts."
1998
+ echo "**Resolution priority**: PRD > test-spec > context > memory"
1999
+ echo "If documents disagree, follow PRD and proceed. Log any conflict by"
2000
+ echo "appending to \`$LOGS_DIR/conflict-log.jsonl\` in format:"
2001
+ echo ' {"iteration":N,"us_id":"US-NNN","source_a":"prd","source_b":"test-spec","conflict":"description","resolution":"followed PRD"}'
2002
+ echo "Do NOT wait for human input. Keep verifying."
2003
+ fi
2004
+ } | atomic_write "$prompt_file"
2005
+
2006
+ # Write trigger script (DO NOT use exec -- breaks heartbeat cleanup)
2007
+ # Engine-specific launch command (expanded at write time)
2008
+ if [[ "$verifier_engine" = "codex" ]]; then
2009
+ local engine_cmd="${CODEX_BIN:-codex} -m $VERIFIER_CODEX_MODEL \\
2010
+ -c model_reasoning_effort=\"$VERIFIER_CODEX_REASONING\" \\
2011
+ --disable plugins --dangerously-bypass-approvals-and-sandbox \\
2012
+ \"\$(cat $prompt_file)\" \\
2013
+ > >(tee $output_log) 2>&1"
2014
+ local engine_comment="# Run codex with fresh context (governance.md s7 step 7) — process substitution preserves tty"
2015
+ else
2016
+ local engine_cmd
2017
+ engine_cmd=$(build_claude_cmd print "$verifier_model" "$prompt_file" "$output_log" "$VERIFIER_EFFORT")
2018
+ local engine_comment="# Run claude with fresh context, no MCP/skills (governance.md s7 step 7)"
2019
+ fi
2020
+
2021
+ {
2022
+ cat <<TRIGGER_EOF
2023
+ #!/bin/zsh
2024
+ # Trigger for iteration $iter verifier${suffix} - generated by run_ralph_desk.zsh
2025
+ # DO NOT use exec here -- it breaks heartbeat cleanup
2026
+
2027
+ HEARTBEAT_FILE="$VERIFIER_HEARTBEAT"
2028
+
2029
+ # Background heartbeat writer (tmux pattern)
2030
+ (
2031
+ while true; do
2032
+ echo '{"epoch":'\$(date +%s)',"pid":'"\$\$"'}' > "\${HEARTBEAT_FILE}.tmp.\$\$"
2033
+ mv "\${HEARTBEAT_FILE}.tmp.\$\$" "\$HEARTBEAT_FILE"
2034
+ sleep 15
2035
+ done
2036
+ ) &
2037
+ HEARTBEAT_PID=\$!
2038
+
2039
+ $engine_comment
2040
+ $engine_cmd
2041
+
2042
+ # Cleanup heartbeat writer
2043
+ kill \$HEARTBEAT_PID 2>/dev/null
2044
+ wait \$HEARTBEAT_PID 2>/dev/null
2045
+ echo '{"epoch":'\$(date +%s)',"status":"exited"}' > "\${HEARTBEAT_FILE}.tmp.\$\$"
2046
+ mv "\${HEARTBEAT_FILE}.tmp.\$\$" "\$HEARTBEAT_FILE"
2047
+ TRIGGER_EOF
2048
+ } | atomic_write "$trigger_file"
2049
+ chmod +x "$trigger_file"
2050
+
2051
+ log " Verifier prompt: $prompt_file"
2052
+ log " Verifier trigger: $trigger_file"
2053
+ }
2054
+
2055
+ # =============================================================================
2056
+ # Cleanup (trap handler)
2057
+ # =============================================================================
2058
+
2059
+ cleanup() {
2060
+ log "Cleaning up..."
2061
+
2062
+ # Remove lockfile
2063
+ if (( LOCKFILE_ACQUIRED )); then
2064
+ rm -f "$LOCKFILE_PATH" 2>/dev/null
2065
+ else
2066
+ log_debug "cleanup: lockfile not owned by this process, skipping removal"
2067
+ fi
2068
+
2069
+ # US-026 R14 P0: remove project-scoped runner lockfile if owned by this slug
2070
+ if [[ -f "$RUNNER_LOCKFILE_PATH" ]]; then
2071
+ local own_slug
2072
+ own_slug=$(jq -r '.slug' "$RUNNER_LOCKFILE_PATH" 2>/dev/null)
2073
+ if [[ "$own_slug" == "$SLUG" ]]; then
2074
+ rm -rf "$RUNNER_LOCKDIR" "$RUNNER_LOCKFILE_PATH" 2>/dev/null
2075
+ fi
2076
+ fi
2077
+
2078
+ # Kill claude processes then kill panes
2079
+ log_debug "cleanup: WORKER_PANE=${WORKER_PANE:-unset} VERIFIER_PANE=${VERIFIER_PANE:-unset}"
2080
+ if [[ -n "${WORKER_PANE:-}" ]]; then
2081
+ tmux send-keys -t "$WORKER_PANE" C-c 2>/dev/null
2082
+ tmux send-keys -t "$WORKER_PANE" "/exit" C-m 2>/dev/null
2083
+ fi
2084
+ if [[ -n "${VERIFIER_PANE:-}" ]]; then
2085
+ tmux send-keys -t "$VERIFIER_PANE" C-c 2>/dev/null
2086
+ tmux send-keys -t "$VERIFIER_PANE" "/exit" C-m 2>/dev/null
2087
+ fi
2088
+ sleep 2
2089
+ # Kill panes on completion
2090
+ if [[ -n "${WORKER_PANE:-}" ]]; then
2091
+ tmux kill-pane -t "$WORKER_PANE" 2>/dev/null
2092
+ fi
2093
+ if [[ -n "${VERIFIER_PANE:-}" ]]; then
2094
+ tmux kill-pane -t "$VERIFIER_PANE" 2>/dev/null
2095
+ fi
2096
+ log " Panes cleaned up."
2097
+
2098
+ # Remove any leftover tmp files (setopt nonomatch to avoid zsh glob errors)
2099
+ setopt local_options nonomatch 2>/dev/null
2100
+ rm -f "$LOGS_DIR"/*.tmp.* "$MEMOS_DIR"/*.tmp.* 2>/dev/null
2101
+
2102
+ # AC4: Generate campaign report on all terminal states (always-on)
2103
+ generate_campaign_report
2104
+
2105
+ # US-001: Generate SV report after campaign report (tmux mode)
2106
+ generate_sv_report
2107
+
2108
+ # Print summary
2109
+ local end_time
2110
+ end_time=$(date +%s)
2111
+ local elapsed=$(( end_time - START_TIME ))
2112
+ local minutes=$(( elapsed / 60 ))
2113
+ local seconds=$(( elapsed % 60 ))
2114
+
2115
+ local final_status="UNKNOWN"
2116
+ if [[ -f "$COMPLETE_SENTINEL" ]]; then final_status="COMPLETE"
2117
+ elif [[ -f "$BLOCKED_SENTINEL" ]]; then final_status="BLOCKED"
2118
+ else final_status="TIMEOUT"; fi
2119
+
2120
+ # --- Update metadata.json with final status ---
2121
+ if [[ -f "$METADATA_FILE" ]]; then
2122
+ jq --arg status "$final_status" --arg end_time "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
2123
+ '.campaign_status = $status | .end_time = $end_time' \
2124
+ "$METADATA_FILE" > "${METADATA_FILE}.tmp" && mv "${METADATA_FILE}.tmp" "$METADATA_FILE"
2125
+ fi
2126
+
2127
+ if (( DEBUG )); then
2128
+ local end_ts=$(date +%s)
2129
+ local elapsed=$((end_ts - START_TIME))
2130
+
2131
+ log_debug "[FLOW] final status=$final_status iterations=$ITERATION elapsed=${elapsed}s"
2132
+
2133
+ # --- Validation ---
2134
+ log_debug "[FLOW] === Execution Validation ==="
2135
+
2136
+ # 1. Did the correct verify mode run?
2137
+ log_debug "[FLOW] verify_mode=$VERIFY_MODE configured=true"
2138
+
2139
+ # 2. Per-US: were all US individually verified?
2140
+ if [[ "$VERIFY_MODE" = "per-us" ]]; then
2141
+ local prd_file="$DESK/plans/prd-$SLUG.md"
2142
+ local expected_us=""
2143
+ if [[ -f "$prd_file" ]]; then
2144
+ expected_us=$(grep -oE 'US-[0-9]+' "$prd_file" | sort -u | tr '\n' ',' | sed 's/,$//')
2145
+ fi
2146
+ local verified_count=$(echo "$VERIFIED_US" | tr ',' '\n' | grep -c 'US-' 2>/dev/null || echo 0)
2147
+ local expected_count=$(echo "$expected_us" | tr ',' '\n' | grep -c 'US-' 2>/dev/null || echo 0)
2148
+
2149
+ if [[ "$final_status" = "COMPLETE" ]]; then
2150
+ if (( verified_count >= expected_count )); then
2151
+ log_debug "[FLOW] per_us_coverage=PASS verified=$verified_count/$expected_count us=$VERIFIED_US"
2152
+ else
2153
+ log_debug "[FLOW] per_us_coverage=FAIL verified=$verified_count/$expected_count expected=$expected_us got=$VERIFIED_US"
2154
+ fi
2155
+ else
2156
+ log_debug "[FLOW] per_us_coverage=INCOMPLETE verified=$verified_count/$expected_count status=$final_status"
2157
+ fi
2158
+ fi
2159
+
2160
+ # 3. Consensus: were both engines used?
2161
+ if [[ "$CONSENSUS_MODE" != "off" ]]; then
2162
+ if [[ -n "${CLAUDE_VERDICT:-}" && -n "${CODEX_VERDICT:-}" ]]; then
2163
+ log_debug "[FLOW] consensus=USED mode=$CONSENSUS_MODE claude=$CLAUDE_VERDICT codex=$CODEX_VERDICT rounds=$CONSENSUS_ROUND"
2164
+ else
2165
+ log_debug "[FLOW] consensus=NOT_TRIGGERED mode=$CONSENSUS_MODE claude=${CLAUDE_VERDICT:-none} codex=${CODEX_VERDICT:-none}"
2166
+ fi
2167
+ fi
2168
+
2169
+ # 4. Engine match: did the configured engines actually run?
2170
+ local worker_dispatches=$(grep -c '\[FLOW\].*phase=worker.*dispatched=true' "$DEBUG_LOG" 2>/dev/null || echo 0)
2171
+ local verifier_dispatches=$(grep -c '\[FLOW\].*phase=verifier.*dispatched=true' "$DEBUG_LOG" 2>/dev/null || echo 0)
2172
+ log_debug "[FLOW] dispatches worker=$worker_dispatches verifier=$verifier_dispatches"
2173
+
2174
+ # 5. Fix loops: how many fix contracts were generated?
2175
+ local fix_count=$(grep -c '\[DECIDE\].*phase=fix_loop' "$DEBUG_LOG" 2>/dev/null || echo 0)
2176
+ log_debug "[FLOW] fix_loops=$fix_count consecutive_failures=$CONSECUTIVE_FAILURES"
2177
+
2178
+ # 6. Circuit breakers: any triggered?
2179
+ local cb_count=$(grep -c '\[GOV\].*circuit_breaker=' "$DEBUG_LOG" 2>/dev/null || echo 0)
2180
+ log_debug "[FLOW] circuit_breakers_triggered=$cb_count"
2181
+
2182
+ # 7. Overall result
2183
+ log_debug "[FLOW] result=$final_status iterations=$ITERATION elapsed=${elapsed}s verified_us=$VERIFIED_US"
2184
+ fi
2185
+
2186
+ echo ""
2187
+ echo "============================================================"
2188
+ echo " Ralph Desk Tmux Runner - Session Complete"
2189
+ echo "============================================================"
2190
+ echo " Session: $SESSION_NAME"
2191
+ echo " Slug: $SLUG"
2192
+ echo " Iterations: $ITERATION / $MAX_ITER"
2193
+ echo " Elapsed: ${minutes}m ${seconds}s"
2194
+ echo ""
2195
+
2196
+ if [[ -f "$COMPLETE_SENTINEL" ]]; then
2197
+ echo " Final State: COMPLETE"
2198
+ elif [[ -f "$BLOCKED_SENTINEL" ]]; then
2199
+ echo " Final State: BLOCKED"
2200
+ else
2201
+ echo " Final State: STOPPED (interrupted or timeout)"
2202
+ fi
2203
+
2204
+ echo ""
2205
+ echo " Tmux session left alive for inspection:"
2206
+ echo " tmux attach -t $SESSION_NAME"
2207
+ echo " tmux kill-session -t $SESSION_NAME"
2208
+ echo "============================================================"
2209
+ }
2210
+
2211
+ # =============================================================================
2212
+ # Poll Loop (used for both Worker and Verifier)
2213
+ # =============================================================================
2214
+
2215
+ # --- governance.md s7 step 5+6: Poll for signal file with heartbeat monitoring ---
2216
+ poll_for_signal() {
2217
+ local signal_file="$1"
2218
+ local heartbeat_file="$2"
2219
+ local pane_id="$3"
2220
+ local trigger_file="$4"
2221
+ local role="$5" # "worker" or "verifier"
2222
+ local nudge_count=0
2223
+ local api_retry_count=0
2224
+ local poll_start
2225
+ poll_start=$(date +%s)
2226
+
2227
+ # Initialize idle tracking for this pane
2228
+ LAST_PANE_CONTENT[$pane_id]=""
2229
+ PANE_IDLE_SINCE[$pane_id]=$(date +%s)
2230
+
2231
+ while true; do
2232
+ local now
2233
+ now=$(date +%s)
2234
+ local elapsed=$(( now - poll_start ))
2235
+
2236
+ # Per-iteration timeout check
2237
+ if (( elapsed >= ITER_TIMEOUT )); then
2238
+ log_error "$role timed out after ${ITER_TIMEOUT}s for iteration $ITERATION"
2239
+ return 1 # timeout
2240
+ fi
2241
+
2242
+ # Check if signal file appeared
2243
+ if [[ -f "$signal_file" ]]; then
2244
+ # Bug #7-extra (BOS 2026-05-06): file existence is NOT enough. Worker
2245
+ # (claude opus) writes via Claude Code's Write tool, which is not
2246
+ # guaranteed atomic — the file can appear with empty / partial JSON
2247
+ # before the write completes. Verifier was being dispatched against a
2248
+ # half-written iter-signal.json. Validate that the file holds a single
2249
+ # parseable, non-null JSON value (`jq -e .`) before accepting; any
2250
+ # failure simply continues polling (next tick re-reads). Note: `jq
2251
+ # empty` was rejected because it accepts an EMPTY file as "zero
2252
+ # documents" — the exact race window we need to reject.
2253
+ if jq -e . "$signal_file" >/dev/null 2>&1; then
2254
+ log " Signal file detected: $signal_file"
2255
+ return 0 # success
2256
+ fi
2257
+ # Empty / truncated / mid-write JSON. Stay in the polling loop and let
2258
+ # the next tick re-read once the writer has finished.
2259
+ log_debug "[bug7-extra] $role signal file present but JSON not yet valid — continue polling"
2260
+ fi
2261
+
2262
+ # A4 fallback: done-claim exists but no signal → Worker forgot iter-signal
2263
+ # ONLY for Worker polling — Verifier waits for verdict file, not done-claim
2264
+ #
2265
+ # v5.7 §4.14 (Bug 5 fix, CRITICAL): if Worker pane shows a pending TUI
2266
+ # permission prompt (`Do you want to ...` with `(y/n)` / `❯ 1.` affordance),
2267
+ # Worker is NOT done — it's stuck mid-write after the first done-claim pass.
2268
+ # Suspending A4 fallback in this case prevents premature Verifier dispatch
2269
+ # against partial Worker output. auto_dismiss_prompts() will already have
2270
+ # tried to clear the prompt; if it's still visible the worker is in a
2271
+ # multi-prompt sequence and needs more time, not an A4 short-circuit.
2272
+ if [[ "$role" != *erifier* && -f "$DONE_CLAIM_FILE" && ! -f "$signal_file" ]]; then
2273
+ local _a4_capture
2274
+ _a4_capture=$(tmux capture-pane -t "$pane_id" -p -S -50 2>/dev/null || true)
2275
+ local -a _a4_lines
2276
+ _a4_lines=("${(@f)_a4_capture}")
2277
+ local _a4_i _a4_n=${#_a4_lines[@]} _a4_blocked=0
2278
+ for ((_a4_i=1; _a4_i <= _a4_n; _a4_i++)); do
2279
+ if [[ "${_a4_lines[_a4_i]}" =~ $_PROMPT_RE ]]; then
2280
+ local _a4_prev="${_a4_lines[_a4_i-1]:-}"
2281
+ local _a4_cur="${_a4_lines[_a4_i]}"
2282
+ local _a4_next="${_a4_lines[_a4_i+1]:-}"
2283
+ if [[ "$_a4_prev" =~ $_AFFORDANCE_RE || "$_a4_cur" =~ $_AFFORDANCE_RE || "$_a4_next" =~ $_AFFORDANCE_RE ]]; then
2284
+ _a4_blocked=1
2285
+ break
2286
+ fi
2287
+ fi
2288
+ done
2289
+ if (( _a4_blocked )); then
2290
+ log " Worker pane has pending permission prompt — A4 fallback suspended (Bug 5 guard)"
2291
+ log_debug "[GOV] iter=$ITERATION a4_fallback_suspended=true reason=worker_prompt_pending pane=$pane_id"
2292
+ # Continue polling; do NOT auto-generate signal. auto_dismiss_prompts will
2293
+ # try to dismiss on the next loop iteration.
2294
+ else
2295
+ local dc_us_id
2296
+ dc_us_id=$(jq -r '.us_id // "unknown"' "$DONE_CLAIM_FILE" 2>/dev/null)
2297
+ if [[ -n "$dc_us_id" && "$dc_us_id" != "null" ]]; then
2298
+ # Bug #8 PR-B: defer to shared 4-way gate (codex critic P1.2).
2299
+ # _bug8_check_synth_allowed handles done-claim/git/dirty-tree gates
2300
+ # uniformly across handle_worker_exit_codex AND this inline path so
2301
+ # both codex-exit and inline-polling A4 enforce the same contract.
2302
+ if _bug8_check_synth_allowed "$ITERATION" "$dc_us_id" "inline_polling_a4_clean"; then
2303
+ log " WARNING: done-claim exists for $dc_us_id but no iter-signal. Tree clean — auto-generating signal (A4 fallback)."
2304
+ log_debug "[GOV] iter=$ITERATION done_claim_without_signal=true us_id=$dc_us_id action=auto_generate_signal"
2305
+ # v0.15.4 PR-B2-FIX: Worker pane is alive and idling post-done-claim
2306
+ # (the canonical Bug #5/7 race window). Reap before synthesizing the
2307
+ # signal so the worker cannot revise done-claim or emit a late
2308
+ # iter-signal that races the leader's synthesized one. Mirror of
2309
+ # Bug #7 Fix-Q parity at run_ralph_desk.zsh:3181 — kill before lock,
2310
+ # lock before synth-write so the next leader read sees a frozen
2311
+ # done-claim and a fresh signal_file in that order.
2312
+ _kill_pane_process "$pane_id" "worker-a4"
2313
+ _lock_sentinel "$DONE_CLAIM_FILE"
2314
+ echo '{"iteration":'"$ITERATION"',"status":"verify","us_id":"'"$dc_us_id"'","summary":"auto-generated by A4 fallback (done-claim + clean tree)","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'"}' | atomic_write "$signal_file"
2315
+ _emit_a4_fallback_audit "$dc_us_id" "$ITERATION" "inline_polling_a4_clean"
2316
+ return 0
2317
+ else
2318
+ # Bug #8 PR-B (codex critic round-2 P2): hard-stop rc=2 so the
2319
+ # main worker loop (L3119) treats this BLOCKED as terminal,
2320
+ # matching the handle_worker_exit_codex blocked path. rc=1 is
2321
+ # ambiguous — caller may interpret it as a recoverable poll
2322
+ # failure and re-loop while the BLOCKED sentinel is on disk.
2323
+ return 2
2324
+ fi
2325
+ fi
2326
+ fi
2327
+ fi
2328
+
2329
+ # API transient-error recovery with bounded backoff
2330
+ local pane_output_for_retry
2331
+ pane_output_for_retry=$(tmux capture-pane -t "$pane_id" -p 2>/dev/null || true)
2332
+ local is_api_text_retry=0
2333
+ if [[ -n "$pane_output_for_retry" ]] &&
2334
+ ( echo "$pane_output_for_retry" | grep -qiE '(^|[^[:digit:]])500([^[:digit:]]|$)' \
2335
+ || echo "$pane_output_for_retry" | grep -qiE '(^|[^[:digit:]])529([^[:digit:]]|$)' \
2336
+ || echo "$pane_output_for_retry" | grep -qi 'overloaded' \
2337
+ || echo "$pane_output_for_retry" | grep -qi 'too many requests' \
2338
+ || echo "$pane_output_for_retry" | grep -qi 'service unavailable' ); then
2339
+ is_api_text_retry=1
2340
+ fi
2341
+
2342
+ if (( is_api_text_retry )) || is_api_error "$pane_id"; then
2343
+ (( api_retry_count++ ))
2344
+ log_debug "[FLOW] iter=$ITERATION api_retry=${api_retry_count}/${_API_MAX_RETRIES} role=${role} reason=tmux_pane_api_error"
2345
+ if (( api_retry_count >= _API_MAX_RETRIES )); then
2346
+ log_error "API unavailable after ${_API_MAX_RETRIES} retries"
2347
+ write_blocked_sentinel "API unavailable after ${_API_MAX_RETRIES} retries" "" "infra_failure"
2348
+ return 2
2349
+ fi
2350
+ # A5: If pane shows "queued messages" or rate-limit corruption, restart pane
2351
+ if echo "$pane_output_for_retry" | grep -qi 'queued messages'; then
2352
+ log " A5: Rate-limited pane shows 'queued messages' — restarting $role pane"
2353
+ log_debug "[GOV] iter=$ITERATION phase=rate_limit_pane_restart role=$role reason=queued_messages"
2354
+ tmux send-keys -t "$pane_id" C-c 2>/dev/null; sleep 0.5
2355
+ tmux send-keys -t "$pane_id" "/exit" C-m 2>/dev/null; sleep 2
2356
+ wait_for_pane_ready "$pane_id" 10 2>/dev/null || true
2357
+ fi
2358
+ sleep "$_API_RETRY_INTERVAL_S"
2359
+ continue
2360
+ else
2361
+ api_retry_count=0
2362
+ fi
2363
+
2364
+ # Check heartbeat freshness (tmux pattern)
2365
+ if [[ -f "$heartbeat_file" ]]; then
2366
+ if check_heartbeat_exited "$heartbeat_file"; then
2367
+ # Process exited but no signal file -- give a brief grace period
2368
+ sleep 3
2369
+ if [[ -f "$signal_file" ]]; then
2370
+ log " Signal file detected after process exit: $signal_file"
2371
+ return 0
2372
+ fi
2373
+ # Dispatch to engine-specific exit handler
2374
+ if [[ "$WORKER_ENGINE" = "codex" && "$role" != *erifier* ]]; then
2375
+ # Bug #8 PR-B: handle_worker_exit_codex now returns 1 when it has
2376
+ # written a BLOCKED sentinel (no done-claim, dirty tree, git
2377
+ # unverifiable). Propagate the return so main loop stops, instead
2378
+ # of swallowing it with `return 0` and continuing as if the poll
2379
+ # had succeeded.
2380
+ if handle_worker_exit_codex "$ITERATION" "$signal_file"; then
2381
+ return 0
2382
+ else
2383
+ return 2
2384
+ fi
2385
+ fi
2386
+ # Claude path (or verifier of any engine)
2387
+ if handle_worker_exit_claude "$pane_id" "$ITERATION" "$trigger_file"; then
2388
+ # Reset poll timer for the restart
2389
+ poll_start=$(date +%s)
2390
+ nudge_count=0
2391
+ LAST_PANE_CONTENT[$pane_id]=""
2392
+ PANE_IDLE_SINCE[$pane_id]=$(date +%s)
2393
+ sleep "$POLL_INTERVAL"
2394
+ continue
2395
+ else
2396
+ return 1 # max restarts exceeded
2397
+ fi
2398
+ fi
2399
+
2400
+ if ! check_heartbeat "$heartbeat_file"; then
2401
+ log " WARNING: $role heartbeat stale (>${HEARTBEAT_STALE_THRESHOLD}s)"
2402
+ (( HEARTBEAT_STALE_COUNT++ ))
2403
+ # Circuit breaker: 3 consecutive heartbeat stale events
2404
+ if (( HEARTBEAT_STALE_COUNT >= 3 )); then
2405
+ log_debug "[GOV] iter=$ITERATION circuit_breaker=heartbeat_stale detail=\"3 consecutive heartbeat stale events\""
2406
+ log_error "Circuit breaker: 3 consecutive heartbeat stale events"
2407
+ return 1
2408
+ fi
2409
+ # Attempt restart
2410
+ if restart_worker "$pane_id" "$ITERATION" "$trigger_file"; then
2411
+ poll_start=$(date +%s)
2412
+ nudge_count=0
2413
+ continue
2414
+ else
2415
+ return 1
2416
+ fi
2417
+ else
2418
+ # Heartbeat is fresh, reset stale counter
2419
+ HEARTBEAT_STALE_COUNT=0
2420
+ fi
2421
+ fi
2422
+
2423
+ # Dead pane detection during poll: check if claude/codex process died
2424
+ local poll_cmd
2425
+ poll_cmd=$(tmux display-message -p -t "$pane_id" '#{pane_current_command}' 2>/dev/null)
2426
+ # Dead pane detection — delegates to check_dead_pane() for engine-aware logic
2427
+ if check_dead_pane "$poll_cmd" "$WORKER_ENGINE" "$role"; then
2428
+ log " WARNING: $role pane $pane_id has bare shell ($poll_cmd) — process died during execution"
2429
+ log_debug "[GOV] iter=$ITERATION pane_dead_during_poll=true pane=$pane_id cmd=$poll_cmd role=$role"
2430
+ # Return failure so caller can handle recovery
2431
+ return 1
2432
+ fi
2433
+
2434
+ # v5.7 §4.13.a: window-bounded prompt auto-dismiss (replaces broad inline grep).
2435
+ # check_and_nudge_idle_pane also calls auto_dismiss_prompts internally, but
2436
+ # we keep this explicit call so dismiss happens BEFORE the idle/nudge check
2437
+ # and is logged with iter context.
2438
+ auto_dismiss_prompts "$pane_id"
2439
+
2440
+ # v5.7 §4.16: bounded prompt-stall escalation. If pane has been prompt-stuck
2441
+ # for PROMPT_STALL_TIMEOUT (5min default) or dismiss attempts exceed
2442
+ # PROMPT_DISMISS_FAIL_LIMIT, write BLOCKED `infra_failure` and exit the poll.
2443
+ # Closes the "alive process = infinite extend" gap (codex Critic HIGH).
2444
+ if ! check_prompt_stall "$pane_id"; then
2445
+ return 2 # signal: hard-failed, do not retry
2446
+ fi
2447
+
2448
+ # v5.7 §4.17 (codex Critic HIGH): generic no-progress timeout. Catches
2449
+ # undetected prompts, hung network calls, or any other alive-but-frozen
2450
+ # state. PROGRESS_NO_CHANGE_TIMEOUT defaults to 10 minutes. Independent
2451
+ # of regex prompt detection — fires whenever pane content is byte-equal
2452
+ # for too long even when Worker process is "alive".
2453
+ if ! check_no_progress "$pane_id"; then
2454
+ return 2 # hard-failed, infra_failure recorded
2455
+ fi
2456
+
2457
+ # Idle pane nudging (tmux pattern)
2458
+ check_and_nudge_idle_pane "$pane_id" "nudge_count"
2459
+
2460
+ sleep "$POLL_INTERVAL"
2461
+ done
2462
+ }
2463
+
2464
+ # =============================================================================
2465
+ # Consensus Verification (run two verifiers sequentially in same pane)
2466
+ # =============================================================================
2467
+
2468
+ # --- US-004: Run a single verifier in the Verifier pane and poll for verdict ---
2469
+ run_single_verifier() {
2470
+ local iter="$1"
2471
+ local engine="$2" # claude|codex
2472
+ local model="$3" # model for this verifier
2473
+ local suffix="$4" # "-claude" or "-codex"
2474
+ local verdict_dest="$5" # where to copy the verdict file
2475
+
2476
+ # Write trigger for this engine
2477
+ write_verifier_trigger "$iter" "$engine" "$model" "$suffix"
2478
+ local trigger_file="$LOGS_DIR/iter-$(printf '%03d' $iter).verifier${suffix}-trigger.sh"
2479
+ local prompt_file="$LOGS_DIR/iter-$(printf '%03d' $iter).verifier${suffix}-prompt.md"
2480
+
2481
+ # Clean previous Verifier session (with dead pane detection)
2482
+ local verifier_cmd
2483
+ verifier_cmd=$(tmux display-message -p -t "$VERIFIER_PANE" '#{pane_current_command}' 2>/dev/null)
2484
+ if [[ -z "$verifier_cmd" ]]; then
2485
+ log " Verifier pane $VERIFIER_PANE is gone — replacing..."
2486
+ log_debug "[GOV] iter=$iter pane_dead=true pane_id=$VERIFIER_PANE action=replace_pane"
2487
+ replace_worker_pane "$VERIFIER_PANE" "verifier"
2488
+ VERIFIER_PANE=$(jq -r '.panes.verifier' "$SESSION_CONFIG")
2489
+ log " New verifier pane: $VERIFIER_PANE"
2490
+ elif [[ "$verifier_cmd" == "zsh" || "$verifier_cmd" == "bash" ]]; then
2491
+ log " Verifier pane $VERIFIER_PANE has bare shell ($verifier_cmd) — resetting..."
2492
+ log_debug "[GOV] iter=$iter pane_dead=true pane_id=$VERIFIER_PANE cmd=$verifier_cmd action=reset_shell"
2493
+ tmux send-keys -t "$VERIFIER_PANE" C-c C-u 2>/dev/null
2494
+ sleep 0.2
2495
+ tmux send-keys -t "$VERIFIER_PANE" "clear" C-m 2>/dev/null
2496
+ sleep 0.3
2497
+ elif [[ "$verifier_cmd" == "node" || "$verifier_cmd" == "claude" || "$verifier_cmd" == "codex" ]]; then
2498
+ tmux send-keys -t "$VERIFIER_PANE" C-c 2>/dev/null
2499
+ sleep 0.5
2500
+ tmux send-keys -t "$VERIFIER_PANE" "/exit" C-m 2>/dev/null
2501
+ sleep 2
2502
+ fi
2503
+ # Always ensure clean shell state before launching new verifier
2504
+ wait_for_pane_ready "$VERIFIER_PANE" 10 2>/dev/null || true
2505
+ # Clear pane to avoid residual text interference
2506
+ tmux send-keys -t "$VERIFIER_PANE" C-l 2>/dev/null
2507
+ sleep 0.5
2508
+
2509
+ # Remove previous verdict file
2510
+ rm -f "$VERDICT_FILE" 2>/dev/null
2511
+
2512
+ # Launch verifier — dispatch to engine-specific function
2513
+ local verifier_launch
2514
+ if [[ "$engine" = "codex" ]]; then
2515
+ verifier_launch="${CODEX_BIN:-codex} -m $VERIFIER_CODEX_MODEL -c model_reasoning_effort=\"$VERIFIER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
2516
+ launch_verifier_codex "$VERIFIER_PANE" "$prompt_file" "$iter" "$verifier_launch"
2517
+ log_debug "Verifier$suffix codex TUI dispatched"
2518
+ else
2519
+ verifier_launch="$(build_claude_cmd tui "$model" "" "" "$VERIFIER_EFFORT")"
2520
+ if ! launch_verifier_claude "$VERIFIER_PANE" "$prompt_file" "$iter" "$verifier_launch"; then
2521
+ log_error "Verifier$suffix failed to start"
2522
+ return 1
2523
+ fi
2524
+ log_debug "Verifier$suffix claude dispatched"
2525
+ fi
2526
+
2527
+ # Poll for verdict
2528
+ if [[ "$engine" = "codex" ]]; then
2529
+ # Codex exec: file poll + short grace period after verdict detected
2530
+ log " Polling for verify-verdict.json ($suffix, codex TUI)..."
2531
+ local codex_poll_start
2532
+ codex_poll_start=$(date +%s)
2533
+ local _verdict_detected_at=0
2534
+ while true; do
2535
+ # Wait for verdict file with valid JSON
2536
+ if [[ -f "$VERDICT_FILE" ]] && jq . "$VERDICT_FILE" >/dev/null 2>&1; then
2537
+ if (( _verdict_detected_at == 0 )); then
2538
+ _verdict_detected_at=$(date +%s)
2539
+ log " Verdict file detected. Grace period (30s) for codex to finalize..."
2540
+ fi
2541
+ # Grace period: 30s after verdict detection, proceed regardless of pane state
2542
+ local _grace_elapsed=$(( $(date +%s) - _verdict_detected_at ))
2543
+ if (( _grace_elapsed >= 30 )); then
2544
+ log " Grace period complete. Proceeding."
2545
+ break
2546
+ fi
2547
+ # Early exit: if pane returned to shell, no need to wait
2548
+ local _pane_cmd
2549
+ _pane_cmd=$(tmux display-message -p -t "$VERIFIER_PANE" '#{pane_current_command}' 2>/dev/null || echo "")
2550
+ if [[ "$_pane_cmd" = "zsh" || "$_pane_cmd" = "bash" || -z "$_pane_cmd" ]]; then
2551
+ log " Codex verifier$suffix process exited. Proceeding."
2552
+ break
2553
+ fi
2554
+ fi
2555
+ local codex_elapsed=$(( $(date +%s) - codex_poll_start ))
2556
+ if (( codex_elapsed >= ITER_TIMEOUT )); then
2557
+ if (( _verdict_detected_at > 0 )); then
2558
+ log " Codex verifier$suffix timed out waiting, but verdict exists. Proceeding."
2559
+ break
2560
+ fi
2561
+ log_error "Codex verifier$suffix timed out after ${ITER_TIMEOUT}s"
2562
+ return 1
2563
+ fi
2564
+ sleep "$POLL_INTERVAL"
2565
+ done
2566
+ else
2567
+ # Claude: use full poll_for_signal with heartbeat/nudge
2568
+ log " Polling for verify-verdict.json ($suffix)..."
2569
+ if ! poll_for_signal "$VERDICT_FILE" "$VERIFIER_HEARTBEAT" "$VERIFIER_PANE" "$verifier_launch" "Verifier$suffix"; then
2570
+ local verifier_poll_rc=$?
2571
+ if (( verifier_poll_rc == 2 )); then
2572
+ return 1
2573
+ fi
2574
+ log_error "Verifier$suffix poll failed"
2575
+ return 1
2576
+ fi
2577
+ fi
2578
+
2579
+ # Bug #7 Fix-Q/R: reap verifier pane the moment we accept the verdict so
2580
+ # codex/claude cannot keep self-reviewing and rewrite verify-verdict.json.
2581
+ # Lock applied AFTER cp so the archived snapshot is also frozen at intent.
2582
+ _kill_pane_process "$VERIFIER_PANE" "verifier-${suffix}"
2583
+
2584
+ # Copy verdict to destination
2585
+ cp "$VERDICT_FILE" "$verdict_dest"
2586
+ _lock_sentinel "$VERDICT_FILE"
2587
+ # PR-0b-narrow: stamp leader handshake ack on the verdict (audit-only).
2588
+ _stamp_ack_field "$VERDICT_FILE"
2589
+ log " Verifier$suffix verdict saved to $verdict_dest"
2590
+ return 0
2591
+ }
2592
+
2593
+ # --- Sequential final verify: run per-US scoped verifiers instead of one big ALL verify ---
2594
+ # Returns 0 if all US pass + integration check pass, 1 if any US fails, 2 if integration fails.
2595
+ # Sets FAILED_US global on failure.
2596
+ run_sequential_final_verify() {
2597
+ local iter="$1"
2598
+ FAILED_US=""
2599
+
2600
+ log " Sequential final verify: ${US_LIST} (${VERIFY_MODE} mode)"
2601
+ log_debug "[FLOW] iter=$iter phase=sequential_final_verify us_list=$US_LIST"
2602
+
2603
+ for us in $(echo "$US_LIST" | tr ',' ' '); do
2604
+ log " Final verify: checking $us..."
2605
+
2606
+ # Temporarily override signal file to scope verifier to this US
2607
+ local orig_signal
2608
+ orig_signal=$(cat "$SIGNAL_FILE" 2>/dev/null)
2609
+ echo "{\"status\":\"verify\",\"us_id\":\"$us\",\"summary\":\"sequential final verify\"}" | atomic_write "$SIGNAL_FILE"
2610
+
2611
+ # Write scoped verifier trigger
2612
+ write_verifier_trigger "$iter"
2613
+ local verifier_prompt="$LOGS_DIR/iter-$(printf '%03d' $iter).verifier-prompt.md"
2614
+
2615
+ # Clean verifier pane
2616
+ local verifier_cmd
2617
+ verifier_cmd=$(tmux display-message -p -t "$VERIFIER_PANE" '#{pane_current_command}' 2>/dev/null)
2618
+ if [[ "$verifier_cmd" == "node" || "$verifier_cmd" == "claude" || "$verifier_cmd" == "codex" ]]; then
2619
+ tmux send-keys -t "$VERIFIER_PANE" C-c 2>/dev/null; sleep 0.5
2620
+ tmux send-keys -t "$VERIFIER_PANE" "/exit" C-m 2>/dev/null; sleep 2
2621
+ fi
2622
+ wait_for_pane_ready "$VERIFIER_PANE" 10 2>/dev/null || true
2623
+
2624
+ # Launch verifier
2625
+ local verifier_launch
2626
+ if [[ "$VERIFIER_ENGINE" = "codex" ]]; then
2627
+ verifier_launch="${CODEX_BIN:-codex} -m $VERIFIER_CODEX_MODEL -c model_reasoning_effort=\"$VERIFIER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
2628
+ launch_verifier_codex "$VERIFIER_PANE" "$verifier_prompt" "$iter" "$verifier_launch"
2629
+ else
2630
+ verifier_launch="$(build_claude_cmd tui "$VERIFIER_MODEL" "" "" "$VERIFIER_EFFORT")"
2631
+ launch_verifier_claude "$VERIFIER_PANE" "$verifier_prompt" "$iter" "$verifier_launch" || {
2632
+ log_error "Failed to launch verifier for $us"
2633
+ FAILED_US="$us"
2634
+ return 1
2635
+ }
2636
+ fi
2637
+
2638
+ # Poll for verdict
2639
+ rm -f "$VERDICT_FILE"
2640
+ local poll_rc=0
2641
+ poll_for_signal "$VERDICT_FILE" "$VERIFIER_HEARTBEAT" "$VERIFIER_PANE" "$verifier_launch" "Verifier-final" || poll_rc=$?
2642
+ if (( poll_rc != 0 )); then
2643
+ log_error "Verifier poll failed for $us (rc=$poll_rc)"
2644
+ FAILED_US="$us"
2645
+ return 1
2646
+ fi
2647
+
2648
+ # Bug #7 Fix-Q/R: reap verifier pane between per-US final verifications so
2649
+ # the previous codex/claude TUI cannot continue running while the next per-
2650
+ # US verifier dispatch reuses the same pane.
2651
+ _kill_pane_process "$VERIFIER_PANE" "verifier-final"
2652
+ _lock_sentinel "$VERDICT_FILE"
2653
+ # PR-0b-narrow: stamp leader handshake ack on the verdict (audit-only).
2654
+ _stamp_ack_field "$VERDICT_FILE"
2655
+
2656
+ # Check verdict
2657
+ local verdict
2658
+ verdict=$(jq -r '.verdict' "$VERDICT_FILE" 2>/dev/null)
2659
+ if [[ "$verdict" != "pass" ]]; then
2660
+ FAILED_US="$us"
2661
+ log " Sequential final verify FAILED at $us"
2662
+ log_debug "[FLOW] iter=$iter phase=sequential_final_verify failed_us=$us verdict=$verdict"
2663
+ return 1
2664
+ fi
2665
+ log " Sequential final verify: $us PASSED"
2666
+
2667
+ # Archive per-US final verdict
2668
+ cp "$VERDICT_FILE" "$LOGS_DIR/iter-$(printf '%03d' $iter).final-verdict-${us}.json" 2>/dev/null
2669
+ done
2670
+
2671
+ # Integration check: run tests if VERIFICATION_CMD is set
2672
+ if [[ -n "${VERIFICATION_CMD:-}" ]]; then
2673
+ log " Running integration test suite after sequential verify..."
2674
+ log_debug "[FLOW] iter=$iter phase=integration_check cmd=$VERIFICATION_CMD"
2675
+ if ! eval "$VERIFICATION_CMD" > /dev/null 2>&1; then
2676
+ log " Integration test suite FAILED"
2677
+ FAILED_US="integration"
2678
+ return 2
2679
+ fi
2680
+ log " Integration test suite PASSED"
2681
+ fi
2682
+
2683
+ log " Sequential final verify: ALL PASSED"
2684
+ return 0
2685
+ }
2686
+
2687
+ # --- US-005: Determine whether consensus verification should run for this signal ---
2688
+ # Returns 0 (use consensus) or 1 (single engine).
2689
+ # Uses unified CONSENSUS_MODE: off|all|final-only
2690
+ _should_use_consensus() {
2691
+ local signal_us_id="${1:-}"
2692
+ case "$CONSENSUS_MODE" in
2693
+ all) return 0 ;;
2694
+ final-only) [[ "$signal_us_id" == "ALL" ]] && return 0 ;;
2695
+ off|*) return 1 ;;
2696
+ esac
2697
+ }
2698
+
2699
+ # --- US-004: Run consensus verification (claude + codex sequentially) ---
2700
+ run_consensus_verification() {
2701
+ local iter="$1"
2702
+ local claude_verdict_file="$LOGS_DIR/iter-$(printf '%03d' $iter).verify-verdict-claude.json"
2703
+ local codex_verdict_file="$LOGS_DIR/iter-$(printf '%03d' $iter).verify-verdict-codex.json"
2704
+
2705
+ CONSENSUS_ROUND=0
2706
+ CLAUDE_VERDICT=""
2707
+ CODEX_VERDICT=""
2708
+
2709
+ while (( CONSENSUS_ROUND < 6 )); do
2710
+ (( CONSENSUS_ROUND++ ))
2711
+ log " Consensus round $CONSENSUS_ROUND/6..."
2712
+
2713
+ # Run claude verifier first
2714
+ local _claude_t0=$(date +%s)
2715
+ if ! run_single_verifier "$iter" "claude" "$VERIFIER_MODEL" "-claude" "$claude_verdict_file"; then
2716
+ log_error "Claude verifier failed in consensus round $CONSENSUS_ROUND"
2717
+ return 1
2718
+ fi
2719
+ ITER_VERIFIER_CLAUDE_DURATION_S=$(( $(date +%s) - _claude_t0 ))
2720
+ CLAUDE_VERDICT=$(jq -r '.verdict' "$claude_verdict_file" 2>/dev/null)
2721
+ # A12 fix: validate claude verdict is not null/empty — if so, retry once before proceeding
2722
+ if [[ -z "$CLAUDE_VERDICT" || "$CLAUDE_VERDICT" == "null" ]]; then
2723
+ log " WARNING: Claude verdict is '$CLAUDE_VERDICT' — likely interrupted. Retrying claude verifier..."
2724
+ log_debug "[GOV] iter=$iter phase=consensus_claude_retry reason=null_verdict"
2725
+ rm -f "$claude_verdict_file" 2>/dev/null
2726
+ if ! run_single_verifier "$iter" "claude" "$VERIFIER_MODEL" "-claude" "$claude_verdict_file"; then
2727
+ log_error "Claude verifier retry also failed"
2728
+ return 1
2729
+ fi
2730
+ CLAUDE_VERDICT=$(jq -r '.verdict' "$claude_verdict_file" 2>/dev/null)
2731
+ if [[ -z "$CLAUDE_VERDICT" || "$CLAUDE_VERDICT" == "null" ]]; then
2732
+ log_error "Claude verdict still null after retry — consensus cannot proceed"
2733
+ return 1
2734
+ fi
2735
+ fi
2736
+ log_debug "[GOV] iter=$iter phase=consensus_claude verdict=$CLAUDE_VERDICT model=$VERIFIER_MODEL"
2737
+
2738
+ # consensus-fail-fast removed (complexity vs value too low)
2739
+
2740
+ # Run codex verifier second
2741
+ local _codex_t0=$(date +%s)
2742
+ if ! run_single_verifier "$iter" "codex" "$VERIFIER_CODEX_MODEL" "-codex" "$codex_verdict_file"; then
2743
+ log_error "Codex verifier failed in consensus round $CONSENSUS_ROUND"
2744
+ return 1
2745
+ fi
2746
+ ITER_VERIFIER_CODEX_DURATION_S=$(( $(date +%s) - _codex_t0 ))
2747
+ CODEX_VERDICT=$(jq -r '.verdict' "$codex_verdict_file" 2>/dev/null)
2748
+ log_debug "[GOV] iter=$iter phase=consensus_codex verdict=$CODEX_VERDICT model=$VERIFIER_CODEX_MODEL reasoning=$VERIFIER_CODEX_REASONING"
2749
+
2750
+ log " Consensus: claude=$CLAUDE_VERDICT codex=$CODEX_VERDICT"
2751
+ local _combined_action="retry"
2752
+ if [[ "$CLAUDE_VERDICT" = "pass" && "$CODEX_VERDICT" = "pass" ]]; then _combined_action="pass"
2753
+ elif (( CONSENSUS_ROUND >= 6 )); then _combined_action="blocked"
2754
+ fi
2755
+ log_debug "[GOV] iter=$iter phase=consensus round=$CONSENSUS_ROUND claude=$CLAUDE_VERDICT codex=$CODEX_VERDICT combined_action=$_combined_action"
2756
+
2757
+ # Both pass → success
2758
+ if [[ "$CLAUDE_VERDICT" = "pass" && "$CODEX_VERDICT" = "pass" ]]; then
2759
+ # Create merged verdict with per-engine details
2760
+ {
2761
+ echo '{'
2762
+ echo ' "verdict": "pass",'
2763
+ echo ' "verified_at_utc": "'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'",'
2764
+ echo ' "summary": "Consensus PASS: both claude and codex verified independently",'
2765
+ echo ' "recommended_state_transition": "complete",'
2766
+ echo ' "consensus": {'
2767
+ echo ' "claude": { "verdict": "pass", "file": "'"$claude_verdict_file"'" },'
2768
+ echo ' "codex": { "verdict": "pass", "file": "'"$codex_verdict_file"'" },'
2769
+ echo ' "round": '"$CONSENSUS_ROUND"
2770
+ echo ' }'
2771
+ echo '}'
2772
+ } | atomic_write "$VERDICT_FILE"
2773
+ return 0
2774
+ fi
2775
+
2776
+ # Consensus disagreement
2777
+ log_debug "[GOV] iter=$iter phase=consensus_disagreement round=$CONSENSUS_ROUND claude=$CLAUDE_VERDICT codex=$CODEX_VERDICT action=fix_contract"
2778
+
2779
+ # NOTE: pre_existing_failure heuristic was removed (v0.3.5).
2780
+ # It used unreliable grep-in-description string matching to classify
2781
+ # consensus failures as "pre-existing", bypassing the consensus rule.
2782
+ # Consensus disagreement now ALWAYS flows to fix contract.
2783
+ # Codex CLI crash (no verdict file) is handled upstream via run_single_verifier return 1 → BLOCKED.
2784
+
2785
+ # --- Consensus disagreement: build fix contract ---
2786
+ local fix_contract="$LOGS_DIR/iter-$(printf '%03d' $iter).fix-contract.md"
2787
+ {
2788
+ echo "# Fix Contract (Consensus Round $CONSENSUS_ROUND, iteration $iter)"
2789
+ echo ""
2790
+ echo "## Claude Verdict: $CLAUDE_VERDICT"
2791
+ if [[ "$CLAUDE_VERDICT" = "fail" ]]; then
2792
+ echo "### Claude Issues"
2793
+ jq -r '.issues[]? | "- [\(.severity // "unknown")] \(.criterion // "?"): \(.description // "no description")\(if .fix_hint then " (hint: \(.fix_hint))" else "" end)"' "$claude_verdict_file" 2>/dev/null || echo "- (no structured issues)"
2794
+ fi
2795
+ echo ""
2796
+ echo "## Codex Verdict: $CODEX_VERDICT"
2797
+ if [[ "$CODEX_VERDICT" = "fail" ]]; then
2798
+ echo "### Codex Issues"
2799
+ jq -r '.issues[]? | "- [\(.severity // "unknown")] \(.criterion // "?"): \(.description // "no description")\(if .fix_hint then " (hint: \(.fix_hint))" else "" end)"' "$codex_verdict_file" 2>/dev/null || echo "- (no structured issues)"
2800
+ fi
2801
+ echo ""
2802
+ echo "## Traceability"
2803
+ echo "Only changes that resolve a listed issue are allowed."
2804
+ } | atomic_write "$fix_contract"
2805
+
2806
+ log " Combined fix contract: $fix_contract"
2807
+
2808
+ # If this is not the last round, the caller will dispatch the Worker with the fix contract
2809
+ # For now, write a fail verdict so the main loop can handle the fix loop
2810
+ if (( CONSENSUS_ROUND < 6 )); then
2811
+ # Create a merged fail verdict for the main loop — include issues from BOTH verdicts
2812
+ local merged_issues="[]"
2813
+ local claude_issues codex_issues
2814
+ claude_issues=$(jq -c '[.issues[]? | . + {"source": "claude"}]' "$claude_verdict_file" 2>/dev/null || echo '[]')
2815
+ codex_issues=$(jq -c '[.issues[]? | . + {"source": "codex"}]' "$codex_verdict_file" 2>/dev/null || echo '[]')
2816
+ merged_issues=$(echo "$claude_issues $codex_issues" | jq -s 'add // []')
2817
+ {
2818
+ echo '{'
2819
+ echo ' "verdict": "fail",'
2820
+ echo ' "verified_at_utc": "'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'",'
2821
+ echo ' "summary": "Consensus disagreement (round '"$CONSENSUS_ROUND"'/6): claude='"$CLAUDE_VERDICT"' codex='"$CODEX_VERDICT"'",'
2822
+ echo ' "issues": '"$merged_issues"','
2823
+ echo ' "recommended_state_transition": "continue",'
2824
+ echo ' "consensus": { "claude": "'"$CLAUDE_VERDICT"'", "codex": "'"$CODEX_VERDICT"'", "round": '"$CONSENSUS_ROUND"' }'
2825
+ echo '}'
2826
+ } | atomic_write "$VERDICT_FILE"
2827
+ return 2 # special return: consensus disagreement, needs retry
2828
+ fi
2829
+ done
2830
+
2831
+ # Max consensus rounds exceeded — include issues from both verdicts
2832
+ log_error "Consensus failed after 6 rounds"
2833
+ local final_claude_issues final_codex_issues final_merged_issues
2834
+ final_claude_issues=$(jq -c '[.issues[]? | . + {"source": "claude"}]' "$claude_verdict_file" 2>/dev/null || echo '[]')
2835
+ final_codex_issues=$(jq -c '[.issues[]? | . + {"source": "codex"}]' "$codex_verdict_file" 2>/dev/null || echo '[]')
2836
+ final_merged_issues=$(echo "$final_claude_issues $final_codex_issues" | jq -s 'add // []')
2837
+ {
2838
+ echo '{'
2839
+ echo ' "verdict": "fail",'
2840
+ echo ' "verified_at_utc": "'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'",'
2841
+ echo ' "summary": "Consensus failed after 6 rounds: claude='"$CLAUDE_VERDICT"' codex='"$CODEX_VERDICT"'",'
2842
+ echo ' "issues": '"$final_merged_issues"','
2843
+ echo ' "recommended_state_transition": "blocked",'
2844
+ echo ' "consensus": { "claude": "'"$CLAUDE_VERDICT"'", "codex": "'"$CODEX_VERDICT"'", "round": 6 }'
2845
+ echo '}'
2846
+ } | atomic_write "$VERDICT_FILE"
2847
+ return 1
2848
+ }
2849
+
2850
+ # =============================================================================
2851
+ # Main Leader Loop
2852
+ # =============================================================================
2853
+
2854
+ main() {
2855
+ # --- US-026 R14 P0: project-scoped runner lockfile (mkdir atomic) ---
2856
+ # Prevents duplicate runners on the same project root regardless of slug.
2857
+ # Different ROOT_HASH allows independent parallel runners across projects.
2858
+ mkdir -p "$(dirname "$RUNNER_LOCKFILE_PATH")" 2>/dev/null
2859
+ if ! mkdir "$RUNNER_LOCKDIR" 2>/dev/null; then
2860
+ local existing existing_slug
2861
+ existing=$(jq -r '.pid' "$RUNNER_LOCKFILE_PATH" 2>/dev/null || echo 0)
2862
+ existing_slug=$(jq -r '.slug // "unknown"' "$RUNNER_LOCKFILE_PATH" 2>/dev/null || echo unknown)
2863
+ if [[ "$existing" -gt 0 ]] && kill -0 "$existing" 2>/dev/null; then
2864
+ echo "duplicate rlp-desk runner detected on this project root. existing pid=$existing slug=$existing_slug, this attempt slug=$SLUG. exiting." >&2
2865
+ echo " Recover with: rm -rf '$RUNNER_LOCKDIR' '$RUNNER_LOCKFILE_PATH' (only if pid $existing is confirmed dead)" >&2
2866
+ exit 1
2867
+ fi
2868
+ rm -rf "$RUNNER_LOCKDIR"
2869
+ mkdir "$RUNNER_LOCKDIR" 2>/dev/null || {
2870
+ echo "failed to acquire runner lock after stale cleanup; another wrapper raced ahead. exit 1" >&2
2871
+ exit 1
2872
+ }
2873
+ echo "stale runner lockfile cleaned (pid $existing dead) — acquired" >&2
2874
+ fi
2875
+ printf '{"pid":%s,"slug":"%s","root":"%s","started_at":"%s"}\n' \
2876
+ "$$" "$SLUG" "$ROOT" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$RUNNER_LOCKFILE_PATH"
2877
+
2878
+ # --- Lockfile: prevent duplicate execution (ZSH-4 race-safe, v0.17.1) ---
2879
+ # Delegates to acquire_slug_lock (lib_ralph_desk.zsh): atomic set -C fast path +
2880
+ # mkdir-mutex-serialized, PID-reaped stale recovery. Race-safe vs concurrent
2881
+ # recoverers, gap-starters, and a crashed-recoverer mutex leak.
2882
+ if acquire_slug_lock "$LOCKFILE_PATH"; then
2883
+ LOCKFILE_ACQUIRED=1
2884
+ else
2885
+ local lock_pid
2886
+ lock_pid=$(cat "$LOCKFILE_PATH" 2>/dev/null)
2887
+ log_error "Another instance is already running or won the lock race (PID ${lock_pid:-unknown}). Kill it or rm $LOCKFILE_PATH"
2888
+ exit 1
2889
+ fi
2890
+ # US-023 R11 P2-K: chain `_emit_final_cost_log` so cost-log.jsonl is never silently empty on exit.
2891
+ trap '_emit_final_cost_log; cleanup' EXIT INT TERM
2892
+ mkdir -p "$LOGS_DIR" "$RUNTIME_DIR" 2>/dev/null
2893
+
2894
+ # --- Analytics directory: always create (campaign.jsonl + metadata.json are always-on) ---
2895
+ mkdir -p "$ANALYTICS_DIR" 2>/dev/null
2896
+
2897
+ # --- debug.log versioning (in analytics dir, --debug only) ---
2898
+ if (( DEBUG )) && [[ -f "$DEBUG_LOG" ]]; then
2899
+ local dbg_n=1
2900
+ while [[ -f "${DEBUG_LOG%.log}-v${dbg_n}.log" ]]; do
2901
+ (( dbg_n++ ))
2902
+ done
2903
+ mv "$DEBUG_LOG" "${DEBUG_LOG%.log}-v${dbg_n}.log"
2904
+ fi
2905
+
2906
+ # --- campaign.jsonl versioning (always-on) ---
2907
+ if [[ -f "$CAMPAIGN_JSONL" ]]; then
2908
+ local cj_n=1
2909
+ while [[ -f "${CAMPAIGN_JSONL%.jsonl}-v${cj_n}.jsonl" ]]; do
2910
+ (( cj_n++ ))
2911
+ done
2912
+ mv "$CAMPAIGN_JSONL" "${CAMPAIGN_JSONL%.jsonl}-v${cj_n}.jsonl"
2913
+ fi
2914
+
2915
+ # --- metadata.json: always write at campaign start (cross-project identification) ---
2916
+ jq -n \
2917
+ --arg slug "$SLUG" \
2918
+ --arg project_root "$ROOT" \
2919
+ --arg project_name "$(basename "$ROOT")" \
2920
+ --arg campaign_status "running" \
2921
+ --arg start_time "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
2922
+ --arg end_time "" \
2923
+ --arg worker_model "$WORKER_MODEL" \
2924
+ --arg verifier_model "$VERIFIER_MODEL" \
2925
+ --argjson debug "$DEBUG" \
2926
+ --argjson with_sv "$WITH_SELF_VERIFICATION" \
2927
+ --argjson with_sv_requested "$WITH_SELF_VERIFICATION_REQUESTED" \
2928
+ --arg sv_skipped_reason "$SV_SKIPPED_REASON" \
2929
+ --arg lane_mode "$LANE_MODE" \
2930
+ --argjson consensus "${VERIFY_CONSENSUS:-0}" \
2931
+ '{slug: $slug, project_root: $project_root, project_name: $project_name, campaign_status: $campaign_status, start_time: $start_time, end_time: $end_time, worker_model: $worker_model, verifier_model: $verifier_model, debug: $debug, with_self_verification: $with_sv, with_self_verification_requested: $with_sv_requested, sv_skipped_reason: $sv_skipped_reason, lane_mode: $lane_mode, consensus: $consensus}' \
2932
+ > "$METADATA_FILE"
2933
+
2934
+ # --- Startup ---
2935
+ log "Ralph Desk Tmux Runner starting..."
2936
+ log " Slug: $SLUG"
2937
+ log " Root: $ROOT"
2938
+ log " Max iterations: $MAX_ITER"
2939
+ log " Worker model: $WORKER_MODEL"
2940
+ log " Verifier model: $VERIFIER_MODEL (per-US) / $FINAL_VERIFIER_MODEL (final)"
2941
+ log " Verify mode: $VERIFY_MODE"
2942
+ log " Consensus mode: $CONSENSUS_MODE"
2943
+ log " Consensus model: $CONSENSUS_MODEL (per-US) / $FINAL_CONSENSUS_MODEL (final)"
2944
+ log " Poll interval: ${POLL_INTERVAL}s"
2945
+ log " Iter timeout: ${ITER_TIMEOUT}s"
2946
+ # --- Debug: Log execution plan ---
2947
+ if (( DEBUG )); then
2948
+ # Extract US IDs from PRD
2949
+ local prd_file="$DESK/plans/prd-$SLUG.md"
2950
+ local us_list=""
2951
+ if [[ -f "$prd_file" ]]; then
2952
+ us_list=$(grep -oE 'US-[0-9]+' "$prd_file" | sort -u | tr '\n' ',' | sed 's/,$//')
2953
+ fi
2954
+ local us_count=$(echo "$us_list" | tr ',' '\n' | grep -c 'US-')
2955
+
2956
+ log_debug "[OPTION] slug=$SLUG us_count=$us_count us_list=$us_list"
2957
+ log_debug "[OPTION] worker_engine=$WORKER_ENGINE worker_model=$WORKER_MODEL"
2958
+ log_debug "[OPTION] verifier_engine=$VERIFIER_ENGINE verifier_model=$VERIFIER_MODEL"
2959
+ log_debug "[OPTION] verify_mode=$VERIFY_MODE consensus_mode=$CONSENSUS_MODE max_iter=$MAX_ITER"
2960
+ log_debug "[OPTION] cb_threshold=$CB_THRESHOLD effective_cb_threshold=$EFFECTIVE_CB_THRESHOLD iter_timeout=$ITER_TIMEOUT with_self_verification=$WITH_SELF_VERIFICATION (requested=$WITH_SELF_VERIFICATION_REQUESTED skipped=${SV_SKIPPED_REASON:-none}) debug=$DEBUG"
2961
+
2962
+ if [[ "$VERIFY_MODE" = "per-us" ]]; then
2963
+ # Build expected flow
2964
+ local expected_flow=""
2965
+ for us in $(echo "$us_list" | tr ',' ' '); do
2966
+ expected_flow="${expected_flow}worker->verify($us)->"
2967
+ done
2968
+ expected_flow="${expected_flow}verify(ALL)->COMPLETE"
2969
+ log_debug "[OPTION] expected_flow=$expected_flow"
2970
+ else
2971
+ log_debug "[OPTION] expected_flow=worker(all)->verify(ALL)->COMPLETE"
2972
+ fi
2973
+
2974
+ if [[ "${VERIFY_CONSENSUS:-0}" = "1" ]]; then
2975
+ log_debug "[OPTION] consensus_flow=each_verify_runs_claude+codex_both_must_pass"
2976
+ fi
2977
+ fi
2978
+
2979
+ # Extract US list for per-US sequencing
2980
+ if [[ "$VERIFY_MODE" = "per-us" ]]; then
2981
+ local prd_file="$DESK/plans/prd-$SLUG.md"
2982
+ if [[ -f "$prd_file" ]]; then
2983
+ US_LIST=$(grep -oE 'US-[0-9]+' "$prd_file" | sort -u | tr '\n' ',' | sed 's/,$//')
2984
+ fi
2985
+
2986
+ # Initialize VERIFIED_US from memory's Completed Stories (carry over previous runs)
2987
+ local memory_file="$DESK/memos/${SLUG}-memory.md"
2988
+ if [[ -f "$memory_file" ]]; then
2989
+ local completed_us
2990
+ completed_us=$(sed -n '/^## Completed Stories$/,/^## /p' "$memory_file" 2>/dev/null | grep '^- US-' | sed 's/^- \(US-[0-9]*\):.*/\1/' | sort -u | tr '\n' ',' | sed 's/,$//')
2991
+ if [[ -n "$completed_us" ]]; then
2992
+ VERIFIED_US="$completed_us"
2993
+ log " Loaded completed stories from memory: $VERIFIED_US"
2994
+ log_debug "[FLOW] loaded_verified_us_from_memory=$VERIFIED_US"
2995
+ fi
2996
+ fi
2997
+
2998
+ # D1: Fallback — restore verified_us from status.json if memory had none
2999
+ if [[ -z "$VERIFIED_US" && -f "$STATUS_FILE" ]]; then
3000
+ local status_verified
3001
+ status_verified=$(jq -r '.verified_us // [] | join(",")' "$STATUS_FILE" 2>/dev/null)
3002
+ if [[ -n "$status_verified" ]]; then
3003
+ VERIFIED_US="$status_verified"
3004
+ log " Restored verified_us from status.json: $VERIFIED_US"
3005
+ log_debug "[FLOW] restored_verified_us_from_status=$VERIFIED_US"
3006
+ fi
3007
+ fi
3008
+ fi
3009
+
3010
+ # Initialize PRD snapshot state for live update detection
3011
+ PREV_PRD_HASH=$(compute_prd_hash)
3012
+ PREV_PRD_US_LIST=$(count_prd_us)
3013
+
3014
+ # Dependency checks
3015
+ check_dependencies
3016
+
3017
+ # Print security warning (governance.md s7: --dangerously-skip-permissions)
3018
+ print_security_warning
3019
+
3020
+ # Validate scaffold
3021
+ validate_scaffold
3022
+
3023
+ # Check for existing sessions
3024
+ check_existing_sessions
3025
+
3026
+ # Create tmux session with pane IDs (governance.md s7 step 1)
3027
+ create_session
3028
+
3029
+ # Set trap for cleanup on exit/error
3030
+ # US-023 R11 P2-K: chain `_emit_final_cost_log` so cost-log.jsonl is never silently empty.
3031
+ trap '_emit_final_cost_log; cleanup' EXIT
3032
+
3033
+ # Initialize context hash for stale detection
3034
+ PREV_CONTEXT_HASH=$(compute_context_hash)
3035
+
3036
+ # --- governance.md s7: Leader Loop ---
3037
+ local HARD_CEILING=$(( ITER_TIMEOUT * 3 )) # logged but NOT enforced — Worker extends indefinitely when active
3038
+
3039
+ for (( ITERATION = 1; ITERATION <= MAX_ITER; ITERATION++ )); do
3040
+ # US-024 R12 P0: lifecycle check site #2 — verify session/panes alive at iter entry.
3041
+ _r12_check_lifecycle "iter_start"
3042
+ log ""
3043
+ log "========== Iteration $ITERATION / $MAX_ITER =========="
3044
+ local ITER_START_TIME
3045
+ ITER_START_TIME=$(date +%s)
3046
+ local _iter_contract=""
3047
+ _iter_contract=$(sed -n '/^## Next Iteration Contract$/,/^## /{ /^## Next/d; /^## [^N]/d; p; }' "$MEMORY_FILE" 2>/dev/null | head -1 | tr '\n' ' ')
3048
+ log_debug "[FLOW] iter=$ITERATION start contract=\"${_iter_contract:-none}\""
3049
+
3050
+ # --- governance.md s7 step 1: Check sentinels ---
3051
+ if [[ -f "$COMPLETE_SENTINEL" ]]; then
3052
+ log "COMPLETE sentinel found. Campaign succeeded."
3053
+ update_status "complete" "complete"
3054
+ return 0
3055
+ fi
3056
+ if [[ -f "$BLOCKED_SENTINEL" ]]; then
3057
+ log "BLOCKED sentinel found. Campaign blocked."
3058
+ update_status "blocked" "blocked"
3059
+ return 1
3060
+ fi
3061
+
3062
+ # PR-A (Bug #10): operator-recovery hygiene check.
3063
+ # When the operator hand-rolls a `phase=verify` recovery (jq-patches
3064
+ # status.json, writes manual iter-signal.json + done-claim.json, deletes
3065
+ # the blocked sentinel), the leader MUST honor that work instead of
3066
+ # deleting the artifacts and resetting to phase=worker. Mirrors the
3067
+ # Node-side guard in src/node/runner/campaign-main-loop.mjs.
3068
+ local SKIP_NEXT_WORKER=0
3069
+ local LAST_PHASE=""
3070
+ if [[ -f "$STATUS_FILE" ]] && command -v jq >/dev/null 2>&1; then
3071
+ LAST_PHASE=$(jq -r '.phase // ""' "$STATUS_FILE" 2>/dev/null)
3072
+ fi
3073
+ if [[ "$LAST_PHASE" == "verify" ]]; then
3074
+ local _iter_prompt="$LOGS_DIR/iter-$(printf '%03d' $ITERATION).worker-prompt.md"
3075
+ if _validate_operator_recovery_artifacts \
3076
+ "$SIGNAL_FILE" "$DONE_CLAIM_FILE" "$STATUS_FILE" "$_iter_prompt"; then
3077
+ log "[recovery] Resuming verify phase — operator manual recovery detected (iter=$ITERATION)"
3078
+ log_debug "[recovery] iter=$ITERATION skip_worker=true reason=manual_recovery_validated"
3079
+ SKIP_NEXT_WORKER=1
3080
+ else
3081
+ log "[recovery] phase=verify ignored: ${RECOVERY_FAIL_REASON}"
3082
+ log_debug "[recovery] iter=$ITERATION skip_worker=false reason=\"${RECOVERY_FAIL_REASON}\""
3083
+ fi
3084
+ fi
3085
+
3086
+ # PR-E (Phase C1, stabilization): operator-cleared BLOCKED recovery.
3087
+ # Pair to PR-A above. Runs AFTER PR-A (so phase=verify wins) and skipped
3088
+ # when SKIP_NEXT_WORKER=1 (PR-A already honored). Resets stale counters
3089
+ # in status.json when operator manually deleted the BLOCKED sentinel.
3090
+ # Mirrors Node `_validateBlockedRecovery` + branch in campaign-main-loop.mjs.
3091
+ if [[ "$LAST_PHASE" == "blocked" && "$SKIP_NEXT_WORKER" -eq 0 ]]; then
3092
+ local _blocked_sidecar="$MEMOS_DIR/${SLUG}-blocked.json"
3093
+ if _validate_blocked_recovery \
3094
+ "$BLOCKED_SENTINEL" "$_blocked_sidecar" "$STATUS_FILE"; then
3095
+ local _prev_reason
3096
+ _prev_reason=$(jq -r '.last_block_reason // ""' "$STATUS_FILE" 2>/dev/null)
3097
+ log "[recovery] Operator-cleared BLOCKED detected (was: ${_prev_reason:-unrecorded}). Resetting counters and resuming as worker. iter=$ITERATION"
3098
+ log_debug "[recovery] iter=$ITERATION blocked_recovery=applied reason=\"${BLOCKED_RECOVERY_FAIL_REASON:-sidecar absent or recoverable=true}\""
3099
+ # Reset counters in-process. update_status writes fresh status when
3100
+ # next phase transition fires. Operator's intent was a clean restart.
3101
+ CONSECUTIVE_FAILURES=0
3102
+ CONSECUTIVE_BLOCKS=0
3103
+ LAST_BLOCK_REASON=""
3104
+ # Archive sidecar (rename, not delete) for audit trail.
3105
+ _archive_recovered_sidecar "$_blocked_sidecar"
3106
+ else
3107
+ log "[recovery] phase=blocked ignored: ${BLOCKED_RECOVERY_FAIL_REASON}"
3108
+ log_debug "[recovery] iter=$ITERATION blocked_recovery=skipped reason=\"${BLOCKED_RECOVERY_FAIL_REASON}\""
3109
+ fi
3110
+ fi
3111
+
3112
+ if (( ! SKIP_NEXT_WORKER )); then
3113
+ # --- governance.md s7 step 8 (cleanup): Clean previous iteration signals ---
3114
+ # Bug #7 Fix-R cleanup: unlock 0o444 sentinels written by the previous
3115
+ # iteration's reaper before rm so cleanup does not log permission noise.
3116
+ _unlock_sentinel "$SIGNAL_FILE"
3117
+ _unlock_sentinel "$VERDICT_FILE"
3118
+ rm -f "$SIGNAL_FILE" "$DONE_CLAIM_FILE" "$VERDICT_FILE" 2>/dev/null
3119
+ rm -f "$WORKER_HEARTBEAT" "$VERIFIER_HEARTBEAT" 2>/dev/null
3120
+
3121
+ # --- Clean previous claude session in panes (one-shot lifecycle) ---
3122
+ # Only needed from iteration 2 onwards (iteration 1 has fresh panes)
3123
+ if (( ITERATION > 1 )); then
3124
+ # Send C-c first (in case claude is mid-task), then /exit
3125
+ tmux send-keys -t "$WORKER_PANE" C-c 2>/dev/null
3126
+ sleep 1
3127
+ tmux send-keys -t "$WORKER_PANE" "/exit" C-m 2>/dev/null
3128
+ sleep 2
3129
+ # Wait for shell prompt before proceeding
3130
+ wait_for_pane_ready "$WORKER_PANE" 10 2>/dev/null || true
3131
+ fi
3132
+ fi
3133
+
3134
+ # Reset per-iteration state
3135
+ local worker_nudge_count=0
3136
+ local verifier_nudge_count=0
3137
+ ITER_VERIFIER_START=""
3138
+ ITER_VERIFIER_END=""
3139
+
3140
+ # --- US-004: detect PRD changes for live update + re-split ---
3141
+ check_prd_update
3142
+
3143
+ # AC1: capture worker start timestamp (still set for downstream telemetry
3144
+ # even when the worker dispatch is skipped — recovery still consumes time).
3145
+ ITER_WORKER_START=$(date +%s)
3146
+
3147
+ local worker_launch=""
3148
+ if (( ! SKIP_NEXT_WORKER )); then
3149
+ # --- governance.md s7 step 4: Build worker prompt + trigger ---
3150
+ write_worker_trigger "$ITERATION"
3151
+ local worker_prompt="$LOGS_DIR/iter-$(printf '%03d' $ITERATION).worker-prompt.md"
3152
+
3153
+ update_status "worker" "running"
3154
+
3155
+ # --- governance.md s7 step 5: Execute Worker (dispatched to engine-specific function) ---
3156
+ log_debug "[FLOW] iter=$ITERATION phase=worker engine=$WORKER_ENGINE model=$WORKER_MODEL dispatched=true"
3157
+
3158
+ if [[ "$WORKER_ENGINE" = "codex" ]]; then
3159
+ worker_launch="${CODEX_BIN:-codex} -m $WORKER_CODEX_MODEL -c model_reasoning_effort=\"$WORKER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
3160
+ if ! launch_worker_codex "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
3161
+ write_blocked_sentinel "Worker codex failed to start in pane" "" "infra_failure"
3162
+ update_status "blocked" "worker_start_failed"
3163
+ return 1
3164
+ fi
3165
+ else
3166
+ worker_launch="$(build_claude_cmd tui "$WORKER_MODEL" "" "" "$WORKER_EFFORT")"
3167
+ if ! launch_worker_claude "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
3168
+ write_blocked_sentinel "Worker claude failed to start in pane" "" "infra_failure"
3169
+ update_status "blocked" "worker_start_failed"
3170
+ return 1
3171
+ fi
3172
+ fi
3173
+ else
3174
+ # PR-A (Bug #10): one-shot recovery path. The operator's iter-signal.json
3175
+ # is already on disk; polling below picks it up immediately and the loop
3176
+ # transitions cleanly into the verifier phase. Persist phase=verify so a
3177
+ # subsequent crash-and-relaunch sees the same contract. SKIP_NEXT_WORKER
3178
+ # is local to this iteration so iter-N+1 dispatches the worker normally.
3179
+ update_status "verify" "running"
3180
+ log "[recovery] Skipping worker dispatch for iter=$ITERATION (one-shot, honoring operator manual recovery)"
3181
+ fi
3182
+
3183
+ # --- governance.md s7 step 5+6: Poll for Worker completion ---
3184
+ # US-024 R12 P0: lifecycle check site #3 — verify panes alive after worker dispatch, before wait-loop.
3185
+ _r12_check_lifecycle "post_send"
3186
+ log " Polling for iter-signal.json..."
3187
+ local worker_poll_done=0
3188
+ while (( ! worker_poll_done )); do
3189
+ local worker_poll_rc=0
3190
+ if poll_for_signal "$SIGNAL_FILE" "$WORKER_HEARTBEAT" "$WORKER_PANE" "$worker_launch" "Worker"; then
3191
+ worker_poll_done=1
3192
+ log_debug "[FLOW] iter=$ITERATION poll_signal_received=true"
3193
+ # Bug #7 Fix-Q/R: reap worker pane immediately so claude/codex cannot
3194
+ # self-review and rewrite iter-signal.json (1m43s drift observed).
3195
+ _kill_pane_process "$WORKER_PANE" "worker"
3196
+ _lock_sentinel "$SIGNAL_FILE"
3197
+ # v0.15.4 PR-B2-FIX: same worker pass also produced done-claim. Freeze
3198
+ # it alongside iter-signal so Bug #8 gates and the iter-NNN-done-claim
3199
+ # archive (lib_ralph_desk.zsh:602) read a snapshot the worker can no
3200
+ # longer revise. Symmetric with iter-signal/verdict lock contract.
3201
+ _lock_sentinel "$DONE_CLAIM_FILE"
3202
+ # PR-0b-narrow: stamp leader handshake ack on the iter-signal (audit-only).
3203
+ _stamp_ack_field "$SIGNAL_FILE"
3204
+ else
3205
+ worker_poll_rc=$?
3206
+ if (( worker_poll_rc == 2 )); then
3207
+ return 1
3208
+ fi
3209
+ # Check if Worker is still actively running (not stuck)
3210
+ local worker_cmd
3211
+ worker_cmd=$(tmux display-message -p -t "$WORKER_PANE" '#{pane_current_command}' 2>/dev/null)
3212
+ if [[ "$worker_cmd" == "node" || "$worker_cmd" == "claude" || "$worker_cmd" == "codex" ]]; then
3213
+ # Process alive — extend indefinitely (no hard ceiling kill)
3214
+ # Stale-context breaker and nudge system handle truly stuck workers
3215
+ local iter_elapsed=$(( $(date +%s) - ITER_START_TIME ))
3216
+ local ceiling_exceeded=""
3217
+ if (( iter_elapsed >= HARD_CEILING )); then
3218
+ ceiling_exceeded=" [EXCEEDED hard_ceiling=${HARD_CEILING}s — not enforced, logged only]"
3219
+ log " WARNING: Worker exceeded soft hard-ceiling (${iter_elapsed}s >= ${HARD_CEILING}s) but still active. Continuing..."
3220
+ log_debug "[GOV] iter=$ITERATION hard_ceiling_exceeded=true elapsed=${iter_elapsed}s ceiling=${HARD_CEILING}s process=$worker_cmd action=log_only_no_kill"
3221
+ fi
3222
+ log " Worker timed out but still active ($worker_cmd). Extending poll... (${iter_elapsed}s, no ceiling)${ceiling_exceeded}"
3223
+ log_debug "[GOV] iter=$ITERATION timeout_active=true process=$worker_cmd elapsed=${iter_elapsed}s action=extend_indefinitely"
3224
+ log_debug "[FLOW] iter=$ITERATION poll_extended=true worker_cmd=$worker_cmd"
3225
+ update_status "worker" "slow"
3226
+ # Loop continues — re-poll same iteration
3227
+ else
3228
+ # Worker is truly dead/stuck
3229
+ (( MONITOR_FAILURE_COUNT++ ))
3230
+ log_debug "[GOV] iter=$ITERATION monitor_failure=$MONITOR_FAILURE_COUNT/3"
3231
+ if (( MONITOR_FAILURE_COUNT >= 3 )); then
3232
+ log_debug "[GOV] iter=$ITERATION circuit_breaker=monitor_failures detail=\"3 consecutive monitor failures\""
3233
+ write_blocked_sentinel "3 consecutive monitor failures (worker not active)" "" "infra_failure"
3234
+ update_status "blocked" "monitor_failures"
3235
+ return 1
3236
+ fi
3237
+ log " WARNING: Worker poll failed (monitor failure $MONITOR_FAILURE_COUNT/3) — will retry"
3238
+ update_status "worker" "poll_failed"
3239
+ log_debug "[FLOW] iter=$ITERATION poll_worker_dead=true worker_cmd=$worker_cmd retry=true"
3240
+ # v0.14.3 P0-5 (Bug Report #5): previously this branch wrote BLOCKED
3241
+ # unconditionally even at counter 1/3, so a single transient
3242
+ # worker-dead detection halted the campaign in 5s instead of
3243
+ # honoring the 3-strike circuit breaker above (L3001-3006). Removed
3244
+ # the unconditional sentinel write; the loop now continues so the
3245
+ # next polling tick can either confirm the dead state (counter
3246
+ # eventually reaches 3 → BLOCKED) or recover (worker resumes →
3247
+ # MONITOR_FAILURE_COUNT reset on success at L3025).
3248
+ fi
3249
+ fi
3250
+ done
3251
+
3252
+ if [[ ! -f "$SIGNAL_FILE" ]]; then
3253
+ log_debug "[FLOW] iter=$ITERATION no_signal_after_poll=true continuing"
3254
+ # No signal — monitor failure, go to next iteration
3255
+ continue
3256
+ fi
3257
+
3258
+ # Reset monitor failure count on success
3259
+ MONITOR_FAILURE_COUNT=0
3260
+
3261
+ # AC1: capture worker end timestamp; reset consensus timing
3262
+ ITER_WORKER_END=$(date +%s)
3263
+ ITER_VERIFIER_CLAUDE_DURATION_S=""
3264
+ ITER_VERIFIER_CODEX_DURATION_S=""
3265
+
3266
+ # --- governance.md s7 step 6: Read iter-signal.json via jq (JSON only, no markdown) ---
3267
+ local signal_status
3268
+ signal_status=$(jq -r '.status' "$SIGNAL_FILE" 2>/dev/null)
3269
+ local signal_summary
3270
+ signal_summary=$(jq -r '.summary // "no summary"' "$SIGNAL_FILE" 2>/dev/null)
3271
+
3272
+ log " Worker signal: status=$signal_status summary=\"$signal_summary\""
3273
+
3274
+ # Read us_id early for EXEC logging (also used later in verify branch)
3275
+ local signal_us_id_early=""
3276
+ signal_us_id_early=$(jq -r '.us_id // empty' "$SIGNAL_FILE" 2>/dev/null)
3277
+ log_debug "[FLOW] iter=$ITERATION phase=worker_signal status=$signal_status us_id=${signal_us_id_early:-none} summary=\"$signal_summary\""
3278
+
3279
+ case "$signal_status" in
3280
+ continue)
3281
+ # --- governance.md s7 step 6: continue -> go to step 8 ---
3282
+ log " Worker requests continue. Moving to next iteration."
3283
+ update_status "worker" "continue"
3284
+ ;;
3285
+ verify_partial)
3286
+ # US-019 R7 P1-G: Worker explicitly verified a subset of ACs and deferred the rest.
3287
+ # Verifier evaluates only verified_acs. Malformed (empty verified_acs) downgrades to blocked.
3288
+ local vp_count
3289
+ vp_count=$(jq -r '.verified_acs // [] | length' "$SIGNAL_FILE" 2>/dev/null || echo 0)
3290
+ if [[ "$vp_count" -eq 0 ]]; then
3291
+ log " Worker signal verify_partial but verified_acs is empty — downgrading to blocked (verify_partial_malformed)."
3292
+ local vp_us_id
3293
+ vp_us_id=$(jq -r '.us_id // empty' "$SIGNAL_FILE" 2>/dev/null)
3294
+ write_blocked_sentinel "verify_partial_malformed: empty verified_acs" "${vp_us_id:-${CURRENT_US:-ALL}}" "mission_abort"
3295
+ update_status "blocked" "verify_partial_malformed"
3296
+ break
3297
+ fi
3298
+ log " Worker signal verify_partial (verified_acs count=$vp_count). Routing to verify path."
3299
+ signal_status="verify"
3300
+ ;&
3301
+ verify)
3302
+ # --- governance.md s7 step 7: Execute Verifier ---
3303
+ # Read us_id from signal for per-US scoping
3304
+ local signal_us_id=""
3305
+ signal_us_id=$(jq -r '.us_id // empty' "$SIGNAL_FILE" 2>/dev/null)
3306
+ log " Worker claims done (us_id=${signal_us_id:-all}). Dispatching Verifier..."
3307
+
3308
+ # AC1: capture verifier start timestamp
3309
+ ITER_VERIFIER_START=$(date +%s)
3310
+
3311
+ update_status "verifier" "running"
3312
+
3313
+ # --- Sequential final verify: per-US scoped checks instead of one big ALL verify ---
3314
+ if [[ "$signal_us_id" == "ALL" && "$VERIFY_MODE" == "per-us" && -n "$US_LIST" ]]; then
3315
+ log " Final ALL verify: using sequential per-US strategy (timeout prevention)"
3316
+ local seq_rc=0
3317
+ run_sequential_final_verify "$ITERATION" || seq_rc=$?
3318
+ if (( seq_rc == 0 )); then
3319
+ write_complete_sentinel "Sequential final verify passed (all US verified individually)"
3320
+ update_status "complete" "pass"
3321
+ write_campaign_jsonl "$ITERATION" "ALL" "pass"
3322
+ return 0
3323
+ else
3324
+ # Sequential verify failed — fall through to fix loop with failed US
3325
+ log " Sequential final verify failed at ${FAILED_US:-unknown}. Entering fix loop."
3326
+ signal_us_id="${FAILED_US:-ALL}"
3327
+ # Synthesize a fail verdict for the fix loop
3328
+ echo "{\"verdict\":\"fail\",\"summary\":\"Sequential final verify failed at ${FAILED_US:-unknown}\",\"issues\":[{\"severity\":\"critical\",\"criterion\":\"${FAILED_US:-ALL}\",\"description\":\"Failed during sequential final verification\"}]}" | atomic_write "$VERDICT_FILE"
3329
+ fi
3330
+ fi
3331
+
3332
+ # --- Consensus scope check (US-005: _should_use_consensus handles CONSENSUS_MODE) ---
3333
+ local use_consensus=0
3334
+ _should_use_consensus "$signal_us_id" && use_consensus=1
3335
+
3336
+ # --- Consensus vs single verification ---
3337
+ if (( use_consensus )); then
3338
+ # US-004: Run consensus verification (claude + codex sequentially)
3339
+ local consensus_rc=0
3340
+ run_consensus_verification "$ITERATION" || consensus_rc=$?
3341
+
3342
+ if (( consensus_rc == 2 )); then
3343
+ # Consensus disagreement — treat as fail, fix loop will handle
3344
+ log " Consensus disagreement, treating as fail."
3345
+ elif (( consensus_rc != 0 )); then
3346
+ # Consensus verification failed entirely
3347
+ log_error "Consensus verification failed"
3348
+ write_blocked_sentinel "Consensus verification failed after max rounds" "" "repeat_axis"
3349
+ update_status "blocked" "consensus_failed"
3350
+ return 1
3351
+ fi
3352
+ else
3353
+ # Standard single-engine verification
3354
+ write_verifier_trigger "$ITERATION"
3355
+ local verifier_prompt="$LOGS_DIR/iter-$(printf '%03d' $ITERATION).verifier-prompt.md"
3356
+
3357
+ # Step 7a: Clean previous Verifier session (with dead pane detection)
3358
+ local verifier_cmd
3359
+ verifier_cmd=$(tmux display-message -p -t "$VERIFIER_PANE" '#{pane_current_command}' 2>/dev/null)
3360
+ if [[ -z "$verifier_cmd" ]]; then
3361
+ log " Verifier pane $VERIFIER_PANE is gone — replacing..."
3362
+ log_debug "[GOV] iter=$ITERATION pane_dead=true pane_id=$VERIFIER_PANE action=replace_pane"
3363
+ replace_worker_pane "$VERIFIER_PANE" "verifier"
3364
+ VERIFIER_PANE=$(jq -r '.panes.verifier' "$SESSION_CONFIG")
3365
+ log " New verifier pane: $VERIFIER_PANE"
3366
+ elif [[ "$verifier_cmd" == "zsh" || "$verifier_cmd" == "bash" ]]; then
3367
+ log " Verifier pane $VERIFIER_PANE has bare shell ($verifier_cmd) — resetting..."
3368
+ log_debug "[GOV] iter=$ITERATION pane_dead=true pane_id=$VERIFIER_PANE cmd=$verifier_cmd action=reset_shell"
3369
+ tmux send-keys -t "$VERIFIER_PANE" C-c C-u 2>/dev/null
3370
+ sleep 0.2
3371
+ tmux send-keys -t "$VERIFIER_PANE" "clear" C-m 2>/dev/null
3372
+ sleep 0.3
3373
+ elif [[ "$verifier_cmd" == "node" || "$verifier_cmd" == "claude" || "$verifier_cmd" == "codex" ]]; then
3374
+ tmux send-keys -t "$VERIFIER_PANE" C-c 2>/dev/null
3375
+ sleep 0.5
3376
+ tmux send-keys -t "$VERIFIER_PANE" "/exit" C-m 2>/dev/null
3377
+ sleep 2
3378
+ fi
3379
+ wait_for_pane_ready "$VERIFIER_PANE" 10 2>/dev/null || true
3380
+
3381
+ local verifier_launch
3382
+ if [[ "$VERIFIER_ENGINE" = "codex" ]]; then
3383
+ verifier_launch="${CODEX_BIN:-codex} -m $VERIFIER_CODEX_MODEL -c model_reasoning_effort=\"$VERIFIER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
3384
+ else
3385
+ verifier_launch="$(build_claude_cmd tui "$VERIFIER_MODEL" "" "" "$VERIFIER_EFFORT")"
3386
+ fi
3387
+ log_debug "[FLOW] iter=$ITERATION phase=verifier engine=$VERIFIER_ENGINE model=$VERIFIER_MODEL scope=${signal_us_id:-all} dispatched=true"
3388
+
3389
+ if [[ "$VERIFIER_ENGINE" = "codex" ]]; then
3390
+ launch_verifier_codex "$VERIFIER_PANE" "$verifier_prompt" "$ITERATION" "$verifier_launch"
3391
+ else
3392
+ if ! launch_verifier_claude "$VERIFIER_PANE" "$verifier_prompt" "$ITERATION" "$verifier_launch"; then
3393
+ update_status "verifier" "start_failed"
3394
+ continue
3395
+ fi
3396
+ fi
3397
+
3398
+ # Poll for verify-verdict.json
3399
+ log " Polling for verify-verdict.json..."
3400
+ if ! poll_for_signal "$VERDICT_FILE" "$VERIFIER_HEARTBEAT" "$VERIFIER_PANE" "$verifier_launch" "Verifier"; then
3401
+ local verifier_poll_rc=$?
3402
+ if (( verifier_poll_rc == 2 )); then
3403
+ return 1
3404
+ fi
3405
+ log_error "Verifier poll failed"
3406
+ # Verifier is dead/stuck — BLOCK and let user decide
3407
+ write_blocked_sentinel "Verifier process dead/stuck (poll failed). Pane preserved for inspection." "" "infra_failure"
3408
+ update_status "blocked" "verifier_dead"
3409
+ return 1
3410
+ fi
3411
+ # Bug #7 Fix-Q/R: reap verifier pane immediately so codex cannot
3412
+ # rewrite verify-verdict.json post-detect (mtime drift fix).
3413
+ _kill_pane_process "$VERIFIER_PANE" "verifier"
3414
+ _lock_sentinel "$VERDICT_FILE"
3415
+ # PR-0b-narrow: stamp leader handshake ack on the verdict (audit-only).
3416
+ _stamp_ack_field "$VERDICT_FILE"
3417
+ fi
3418
+
3419
+ # AC1: capture verifier end timestamp
3420
+ ITER_VERIFIER_END=$(date +%s)
3421
+
3422
+ # --- governance.md s7 step 7: Read verdict via jq ---
3423
+ local verdict
3424
+ verdict=$(jq -r '.verdict' "$VERDICT_FILE" 2>/dev/null)
3425
+ local recommended
3426
+ recommended=$(jq -r '.recommended_state_transition' "$VERDICT_FILE" 2>/dev/null)
3427
+ local verdict_summary
3428
+ verdict_summary=$(jq -r '.summary // "no summary"' "$VERDICT_FILE" 2>/dev/null)
3429
+
3430
+ log " Verifier: verdict=$verdict recommended=$recommended"
3431
+ log " Verifier summary: \"$verdict_summary\""
3432
+ local _issues_count=$(jq '.issues | length' "$VERDICT_FILE" 2>/dev/null || echo 0)
3433
+ log_debug "[GOV] iter=$ITERATION phase=verdict engine=$VERIFIER_ENGINE verdict=$verdict recommended=$recommended us_id=${signal_us_id:-all} issues=$_issues_count"
3434
+
3435
+ case "$verdict" in
3436
+ pass)
3437
+ CONSECUTIVE_FAILURES=0
3438
+ CONSENSUS_ROUND=0
3439
+ _SAME_US_FAIL_COUNT=0
3440
+ _LAST_FAILED_US=""
3441
+ if (( _MODEL_UPGRADED )); then
3442
+ log " Worker model restored: ${WORKER_MODEL} → ${_ORIGINAL_WORKER_MODEL} (pass verdict)"
3443
+ log_debug "[DECIDE] iter=$ITERATION phase=model_select model_restore=true from=${WORKER_MODEL} to=${_ORIGINAL_WORKER_MODEL}"
3444
+ WORKER_MODEL="$_ORIGINAL_WORKER_MODEL"
3445
+ if [[ "$WORKER_ENGINE" = "codex" ]]; then
3446
+ WORKER_CODEX_MODEL="$WORKER_MODEL"
3447
+ WORKER_CODEX_REASONING="$_ORIGINAL_WORKER_CODEX_REASONING"
3448
+ fi
3449
+ _MODEL_UPGRADED=0
3450
+ fi
3451
+
3452
+ # --- Verified US tracking (both per-us and batch modes) ---
3453
+ if [[ -n "$signal_us_id" && "$signal_us_id" != "ALL" ]]; then
3454
+ # Add this US to verified list
3455
+ if [[ -n "$VERIFIED_US" ]]; then
3456
+ VERIFIED_US="${VERIFIED_US},${signal_us_id}"
3457
+ else
3458
+ VERIFIED_US="$signal_us_id"
3459
+ fi
3460
+ log " US $signal_us_id verified. Verified so far: $VERIFIED_US"
3461
+ log_debug "[FLOW] iter=$ITERATION verified_us_update=$signal_us_id verified_us_total=$VERIFIED_US"
3462
+ update_status "verifier" "pass_us"
3463
+ # Worker will do next US on next iteration
3464
+ elif [[ "$recommended" == "complete" || "$signal_us_id" == "ALL" ]]; then
3465
+ # Final full verify passed or complete recommended
3466
+ write_complete_sentinel "$verdict_summary"
3467
+ update_status "complete" "pass"
3468
+ write_campaign_jsonl "$ITERATION" "${signal_us_id:-ALL}" "pass"
3469
+ return 0
3470
+ else
3471
+ log " Verifier passed but did not recommend complete. Continuing."
3472
+ update_status "verifier" "pass_continue"
3473
+ fi
3474
+ ;;
3475
+ fail)
3476
+ # --- governance.md s7½: Fix Loop (adapted for tmux lean mode) ---
3477
+
3478
+ # Parse per_us_results from verdict to track partial progress (batch + per-us)
3479
+ local _prev_verified="$VERIFIED_US"
3480
+ if jq -e '.per_us_results' "$VERDICT_FILE" &>/dev/null; then
3481
+ local _newly_passed
3482
+ _newly_passed=$(jq -r '.per_us_results | to_entries[] | select(.value == "pass") | .key' "$VERDICT_FILE" 2>/dev/null)
3483
+ for _pus in $(echo "$_newly_passed"); do
3484
+ if ! echo ",$VERIFIED_US," | grep -q ",$_pus,"; then
3485
+ if [[ -n "$VERIFIED_US" ]]; then
3486
+ VERIFIED_US="${VERIFIED_US},${_pus}"
3487
+ else
3488
+ VERIFIED_US="$_pus"
3489
+ fi
3490
+ log " Partial progress: $_pus passed (overall FAIL). Verified so far: $VERIFIED_US"
3491
+ fi
3492
+ done
3493
+ log_debug "[FLOW] iter=$ITERATION partial_progress prev=$_prev_verified now=$VERIFIED_US"
3494
+ fi
3495
+
3496
+ # Partial progress resets consecutive failures (progress was made)
3497
+ if [[ "$VERIFIED_US" != "$_prev_verified" ]]; then
3498
+ CONSECUTIVE_FAILURES=0
3499
+ log " Progress detected — consecutive_failures reset to 0"
3500
+ log_debug "[GOV] iter=$ITERATION consecutive_failures_reset=partial_progress"
3501
+ fi
3502
+
3503
+ (( CONSECUTIVE_FAILURES++ ))
3504
+ record_us_failure "${signal_us_id:-unknown}"
3505
+ check_model_upgrade "${signal_us_id:-unknown}"
3506
+
3507
+ # Mid-CB warning: alert at halfway point (governance §8 early warning)
3508
+ if (( CONSECUTIVE_FAILURES == EFFECTIVE_CB_THRESHOLD / 2 )); then
3509
+ log " [WARN] Mid-CB: $CONSECUTIVE_FAILURES/${EFFECTIVE_CB_THRESHOLD} consecutive failures — consider reviewing AC quality"
3510
+ log_debug "[GOV] iter=$ITERATION mid_cb_warning=true consecutive_failures=$CONSECUTIVE_FAILURES threshold=$EFFECTIVE_CB_THRESHOLD"
3511
+ fi
3512
+ local verdict_summary_fail
3513
+ verdict_summary_fail=$(jq -r '.summary // "no summary"' "$VERDICT_FILE" 2>/dev/null)
3514
+ log " Verifier FAILED (consecutive: $CONSECUTIVE_FAILURES). Building fix contract..."
3515
+
3516
+ # Extract issues from verdict for next Worker's fix contract
3517
+ local fix_contract="$LOGS_DIR/iter-$(printf '%03d' $ITERATION).fix-contract.md"
3518
+ {
3519
+ echo "# Fix Contract (from Verifier iteration $ITERATION)"
3520
+ echo ""
3521
+ if [[ -n "$VERIFIED_US" ]]; then
3522
+ echo "## Verified US (do NOT re-implement these)"
3523
+ echo "$VERIFIED_US" | tr ',' '\n' | sed 's/^/- /'
3524
+ echo ""
3525
+ echo "**Focus ONLY on unverified user stories. The above are already verified.**"
3526
+ echo ""
3527
+ fi
3528
+ echo "## Summary"
3529
+ echo "$verdict_summary_fail"
3530
+ echo ""
3531
+ echo "## Issues (from verify-verdict.json)"
3532
+ jq -r '.issues[]? | "- [\(.severity // "unknown")] \(.criterion // "?"): \(.description // "no description")\(if .fix_hint then " (hint: \(.fix_hint))" else "" end)"' "$VERDICT_FILE" 2>/dev/null || echo "- (no structured issues available)"
3533
+ echo ""
3534
+ echo "## Next Iteration Contract"
3535
+ jq -r '.next_iteration_contract // "Fix the issues listed above."' "$VERDICT_FILE" 2>/dev/null
3536
+ } | atomic_write "$fix_contract"
3537
+ log " Fix contract: $fix_contract"
3538
+ log_debug "[DECIDE] iter=$ITERATION phase=fix_loop trigger=$verdict consecutive_failures=$CONSECUTIVE_FAILURES fix_contract=$fix_contract"
3539
+
3540
+ # Circuit breaker: consecutive failures (with architecture escalation when at model ceiling)
3541
+ if (( CONSECUTIVE_FAILURES >= EFFECTIVE_CB_THRESHOLD )); then
3542
+ # For codex: use full model:reasoning string (WORKER_MODEL loses reasoning suffix after upgrade)
3543
+ _ceiling_model_str="$([[ "$WORKER_ENGINE" = "codex" ]] && echo "${WORKER_CODEX_MODEL}:${WORKER_CODEX_REASONING}" || echo "$WORKER_MODEL")"
3544
+ if (( _MODEL_UPGRADED )) && [[ -z "$(get_next_model "$_ceiling_model_str")" ]]; then
3545
+ log_debug "[GOV] iter=$ITERATION circuit_breaker=consecutive_failures detail=\"architecture escalation: Worker at ceiling (${WORKER_MODEL}), ${EFFECTIVE_CB_THRESHOLD} consecutive failures\""
3546
+ log_error "Circuit breaker: architecture escalation — Worker upgraded to ceiling (${WORKER_MODEL}), ${EFFECTIVE_CB_THRESHOLD} consecutive failures"
3547
+ write_blocked_sentinel "architecture escalation: Worker upgraded to ceiling model (${WORKER_MODEL}), ${EFFECTIVE_CB_THRESHOLD} consecutive verification failures" "" "repeat_axis"
3548
+ else
3549
+ log_debug "[GOV] iter=$ITERATION circuit_breaker=consecutive_failures detail=\"${EFFECTIVE_CB_THRESHOLD} consecutive verification failures\""
3550
+ log_error "Circuit breaker: ${EFFECTIVE_CB_THRESHOLD} consecutive verification failures"
3551
+ write_blocked_sentinel "${EFFECTIVE_CB_THRESHOLD} consecutive verification failures" "" "repeat_axis"
3552
+ fi
3553
+ update_status "blocked" "consecutive_failures"
3554
+ return 1
3555
+ fi
3556
+
3557
+ update_status "verifier" "fail"
3558
+ ;;
3559
+ request_info)
3560
+ # --- governance.md s7 step 7: request_info (degraded in tmux mode) ---
3561
+ local verdict_summary_ri
3562
+ verdict_summary_ri=$(jq -r '.summary // "no summary"' "$VERDICT_FILE" 2>/dev/null)
3563
+ log " Verifier requests info (degraded in tmux lean mode)."
3564
+ log " Questions: \"$verdict_summary_ri\""
3565
+ log " Treating as soft fail — Worker will see verdict in next iteration."
3566
+ update_status "verifier" "request_info"
3567
+ ;;
3568
+ blocked)
3569
+ local _verdict_cat
3570
+ _verdict_cat=$(_classify_cross_us_or_metric "$verdict_summary")
3571
+ write_blocked_sentinel "Verifier verdict: blocked - $verdict_summary" "" "$_verdict_cat"
3572
+ update_status "blocked" "verifier_blocked"
3573
+ return 1
3574
+ ;;
3575
+ *)
3576
+ log_error "Unknown verdict: $verdict"
3577
+ update_status "verifier" "unknown_verdict"
3578
+ ;;
3579
+ esac
3580
+ ;;
3581
+ blocked)
3582
+ # --- governance.md s7 step 6: blocked -> write sentinel ---
3583
+ local _signal_cat
3584
+ _signal_cat=$(_classify_cross_us_or_metric "$signal_summary")
3585
+ write_blocked_sentinel "Worker reported blocked: $signal_summary" "" "$_signal_cat"
3586
+ update_status "blocked" "worker_blocked"
3587
+ return 1
3588
+ ;;
3589
+ *)
3590
+ log_error "Unknown signal status: $signal_status"
3591
+ update_status "worker" "unknown_status"
3592
+ ;;
3593
+ esac
3594
+
3595
+ # --- step 7d: Archive iteration artifacts before cleanup ---
3596
+ archive_iter_artifacts "$ITERATION"
3597
+
3598
+ # --- AC5: Write per-iteration cost estimate ---
3599
+ write_cost_log "$ITERATION"
3600
+ write_campaign_jsonl "$ITERATION" "${signal_us_id:-unknown}" "${signal_status:-unknown}"
3601
+
3602
+ # --- governance.md s7 step 8: Write result log ---
3603
+ write_result_log "$ITERATION" "$signal_status"
3604
+
3605
+ # --- governance.md s7 step 8: Circuit breaker - stale context check ---
3606
+ if ! check_stale_context; then
3607
+ log_debug "[GOV] iter=$ITERATION circuit_breaker=stale_context detail=\"context unchanged for 3 consecutive iterations\""
3608
+ write_blocked_sentinel "Context unchanged for 3 consecutive iterations (stale)" "" "context_limit"
3609
+ update_status "blocked" "stale_context"
3610
+ return 1
3611
+ fi
3612
+
3613
+ # --- governance.md s7 step 8: Update status ---
3614
+ update_status "idle" "${signal_status:-unknown}"
3615
+ done
3616
+
3617
+ # Max iterations reached
3618
+ log "Max iterations ($MAX_ITER) reached."
3619
+ update_status "timeout" "max_iter"
3620
+ return 1
3621
+ }
3622
+
3623
+ # =============================================================================
3624
+ # Entry Point
3625
+ # =============================================================================
3626
+
3627
+ # --- CLI: parse --worker-model / --verifier-model flags ---
3628
+ # These flags override env-var defaults (WORKER_ENGINE, WORKER_MODEL, etc.)
3629
+ # Format: "model:reasoning" → codex engine; "model-name" → claude engine
3630
+ _cli_i=1
3631
+ while (( _cli_i <= $# )); do
3632
+ case "${@[$_cli_i]}" in
3633
+ --worker-model)
3634
+ (( _cli_i++ ))
3635
+ _cli_parsed=$(parse_model_flag "${@[$_cli_i]:-}" "worker") || exit 1
3636
+ WORKER_ENGINE="${_cli_parsed%% *}"
3637
+ _cli_rest="${_cli_parsed#* }"
3638
+ WORKER_MODEL="${_cli_rest%% *}"
3639
+ if [[ "$WORKER_ENGINE" = "codex" ]]; then
3640
+ WORKER_CODEX_MODEL="$WORKER_MODEL"
3641
+ WORKER_CODEX_REASONING="${_cli_rest##* }"
3642
+ elif [[ "$_cli_rest" == *" "* ]]; then
3643
+ WORKER_EFFORT="${_cli_rest##* }"
3644
+ fi
3645
+ ;;
3646
+ --verifier-model)
3647
+ (( _cli_i++ ))
3648
+ _cli_parsed=$(parse_model_flag "${@[$_cli_i]:-}" "verifier") || exit 1
3649
+ VERIFIER_ENGINE="${_cli_parsed%% *}"
3650
+ _cli_rest="${_cli_parsed#* }"
3651
+ VERIFIER_MODEL="${_cli_rest%% *}"
3652
+ if [[ "$VERIFIER_ENGINE" = "codex" ]]; then
3653
+ VERIFIER_CODEX_MODEL="$VERIFIER_MODEL"
3654
+ VERIFIER_CODEX_REASONING="${_cli_rest##* }"
3655
+ elif [[ "$_cli_rest" == *" "* ]]; then
3656
+ VERIFIER_EFFORT="${_cli_rest##* }"
3657
+ fi
3658
+ ;;
3659
+ --lock-worker-model)
3660
+ LOCK_WORKER_MODEL=1
3661
+ ;;
3662
+ --autonomous)
3663
+ AUTONOMOUS_MODE=1
3664
+ ;;
3665
+ --lane-strict)
3666
+ # P1-E opt-in: lane mtime audit escalates to BLOCKED instead of WARN.
3667
+ # See governance §7¾.
3668
+ LANE_MODE="strict"
3669
+ ;;
3670
+ --test-density-strict)
3671
+ # US-018 R6 P1-F opt-in: AC with < 3 tests fails init (exit 1) instead of WARN.
3672
+ # See governance §7f.
3673
+ TEST_DENSITY_MODE="strict"
3674
+ ;;
3675
+ --final-verifier-model)
3676
+ (( _cli_i++ ))
3677
+ _cli_parsed=$(parse_model_flag "${@[$_cli_i]:-}" "final-verifier") || exit 1
3678
+ FINAL_VERIFIER_ENGINE="${_cli_parsed%% *}"
3679
+ _cli_rest="${_cli_parsed#* }"
3680
+ FINAL_VERIFIER_MODEL="${_cli_rest%% *}"
3681
+ if [[ "$FINAL_VERIFIER_ENGINE" = "codex" ]]; then
3682
+ FINAL_VERIFIER_CODEX_MODEL="$FINAL_VERIFIER_MODEL"
3683
+ FINAL_VERIFIER_CODEX_REASONING="${_cli_rest##* }"
3684
+ elif [[ "$_cli_rest" == *" "* ]]; then
3685
+ FINAL_VERIFIER_EFFORT="${_cli_rest##* }"
3686
+ fi
3687
+ ;;
3688
+ --consensus)
3689
+ (( _cli_i++ ))
3690
+ CONSENSUS_MODE="${@[$_cli_i]:-off}"
3691
+ ;;
3692
+ --consensus-model)
3693
+ (( _cli_i++ ))
3694
+ CONSENSUS_MODEL="${@[$_cli_i]:-gpt-5.5:medium}"
3695
+ ;;
3696
+ --final-consensus-model)
3697
+ (( _cli_i++ ))
3698
+ FINAL_CONSENSUS_MODEL="${@[$_cli_i]:-gpt-5.5:high}"
3699
+ ;;
3700
+ --final-consensus)
3701
+ # Legacy: map to new --consensus final-only
3702
+ CONSENSUS_MODE="final-only"
3703
+ ;;
3704
+ --verify-consensus)
3705
+ # Legacy: map to new --consensus all
3706
+ CONSENSUS_MODE="all"
3707
+ ;;
3708
+ esac
3709
+ (( _cli_i++ ))
3710
+ done
3711
+ unset _cli_i _cli_parsed _cli_rest
3712
+
3713
+ # Require tmux — tmux mode only works inside an active tmux session
3714
+ if [[ -z "${TMUX:-}" ]]; then
3715
+ echo "ERROR: tmux mode requires running inside a tmux session."
3716
+ echo ""
3717
+ echo " Start tmux first, then retry:"
3718
+ echo " tmux"
3719
+ echo " LOOP_NAME=$SLUG $0"
3720
+ echo ""
3721
+ echo " Or use Agent() mode instead (no tmux needed):"
3722
+ echo " /rlp-desk run $SLUG"
3723
+ exit 1
3724
+ fi
3725
+