@ai-dev-methodologies/rlp-desk 0.18.0 → 0.18.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -11,6 +11,40 @@ For pre-v0.15.4 versions, refer to `git log` and individual GitHub release notes
11
11
  - Post-v0.15.6: remove `RLP_LIFECYCLE_METRICS` flag entirely (per plan v3 ADR follow-ups).
12
12
  - Phase D.1 (handoff documents) + Phase D.2 (per-stage agent role specialization) — both deferred per `docs/plans/v0.15.4-release-runbook.md` §7.6.
13
13
 
14
+ ## [0.18.2] — 2026-06-25
15
+
16
+ Five leader-resilience fixes (per-us, consensus, and config-validation paths), each
17
+ validated by a real-LLM dogfood. Consensus is opt-in; default per-us campaigns benefit
18
+ from the final-verify and config-validation fixes.
19
+
20
+ ### Fixed
21
+ - Final verification is resilient to verifier non-determinism: a user story that already
22
+ passed its per-US check is re-verified (up to a small bound) on a fail verdict, so a
23
+ flaky false-fail must reproduce before it charges a fix-loop failure. A single
24
+ non-deterministic false-fail can no longer block a complete, correct campaign.
25
+ - The `claude` rate-limit banner ("API Error: … temporarily limiting requests … · Rate
26
+ limited") is now recognized as a transient API condition and routed to the bounded
27
+ backoff, instead of being misclassified as a frozen-pane deadlock.
28
+ - Numeric configuration knobs (e.g. `--max-iter`, `--cb-threshold`, timeouts) are now
29
+ validated: a non-integer or out-of-range value falls back to its default with a warning,
30
+ instead of silently mis-evaluating (a bad `--max-iter` previously could make a campaign
31
+ run zero iterations).
32
+ - Leader auto-commit recovery no longer blocks a campaign whose work the worker already
33
+ committed (a commit-timing race previously surfaced as "nothing to commit" → blocked).
34
+ - `--consensus final-only` now actually runs at the final verification in the default
35
+ per-us verify mode (it was previously bypassed by the sequential final-verify path, so
36
+ the recommended consensus configuration was a silent no-op in the default mode).
37
+
38
+ ## [0.18.1] — 2026-06-25
39
+
40
+ ### Fixed
41
+ - End-of-campaign resilience: when the last user story passes its per-US verify, the
42
+ leader now runs the final verification directly instead of dispatching one more
43
+ worker iteration solely to hand off to the final verify. That extra round-trip was a
44
+ fragile dependency on a healthy worker at the very end of a campaign (a worker stall
45
+ or API rate-limit at that moment could block a campaign whose work was already
46
+ complete and verified). Per-US campaigns now finalize without it.
47
+
14
48
  ## [0.18.0] — 2026-06-24
15
49
 
16
50
  **Leader hardening: campaigns that complete.** The `--mode tmux` leader's decision
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ai-dev-methodologies/rlp-desk",
3
- "version": "0.18.0",
3
+ "version": "0.18.2",
4
4
  "description": "Fresh-context iterative loops for Claude Code — autonomous task completion with independent verification",
5
5
  "scripts": {
6
6
  "postinstall": "node scripts/postinstall.js",
@@ -3,6 +3,31 @@ set -uo pipefail
3
3
  # NOTE: We use set -u (undefined var check) and pipefail, but NOT set -e
4
4
  # because the main loop uses explicit error checks throughout.
5
5
 
6
+ # D-19: validate an env-overridable INTEGER knob. A non-integer value (operator
7
+ # typo, or a bad CLI arg like `--max-iter abc` threaded into the env) otherwise
8
+ # mis-evaluates under `set -u` inside (( )) arithmetic — e.g. a non-integer
9
+ # MAX_ITER makes the main-loop bound error so the campaign silently runs ZERO
10
+ # iterations; a non-integer CB_THRESHOLD breaks the circuit breaker. The `<->`
11
+ # integer glob is checked FIRST (and short-circuits) so the arithmetic never runs
12
+ # on a non-integer. A malformed / below-min / above-max value → the default.
13
+ _validate_int_knob() {
14
+ local _name="$1" _default="$2" _min="${3:-0}" _max="${4:-0}"
15
+ local _val="${(P)_name}"
16
+ local _bad=0
17
+ if ! [[ "$_val" == <-> ]]; then
18
+ _bad=1
19
+ elif (( _val < _min )); then
20
+ _bad=1
21
+ elif (( _max > 0 && _val > _max )); then
22
+ _bad=1
23
+ fi
24
+ if (( _bad )); then
25
+ local _range="min=$_min"; (( _max > 0 )) && _range="$_range, max=$_max"
26
+ print -r -- "WARNING: $_name='$_val' is not a valid integer ($_range) — using default $_default" >&2
27
+ eval "$_name=$_default"
28
+ fi
29
+ }
30
+
6
31
  # =============================================================================
7
32
  # Ralph Desk Tmux Runner
8
33
  #
@@ -56,6 +81,14 @@ HEARTBEAT_STALE_THRESHOLD="${HEARTBEAT_STALE_THRESHOLD:-120}"
56
81
  MAX_RESTARTS="${MAX_RESTARTS:-3}"
57
82
  IDLE_NUDGE_THRESHOLD="${IDLE_NUDGE_THRESHOLD:-30}"
58
83
  MAX_NUDGES="${MAX_NUDGES:-3}"
84
+ # D-19: validate the numeric knobs above (set -u + (( )) arithmetic safety).
85
+ _validate_int_knob MAX_ITER 20 1
86
+ _validate_int_knob POLL_INTERVAL 5 1
87
+ _validate_int_knob ITER_TIMEOUT 600 1
88
+ _validate_int_knob HEARTBEAT_STALE_THRESHOLD 120 1
89
+ _validate_int_knob MAX_RESTARTS 3 0
90
+ _validate_int_knob IDLE_NUDGE_THRESHOLD 30 1
91
+ _validate_int_knob MAX_NUDGES 3 0
59
92
  WITH_SELF_VERIFICATION="${WITH_SELF_VERIFICATION:-0}"
60
93
  WITH_SELF_VERIFICATION_REQUESTED="$WITH_SELF_VERIFICATION" # preserves original user intent for traceability (governance §1f)
61
94
  SV_SKIPPED_REASON="" # set when SV is disabled despite user request
@@ -88,6 +121,7 @@ TEST_DENSITY_MODE="${TEST_DENSITY_MODE:-warn}"
88
121
  # .sisyphus/mission-abort.json and exits non-zero so contract defects don't
89
122
  # silently loop. infra_failure category and the very first iteration are exempt.
90
123
  BLOCK_CB_THRESHOLD="${BLOCK_CB_THRESHOLD:-3}"
124
+ _validate_int_knob BLOCK_CB_THRESHOLD 3 1 # D-19
91
125
  CONSECUTIVE_BLOCKS=0
92
126
  LAST_BLOCK_REASON=""
93
127
 
@@ -229,6 +263,18 @@ FINAL_VERIFIER_ENGINE="${FINAL_VERIFIER_ENGINE:-claude}"
229
263
  WORKER_EFFORT="${WORKER_EFFORT:-}"
230
264
  VERIFIER_EFFORT="${VERIFIER_EFFORT:-}"
231
265
  FINAL_VERIFIER_EFFORT="${FINAL_VERIFIER_EFFORT:-}"
266
+ # D-18: max final-verify attempts for a US that ALREADY passed per-US. A verifier
267
+ # fail verdict on already-per-US-passed work must REPRODUCE across all attempts
268
+ # (first pass wins) before it charges a fix-loop failure — guards against verifier
269
+ # non-determinism defeating a complete, correct campaign. A genuinely-regressed US
270
+ # (or one never per-US-passed) still fails on the first attempt.
271
+ FINAL_VERIFY_MAX_ATTEMPTS="${FINAL_VERIFY_MAX_ATTEMPTS:-3}"
272
+ # D-18/D-19: a non-integer value ("abc") would mis-evaluate under set -u in the
273
+ # (( )) retry-loop arithmetic — skipping the loop and silently FALSE-FAILING a US
274
+ # — and an unbounded value would be ruinously expensive. Validate to an integer
275
+ # in 1..10 via the shared _validate_int_knob helper (D-19 generalized this
276
+ # per-knob fix into one validator used by every numeric knob).
277
+ _validate_int_knob FINAL_VERIFY_MAX_ATTEMPTS 3 1 10
232
278
 
233
279
  # Auto-detect engine from model format for env var path (CLI path uses parse_model_flag)
234
280
  _auto_detect_engine WORKER_MODEL WORKER_ENGINE WORKER_CODEX_MODEL WORKER_CODEX_REASONING WORKER_EFFORT
@@ -260,6 +306,7 @@ elif [[ "${FINAL_CONSENSUS:-0}" = "1" ]]; then
260
306
  fi
261
307
  CONSENSUS_SCOPE="${CONSENSUS_SCOPE:-${CONSENSUS_MODE}}"
262
308
  CB_THRESHOLD="${CB_THRESHOLD:-6}" # consecutive failures before BLOCKED (default: 6)
309
+ _validate_int_knob CB_THRESHOLD 6 1 # D-19: must be valid before the (( *2 )) below
263
310
  # Effective CB threshold: doubled when consensus mode active
264
311
  if [[ "$CONSENSUS_MODE" != "off" ]]; then
265
312
  EFFECTIVE_CB_THRESHOLD=$(( CB_THRESHOLD * 2 ))
@@ -268,6 +315,8 @@ else
268
315
  fi
269
316
  _API_MAX_RETRIES="${_API_MAX_RETRIES:-5}"
270
317
  _API_RETRY_INTERVAL_S="${_API_RETRY_INTERVAL_S:-30}"
318
+ _validate_int_knob _API_MAX_RETRIES 5 1 # D-19
319
+ _validate_int_knob _API_RETRY_INTERVAL_S 30 1 # D-19
271
320
 
272
321
  # --- Derived Paths ---
273
322
  DESK="$ROOT/${RLP_DESK_RUNTIME_DIR:-.rlp-desk}"
@@ -350,6 +399,9 @@ BASELINE_COMMIT="" # git HEAD at campaign start (captured before loop)
350
399
  CAMPAIGN_REPORT_GENERATED=0 # guard against double-generation in cleanup trap
351
400
  SV_REPORT_GENERATED=0 # guard against double-generation in generate_sv_report
352
401
  VERIFIED_US="" # comma-separated list of verified US IDs (per-us mode)
402
+ _FINALIZE_PENDING=0 # D-16: armed when the last per-US pass completes coverage;
403
+ # the next loop top synthesizes an ALL verify signal and
404
+ # skips the (fragile) worker round-trip to emit it.
353
405
  CONSENSUS_ROUND=0 # current consensus round for current US
354
406
  US_LIST="" # comma-separated US IDs from PRD (per-us mode)
355
407
  LOCKFILE_ACQUIRED=0
@@ -844,7 +896,25 @@ _bug8_check_synth_allowed() {
844
896
  log " Bug #8 F-8 recovery: done-claim + Worker's uncommitted tracked changes — auto-committing $us_id work (files: $_bug8_first5)."
845
897
  log_debug "[GOV] iter=$iter bug8=recover_autocommit us_id=$us_id files='$_bug8_first5'"
846
898
  local -a _bug8_add=("${(@f)_bug8_worker_files}")
847
- if git -C "$ROOT" add -- "${_bug8_add[@]}" && git -C "$ROOT" commit -q -m "chore(leader-recovery): commit Worker's uncommitted $us_id changes (Bug #8 F-8)"; then
899
+ # D-20 (codex LOW): fail-safe on an empty file list. The upstream
900
+ # `[[ -z "$_bug8_worker_files" ]]` guard already makes this unreachable, but
901
+ # never let an empty array turn `git diff --quiet HEAD --` into a whole-tree
902
+ # check (which could falsely read "already committed" → false PASS). BLOCK.
903
+ if (( ${#_bug8_add} == 0 )); then
904
+ log_error " Bug #8: empty worker-file list at auto-commit (unexpected) — refusing synthesis."
905
+ write_blocked_sentinel "worker_incomplete_uncommitted: empty file list at auto-commit" "$us_id" "metric_failure"
906
+ return 1
907
+ fi
908
+ if git -C "$ROOT" diff --quiet HEAD -- "${_bug8_add[@]}" 2>/dev/null; then
909
+ # D-20: the Worker committed these files itself in the window between the
910
+ # dirty-detection above and now (a reap/commit race) — the working tree is
911
+ # already clean vs HEAD for them, i.e. the work IS committed. The old code
912
+ # ran `git add … && git commit`, which exited non-zero ("nothing to commit")
913
+ # and BLOCKED a correct, fully-committed campaign. Treat "already committed"
914
+ # as success: proceed to synthesis (the Verifier still gates correctness).
915
+ log " Bug #8 F-8 (D-20): Worker files already committed (commit race) — nothing to auto-commit; proceeding."
916
+ log_debug "[GOV] iter=$iter bug8=autocommit_noop_already_committed us_id=$us_id files='$_bug8_first5'"
917
+ elif git -C "$ROOT" add -- "${_bug8_add[@]}" && git -C "$ROOT" commit -q -m "chore(leader-recovery): commit Worker's uncommitted $us_id changes (Bug #8 F-8)"; then
848
918
  log " Leader-recovery auto-commit OK (Worker files only) — Verifier will gate correctness."
849
919
  else
850
920
  log_error " Bug #8: leader-recovery auto-commit failed. Refusing synthesis. files: $_bug8_first5"
@@ -1431,6 +1501,8 @@ typeset -gA PANE_PROMPT_STUCK_SINCE
1431
1501
  typeset -gA PANE_DISMISS_FAILED_COUNT
1432
1502
  PROMPT_STALL_TIMEOUT="${PROMPT_STALL_TIMEOUT:-300}" # 5 min default
1433
1503
  PROMPT_DISMISS_FAIL_LIMIT="${PROMPT_DISMISS_FAIL_LIMIT:-20}" # ~100s of fruitless dismiss attempts
1504
+ _validate_int_knob PROMPT_STALL_TIMEOUT 300 1 # D-19
1505
+ _validate_int_knob PROMPT_DISMISS_FAIL_LIMIT 20 1 # D-19
1434
1506
 
1435
1507
  # v5.7 §4.17: generic no-progress timeout (codex Critic HIGH — closes the gap
1436
1508
  # where an undetected prompt or alive-but-frozen Worker bypasses Layer 4).
@@ -1438,6 +1510,7 @@ PROMPT_DISMISS_FAIL_LIMIT="${PROMPT_DISMISS_FAIL_LIMIT:-20}" # ~100s of fruitle
1438
1510
  # seconds AND signal file still missing, write BLOCKED `infra_failure` reason
1439
1511
  # `worker_no_progress` so silent infinite-wait is impossible.
1440
1512
  PROGRESS_NO_CHANGE_TIMEOUT="${PROGRESS_NO_CHANGE_TIMEOUT:-600}" # 10 min default
1513
+ _validate_int_knob PROGRESS_NO_CHANGE_TIMEOUT 600 1 # D-19
1441
1514
  typeset -gA PANE_LAST_CHANGE_TS # epoch when content last changed
1442
1515
  typeset -gA PANE_LAST_CONTENT_FOR_PROGRESS # captured content for diff
1443
1516
 
@@ -1446,6 +1519,7 @@ typeset -gA PANE_LAST_CONTENT_FOR_PROGRESS # captured content for diff
1446
1519
  # CODEX_IDLE_GRACE_S (default 120s) before BLOCK. Per-pane bookkeeping to
1447
1520
  # avoid granting it repeatedly. Bug Report #3 (BOS 2026-05-04).
1448
1521
  CODEX_IDLE_GRACE_S="${CODEX_IDLE_GRACE_S:-120}"
1522
+ _validate_int_knob CODEX_IDLE_GRACE_S 120 1 # D-19
1449
1523
  typeset -gA PANE_CODEX_IDLE_GRACED
1450
1524
  # v0.14.2: per-verifier-pane trace flag — log the verdict-lookup outcome
1451
1525
  # exactly once per byte-stasis transition. Bug Report #4 (BOS 2026-05-05).
@@ -2517,9 +2591,22 @@ poll_for_signal() {
2517
2591
  if [[ -n "$pane_output_for_retry" ]] &&
2518
2592
  ( echo "$pane_output_for_retry" | grep -qiE '(^|[^[:digit:]])500([^[:digit:]]|$)' \
2519
2593
  || echo "$pane_output_for_retry" | grep -qiE '(^|[^[:digit:]])529([^[:digit:]]|$)' \
2594
+ || echo "$pane_output_for_retry" | grep -qiE '(^|[^[:digit:]])429([^[:digit:]]|$)' \
2520
2595
  || echo "$pane_output_for_retry" | grep -qi 'overloaded' \
2521
2596
  || echo "$pane_output_for_retry" | grep -qi 'too many requests' \
2522
- || echo "$pane_output_for_retry" | grep -qi 'service unavailable' ); then
2597
+ || echo "$pane_output_for_retry" | grep -qi 'service unavailable' \
2598
+ || echo "$pane_output_for_retry" | grep -qiE 'api error.*temporarily limiting requests' ); then
2599
+ # D-17a: the last pattern catches the claude TUI rate-limit banner
2600
+ # ("API Error: Server is temporarily limiting requests (not your usage
2601
+ # limit) · Rate limited") that previously fell through to the 600s
2602
+ # frozen-pane BLOCK with a misleading "deadlock" reason. It requires BOTH
2603
+ # the "API Error" banner prefix AND the distinctive multi-word phrase
2604
+ # "temporarily limiting requests" on the SAME line (codex MEDIUM): a Worker
2605
+ # implementing a rate-limiter feature, quoting the phrase, or merely
2606
+ # discussing API rate-limit handling does NOT false-trigger backoff — only
2607
+ # the actual error banner does. Routes to the bounded API backoff below
2608
+ # (5×30s) → recovers a transient limit, else BLOCKs as infra (recoverable),
2609
+ # not as a misleading frozen-pane deadlock.
2523
2610
  is_api_text_retry=1
2524
2611
  fi
2525
2612
 
@@ -2818,6 +2905,105 @@ run_single_verifier() {
2818
2905
  # --- Sequential final verify: run per-US scoped verifiers instead of one big ALL verify ---
2819
2906
  # Returns 0 if all US pass + integration check pass, 1 if any US fails, 2 if integration fails.
2820
2907
  # Sets FAILED_US global on failure.
2908
+ # D-16: true when every US in US_LIST is already present in VERIFIED_US.
2909
+ # Used to arm leader-driven finalize after the last per-US pass.
2910
+ _all_us_verified() {
2911
+ [[ -n "$US_LIST" ]] || return 1
2912
+ local _us
2913
+ for _us in $(echo "$US_LIST" | tr ',' ' '); do
2914
+ echo ",$VERIFIED_US," | grep -q ",$_us," || return 1
2915
+ done
2916
+ return 0
2917
+ }
2918
+
2919
+ # D-18 helper: one final-verify pass for a single US. Returns 0=pass verdict,
2920
+ # 1=fail verdict, 2=infra-terminal (launch/poll hard fail — sentinel handling
2921
+ # already done by the poll). Reads/updates globals (VERIFIER_PANE, FINAL_VERIFIER_*,
2922
+ # SIGNAL_FILE, VERDICT_FILE, SESSION_CONFIG). Extracted from run_sequential_final_verify
2923
+ # so the caller can re-verify a per-US-passed US on a flake without duplicating the
2924
+ # dispatch/poll logic. Does NOT set FAILED_US (the caller owns that).
2925
+ _final_verify_one_us() {
2926
+ local us="$1" iter="$2"
2927
+
2928
+ # Temporarily override signal file to scope verifier to this US
2929
+ local orig_signal
2930
+ orig_signal=$(cat "$SIGNAL_FILE" 2>/dev/null)
2931
+ echo "{\"status\":\"verify\",\"us_id\":\"$us\",\"summary\":\"sequential final verify\"}" | atomic_write "$SIGNAL_FILE"
2932
+
2933
+ # Write scoped verifier trigger
2934
+ write_verifier_trigger "$iter"
2935
+ local verifier_prompt="$LOGS_DIR/iter-$(printf '%03d' $iter).verifier-prompt.md"
2936
+
2937
+ # Clean verifier pane
2938
+ local verifier_cmd
2939
+ verifier_cmd=$(tmux display-message -p -t "$VERIFIER_PANE" '#{pane_current_command}' 2>/dev/null)
2940
+ if [[ "$verifier_cmd" == "node" || "$verifier_cmd" == "claude" || "$verifier_cmd" == "codex" ]]; then
2941
+ tmux send-keys -t "$VERIFIER_PANE" C-c 2>/dev/null; sleep 0.5
2942
+ tmux send-keys -t "$VERIFIER_PANE" "/exit" C-m 2>/dev/null; sleep 2
2943
+ fi
2944
+ wait_for_pane_ready "$VERIFIER_PANE" 10 2>/dev/null || true
2945
+
2946
+ # Launch verifier. D-1: the FINAL (ALL) verify uses FINAL_VERIFIER_* (the
2947
+ # "final 엄격" knob — a configured stronger model, e.g. opus, for the final
2948
+ # gate), NOT the lighter per-US VERIFIER_*. This is the configured-final-model
2949
+ # distinction, distinct from the removed per-iteration verifier auto-upgrade.
2950
+ local verifier_launch
2951
+ if [[ "$FINAL_VERIFIER_ENGINE" = "codex" ]]; then
2952
+ verifier_launch="${CODEX_BIN:-codex} -m $FINAL_VERIFIER_CODEX_MODEL -c model_reasoning_effort=\"$FINAL_VERIFIER_CODEX_REASONING\" -c mcp_servers='{}' --disable plugins --dangerously-bypass-approvals-and-sandbox"
2953
+ launch_verifier_codex "$VERIFIER_PANE" "$verifier_prompt" "$iter" "$verifier_launch"
2954
+ else
2955
+ verifier_launch="$(build_claude_cmd tui "$FINAL_VERIFIER_MODEL" "" "" "$FINAL_VERIFIER_EFFORT")"
2956
+ launch_verifier_claude "$VERIFIER_PANE" "$verifier_prompt" "$iter" "$verifier_launch" || {
2957
+ log_error "Failed to launch final verifier for $us"
2958
+ return 2
2959
+ }
2960
+ fi
2961
+
2962
+ # Poll for verdict. D-4: distinguish rc==2 (hard-fail, sentinel already
2963
+ # written → terminal) from rc==1 (transient pane race/timeout) and give ONE
2964
+ # replace-pane + re-dispatch retry before failing the US — the F-10 retry
2965
+ # parity the per-US main verifier site has but this final-verify path lacked
2966
+ # (a single transient poll miss falsely failed a US at the most expensive
2967
+ # end-of-campaign moment, charging a bogus consecutive failure).
2968
+ rm -f "$VERDICT_FILE"
2969
+ local poll_rc=0
2970
+ poll_for_signal "$VERDICT_FILE" "$VERIFIER_HEARTBEAT" "$VERIFIER_PANE" "$verifier_launch" "Verifier-final" || poll_rc=$?
2971
+ if (( poll_rc == 2 )); then
2972
+ log_error "Verifier hard-fail (rc=2, sentinel written) for $us in final verify"
2973
+ return 2
2974
+ fi
2975
+ if (( poll_rc == 1 )); then
2976
+ log " Verifier-final transient poll fail for $us — replacing pane + retrying once (D-4)"
2977
+ replace_worker_pane "$VERIFIER_PANE" "verifier"
2978
+ VERIFIER_PANE=$(jq -r '.panes.verifier' "$SESSION_CONFIG")
2979
+ if [[ "$FINAL_VERIFIER_ENGINE" = "codex" ]]; then
2980
+ launch_verifier_codex "$VERIFIER_PANE" "$verifier_prompt" "$iter" "$verifier_launch"
2981
+ else
2982
+ launch_verifier_claude "$VERIFIER_PANE" "$verifier_prompt" "$iter" "$verifier_launch" || return 2
2983
+ fi
2984
+ rm -f "$VERDICT_FILE"; poll_rc=0
2985
+ poll_for_signal "$VERDICT_FILE" "$VERIFIER_HEARTBEAT" "$VERIFIER_PANE" "$verifier_launch" "Verifier-final" || poll_rc=$?
2986
+ if (( poll_rc != 0 )); then
2987
+ log_error "Verifier poll failed for $us after replace+retry (rc=$poll_rc)"
2988
+ return 2
2989
+ fi
2990
+ fi
2991
+
2992
+ # Bug #7 Fix-Q/R: reap verifier pane between per-US final verifications so
2993
+ # the previous codex/claude TUI cannot continue running while the next per-
2994
+ # US verifier dispatch reuses the same pane.
2995
+ _kill_pane_process "$VERIFIER_PANE" "verifier-final"
2996
+ _lock_sentinel "$VERDICT_FILE"
2997
+ # PR-0b-narrow: stamp leader handshake ack on the verdict (audit-only).
2998
+ _stamp_ack_field "$VERDICT_FILE"
2999
+
3000
+ # Read verdict
3001
+ local verdict
3002
+ verdict=$(jq -r '.verdict' "$VERDICT_FILE" 2>/dev/null)
3003
+ [[ "$verdict" == "pass" ]] && return 0
3004
+ return 1
3005
+ }
3006
+
2821
3007
  run_sequential_final_verify() {
2822
3008
  local iter="$1"
2823
3009
  FAILED_US=""
@@ -2828,91 +3014,38 @@ run_sequential_final_verify() {
2828
3014
  for us in $(echo "$US_LIST" | tr ',' ' '); do
2829
3015
  log " Final verify: checking $us..."
2830
3016
 
2831
- # Temporarily override signal file to scope verifier to this US
2832
- local orig_signal
2833
- orig_signal=$(cat "$SIGNAL_FILE" 2>/dev/null)
2834
- echo "{\"status\":\"verify\",\"us_id\":\"$us\",\"summary\":\"sequential final verify\"}" | atomic_write "$SIGNAL_FILE"
2835
-
2836
- # Write scoped verifier trigger
2837
- write_verifier_trigger "$iter"
2838
- local verifier_prompt="$LOGS_DIR/iter-$(printf '%03d' $iter).verifier-prompt.md"
2839
-
2840
- # Clean verifier pane
2841
- local verifier_cmd
2842
- verifier_cmd=$(tmux display-message -p -t "$VERIFIER_PANE" '#{pane_current_command}' 2>/dev/null)
2843
- if [[ "$verifier_cmd" == "node" || "$verifier_cmd" == "claude" || "$verifier_cmd" == "codex" ]]; then
2844
- tmux send-keys -t "$VERIFIER_PANE" C-c 2>/dev/null; sleep 0.5
2845
- tmux send-keys -t "$VERIFIER_PANE" "/exit" C-m 2>/dev/null; sleep 2
2846
- fi
2847
- wait_for_pane_ready "$VERIFIER_PANE" 10 2>/dev/null || true
2848
-
2849
- # Launch verifier. D-1: the FINAL (ALL) verify uses FINAL_VERIFIER_* (the
2850
- # "final 엄격" knob — a configured stronger model, e.g. opus, for the final
2851
- # gate), NOT the lighter per-US VERIFIER_*. This is the configured-final-model
2852
- # distinction, distinct from the removed per-iteration verifier auto-upgrade.
2853
- local verifier_launch
2854
- if [[ "$FINAL_VERIFIER_ENGINE" = "codex" ]]; then
2855
- verifier_launch="${CODEX_BIN:-codex} -m $FINAL_VERIFIER_CODEX_MODEL -c model_reasoning_effort=\"$FINAL_VERIFIER_CODEX_REASONING\" -c mcp_servers='{}' --disable plugins --dangerously-bypass-approvals-and-sandbox"
2856
- launch_verifier_codex "$VERIFIER_PANE" "$verifier_prompt" "$iter" "$verifier_launch"
2857
- else
2858
- verifier_launch="$(build_claude_cmd tui "$FINAL_VERIFIER_MODEL" "" "" "$FINAL_VERIFIER_EFFORT")"
2859
- launch_verifier_claude "$VERIFIER_PANE" "$verifier_prompt" "$iter" "$verifier_launch" || {
2860
- log_error "Failed to launch final verifier for $us"
2861
- FAILED_US="$us"
2862
- return 1
2863
- }
2864
- fi
2865
-
2866
- # Poll for verdict. D-4: distinguish rc==2 (hard-fail, sentinel already
2867
- # written → terminal) from rc==1 (transient pane race/timeout) and give ONE
2868
- # replace-pane + re-dispatch retry before failing the US — the F-10 retry
2869
- # parity the per-US main verifier site has but this final-verify path lacked
2870
- # (a single transient poll miss falsely failed a US at the most expensive
2871
- # end-of-campaign moment, charging a bogus consecutive failure).
2872
- rm -f "$VERDICT_FILE"
2873
- local poll_rc=0
2874
- poll_for_signal "$VERDICT_FILE" "$VERIFIER_HEARTBEAT" "$VERIFIER_PANE" "$verifier_launch" "Verifier-final" || poll_rc=$?
2875
- if (( poll_rc == 2 )); then
2876
- log_error "Verifier hard-fail (rc=2, sentinel written) for $us in final verify"
2877
- FAILED_US="$us"
2878
- return 1
2879
- fi
2880
- if (( poll_rc == 1 )); then
2881
- log " Verifier-final transient poll fail for $us — replacing pane + retrying once (D-4)"
2882
- replace_worker_pane "$VERIFIER_PANE" "verifier"
2883
- VERIFIER_PANE=$(jq -r '.panes.verifier' "$SESSION_CONFIG")
2884
- if [[ "$FINAL_VERIFIER_ENGINE" = "codex" ]]; then
2885
- launch_verifier_codex "$VERIFIER_PANE" "$verifier_prompt" "$iter" "$verifier_launch"
2886
- else
2887
- launch_verifier_claude "$VERIFIER_PANE" "$verifier_prompt" "$iter" "$verifier_launch" || { FAILED_US="$us"; return 1; }
2888
- fi
2889
- rm -f "$VERDICT_FILE"; poll_rc=0
2890
- poll_for_signal "$VERDICT_FILE" "$VERIFIER_HEARTBEAT" "$VERIFIER_PANE" "$verifier_launch" "Verifier-final" || poll_rc=$?
2891
- if (( poll_rc != 0 )); then
2892
- log_error "Verifier poll failed for $us after replace+retry (rc=$poll_rc)"
3017
+ # D-18: a US that already passed per-US gets up to FINAL_VERIFY_MAX_ATTEMPTS
3018
+ # final-verify attempts on a FAIL verdict (first pass wins). A verifier
3019
+ # false-fail (non-determinism) on already-correct, per-US-passed work must
3020
+ # REPRODUCE across all attempts before it charges a fix-loop failure — else a
3021
+ # single flake defeats a complete, correct campaign (D-16 dogfood: codex
3022
+ # false-failed pytest-36/36 work → fix-loop churn → stale BLOCK). A US that
3023
+ # never passed per-US (or a genuine regression) fails on the first attempt.
3024
+ local _fv_max=1
3025
+ # FINAL_VERIFY_MAX_ATTEMPTS is validated to 1..10 at declaration, so no clamp here.
3026
+ if echo ",$VERIFIED_US," | grep -q ",$us,"; then _fv_max=$FINAL_VERIFY_MAX_ATTEMPTS; fi
3027
+ local _fv_attempt=0 _fv_rc=1
3028
+ while (( _fv_attempt < _fv_max )); do
3029
+ (( _fv_attempt++ ))
3030
+ _final_verify_one_us "$us" "$iter"; _fv_rc=$?
3031
+ if (( _fv_rc == 2 )); then # infra-terminal (launch/poll hard fail) — no retry
2893
3032
  FAILED_US="$us"
3033
+ log " Sequential final verify FAILED at $us (infra)"
3034
+ log_debug "[FLOW] iter=$iter phase=sequential_final_verify failed_us=$us reason=infra attempts=$_fv_attempt"
2894
3035
  return 1
2895
3036
  fi
2896
- fi
2897
-
2898
- # Bug #7 Fix-Q/R: reap verifier pane between per-US final verifications so
2899
- # the previous codex/claude TUI cannot continue running while the next per-
2900
- # US verifier dispatch reuses the same pane.
2901
- _kill_pane_process "$VERIFIER_PANE" "verifier-final"
2902
- _lock_sentinel "$VERDICT_FILE"
2903
- # PR-0b-narrow: stamp leader handshake ack on the verdict (audit-only).
2904
- _stamp_ack_field "$VERDICT_FILE"
2905
-
2906
- # Check verdict
2907
- local verdict
2908
- verdict=$(jq -r '.verdict' "$VERDICT_FILE" 2>/dev/null)
2909
- if [[ "$verdict" != "pass" ]]; then
3037
+ (( _fv_rc == 0 )) && break # pass verdict
3038
+ log " Sequential final verify: $us verdict=fail (attempt $_fv_attempt/$_fv_max)"
3039
+ log_debug "[FLOW] iter=$iter phase=sequential_final_verify us=$us attempt=$_fv_attempt/$_fv_max verdict=fail"
3040
+ (( _fv_attempt < _fv_max )) && log " D-18: re-verifying $us a per-US-passed US's fail must reproduce to count (verifier flake guard)."
3041
+ done
3042
+ if (( _fv_rc != 0 )); then
2910
3043
  FAILED_US="$us"
2911
- log " Sequential final verify FAILED at $us"
2912
- log_debug "[FLOW] iter=$iter phase=sequential_final_verify failed_us=$us verdict=$verdict"
3044
+ log " Sequential final verify FAILED at $us (failed all $_fv_attempt/$_fv_max attempt(s))"
3045
+ log_debug "[FLOW] iter=$iter phase=sequential_final_verify failed_us=$us verdict=fail attempts=$_fv_attempt max=$_fv_max"
2913
3046
  return 1
2914
3047
  fi
2915
- log " Sequential final verify: $us PASSED"
3048
+ log " Sequential final verify: $us PASSED$([[ $_fv_attempt -gt 1 ]] && echo " (after $_fv_attempt attempts — earlier verdict was a verifier flake)")"
2916
3049
 
2917
3050
  # Archive per-US final verdict
2918
3051
  cp "$VERDICT_FILE" "$LOGS_DIR/iter-$(printf '%03d' $iter).final-verdict-${us}.json" 2>/dev/null
@@ -3487,6 +3620,28 @@ main() {
3487
3620
  fi
3488
3621
  fi
3489
3622
 
3623
+ # D-16: leader-driven finalize. The previous iteration's last per-US pass
3624
+ # completed coverage and armed _FINALIZE_PENDING instead of dispatching a
3625
+ # worker round-trip to emit an ALL signal. Synthesize that ALL verify signal
3626
+ # ourselves and skip the worker; the existing verify path (signal_us_id=ALL →
3627
+ # run_sequential_final_verify) handles completion AND the fix-loop on failure.
3628
+ # Operator recovery (PR-A) takes precedence — only finalize if it did not claim
3629
+ # this iteration. A crash before this point loses the flag and safely falls
3630
+ # back to the worker round-trip (the pre-D-16 path).
3631
+ if (( _FINALIZE_PENDING )) && [[ "$SKIP_NEXT_WORKER" -eq 0 ]]; then
3632
+ _FINALIZE_PENDING=0
3633
+ log " Leader finalize (D-16): all US verified ($VERIFIED_US) — synthesizing ALL verify signal, skipping worker round-trip."
3634
+ log_debug "[FLOW] iter=$ITERATION d16_finalize=true verified_us=$VERIFIED_US"
3635
+ printf '{"iteration": %d, "status": "verify", "us_id": "ALL", "summary": "leader finalize (D-16: all per-US verified)", "timestamp": "%s"}\n' \
3636
+ "$ITERATION" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" | atomic_write "$SIGNAL_FILE"
3637
+ update_status "verify" "running"
3638
+ SKIP_NEXT_WORKER=1
3639
+ else
3640
+ # Any normally-dispatched iteration clears a stale arm (defensive; the flag
3641
+ # is consumed above on the immediately-following iteration in practice).
3642
+ _FINALIZE_PENDING=0
3643
+ fi
3644
+
3490
3645
  if (( ! SKIP_NEXT_WORKER )); then
3491
3646
  # --- governance.md s7 step 8 (cleanup): Clean previous iteration signals ---
3492
3647
  # Bug #7 Fix-R cleanup: unlock 0o444 sentinels written by the previous
@@ -3724,7 +3879,15 @@ main() {
3724
3879
  update_status "verifier" "running"
3725
3880
 
3726
3881
  # --- Sequential final verify: per-US scoped checks instead of one big ALL verify ---
3727
- if [[ "$signal_us_id" == "ALL" && "$VERIFY_MODE" == "per-us" && -n "$US_LIST" ]]; then
3882
+ # D-21: do NOT take the sequential single-verifier path when consensus applies to
3883
+ # the final verify — it would BYPASS consensus entirely (run_sequential_final_verify
3884
+ # returns 0 before the consensus check below), so `--consensus final-only` was a
3885
+ # silent no-op in the DEFAULT per-us mode (the recommended consensus config). When
3886
+ # consensus is on, fall through to run_consensus_verification "ALL" (which uses the
3887
+ # stricter FINAL_VERIFIER_MODEL + FINAL_CONSENSUS_MODEL and the designed
3888
+ # claude+codex final path). The per-US timeout-prevention split is the off-consensus
3889
+ # optimization only.
3890
+ if [[ "$signal_us_id" == "ALL" && "$VERIFY_MODE" == "per-us" && -n "$US_LIST" ]] && ! _should_use_consensus "$signal_us_id"; then
3728
3891
  log " Final ALL verify: using sequential per-US strategy (timeout prevention)"
3729
3892
  local seq_rc=0
3730
3893
  run_sequential_final_verify "$ITERATION" || seq_rc=$?
@@ -3967,7 +4130,18 @@ main() {
3967
4130
  _append_verified_ledger "$signal_us_id" # F-14: durable source-of-truth
3968
4131
  fi
3969
4132
  update_status "verifier" "pass_us"
3970
- # Worker will do next US on next iteration
4133
+ # D-16: if this pass completed coverage (every US in US_LIST is now
4134
+ # verified), arm leader-driven finalize so the NEXT loop top runs the
4135
+ # sequential final verify DIRECTLY — instead of a worker round-trip
4136
+ # whose only job is to emit an ALL signal (a fragile extra LLM
4137
+ # iteration, observed hanging on an API rate-limit in SV CRITICAL).
4138
+ if [[ "$VERIFY_MODE" == "per-us" && -n "$US_LIST" ]] && _all_us_verified; then
4139
+ _FINALIZE_PENDING=1
4140
+ log " Coverage complete ($VERIFIED_US) — arming leader finalize (D-16, no worker round-trip)."
4141
+ log_debug "[FLOW] iter=$ITERATION d16_arm_finalize=true verified_us=$VERIFIED_US"
4142
+ else
4143
+ : # more US remain → Worker will do next US on next iteration
4144
+ fi
3971
4145
  fi
3972
4146
  elif [[ "$recommended" == (complete|completed|done) || "$signal_us_id" == "ALL" ]]; then
3973
4147
  # Final full verify passed or complete recommended