@ai-dev-methodologies/rlp-desk 0.14.6 → 0.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/plans/bug-report-overhaul-backlog.md +49 -0
- package/docs/plans/bug-report-overhaul-v0.md +238 -0
- package/docs/plans/bug-report-overhaul-v1.md +319 -0
- package/docs/plans/native-agent-revert.md +184 -0
- package/docs/plans/polished-gliding-toucan.md +234 -0
- package/docs/plans/strategic-review/rlp-desk-strategic-review.md +125 -0
- package/docs/rlp-desk/signal-protocol.md +93 -0
- package/install.sh +2 -0
- package/package.json +1 -1
- package/scripts/postinstall.js +2 -0
- package/src/commands/rlp-desk.md +56 -46
- package/src/node/run.mjs +45 -7
- package/src/node/runner/campaign-main-loop.mjs +372 -15
- package/src/node/shared/fs.mjs +83 -0
- package/src/node/tmux/pane-manager.mjs +39 -0
- package/src/scripts/lib_ralph_desk.zsh +152 -0
- package/src/scripts/run_ralph_desk.zsh +218 -59
|
@@ -244,6 +244,158 @@ atomic_write() {
|
|
|
244
244
|
mv "$tmp" "$target"
|
|
245
245
|
}
|
|
246
246
|
|
|
247
|
+
# =============================================================================
|
|
248
|
+
# Bug #7 Fix-Q/R: Post-sentinel pane reaper + sentinel write-lock
|
|
249
|
+
# =============================================================================
|
|
250
|
+
# Without explicit teardown the claude/codex TUI returns to its idle prompt and
|
|
251
|
+
# self-reviews for ~2min after writing iter-signal.json or verify-verdict.json.
|
|
252
|
+
# Observed: verdict mtime drift 1m43s post-detect; iter-N verifier overlapped
|
|
253
|
+
# iter-N+1 worker for 2min. _kill_pane_process closes the race; _lock_sentinel
|
|
254
|
+
# is defense-in-depth that freezes the file mtime. Mirror of run_ralph_desk.zsh
|
|
255
|
+
# verifier-cleanup pattern at L2384-2397 (Ctrl+C + /exit + wait_for_pane_ready).
|
|
256
|
+
# Both helpers are fail-open: pane may already be dead, FS may ignore chmod.
|
|
257
|
+
_kill_pane_process() {
|
|
258
|
+
local pane_id="$1"
|
|
259
|
+
local role="${2:-producer}"
|
|
260
|
+
[[ -n "$pane_id" ]] || return 0
|
|
261
|
+
if typeset -f log_debug >/dev/null 2>&1; then
|
|
262
|
+
log_debug "[bug7] kill_pane_process pane=$pane_id role=$role"
|
|
263
|
+
fi
|
|
264
|
+
tmux send-keys -t "$pane_id" C-c 2>/dev/null
|
|
265
|
+
sleep 0.5
|
|
266
|
+
tmux send-keys -t "$pane_id" C-c 2>/dev/null
|
|
267
|
+
sleep 1
|
|
268
|
+
if typeset -f wait_for_pane_ready >/dev/null 2>&1; then
|
|
269
|
+
wait_for_pane_ready "$pane_id" 5 2>/dev/null || true
|
|
270
|
+
fi
|
|
271
|
+
return 0
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
_lock_sentinel() {
|
|
275
|
+
local file="$1"
|
|
276
|
+
[[ -n "$file" && -f "$file" ]] || return 0
|
|
277
|
+
chmod 0444 "$file" 2>/dev/null || true
|
|
278
|
+
return 0
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
_unlock_sentinel() {
|
|
282
|
+
local file="$1"
|
|
283
|
+
[[ -n "$file" && -f "$file" ]] || return 0
|
|
284
|
+
chmod 0644 "$file" 2>/dev/null || true
|
|
285
|
+
return 0
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
# PR-A (Bug #10) — validate operator-written manual recovery artifacts.
|
|
289
|
+
# Returns 0 when all 5 checks pass; 1 otherwise. Sets RECOVERY_FAIL_REASON
|
|
290
|
+
# (global) on failure for caller logging. Mirrors the Node-side helper
|
|
291
|
+
# `_validateOperatorRecoveryArtifacts` in `src/node/runner/campaign-main-loop.mjs`.
|
|
292
|
+
#
|
|
293
|
+
# Args:
|
|
294
|
+
# $1 iter-signal.json path
|
|
295
|
+
# $2 done-claim.json path
|
|
296
|
+
# $3 status.json path
|
|
297
|
+
# $4 iter-NNN.worker-prompt.md path (may not exist for iter-1 fresh start)
|
|
298
|
+
_validate_operator_recovery_artifacts() {
|
|
299
|
+
local sig_file="$1" done_file="$2" status_file="$3" prompt_file="$4"
|
|
300
|
+
RECOVERY_FAIL_REASON=""
|
|
301
|
+
|
|
302
|
+
# Check 1: both artifacts exist + parse as JSON
|
|
303
|
+
if [[ ! -f "$sig_file" ]]; then
|
|
304
|
+
RECOVERY_FAIL_REASON="iter-signal.json missing"; return 1
|
|
305
|
+
fi
|
|
306
|
+
if [[ ! -f "$done_file" ]]; then
|
|
307
|
+
RECOVERY_FAIL_REASON="done-claim.json missing"; return 1
|
|
308
|
+
fi
|
|
309
|
+
if ! command -v jq >/dev/null 2>&1; then
|
|
310
|
+
RECOVERY_FAIL_REASON="jq unavailable; cannot validate"; return 1
|
|
311
|
+
fi
|
|
312
|
+
if ! jq -e . "$sig_file" >/dev/null 2>&1; then
|
|
313
|
+
RECOVERY_FAIL_REASON="iter-signal.json parse error"; return 1
|
|
314
|
+
fi
|
|
315
|
+
if ! jq -e . "$done_file" >/dev/null 2>&1; then
|
|
316
|
+
RECOVERY_FAIL_REASON="done-claim.json parse error"; return 1
|
|
317
|
+
fi
|
|
318
|
+
if [[ ! -f "$status_file" ]] || ! jq -e . "$status_file" >/dev/null 2>&1; then
|
|
319
|
+
RECOVERY_FAIL_REASON="status.json missing or invalid"; return 1
|
|
320
|
+
fi
|
|
321
|
+
|
|
322
|
+
# Check 2: us_id match in both artifacts
|
|
323
|
+
local current_us sig_us done_us
|
|
324
|
+
current_us=$(jq -r '.current_us // ""' "$status_file" 2>/dev/null)
|
|
325
|
+
sig_us=$(jq -r '.us_id // ""' "$sig_file" 2>/dev/null)
|
|
326
|
+
done_us=$(jq -r '.us_id // ""' "$done_file" 2>/dev/null)
|
|
327
|
+
if [[ "$sig_us" != "$current_us" ]]; then
|
|
328
|
+
RECOVERY_FAIL_REASON="iter-signal.us_id ($sig_us) != status.current_us ($current_us)"; return 1
|
|
329
|
+
fi
|
|
330
|
+
if [[ "$done_us" != "$current_us" ]]; then
|
|
331
|
+
RECOVERY_FAIL_REASON="done-claim.us_id ($done_us) != status.current_us ($current_us)"; return 1
|
|
332
|
+
fi
|
|
333
|
+
|
|
334
|
+
# Check 3: iteration match in both artifacts
|
|
335
|
+
local current_iter sig_iter done_iter
|
|
336
|
+
current_iter=$(jq -r '.iteration // 0' "$status_file" 2>/dev/null)
|
|
337
|
+
sig_iter=$(jq -r '.iteration // 0' "$sig_file" 2>/dev/null)
|
|
338
|
+
done_iter=$(jq -r '.iteration // 0' "$done_file" 2>/dev/null)
|
|
339
|
+
if [[ "$sig_iter" != "$current_iter" ]]; then
|
|
340
|
+
RECOVERY_FAIL_REASON="iter-signal.iteration ($sig_iter) != status.iteration ($current_iter)"; return 1
|
|
341
|
+
fi
|
|
342
|
+
if [[ "$done_iter" != "$current_iter" ]]; then
|
|
343
|
+
RECOVERY_FAIL_REASON="done-claim.iteration ($done_iter) != status.iteration ($current_iter)"; return 1
|
|
344
|
+
fi
|
|
345
|
+
|
|
346
|
+
# Check 4: iter_signal_quality must equal 'specific'
|
|
347
|
+
local sig_quality
|
|
348
|
+
sig_quality=$(jq -r '.iter_signal_quality // ""' "$sig_file" 2>/dev/null)
|
|
349
|
+
if [[ "$sig_quality" != "specific" ]]; then
|
|
350
|
+
RECOVERY_FAIL_REASON="iter-signal.iter_signal_quality ($sig_quality) != 'specific'"; return 1
|
|
351
|
+
fi
|
|
352
|
+
|
|
353
|
+
# Check 5: artifact mtimes must be strictly newer than worker-prompt mtime.
|
|
354
|
+
# Vacuously passes when the prompt file does not exist (fresh iter-1 start
|
|
355
|
+
# before any leader-written prompt).
|
|
356
|
+
if [[ -f "$prompt_file" ]]; then
|
|
357
|
+
local prompt_mtime sig_mtime done_mtime
|
|
358
|
+
prompt_mtime=$(stat -f %m "$prompt_file" 2>/dev/null || stat -c %Y "$prompt_file" 2>/dev/null || print 0)
|
|
359
|
+
sig_mtime=$(stat -f %m "$sig_file" 2>/dev/null || stat -c %Y "$sig_file" 2>/dev/null || print 0)
|
|
360
|
+
done_mtime=$(stat -f %m "$done_file" 2>/dev/null || stat -c %Y "$done_file" 2>/dev/null || print 0)
|
|
361
|
+
if (( sig_mtime <= prompt_mtime )); then
|
|
362
|
+
RECOVERY_FAIL_REASON="iter-signal.json mtime ($sig_mtime) not strictly newer than worker-prompt mtime ($prompt_mtime)"; return 1
|
|
363
|
+
fi
|
|
364
|
+
if (( done_mtime <= prompt_mtime )); then
|
|
365
|
+
RECOVERY_FAIL_REASON="done-claim.json mtime ($done_mtime) not strictly newer than worker-prompt mtime ($prompt_mtime)"; return 1
|
|
366
|
+
fi
|
|
367
|
+
fi
|
|
368
|
+
|
|
369
|
+
return 0
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
# PR-0b-narrow (Plan v6) — stamp leader handshake ack onto the sentinel.
|
|
373
|
+
# Mirror of src/node/shared/fs.mjs::stampAckField. Best-effort, audit-only:
|
|
374
|
+
# any failure is silently swallowed. Sequence:
|
|
375
|
+
# 1. chmod 0644 (so jq + mv can write)
|
|
376
|
+
# 2. jq merge .leader_ack
|
|
377
|
+
# 3. atomic rename via tmp file
|
|
378
|
+
# 4. chmod 0444 (re-lock)
|
|
379
|
+
# Tolerant of jq absence (graceful degrade — no stamp, no error).
|
|
380
|
+
_stamp_ack_field() {
|
|
381
|
+
local file="$1"
|
|
382
|
+
[[ -n "$file" && -f "$file" ]] || return 0
|
|
383
|
+
command -v jq >/dev/null 2>&1 || return 0
|
|
384
|
+
local now_iso
|
|
385
|
+
now_iso=$(date -u +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo "")
|
|
386
|
+
local tmp="${file}.ack.tmp"
|
|
387
|
+
chmod 0644 "$file" 2>/dev/null || true
|
|
388
|
+
if jq --arg ts "$now_iso" \
|
|
389
|
+
'. + {leader_ack: {acked_by: "leader", acked_at: $ts, ack_pane_state: "shell"}}' \
|
|
390
|
+
"$file" > "$tmp" 2>/dev/null; then
|
|
391
|
+
mv "$tmp" "$file" 2>/dev/null || rm -f "$tmp" 2>/dev/null
|
|
392
|
+
else
|
|
393
|
+
rm -f "$tmp" 2>/dev/null
|
|
394
|
+
fi
|
|
395
|
+
chmod 0444 "$file" 2>/dev/null || true
|
|
396
|
+
return 0
|
|
397
|
+
}
|
|
398
|
+
|
|
247
399
|
# =============================================================================
|
|
248
400
|
# Scaffold Validation
|
|
249
401
|
# =============================================================================
|
|
@@ -635,27 +635,82 @@ launch_verifier_claude() {
|
|
|
635
635
|
# On exit: check done-claim, auto-generate iter-signal.
|
|
636
636
|
# Args: $1=iteration $2=signal_file
|
|
637
637
|
# Returns: 0 (signal generated), 1 (error)
|
|
638
|
+
# Bug #8 PR-B (codex critic P1.2 fix): shared 4-way gate used by both
|
|
639
|
+
# handle_worker_exit_codex and the inline-polling A4 path. Returns:
|
|
640
|
+
# 0 = synthesize allowed (caller writes signal_file + emits audit)
|
|
641
|
+
# 1 = BLOCKED (this function already wrote sentinel + emitted audit)
|
|
642
|
+
# Args: $1=iter $2=us_id $3=audit_clean_code (e.g. codex_exit_with_done_claim
|
|
643
|
+
# or inline_polling_a4_clean)
|
|
644
|
+
_bug8_check_synth_allowed() {
|
|
645
|
+
local iter="$1"
|
|
646
|
+
local us_id="${2:-${CURRENT_US:-ALL}}"
|
|
647
|
+
local audit_clean="$3"
|
|
648
|
+
|
|
649
|
+
# Gate 1: done-claim must exist.
|
|
650
|
+
if [[ ! -f "$DONE_CLAIM_FILE" ]]; then
|
|
651
|
+
log_error " Bug #8: no done-claim. Refusing to synthesize verify signal."
|
|
652
|
+
log_debug "[GOV] iter=$iter bug8=block_codex_exit_no_done_claim"
|
|
653
|
+
write_blocked_sentinel \
|
|
654
|
+
"Codex worker exited without writing done-claim (refusing to synthesize verify signal)" \
|
|
655
|
+
"$us_id" \
|
|
656
|
+
"infra_failure"
|
|
657
|
+
_emit_a4_fallback_audit "$us_id" "$iter" "blocked_codex_exit_no_done_claim"
|
|
658
|
+
return 1
|
|
659
|
+
fi
|
|
660
|
+
|
|
661
|
+
# Gate 2: git toplevel must equal $ROOT (canonicalized — macOS resolves
|
|
662
|
+
# /var → /private/var, NTFS may have 8.3 short paths; compare realpaths).
|
|
663
|
+
local _bug8_top _bug8_top_canon _bug8_root_canon
|
|
664
|
+
_bug8_top=$(git -C "$ROOT" rev-parse --show-toplevel 2>/dev/null)
|
|
665
|
+
_bug8_top_canon=$(cd "$_bug8_top" 2>/dev/null && pwd -P 2>/dev/null)
|
|
666
|
+
_bug8_root_canon=$(cd "$ROOT" 2>/dev/null && pwd -P 2>/dev/null)
|
|
667
|
+
if [[ -z "$_bug8_top" || "$_bug8_top_canon" != "$_bug8_root_canon" ]]; then
|
|
668
|
+
log_error " Bug #8: git unverifiable at \$ROOT=$ROOT (toplevel='$_bug8_top'). Refusing synthesis."
|
|
669
|
+
log_debug "[GOV] iter=$iter bug8=block_git_unverifiable root=$ROOT toplevel=$_bug8_top"
|
|
670
|
+
write_blocked_sentinel \
|
|
671
|
+
"git status unverifiable at $ROOT (toplevel='$_bug8_top'); refusing to synthesize verify signal" \
|
|
672
|
+
"$us_id" \
|
|
673
|
+
"infra_failure"
|
|
674
|
+
_emit_a4_fallback_audit "$us_id" "$iter" "blocked_git_unverifiable"
|
|
675
|
+
return 1
|
|
676
|
+
fi
|
|
677
|
+
|
|
678
|
+
# Gate 3: tree must be clean.
|
|
679
|
+
local _bug8_dirty
|
|
680
|
+
_bug8_dirty=$(git -C "$ROOT" status --porcelain 2>/dev/null)
|
|
681
|
+
if [[ -n "$_bug8_dirty" ]]; then
|
|
682
|
+
local _bug8_first5
|
|
683
|
+
_bug8_first5=$(printf '%s\n' "$_bug8_dirty" | head -n 5 | tr '\n' '|' | sed 's/|$//')
|
|
684
|
+
log_error " Bug #8: done-claim present but tree dirty. Refusing synthesis. dirty: $_bug8_first5"
|
|
685
|
+
log_debug "[GOV] iter=$iter bug8=block_dirty_tree us_id=$us_id dirty='$_bug8_first5'"
|
|
686
|
+
write_blocked_sentinel \
|
|
687
|
+
"worker_incomplete_uncommitted: done-claim present but tree dirty ($_bug8_first5)" \
|
|
688
|
+
"$us_id" \
|
|
689
|
+
"metric_failure"
|
|
690
|
+
_emit_a4_fallback_audit "$us_id" "$iter" "blocked_dirty_tree"
|
|
691
|
+
return 1
|
|
692
|
+
fi
|
|
693
|
+
|
|
694
|
+
# All gates passed — synthesize allowed.
|
|
695
|
+
return 0
|
|
696
|
+
}
|
|
697
|
+
|
|
638
698
|
handle_worker_exit_codex() {
|
|
639
699
|
local iter="$1"
|
|
640
700
|
local signal_file="$2"
|
|
641
701
|
|
|
642
|
-
log " Codex worker process exited. Checking for done-claim..."
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
log " Codex worker completed with done-claim (us_id=$dc_us_id). Auto-generating signal."
|
|
647
|
-
echo '{"iteration":'"$iter"',"status":"verify","us_id":"'"$dc_us_id"'","summary":"auto-generated after codex exit","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'"}' > "$signal_file"
|
|
648
|
-
_emit_a4_fallback_audit "$dc_us_id" "$iter" "codex_exit_with_done_claim"
|
|
649
|
-
else
|
|
650
|
-
log " WARNING: Codex worker exited without done-claim. Generating verify signal for current US."
|
|
651
|
-
local current_us
|
|
652
|
-
current_us=$(jq -r '.us_id // "US-001"' "$DESK/memos/${SLUG}-iter-signal.json" 2>/dev/null || echo "US-001")
|
|
653
|
-
local mem_us
|
|
654
|
-
mem_us=$(sed -n 's/.*Next.*US-\([0-9]*\).*/US-\1/p' "$DESK/memos/${SLUG}-memory.md" 2>/dev/null | head -1)
|
|
655
|
-
[[ -n "$mem_us" ]] && current_us="$mem_us"
|
|
656
|
-
echo '{"iteration":'"$iter"',"status":"verify","us_id":"'"$current_us"'","summary":"auto-generated after codex exit (no done-claim)","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'"}' > "$signal_file"
|
|
657
|
-
_emit_a4_fallback_audit "$current_us" "$iter" "codex_exit_no_done_claim"
|
|
702
|
+
log " Codex worker process exited. Checking for done-claim + clean tree..."
|
|
703
|
+
|
|
704
|
+
if ! _bug8_check_synth_allowed "$iter" "${CURRENT_US:-ALL}" "codex_exit_with_done_claim"; then
|
|
705
|
+
return 1
|
|
658
706
|
fi
|
|
707
|
+
|
|
708
|
+
# All 3 gates passed: done-claim present, git OK, tree clean → synthesize.
|
|
709
|
+
local dc_us_id
|
|
710
|
+
dc_us_id=$(jq -r '.us_id // "unknown"' "$DONE_CLAIM_FILE" 2>/dev/null)
|
|
711
|
+
log " Codex worker completed with done-claim (us_id=$dc_us_id) and clean tree. Auto-generating signal."
|
|
712
|
+
echo '{"iteration":'"$iter"',"status":"verify","us_id":"'"$dc_us_id"'","summary":"auto-generated after codex exit (clean tree)","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'"}' > "$signal_file"
|
|
713
|
+
_emit_a4_fallback_audit "$dc_us_id" "$iter" "codex_exit_with_done_claim_clean"
|
|
659
714
|
return 0
|
|
660
715
|
}
|
|
661
716
|
|
|
@@ -2176,8 +2231,22 @@ poll_for_signal() {
|
|
|
2176
2231
|
|
|
2177
2232
|
# Check if signal file appeared
|
|
2178
2233
|
if [[ -f "$signal_file" ]]; then
|
|
2179
|
-
|
|
2180
|
-
|
|
2234
|
+
# Bug #7-extra (BOS 2026-05-06): file existence is NOT enough. Worker
|
|
2235
|
+
# (claude opus) writes via Claude Code's Write tool, which is not
|
|
2236
|
+
# guaranteed atomic — the file can appear with empty / partial JSON
|
|
2237
|
+
# before the write completes. Verifier was being dispatched against a
|
|
2238
|
+
# half-written iter-signal.json. Validate that the file holds a single
|
|
2239
|
+
# parseable, non-null JSON value (`jq -e .`) before accepting; any
|
|
2240
|
+
# failure simply continues polling (next tick re-reads). Note: `jq
|
|
2241
|
+
# empty` was rejected because it accepts an EMPTY file as "zero
|
|
2242
|
+
# documents" — the exact race window we need to reject.
|
|
2243
|
+
if jq -e . "$signal_file" >/dev/null 2>&1; then
|
|
2244
|
+
log " Signal file detected: $signal_file"
|
|
2245
|
+
return 0 # success
|
|
2246
|
+
fi
|
|
2247
|
+
# Empty / truncated / mid-write JSON. Stay in the polling loop and let
|
|
2248
|
+
# the next tick re-read once the writer has finished.
|
|
2249
|
+
log_debug "[bug7-extra] $role signal file present but JSON not yet valid — continue polling"
|
|
2181
2250
|
fi
|
|
2182
2251
|
|
|
2183
2252
|
# A4 fallback: done-claim exists but no signal → Worker forgot iter-signal
|
|
@@ -2216,11 +2285,24 @@ poll_for_signal() {
|
|
|
2216
2285
|
local dc_us_id
|
|
2217
2286
|
dc_us_id=$(jq -r '.us_id // "unknown"' "$DONE_CLAIM_FILE" 2>/dev/null)
|
|
2218
2287
|
if [[ -n "$dc_us_id" && "$dc_us_id" != "null" ]]; then
|
|
2219
|
-
|
|
2220
|
-
|
|
2221
|
-
|
|
2222
|
-
|
|
2223
|
-
|
|
2288
|
+
# Bug #8 PR-B: defer to shared 4-way gate (codex critic P1.2).
|
|
2289
|
+
# _bug8_check_synth_allowed handles done-claim/git/dirty-tree gates
|
|
2290
|
+
# uniformly across handle_worker_exit_codex AND this inline path so
|
|
2291
|
+
# both codex-exit and inline-polling A4 enforce the same contract.
|
|
2292
|
+
if _bug8_check_synth_allowed "$ITERATION" "$dc_us_id" "inline_polling_a4_clean"; then
|
|
2293
|
+
log " WARNING: done-claim exists for $dc_us_id but no iter-signal. Tree clean — auto-generating signal (A4 fallback)."
|
|
2294
|
+
log_debug "[GOV] iter=$ITERATION done_claim_without_signal=true us_id=$dc_us_id action=auto_generate_signal"
|
|
2295
|
+
echo '{"iteration":'"$ITERATION"',"status":"verify","us_id":"'"$dc_us_id"'","summary":"auto-generated by A4 fallback (done-claim + clean tree)","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'"}' > "$signal_file"
|
|
2296
|
+
_emit_a4_fallback_audit "$dc_us_id" "$ITERATION" "inline_polling_a4_clean"
|
|
2297
|
+
return 0
|
|
2298
|
+
else
|
|
2299
|
+
# Bug #8 PR-B (codex critic round-2 P2): hard-stop rc=2 so the
|
|
2300
|
+
# main worker loop (L3119) treats this BLOCKED as terminal,
|
|
2301
|
+
# matching the handle_worker_exit_codex blocked path. rc=1 is
|
|
2302
|
+
# ambiguous — caller may interpret it as a recoverable poll
|
|
2303
|
+
# failure and re-loop while the BLOCKED sentinel is on disk.
|
|
2304
|
+
return 2
|
|
2305
|
+
fi
|
|
2224
2306
|
fi
|
|
2225
2307
|
fi
|
|
2226
2308
|
fi
|
|
@@ -2271,8 +2353,16 @@ poll_for_signal() {
|
|
|
2271
2353
|
fi
|
|
2272
2354
|
# Dispatch to engine-specific exit handler
|
|
2273
2355
|
if [[ "$WORKER_ENGINE" = "codex" && "$role" != *erifier* ]]; then
|
|
2274
|
-
handle_worker_exit_codex
|
|
2275
|
-
|
|
2356
|
+
# Bug #8 PR-B: handle_worker_exit_codex now returns 1 when it has
|
|
2357
|
+
# written a BLOCKED sentinel (no done-claim, dirty tree, git
|
|
2358
|
+
# unverifiable). Propagate the return so main loop stops, instead
|
|
2359
|
+
# of swallowing it with `return 0` and continuing as if the poll
|
|
2360
|
+
# had succeeded.
|
|
2361
|
+
if handle_worker_exit_codex "$ITERATION" "$signal_file"; then
|
|
2362
|
+
return 0
|
|
2363
|
+
else
|
|
2364
|
+
return 2
|
|
2365
|
+
fi
|
|
2276
2366
|
fi
|
|
2277
2367
|
# Claude path (or verifier of any engine)
|
|
2278
2368
|
if handle_worker_exit_claude "$pane_id" "$ITERATION" "$trigger_file"; then
|
|
@@ -2467,8 +2557,16 @@ run_single_verifier() {
|
|
|
2467
2557
|
fi
|
|
2468
2558
|
fi
|
|
2469
2559
|
|
|
2560
|
+
# Bug #7 Fix-Q/R: reap verifier pane the moment we accept the verdict so
|
|
2561
|
+
# codex/claude cannot keep self-reviewing and rewrite verify-verdict.json.
|
|
2562
|
+
# Lock applied AFTER cp so the archived snapshot is also frozen at intent.
|
|
2563
|
+
_kill_pane_process "$VERIFIER_PANE" "verifier-${suffix}"
|
|
2564
|
+
|
|
2470
2565
|
# Copy verdict to destination
|
|
2471
2566
|
cp "$VERDICT_FILE" "$verdict_dest"
|
|
2567
|
+
_lock_sentinel "$VERDICT_FILE"
|
|
2568
|
+
# PR-0b-narrow: stamp leader handshake ack on the verdict (audit-only).
|
|
2569
|
+
_stamp_ack_field "$VERDICT_FILE"
|
|
2472
2570
|
log " Verifier$suffix verdict saved to $verdict_dest"
|
|
2473
2571
|
return 0
|
|
2474
2572
|
}
|
|
@@ -2528,6 +2626,14 @@ run_sequential_final_verify() {
|
|
|
2528
2626
|
return 1
|
|
2529
2627
|
fi
|
|
2530
2628
|
|
|
2629
|
+
# Bug #7 Fix-Q/R: reap verifier pane between per-US final verifications so
|
|
2630
|
+
# the previous codex/claude TUI cannot continue running while the next per-
|
|
2631
|
+
# US verifier dispatch reuses the same pane.
|
|
2632
|
+
_kill_pane_process "$VERIFIER_PANE" "verifier-final"
|
|
2633
|
+
_lock_sentinel "$VERDICT_FILE"
|
|
2634
|
+
# PR-0b-narrow: stamp leader handshake ack on the verdict (audit-only).
|
|
2635
|
+
_stamp_ack_field "$VERDICT_FILE"
|
|
2636
|
+
|
|
2531
2637
|
# Check verdict
|
|
2532
2638
|
local verdict
|
|
2533
2639
|
verdict=$(jq -r '.verdict' "$VERDICT_FILE" 2>/dev/null)
|
|
@@ -2939,20 +3045,50 @@ main() {
|
|
|
2939
3045
|
return 1
|
|
2940
3046
|
fi
|
|
2941
3047
|
|
|
2942
|
-
#
|
|
2943
|
-
|
|
2944
|
-
|
|
3048
|
+
# PR-A (Bug #10): operator-recovery hygiene check.
|
|
3049
|
+
# When the operator hand-rolls a `phase=verify` recovery (jq-patches
|
|
3050
|
+
# status.json, writes manual iter-signal.json + done-claim.json, deletes
|
|
3051
|
+
# the blocked sentinel), the leader MUST honor that work instead of
|
|
3052
|
+
# deleting the artifacts and resetting to phase=worker. Mirrors the
|
|
3053
|
+
# Node-side guard in src/node/runner/campaign-main-loop.mjs.
|
|
3054
|
+
local SKIP_NEXT_WORKER=0
|
|
3055
|
+
local LAST_PHASE=""
|
|
3056
|
+
if [[ -f "$STATUS_FILE" ]] && command -v jq >/dev/null 2>&1; then
|
|
3057
|
+
LAST_PHASE=$(jq -r '.phase // ""' "$STATUS_FILE" 2>/dev/null)
|
|
3058
|
+
fi
|
|
3059
|
+
if [[ "$LAST_PHASE" == "verify" ]]; then
|
|
3060
|
+
local _iter_prompt="$LOGS_DIR/iter-$(printf '%03d' $ITERATION).worker-prompt.md"
|
|
3061
|
+
if _validate_operator_recovery_artifacts \
|
|
3062
|
+
"$SIGNAL_FILE" "$DONE_CLAIM_FILE" "$STATUS_FILE" "$_iter_prompt"; then
|
|
3063
|
+
log "[recovery] Resuming verify phase — operator manual recovery detected (iter=$ITERATION)"
|
|
3064
|
+
log_debug "[recovery] iter=$ITERATION skip_worker=true reason=manual_recovery_validated"
|
|
3065
|
+
SKIP_NEXT_WORKER=1
|
|
3066
|
+
else
|
|
3067
|
+
log "[recovery] phase=verify ignored: ${RECOVERY_FAIL_REASON}"
|
|
3068
|
+
log_debug "[recovery] iter=$ITERATION skip_worker=false reason=\"${RECOVERY_FAIL_REASON}\""
|
|
3069
|
+
fi
|
|
3070
|
+
fi
|
|
2945
3071
|
|
|
2946
|
-
|
|
2947
|
-
|
|
2948
|
-
|
|
2949
|
-
#
|
|
2950
|
-
|
|
2951
|
-
|
|
2952
|
-
|
|
2953
|
-
|
|
2954
|
-
|
|
2955
|
-
|
|
3072
|
+
if (( ! SKIP_NEXT_WORKER )); then
|
|
3073
|
+
# --- governance.md s7 step 8 (cleanup): Clean previous iteration signals ---
|
|
3074
|
+
# Bug #7 Fix-R cleanup: unlock 0o444 sentinels written by the previous
|
|
3075
|
+
# iteration's reaper before rm so cleanup does not log permission noise.
|
|
3076
|
+
_unlock_sentinel "$SIGNAL_FILE"
|
|
3077
|
+
_unlock_sentinel "$VERDICT_FILE"
|
|
3078
|
+
rm -f "$SIGNAL_FILE" "$DONE_CLAIM_FILE" "$VERDICT_FILE" 2>/dev/null
|
|
3079
|
+
rm -f "$WORKER_HEARTBEAT" "$VERIFIER_HEARTBEAT" 2>/dev/null
|
|
3080
|
+
|
|
3081
|
+
# --- Clean previous claude session in panes (one-shot lifecycle) ---
|
|
3082
|
+
# Only needed from iteration 2 onwards (iteration 1 has fresh panes)
|
|
3083
|
+
if (( ITERATION > 1 )); then
|
|
3084
|
+
# Send C-c first (in case claude is mid-task), then /exit
|
|
3085
|
+
tmux send-keys -t "$WORKER_PANE" C-c 2>/dev/null
|
|
3086
|
+
sleep 1
|
|
3087
|
+
tmux send-keys -t "$WORKER_PANE" "/exit" C-m 2>/dev/null
|
|
3088
|
+
sleep 2
|
|
3089
|
+
# Wait for shell prompt before proceeding
|
|
3090
|
+
wait_for_pane_ready "$WORKER_PANE" 10 2>/dev/null || true
|
|
3091
|
+
fi
|
|
2956
3092
|
fi
|
|
2957
3093
|
|
|
2958
3094
|
# Reset per-iteration state
|
|
@@ -2964,33 +3100,44 @@ main() {
|
|
|
2964
3100
|
# --- US-004: detect PRD changes for live update + re-split ---
|
|
2965
3101
|
check_prd_update
|
|
2966
3102
|
|
|
2967
|
-
#
|
|
2968
|
-
|
|
2969
|
-
local worker_prompt="$LOGS_DIR/iter-$(printf '%03d' $ITERATION).worker-prompt.md"
|
|
2970
|
-
|
|
2971
|
-
# AC1: capture worker start timestamp
|
|
3103
|
+
# AC1: capture worker start timestamp (still set for downstream telemetry
|
|
3104
|
+
# even when the worker dispatch is skipped — recovery still consumes time).
|
|
2972
3105
|
ITER_WORKER_START=$(date +%s)
|
|
2973
3106
|
|
|
2974
|
-
|
|
3107
|
+
local worker_launch=""
|
|
3108
|
+
if (( ! SKIP_NEXT_WORKER )); then
|
|
3109
|
+
# --- governance.md s7 step 4: Build worker prompt + trigger ---
|
|
3110
|
+
write_worker_trigger "$ITERATION"
|
|
3111
|
+
local worker_prompt="$LOGS_DIR/iter-$(printf '%03d' $ITERATION).worker-prompt.md"
|
|
2975
3112
|
|
|
2976
|
-
|
|
2977
|
-
log_debug "[FLOW] iter=$ITERATION phase=worker engine=$WORKER_ENGINE model=$WORKER_MODEL dispatched=true"
|
|
3113
|
+
update_status "worker" "running"
|
|
2978
3114
|
|
|
2979
|
-
|
|
2980
|
-
|
|
2981
|
-
|
|
2982
|
-
if
|
|
2983
|
-
|
|
2984
|
-
|
|
2985
|
-
|
|
3115
|
+
# --- governance.md s7 step 5: Execute Worker (dispatched to engine-specific function) ---
|
|
3116
|
+
log_debug "[FLOW] iter=$ITERATION phase=worker engine=$WORKER_ENGINE model=$WORKER_MODEL dispatched=true"
|
|
3117
|
+
|
|
3118
|
+
if [[ "$WORKER_ENGINE" = "codex" ]]; then
|
|
3119
|
+
worker_launch="${CODEX_BIN:-codex} -m $WORKER_CODEX_MODEL -c model_reasoning_effort=\"$WORKER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
|
|
3120
|
+
if ! launch_worker_codex "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
|
|
3121
|
+
write_blocked_sentinel "Worker codex failed to start in pane" "" "infra_failure"
|
|
3122
|
+
update_status "blocked" "worker_start_failed"
|
|
3123
|
+
return 1
|
|
3124
|
+
fi
|
|
3125
|
+
else
|
|
3126
|
+
worker_launch="$(build_claude_cmd tui "$WORKER_MODEL" "" "" "$WORKER_EFFORT")"
|
|
3127
|
+
if ! launch_worker_claude "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
|
|
3128
|
+
write_blocked_sentinel "Worker claude failed to start in pane" "" "infra_failure"
|
|
3129
|
+
update_status "blocked" "worker_start_failed"
|
|
3130
|
+
return 1
|
|
3131
|
+
fi
|
|
2986
3132
|
fi
|
|
2987
3133
|
else
|
|
2988
|
-
|
|
2989
|
-
|
|
2990
|
-
|
|
2991
|
-
|
|
2992
|
-
|
|
2993
|
-
|
|
3134
|
+
# PR-A (Bug #10): one-shot recovery path. The operator's iter-signal.json
|
|
3135
|
+
# is already on disk; polling below picks it up immediately and the loop
|
|
3136
|
+
# transitions cleanly into the verifier phase. Persist phase=verify so a
|
|
3137
|
+
# subsequent crash-and-relaunch sees the same contract. SKIP_NEXT_WORKER
|
|
3138
|
+
# is local to this iteration so iter-N+1 dispatches the worker normally.
|
|
3139
|
+
update_status "verify" "running"
|
|
3140
|
+
log "[recovery] Skipping worker dispatch for iter=$ITERATION (one-shot, honoring operator manual recovery)"
|
|
2994
3141
|
fi
|
|
2995
3142
|
|
|
2996
3143
|
# --- governance.md s7 step 5+6: Poll for Worker completion ---
|
|
@@ -3003,6 +3150,12 @@ main() {
|
|
|
3003
3150
|
if poll_for_signal "$SIGNAL_FILE" "$WORKER_HEARTBEAT" "$WORKER_PANE" "$worker_launch" "Worker"; then
|
|
3004
3151
|
worker_poll_done=1
|
|
3005
3152
|
log_debug "[FLOW] iter=$ITERATION poll_signal_received=true"
|
|
3153
|
+
# Bug #7 Fix-Q/R: reap worker pane immediately so claude/codex cannot
|
|
3154
|
+
# self-review and rewrite iter-signal.json (1m43s drift observed).
|
|
3155
|
+
_kill_pane_process "$WORKER_PANE" "worker"
|
|
3156
|
+
_lock_sentinel "$SIGNAL_FILE"
|
|
3157
|
+
# PR-0b-narrow: stamp leader handshake ack on the iter-signal (audit-only).
|
|
3158
|
+
_stamp_ack_field "$SIGNAL_FILE"
|
|
3006
3159
|
else
|
|
3007
3160
|
worker_poll_rc=$?
|
|
3008
3161
|
if (( worker_poll_rc == 2 )); then
|
|
@@ -3210,6 +3363,12 @@ main() {
|
|
|
3210
3363
|
update_status "blocked" "verifier_dead"
|
|
3211
3364
|
return 1
|
|
3212
3365
|
fi
|
|
3366
|
+
# Bug #7 Fix-Q/R: reap verifier pane immediately so codex cannot
|
|
3367
|
+
# rewrite verify-verdict.json post-detect (mtime drift fix).
|
|
3368
|
+
_kill_pane_process "$VERIFIER_PANE" "verifier"
|
|
3369
|
+
_lock_sentinel "$VERDICT_FILE"
|
|
3370
|
+
# PR-0b-narrow: stamp leader handshake ack on the verdict (audit-only).
|
|
3371
|
+
_stamp_ack_field "$VERDICT_FILE"
|
|
3213
3372
|
fi
|
|
3214
3373
|
|
|
3215
3374
|
# AC1: capture verifier end timestamp
|