@ai-dev-methodologies/rlp-desk 0.15.0 → 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -388,6 +388,110 @@ async function readCurrentState(paths, slug, options) {
388
388
  };
389
389
  }
390
390
 
391
+ // PR-A (Bug #10): validate operator-written recovery artifacts. When the
392
+ // operator hand-rolls a `phase=verify` recovery (jq-patches status.json,
393
+ // writes iter-signal.json + done-claim.json by hand, deletes the blocked
394
+ // sentinel), the leader must NOT silently overwrite that work on relaunch.
395
+ // All five checks must pass for the leader to honor the recovery.
396
+ //
397
+ // Returns { ok: boolean, reason: string }. On any failure the caller falls
398
+ // through to the default behavior (worker dispatch) — defensive by design.
399
+ async function _validateOperatorRecoveryArtifacts({ paths, state }) {
400
+ // 1. iter-signal.json + done-claim.json must both exist and parse.
401
+ let signal;
402
+ let doneClaim;
403
+ try {
404
+ signal = await readJsonIfExists(paths.signalFile);
405
+ } catch (err) {
406
+ return { ok: false, reason: `iter-signal.json parse error: ${err?.message ?? err}` };
407
+ }
408
+ if (!signal) return { ok: false, reason: 'iter-signal.json missing' };
409
+
410
+ try {
411
+ doneClaim = await readJsonIfExists(paths.doneClaimFile);
412
+ } catch (err) {
413
+ return { ok: false, reason: `done-claim.json parse error: ${err?.message ?? err}` };
414
+ }
415
+ if (!doneClaim) return { ok: false, reason: 'done-claim.json missing' };
416
+
417
+ // 2. us_id must match status.current_us in BOTH artifacts.
418
+ if (signal.us_id !== state.current_us) {
419
+ return {
420
+ ok: false,
421
+ reason: `iter-signal.us_id (${signal.us_id}) != status.current_us (${state.current_us})`,
422
+ };
423
+ }
424
+ if (doneClaim.us_id !== state.current_us) {
425
+ return {
426
+ ok: false,
427
+ reason: `done-claim.us_id (${doneClaim.us_id}) != status.current_us (${state.current_us})`,
428
+ };
429
+ }
430
+
431
+ // 3. iteration must match status.iteration in BOTH artifacts.
432
+ if (signal.iteration !== state.iteration) {
433
+ return {
434
+ ok: false,
435
+ reason: `iter-signal.iteration (${signal.iteration}) != status.iteration (${state.iteration})`,
436
+ };
437
+ }
438
+ if (doneClaim.iteration !== state.iteration) {
439
+ return {
440
+ ok: false,
441
+ reason: `done-claim.iteration (${doneClaim.iteration}) != status.iteration (${state.iteration})`,
442
+ };
443
+ }
444
+
445
+ // 4. iter_signal_quality must be 'specific' (not generic / vague).
446
+ if (signal.iter_signal_quality !== 'specific') {
447
+ return {
448
+ ok: false,
449
+ reason: `iter-signal.iter_signal_quality (${signal.iter_signal_quality}) != 'specific'`,
450
+ };
451
+ }
452
+
453
+ // 5. Both artifact mtimes must be NEWER than the most recent
454
+ // iter-NNN.worker-prompt.md mtime — guards against operator running
455
+ // `phase=verify` against stale artifacts from a much earlier iteration.
456
+ const promptFile = path.join(
457
+ paths.campaignLogDir,
458
+ `iter-${String(state.iteration).padStart(3, '0')}.worker-prompt.md`,
459
+ );
460
+ let promptMtime = 0;
461
+ try {
462
+ const promptStat = await fs.stat(promptFile);
463
+ promptMtime = promptStat.mtimeMs;
464
+ } catch {
465
+ // No worker-prompt.md for this iteration → check vacuously passes
466
+ // (operator is recovering from a state that never even dispatched yet).
467
+ promptMtime = 0;
468
+ }
469
+ if (promptMtime > 0) {
470
+ let signalMtime = 0;
471
+ let doneClaimMtime = 0;
472
+ try {
473
+ signalMtime = (await fs.stat(paths.signalFile)).mtimeMs;
474
+ doneClaimMtime = (await fs.stat(paths.doneClaimFile)).mtimeMs;
475
+ } catch (err) {
476
+ return { ok: false, reason: `mtime stat failed: ${err?.message ?? err}` };
477
+ }
478
+ if (signalMtime <= promptMtime) {
479
+ return {
480
+ ok: false,
481
+ reason: `iter-signal.json mtime (${signalMtime}) is not strictly newer than worker-prompt mtime (${promptMtime})`,
482
+ };
483
+ }
484
+ if (doneClaimMtime <= promptMtime) {
485
+ return {
486
+ ok: false,
487
+ reason: `done-claim.json mtime (${doneClaimMtime}) is not strictly newer than worker-prompt mtime (${promptMtime})`,
488
+ };
489
+ }
490
+ }
491
+
492
+ return { ok: true, reason: 'all five checks passed' };
493
+ }
494
+
391
495
  async function appendIterationAnalytics(paths, state, usId, verdict, options) {
392
496
  await appendCampaignAnalytics(paths.analyticsFile, {
393
497
  iter: state.iteration,
@@ -1288,6 +1392,28 @@ async function _runCampaignBody(slug, options, paths, rootDir) {
1288
1392
 
1289
1393
  let fixContractPath = null;
1290
1394
 
1395
+ // PR-A (Bug #10): operator-recovery hygiene. If the operator hand-rolled a
1396
+ // `phase=verify` recovery (jq-patches status.json, writes manual artifacts,
1397
+ // deletes the blocked sentinel), the leader MUST honor that work instead of
1398
+ // resetting to phase=worker on relaunch. The validator runs five checks
1399
+ // (see _validateOperatorRecoveryArtifacts); on full pass, _skipNextWorkerDispatch
1400
+ // is set as a one-shot flag consumed at the worker dispatch call site below.
1401
+ // On any failure the leader logs the reason and falls through to default
1402
+ // behavior.
1403
+ if (state.phase === 'verify' && state.iteration > 0) {
1404
+ const validation = await _validateOperatorRecoveryArtifacts({ paths, state });
1405
+ if (validation.ok) {
1406
+ console.error(
1407
+ `[recovery] Resuming verify phase — operator manual recovery detected (us=${state.current_us} iter=${state.iteration}): ${validation.reason}`,
1408
+ );
1409
+ state._skipNextWorkerDispatch = true;
1410
+ } else {
1411
+ console.error(
1412
+ `[recovery] phase=verify ignored, falling through to worker dispatch: ${validation.reason}`,
1413
+ );
1414
+ }
1415
+ }
1416
+
1291
1417
  // P1-E Lane Enforcement: snapshot lane mtimes before each iteration,
1292
1418
  // compare at the top of the next iteration. Drift on read-only artifacts
1293
1419
  // (PRD, test-spec, context) emits a lane_violation_warning event + audit
@@ -1572,18 +1698,36 @@ async function _runCampaignBody(slug, options, paths, rootDir) {
1572
1698
  }
1573
1699
  }
1574
1700
 
1575
- state.phase = 'worker';
1576
- await writeStatus(paths, state, options.onStatusChange, options.now);
1577
- await dispatchWorker({
1578
- iteration: state.iteration,
1579
- paths,
1580
- slug,
1581
- usList,
1582
- state,
1583
- sendKeys,
1584
- workerPaneId: state.worker_pane_id,
1585
- fixContractPath,
1586
- });
1701
+ // PR-A (Bug #10): one-shot guard. When the operator's `phase=verify`
1702
+ // recovery was honored at campaign entry, skip both the phase reset and
1703
+ // the worker dispatch — the operator already wrote a valid iter-signal.json
1704
+ // and done-claim.json, so pollForSignal below will pick them up immediately
1705
+ // and the loop continues into the verifier phase. The flag is cleared
1706
+ // after consumption so subsequent iterations dispatch the worker normally.
1707
+ if (state._skipNextWorkerDispatch) {
1708
+ state._skipNextWorkerDispatch = false;
1709
+ console.error(
1710
+ `[recovery] Skipping worker dispatch for iter=${state.iteration} (honoring operator manual recovery)`,
1711
+ );
1712
+ // Persist phase=verify so a subsequent crash-and-relaunch sees the same
1713
+ // contract. writeStatus is intentionally called BEFORE pollForSignal so
1714
+ // the on-disk state matches what we are about to do.
1715
+ state.phase = 'verify';
1716
+ await writeStatus(paths, state, options.onStatusChange, options.now);
1717
+ } else {
1718
+ state.phase = 'worker';
1719
+ await writeStatus(paths, state, options.onStatusChange, options.now);
1720
+ await dispatchWorker({
1721
+ iteration: state.iteration,
1722
+ paths,
1723
+ slug,
1724
+ usList,
1725
+ state,
1726
+ sendKeys,
1727
+ workerPaneId: state.worker_pane_id,
1728
+ fixContractPath,
1729
+ });
1730
+ }
1587
1731
 
1588
1732
  let signal;
1589
1733
  try {
@@ -285,6 +285,90 @@ _unlock_sentinel() {
285
285
  return 0
286
286
  }
287
287
 
288
+ # PR-A (Bug #10) — validate operator-written manual recovery artifacts.
289
+ # Returns 0 when all 5 checks pass; 1 otherwise. Sets RECOVERY_FAIL_REASON
290
+ # (global) on failure for caller logging. Mirrors the Node-side helper
291
+ # `_validateOperatorRecoveryArtifacts` in `src/node/runner/campaign-main-loop.mjs`.
292
+ #
293
+ # Args:
294
+ # $1 iter-signal.json path
295
+ # $2 done-claim.json path
296
+ # $3 status.json path
297
+ # $4 iter-NNN.worker-prompt.md path (may not exist for iter-1 fresh start)
298
+ _validate_operator_recovery_artifacts() {
299
+ local sig_file="$1" done_file="$2" status_file="$3" prompt_file="$4"
300
+ RECOVERY_FAIL_REASON=""
301
+
302
+ # Check 1: both artifacts exist + parse as JSON
303
+ if [[ ! -f "$sig_file" ]]; then
304
+ RECOVERY_FAIL_REASON="iter-signal.json missing"; return 1
305
+ fi
306
+ if [[ ! -f "$done_file" ]]; then
307
+ RECOVERY_FAIL_REASON="done-claim.json missing"; return 1
308
+ fi
309
+ if ! command -v jq >/dev/null 2>&1; then
310
+ RECOVERY_FAIL_REASON="jq unavailable; cannot validate"; return 1
311
+ fi
312
+ if ! jq -e . "$sig_file" >/dev/null 2>&1; then
313
+ RECOVERY_FAIL_REASON="iter-signal.json parse error"; return 1
314
+ fi
315
+ if ! jq -e . "$done_file" >/dev/null 2>&1; then
316
+ RECOVERY_FAIL_REASON="done-claim.json parse error"; return 1
317
+ fi
318
+ if [[ ! -f "$status_file" ]] || ! jq -e . "$status_file" >/dev/null 2>&1; then
319
+ RECOVERY_FAIL_REASON="status.json missing or invalid"; return 1
320
+ fi
321
+
322
+ # Check 2: us_id match in both artifacts
323
+ local current_us sig_us done_us
324
+ current_us=$(jq -r '.current_us // ""' "$status_file" 2>/dev/null)
325
+ sig_us=$(jq -r '.us_id // ""' "$sig_file" 2>/dev/null)
326
+ done_us=$(jq -r '.us_id // ""' "$done_file" 2>/dev/null)
327
+ if [[ "$sig_us" != "$current_us" ]]; then
328
+ RECOVERY_FAIL_REASON="iter-signal.us_id ($sig_us) != status.current_us ($current_us)"; return 1
329
+ fi
330
+ if [[ "$done_us" != "$current_us" ]]; then
331
+ RECOVERY_FAIL_REASON="done-claim.us_id ($done_us) != status.current_us ($current_us)"; return 1
332
+ fi
333
+
334
+ # Check 3: iteration match in both artifacts
335
+ local current_iter sig_iter done_iter
336
+ current_iter=$(jq -r '.iteration // 0' "$status_file" 2>/dev/null)
337
+ sig_iter=$(jq -r '.iteration // 0' "$sig_file" 2>/dev/null)
338
+ done_iter=$(jq -r '.iteration // 0' "$done_file" 2>/dev/null)
339
+ if [[ "$sig_iter" != "$current_iter" ]]; then
340
+ RECOVERY_FAIL_REASON="iter-signal.iteration ($sig_iter) != status.iteration ($current_iter)"; return 1
341
+ fi
342
+ if [[ "$done_iter" != "$current_iter" ]]; then
343
+ RECOVERY_FAIL_REASON="done-claim.iteration ($done_iter) != status.iteration ($current_iter)"; return 1
344
+ fi
345
+
346
+ # Check 4: iter_signal_quality must equal 'specific'
347
+ local sig_quality
348
+ sig_quality=$(jq -r '.iter_signal_quality // ""' "$sig_file" 2>/dev/null)
349
+ if [[ "$sig_quality" != "specific" ]]; then
350
+ RECOVERY_FAIL_REASON="iter-signal.iter_signal_quality ($sig_quality) != 'specific'"; return 1
351
+ fi
352
+
353
+ # Check 5: artifact mtimes must be strictly newer than worker-prompt mtime.
354
+ # Vacuously passes when the prompt file does not exist (fresh iter-1 start
355
+ # before any leader-written prompt).
356
+ if [[ -f "$prompt_file" ]]; then
357
+ local prompt_mtime sig_mtime done_mtime
358
+ prompt_mtime=$(stat -f %m "$prompt_file" 2>/dev/null || stat -c %Y "$prompt_file" 2>/dev/null || print 0)
359
+ sig_mtime=$(stat -f %m "$sig_file" 2>/dev/null || stat -c %Y "$sig_file" 2>/dev/null || print 0)
360
+ done_mtime=$(stat -f %m "$done_file" 2>/dev/null || stat -c %Y "$done_file" 2>/dev/null || print 0)
361
+ if (( sig_mtime <= prompt_mtime )); then
362
+ RECOVERY_FAIL_REASON="iter-signal.json mtime ($sig_mtime) not strictly newer than worker-prompt mtime ($prompt_mtime)"; return 1
363
+ fi
364
+ if (( done_mtime <= prompt_mtime )); then
365
+ RECOVERY_FAIL_REASON="done-claim.json mtime ($done_mtime) not strictly newer than worker-prompt mtime ($prompt_mtime)"; return 1
366
+ fi
367
+ fi
368
+
369
+ return 0
370
+ }
371
+
288
372
  # PR-0b-narrow (Plan v6) — stamp leader handshake ack onto the sentinel.
289
373
  # Mirror of src/node/shared/fs.mjs::stampAckField. Best-effort, audit-only:
290
374
  # any failure is silently swallowed. Sequence:
@@ -3045,24 +3045,50 @@ main() {
3045
3045
  return 1
3046
3046
  fi
3047
3047
 
3048
- # --- governance.md s7 step 8 (cleanup): Clean previous iteration signals ---
3049
- # Bug #7 Fix-R cleanup: unlock 0o444 sentinels written by the previous
3050
- # iteration's reaper before rm so cleanup does not log permission noise.
3051
- _unlock_sentinel "$SIGNAL_FILE"
3052
- _unlock_sentinel "$VERDICT_FILE"
3053
- rm -f "$SIGNAL_FILE" "$DONE_CLAIM_FILE" "$VERDICT_FILE" 2>/dev/null
3054
- rm -f "$WORKER_HEARTBEAT" "$VERIFIER_HEARTBEAT" 2>/dev/null
3055
-
3056
- # --- Clean previous claude session in panes (one-shot lifecycle) ---
3057
- # Only needed from iteration 2 onwards (iteration 1 has fresh panes)
3058
- if (( ITERATION > 1 )); then
3059
- # Send C-c first (in case claude is mid-task), then /exit
3060
- tmux send-keys -t "$WORKER_PANE" C-c 2>/dev/null
3061
- sleep 1
3062
- tmux send-keys -t "$WORKER_PANE" "/exit" C-m 2>/dev/null
3063
- sleep 2
3064
- # Wait for shell prompt before proceeding
3065
- wait_for_pane_ready "$WORKER_PANE" 10 2>/dev/null || true
3048
+ # PR-A (Bug #10): operator-recovery hygiene check.
3049
+ # When the operator hand-rolls a `phase=verify` recovery (jq-patches
3050
+ # status.json, writes manual iter-signal.json + done-claim.json, deletes
3051
+ # the blocked sentinel), the leader MUST honor that work instead of
3052
+ # deleting the artifacts and resetting to phase=worker. Mirrors the
3053
+ # Node-side guard in src/node/runner/campaign-main-loop.mjs.
3054
+ local SKIP_NEXT_WORKER=0
3055
+ local LAST_PHASE=""
3056
+ if [[ -f "$STATUS_FILE" ]] && command -v jq >/dev/null 2>&1; then
3057
+ LAST_PHASE=$(jq -r '.phase // ""' "$STATUS_FILE" 2>/dev/null)
3058
+ fi
3059
+ if [[ "$LAST_PHASE" == "verify" ]]; then
3060
+ local _iter_prompt="$LOGS_DIR/iter-$(printf '%03d' $ITERATION).worker-prompt.md"
3061
+ if _validate_operator_recovery_artifacts \
3062
+ "$SIGNAL_FILE" "$DONE_CLAIM_FILE" "$STATUS_FILE" "$_iter_prompt"; then
3063
+ log "[recovery] Resuming verify phase — operator manual recovery detected (iter=$ITERATION)"
3064
+ log_debug "[recovery] iter=$ITERATION skip_worker=true reason=manual_recovery_validated"
3065
+ SKIP_NEXT_WORKER=1
3066
+ else
3067
+ log "[recovery] phase=verify ignored: ${RECOVERY_FAIL_REASON}"
3068
+ log_debug "[recovery] iter=$ITERATION skip_worker=false reason=\"${RECOVERY_FAIL_REASON}\""
3069
+ fi
3070
+ fi
3071
+
3072
+ if (( ! SKIP_NEXT_WORKER )); then
3073
+ # --- governance.md s7 step 8 (cleanup): Clean previous iteration signals ---
3074
+ # Bug #7 Fix-R cleanup: unlock 0o444 sentinels written by the previous
3075
+ # iteration's reaper before rm so cleanup does not log permission noise.
3076
+ _unlock_sentinel "$SIGNAL_FILE"
3077
+ _unlock_sentinel "$VERDICT_FILE"
3078
+ rm -f "$SIGNAL_FILE" "$DONE_CLAIM_FILE" "$VERDICT_FILE" 2>/dev/null
3079
+ rm -f "$WORKER_HEARTBEAT" "$VERIFIER_HEARTBEAT" 2>/dev/null
3080
+
3081
+ # --- Clean previous claude session in panes (one-shot lifecycle) ---
3082
+ # Only needed from iteration 2 onwards (iteration 1 has fresh panes)
3083
+ if (( ITERATION > 1 )); then
3084
+ # Send C-c first (in case claude is mid-task), then /exit
3085
+ tmux send-keys -t "$WORKER_PANE" C-c 2>/dev/null
3086
+ sleep 1
3087
+ tmux send-keys -t "$WORKER_PANE" "/exit" C-m 2>/dev/null
3088
+ sleep 2
3089
+ # Wait for shell prompt before proceeding
3090
+ wait_for_pane_ready "$WORKER_PANE" 10 2>/dev/null || true
3091
+ fi
3066
3092
  fi
3067
3093
 
3068
3094
  # Reset per-iteration state
@@ -3074,33 +3100,44 @@ main() {
3074
3100
  # --- US-004: detect PRD changes for live update + re-split ---
3075
3101
  check_prd_update
3076
3102
 
3077
- # --- governance.md s7 step 4: Build worker prompt + trigger ---
3078
- write_worker_trigger "$ITERATION"
3079
- local worker_prompt="$LOGS_DIR/iter-$(printf '%03d' $ITERATION).worker-prompt.md"
3080
-
3081
- # AC1: capture worker start timestamp
3103
+ # AC1: capture worker start timestamp (still set for downstream telemetry
3104
+ # even when the worker dispatch is skipped — recovery still consumes time).
3082
3105
  ITER_WORKER_START=$(date +%s)
3083
3106
 
3084
- update_status "worker" "running"
3107
+ local worker_launch=""
3108
+ if (( ! SKIP_NEXT_WORKER )); then
3109
+ # --- governance.md s7 step 4: Build worker prompt + trigger ---
3110
+ write_worker_trigger "$ITERATION"
3111
+ local worker_prompt="$LOGS_DIR/iter-$(printf '%03d' $ITERATION).worker-prompt.md"
3085
3112
 
3086
- # --- governance.md s7 step 5: Execute Worker (dispatched to engine-specific function) ---
3087
- log_debug "[FLOW] iter=$ITERATION phase=worker engine=$WORKER_ENGINE model=$WORKER_MODEL dispatched=true"
3113
+ update_status "worker" "running"
3088
3114
 
3089
- local worker_launch
3090
- if [[ "$WORKER_ENGINE" = "codex" ]]; then
3091
- worker_launch="${CODEX_BIN:-codex} -m $WORKER_CODEX_MODEL -c model_reasoning_effort=\"$WORKER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
3092
- if ! launch_worker_codex "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
3093
- write_blocked_sentinel "Worker codex failed to start in pane" "" "infra_failure"
3094
- update_status "blocked" "worker_start_failed"
3095
- return 1
3115
+ # --- governance.md s7 step 5: Execute Worker (dispatched to engine-specific function) ---
3116
+ log_debug "[FLOW] iter=$ITERATION phase=worker engine=$WORKER_ENGINE model=$WORKER_MODEL dispatched=true"
3117
+
3118
+ if [[ "$WORKER_ENGINE" = "codex" ]]; then
3119
+ worker_launch="${CODEX_BIN:-codex} -m $WORKER_CODEX_MODEL -c model_reasoning_effort=\"$WORKER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
3120
+ if ! launch_worker_codex "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
3121
+ write_blocked_sentinel "Worker codex failed to start in pane" "" "infra_failure"
3122
+ update_status "blocked" "worker_start_failed"
3123
+ return 1
3124
+ fi
3125
+ else
3126
+ worker_launch="$(build_claude_cmd tui "$WORKER_MODEL" "" "" "$WORKER_EFFORT")"
3127
+ if ! launch_worker_claude "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
3128
+ write_blocked_sentinel "Worker claude failed to start in pane" "" "infra_failure"
3129
+ update_status "blocked" "worker_start_failed"
3130
+ return 1
3131
+ fi
3096
3132
  fi
3097
3133
  else
3098
- worker_launch="$(build_claude_cmd tui "$WORKER_MODEL" "" "" "$WORKER_EFFORT")"
3099
- if ! launch_worker_claude "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
3100
- write_blocked_sentinel "Worker claude failed to start in pane" "" "infra_failure"
3101
- update_status "blocked" "worker_start_failed"
3102
- return 1
3103
- fi
3134
+ # PR-A (Bug #10): one-shot recovery path. The operator's iter-signal.json
3135
+ # is already on disk; polling below picks it up immediately and the loop
3136
+ # transitions cleanly into the verifier phase. Persist phase=verify so a
3137
+ # subsequent crash-and-relaunch sees the same contract. SKIP_NEXT_WORKER
3138
+ # is local to this iteration so iter-N+1 dispatches the worker normally.
3139
+ update_status "verify" "running"
3140
+ log "[recovery] Skipping worker dispatch for iter=$ITERATION (one-shot, honoring operator manual recovery)"
3104
3141
  fi
3105
3142
 
3106
3143
  # --- governance.md s7 step 5+6: Poll for Worker completion ---