npm - @ai-dev-methodologies/rlp-desk - Versions diffs - 0.15.0 → 0.15.1 - Mend

@ai-dev-methodologies/rlp-desk 0.15.0 → 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/docs/plans/bug-report-overhaul-backlog.md +49 -0
package/docs/plans/bug-report-overhaul-v0.md +238 -0
package/docs/plans/bug-report-overhaul-v1.md +319 -0
package/docs/plans/native-agent-revert.md +184 -0
package/docs/plans/strategic-review/rlp-desk-strategic-review.md +125 -0
package/package.json +1 -1
package/src/commands/rlp-desk.md +56 -46
package/src/node/run.mjs +45 -7
package/src/node/runner/campaign-main-loop.mjs +156 -12
package/src/scripts/lib_ralph_desk.zsh +84 -0
package/src/scripts/run_ralph_desk.zsh +76 -39

package/src/node/runner/campaign-main-loop.mjs CHANGED Viewed

@@ -388,6 +388,110 @@ async function readCurrentState(paths, slug, options) {
   };
 }
+// PR-A (Bug #10): validate operator-written recovery artifacts. When the
+// operator hand-rolls a `phase=verify` recovery (jq-patches status.json,
+// writes iter-signal.json + done-claim.json by hand, deletes the blocked
+// sentinel), the leader must NOT silently overwrite that work on relaunch.
+// All five checks must pass for the leader to honor the recovery.
+//
+// Returns { ok: boolean, reason: string }. On any failure the caller falls
+// through to the default behavior (worker dispatch) — defensive by design.
+async function _validateOperatorRecoveryArtifacts({ paths, state }) {
+  // 1. iter-signal.json + done-claim.json must both exist and parse.
+  let signal;
+  let doneClaim;
+  try {
+    signal = await readJsonIfExists(paths.signalFile);
+  } catch (err) {
+    return { ok: false, reason: `iter-signal.json parse error: ${err?.message ?? err}` };
+  }
+  if (!signal) return { ok: false, reason: 'iter-signal.json missing' };
+  try {
+    doneClaim = await readJsonIfExists(paths.doneClaimFile);
+  } catch (err) {
+    return { ok: false, reason: `done-claim.json parse error: ${err?.message ?? err}` };
+  }
+  if (!doneClaim) return { ok: false, reason: 'done-claim.json missing' };
+  // 2. us_id must match status.current_us in BOTH artifacts.
+  if (signal.us_id !== state.current_us) {
+    return {
+      ok: false,
+      reason: `iter-signal.us_id (${signal.us_id}) != status.current_us (${state.current_us})`,
+    };
+  }
+  if (doneClaim.us_id !== state.current_us) {
+    return {
+      ok: false,
+      reason: `done-claim.us_id (${doneClaim.us_id}) != status.current_us (${state.current_us})`,
+    };
+  }
+  // 3. iteration must match status.iteration in BOTH artifacts.
+  if (signal.iteration !== state.iteration) {
+    return {
+      ok: false,
+      reason: `iter-signal.iteration (${signal.iteration}) != status.iteration (${state.iteration})`,
+    };
+  }
+  if (doneClaim.iteration !== state.iteration) {
+    return {
+      ok: false,
+      reason: `done-claim.iteration (${doneClaim.iteration}) != status.iteration (${state.iteration})`,
+    };
+  }
+  // 4. iter_signal_quality must be 'specific' (not generic / vague).
+  if (signal.iter_signal_quality !== 'specific') {
+    return {
+      ok: false,
+      reason: `iter-signal.iter_signal_quality (${signal.iter_signal_quality}) != 'specific'`,
+    };
+  }
+  // 5. Both artifact mtimes must be NEWER than the most recent
+  //    iter-NNN.worker-prompt.md mtime — guards against operator running
+  //    `phase=verify` against stale artifacts from a much earlier iteration.
+  const promptFile = path.join(
+    paths.campaignLogDir,
+    `iter-${String(state.iteration).padStart(3, '0')}.worker-prompt.md`,
+  );
+  let promptMtime = 0;
+  try {
+    const promptStat = await fs.stat(promptFile);
+    promptMtime = promptStat.mtimeMs;
+  } catch {
+    // No worker-prompt.md for this iteration → check vacuously passes
+    // (operator is recovering from a state that never even dispatched yet).
+    promptMtime = 0;
+  }
+  if (promptMtime > 0) {
+    let signalMtime = 0;
+    let doneClaimMtime = 0;
+    try {
+      signalMtime = (await fs.stat(paths.signalFile)).mtimeMs;
+      doneClaimMtime = (await fs.stat(paths.doneClaimFile)).mtimeMs;
+    } catch (err) {
+      return { ok: false, reason: `mtime stat failed: ${err?.message ?? err}` };
+    }
+    if (signalMtime <= promptMtime) {
+      return {
+        ok: false,
+        reason: `iter-signal.json mtime (${signalMtime}) is not strictly newer than worker-prompt mtime (${promptMtime})`,
+      };
+    }
+    if (doneClaimMtime <= promptMtime) {
+      return {
+        ok: false,
+        reason: `done-claim.json mtime (${doneClaimMtime}) is not strictly newer than worker-prompt mtime (${promptMtime})`,
+      };
+    }
+  }
+  return { ok: true, reason: 'all five checks passed' };
+}
 async function appendIterationAnalytics(paths, state, usId, verdict, options) {
   await appendCampaignAnalytics(paths.analyticsFile, {
     iter: state.iteration,
@@ -1288,6 +1392,28 @@ async function _runCampaignBody(slug, options, paths, rootDir) {
   let fixContractPath = null;
+  // PR-A (Bug #10): operator-recovery hygiene. If the operator hand-rolled a
+  // `phase=verify` recovery (jq-patches status.json, writes manual artifacts,
+  // deletes the blocked sentinel), the leader MUST honor that work instead of
+  // resetting to phase=worker on relaunch. The validator runs five checks
+  // (see _validateOperatorRecoveryArtifacts); on full pass, _skipNextWorkerDispatch
+  // is set as a one-shot flag consumed at the worker dispatch call site below.
+  // On any failure the leader logs the reason and falls through to default
+  // behavior.
+  if (state.phase === 'verify' && state.iteration > 0) {
+    const validation = await _validateOperatorRecoveryArtifacts({ paths, state });
+    if (validation.ok) {
+      console.error(
+        `[recovery] Resuming verify phase — operator manual recovery detected (us=${state.current_us} iter=${state.iteration}): ${validation.reason}`,
+      );
+      state._skipNextWorkerDispatch = true;
+    } else {
+      console.error(
+        `[recovery] phase=verify ignored, falling through to worker dispatch: ${validation.reason}`,
+      );
+    }
+  }
   // P1-E Lane Enforcement: snapshot lane mtimes before each iteration,
   // compare at the top of the next iteration. Drift on read-only artifacts
   // (PRD, test-spec, context) emits a lane_violation_warning event + audit
@@ -1572,18 +1698,36 @@ async function _runCampaignBody(slug, options, paths, rootDir) {
       }
     }
-    state.phase = 'worker';
-    await writeStatus(paths, state, options.onStatusChange, options.now);
-    await dispatchWorker({
-      iteration: state.iteration,
-      paths,
-      slug,
-      usList,
-      state,
-      sendKeys,
-      workerPaneId: state.worker_pane_id,
-      fixContractPath,
-    });
+    // PR-A (Bug #10): one-shot guard. When the operator's `phase=verify`
+    // recovery was honored at campaign entry, skip both the phase reset and
+    // the worker dispatch — the operator already wrote a valid iter-signal.json
+    // and done-claim.json, so pollForSignal below will pick them up immediately
+    // and the loop continues into the verifier phase. The flag is cleared
+    // after consumption so subsequent iterations dispatch the worker normally.
+    if (state._skipNextWorkerDispatch) {
+      state._skipNextWorkerDispatch = false;
+      console.error(
+        `[recovery] Skipping worker dispatch for iter=${state.iteration} (honoring operator manual recovery)`,
+      );
+      // Persist phase=verify so a subsequent crash-and-relaunch sees the same
+      // contract. writeStatus is intentionally called BEFORE pollForSignal so
+      // the on-disk state matches what we are about to do.
+      state.phase = 'verify';
+      await writeStatus(paths, state, options.onStatusChange, options.now);
+    } else {
+      state.phase = 'worker';
+      await writeStatus(paths, state, options.onStatusChange, options.now);
+      await dispatchWorker({
+        iteration: state.iteration,
+        paths,
+        slug,
+        usList,
+        state,
+        sendKeys,
+        workerPaneId: state.worker_pane_id,
+        fixContractPath,
+      });
+    }
     let signal;
     try {

package/src/scripts/lib_ralph_desk.zsh CHANGED Viewed

@@ -285,6 +285,90 @@ _unlock_sentinel() {
   return 0
 }
+# PR-A (Bug #10) — validate operator-written manual recovery artifacts.
+# Returns 0 when all 5 checks pass; 1 otherwise. Sets RECOVERY_FAIL_REASON
+# (global) on failure for caller logging. Mirrors the Node-side helper
+# `_validateOperatorRecoveryArtifacts` in `src/node/runner/campaign-main-loop.mjs`.
+#
+# Args:
+#   $1  iter-signal.json path
+#   $2  done-claim.json path
+#   $3  status.json path
+#   $4  iter-NNN.worker-prompt.md path (may not exist for iter-1 fresh start)
+_validate_operator_recovery_artifacts() {
+  local sig_file="$1" done_file="$2" status_file="$3" prompt_file="$4"
+  RECOVERY_FAIL_REASON=""
+  # Check 1: both artifacts exist + parse as JSON
+  if [[ ! -f "$sig_file" ]]; then
+    RECOVERY_FAIL_REASON="iter-signal.json missing"; return 1
+  fi
+  if [[ ! -f "$done_file" ]]; then
+    RECOVERY_FAIL_REASON="done-claim.json missing"; return 1
+  fi
+  if ! command -v jq >/dev/null 2>&1; then
+    RECOVERY_FAIL_REASON="jq unavailable; cannot validate"; return 1
+  fi
+  if ! jq -e . "$sig_file" >/dev/null 2>&1; then
+    RECOVERY_FAIL_REASON="iter-signal.json parse error"; return 1
+  fi
+  if ! jq -e . "$done_file" >/dev/null 2>&1; then
+    RECOVERY_FAIL_REASON="done-claim.json parse error"; return 1
+  fi
+  if [[ ! -f "$status_file" ]] || ! jq -e . "$status_file" >/dev/null 2>&1; then
+    RECOVERY_FAIL_REASON="status.json missing or invalid"; return 1
+  fi
+  # Check 2: us_id match in both artifacts
+  local current_us sig_us done_us
+  current_us=$(jq -r '.current_us // ""' "$status_file" 2>/dev/null)
+  sig_us=$(jq -r '.us_id // ""' "$sig_file" 2>/dev/null)
+  done_us=$(jq -r '.us_id // ""' "$done_file" 2>/dev/null)
+  if [[ "$sig_us" != "$current_us" ]]; then
+    RECOVERY_FAIL_REASON="iter-signal.us_id ($sig_us) != status.current_us ($current_us)"; return 1
+  fi
+  if [[ "$done_us" != "$current_us" ]]; then
+    RECOVERY_FAIL_REASON="done-claim.us_id ($done_us) != status.current_us ($current_us)"; return 1
+  fi
+  # Check 3: iteration match in both artifacts
+  local current_iter sig_iter done_iter
+  current_iter=$(jq -r '.iteration // 0' "$status_file" 2>/dev/null)
+  sig_iter=$(jq -r '.iteration // 0' "$sig_file" 2>/dev/null)
+  done_iter=$(jq -r '.iteration // 0' "$done_file" 2>/dev/null)
+  if [[ "$sig_iter" != "$current_iter" ]]; then
+    RECOVERY_FAIL_REASON="iter-signal.iteration ($sig_iter) != status.iteration ($current_iter)"; return 1
+  fi
+  if [[ "$done_iter" != "$current_iter" ]]; then
+    RECOVERY_FAIL_REASON="done-claim.iteration ($done_iter) != status.iteration ($current_iter)"; return 1
+  fi
+  # Check 4: iter_signal_quality must equal 'specific'
+  local sig_quality
+  sig_quality=$(jq -r '.iter_signal_quality // ""' "$sig_file" 2>/dev/null)
+  if [[ "$sig_quality" != "specific" ]]; then
+    RECOVERY_FAIL_REASON="iter-signal.iter_signal_quality ($sig_quality) != 'specific'"; return 1
+  fi
+  # Check 5: artifact mtimes must be strictly newer than worker-prompt mtime.
+  # Vacuously passes when the prompt file does not exist (fresh iter-1 start
+  # before any leader-written prompt).
+  if [[ -f "$prompt_file" ]]; then
+    local prompt_mtime sig_mtime done_mtime
+    prompt_mtime=$(stat -f %m "$prompt_file" 2>/dev/null || stat -c %Y "$prompt_file" 2>/dev/null || print 0)
+    sig_mtime=$(stat -f %m "$sig_file" 2>/dev/null || stat -c %Y "$sig_file" 2>/dev/null || print 0)
+    done_mtime=$(stat -f %m "$done_file" 2>/dev/null || stat -c %Y "$done_file" 2>/dev/null || print 0)
+    if (( sig_mtime <= prompt_mtime )); then
+      RECOVERY_FAIL_REASON="iter-signal.json mtime ($sig_mtime) not strictly newer than worker-prompt mtime ($prompt_mtime)"; return 1
+    fi
+    if (( done_mtime <= prompt_mtime )); then
+      RECOVERY_FAIL_REASON="done-claim.json mtime ($done_mtime) not strictly newer than worker-prompt mtime ($prompt_mtime)"; return 1
+    fi
+  fi
+  return 0
+}
 # PR-0b-narrow (Plan v6) — stamp leader handshake ack onto the sentinel.
 # Mirror of src/node/shared/fs.mjs::stampAckField. Best-effort, audit-only:
 # any failure is silently swallowed. Sequence:

package/src/scripts/run_ralph_desk.zsh CHANGED Viewed

@@ -3045,24 +3045,50 @@ main() {
       return 1
     fi
-    # --- governance.md s7 step 8 (cleanup): Clean previous iteration signals ---
-    # Bug #7 Fix-R cleanup: unlock 0o444 sentinels written by the previous
-    # iteration's reaper before rm so cleanup does not log permission noise.
-    _unlock_sentinel "$SIGNAL_FILE"
-    _unlock_sentinel "$VERDICT_FILE"
-    rm -f "$SIGNAL_FILE" "$DONE_CLAIM_FILE" "$VERDICT_FILE" 2>/dev/null
-    rm -f "$WORKER_HEARTBEAT" "$VERIFIER_HEARTBEAT" 2>/dev/null
-    # --- Clean previous claude session in panes (one-shot lifecycle) ---
-    # Only needed from iteration 2 onwards (iteration 1 has fresh panes)
-    if (( ITERATION > 1 )); then
-      # Send C-c first (in case claude is mid-task), then /exit
-      tmux send-keys -t "$WORKER_PANE" C-c 2>/dev/null
-      sleep 1
-      tmux send-keys -t "$WORKER_PANE" "/exit" C-m 2>/dev/null
-      sleep 2
-      # Wait for shell prompt before proceeding
-      wait_for_pane_ready "$WORKER_PANE" 10 2>/dev/null || true
+    # PR-A (Bug #10): operator-recovery hygiene check.
+    # When the operator hand-rolls a `phase=verify` recovery (jq-patches
+    # status.json, writes manual iter-signal.json + done-claim.json, deletes
+    # the blocked sentinel), the leader MUST honor that work instead of
+    # deleting the artifacts and resetting to phase=worker. Mirrors the
+    # Node-side guard in src/node/runner/campaign-main-loop.mjs.
+    local SKIP_NEXT_WORKER=0
+    local LAST_PHASE=""
+    if [[ -f "$STATUS_FILE" ]] && command -v jq >/dev/null 2>&1; then
+      LAST_PHASE=$(jq -r '.phase // ""' "$STATUS_FILE" 2>/dev/null)
+    fi
+    if [[ "$LAST_PHASE" == "verify" ]]; then
+      local _iter_prompt="$LOGS_DIR/iter-$(printf '%03d' $ITERATION).worker-prompt.md"
+      if _validate_operator_recovery_artifacts \
+           "$SIGNAL_FILE" "$DONE_CLAIM_FILE" "$STATUS_FILE" "$_iter_prompt"; then
+        log "[recovery] Resuming verify phase — operator manual recovery detected (iter=$ITERATION)"
+        log_debug "[recovery] iter=$ITERATION skip_worker=true reason=manual_recovery_validated"
+        SKIP_NEXT_WORKER=1
+      else
+        log "[recovery] phase=verify ignored: ${RECOVERY_FAIL_REASON}"
+        log_debug "[recovery] iter=$ITERATION skip_worker=false reason=\"${RECOVERY_FAIL_REASON}\""
+      fi
+    fi
+    if (( ! SKIP_NEXT_WORKER )); then
+      # --- governance.md s7 step 8 (cleanup): Clean previous iteration signals ---
+      # Bug #7 Fix-R cleanup: unlock 0o444 sentinels written by the previous
+      # iteration's reaper before rm so cleanup does not log permission noise.
+      _unlock_sentinel "$SIGNAL_FILE"
+      _unlock_sentinel "$VERDICT_FILE"
+      rm -f "$SIGNAL_FILE" "$DONE_CLAIM_FILE" "$VERDICT_FILE" 2>/dev/null
+      rm -f "$WORKER_HEARTBEAT" "$VERIFIER_HEARTBEAT" 2>/dev/null
+      # --- Clean previous claude session in panes (one-shot lifecycle) ---
+      # Only needed from iteration 2 onwards (iteration 1 has fresh panes)
+      if (( ITERATION > 1 )); then
+        # Send C-c first (in case claude is mid-task), then /exit
+        tmux send-keys -t "$WORKER_PANE" C-c 2>/dev/null
+        sleep 1
+        tmux send-keys -t "$WORKER_PANE" "/exit" C-m 2>/dev/null
+        sleep 2
+        # Wait for shell prompt before proceeding
+        wait_for_pane_ready "$WORKER_PANE" 10 2>/dev/null || true
+      fi
     fi
     # Reset per-iteration state
@@ -3074,33 +3100,44 @@ main() {
     # --- US-004: detect PRD changes for live update + re-split ---
     check_prd_update
-    # --- governance.md s7 step 4: Build worker prompt + trigger ---
-    write_worker_trigger "$ITERATION"
-    local worker_prompt="$LOGS_DIR/iter-$(printf '%03d' $ITERATION).worker-prompt.md"
-    # AC1: capture worker start timestamp
+    # AC1: capture worker start timestamp (still set for downstream telemetry
+    # even when the worker dispatch is skipped — recovery still consumes time).
     ITER_WORKER_START=$(date +%s)
-    update_status "worker" "running"
+    local worker_launch=""
+    if (( ! SKIP_NEXT_WORKER )); then
+      # --- governance.md s7 step 4: Build worker prompt + trigger ---
+      write_worker_trigger "$ITERATION"
+      local worker_prompt="$LOGS_DIR/iter-$(printf '%03d' $ITERATION).worker-prompt.md"
-    # --- governance.md s7 step 5: Execute Worker (dispatched to engine-specific function) ---
-    log_debug "[FLOW] iter=$ITERATION phase=worker engine=$WORKER_ENGINE model=$WORKER_MODEL dispatched=true"
+      update_status "worker" "running"
-    local worker_launch
-    if [[ "$WORKER_ENGINE" = "codex" ]]; then
-      worker_launch="${CODEX_BIN:-codex} -m $WORKER_CODEX_MODEL -c model_reasoning_effort=\"$WORKER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
-      if ! launch_worker_codex "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
-        write_blocked_sentinel "Worker codex failed to start in pane" "" "infra_failure"
-        update_status "blocked" "worker_start_failed"
-        return 1
+      # --- governance.md s7 step 5: Execute Worker (dispatched to engine-specific function) ---
+      log_debug "[FLOW] iter=$ITERATION phase=worker engine=$WORKER_ENGINE model=$WORKER_MODEL dispatched=true"
+      if [[ "$WORKER_ENGINE" = "codex" ]]; then
+        worker_launch="${CODEX_BIN:-codex} -m $WORKER_CODEX_MODEL -c model_reasoning_effort=\"$WORKER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
+        if ! launch_worker_codex "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
+          write_blocked_sentinel "Worker codex failed to start in pane" "" "infra_failure"
+          update_status "blocked" "worker_start_failed"
+          return 1
+        fi
+      else
+        worker_launch="$(build_claude_cmd tui "$WORKER_MODEL" "" "" "$WORKER_EFFORT")"
+        if ! launch_worker_claude "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
+          write_blocked_sentinel "Worker claude failed to start in pane" "" "infra_failure"
+          update_status "blocked" "worker_start_failed"
+          return 1
+        fi
       fi
     else
-      worker_launch="$(build_claude_cmd tui "$WORKER_MODEL" "" "" "$WORKER_EFFORT")"
-      if ! launch_worker_claude "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
-        write_blocked_sentinel "Worker claude failed to start in pane" "" "infra_failure"
-        update_status "blocked" "worker_start_failed"
-        return 1
-      fi
+      # PR-A (Bug #10): one-shot recovery path. The operator's iter-signal.json
+      # is already on disk; polling below picks it up immediately and the loop
+      # transitions cleanly into the verifier phase. Persist phase=verify so a
+      # subsequent crash-and-relaunch sees the same contract. SKIP_NEXT_WORKER
+      # is local to this iteration so iter-N+1 dispatches the worker normally.
+      update_status "verify" "running"
+      log "[recovery] Skipping worker dispatch for iter=$ITERATION (one-shot, honoring operator manual recovery)"
     fi
     # --- governance.md s7 step 5+6: Poll for Worker completion ---