@ai-dev-methodologies/rlp-desk 0.15.0 → 0.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/plans/bug-report-overhaul-backlog.md +49 -0
- package/docs/plans/bug-report-overhaul-v0.md +238 -0
- package/docs/plans/bug-report-overhaul-v1.md +319 -0
- package/docs/plans/native-agent-revert.md +184 -0
- package/docs/plans/strategic-review/rlp-desk-strategic-review.md +125 -0
- package/package.json +1 -1
- package/src/commands/rlp-desk.md +56 -46
- package/src/node/run.mjs +45 -7
- package/src/node/runner/campaign-main-loop.mjs +156 -12
- package/src/scripts/lib_ralph_desk.zsh +84 -0
- package/src/scripts/run_ralph_desk.zsh +76 -39
|
@@ -388,6 +388,110 @@ async function readCurrentState(paths, slug, options) {
|
|
|
388
388
|
};
|
|
389
389
|
}
|
|
390
390
|
|
|
391
|
+
// PR-A (Bug #10): validate operator-written recovery artifacts. When the
|
|
392
|
+
// operator hand-rolls a `phase=verify` recovery (jq-patches status.json,
|
|
393
|
+
// writes iter-signal.json + done-claim.json by hand, deletes the blocked
|
|
394
|
+
// sentinel), the leader must NOT silently overwrite that work on relaunch.
|
|
395
|
+
// All five checks must pass for the leader to honor the recovery.
|
|
396
|
+
//
|
|
397
|
+
// Returns { ok: boolean, reason: string }. On any failure the caller falls
|
|
398
|
+
// through to the default behavior (worker dispatch) — defensive by design.
|
|
399
|
+
async function _validateOperatorRecoveryArtifacts({ paths, state }) {
|
|
400
|
+
// 1. iter-signal.json + done-claim.json must both exist and parse.
|
|
401
|
+
let signal;
|
|
402
|
+
let doneClaim;
|
|
403
|
+
try {
|
|
404
|
+
signal = await readJsonIfExists(paths.signalFile);
|
|
405
|
+
} catch (err) {
|
|
406
|
+
return { ok: false, reason: `iter-signal.json parse error: ${err?.message ?? err}` };
|
|
407
|
+
}
|
|
408
|
+
if (!signal) return { ok: false, reason: 'iter-signal.json missing' };
|
|
409
|
+
|
|
410
|
+
try {
|
|
411
|
+
doneClaim = await readJsonIfExists(paths.doneClaimFile);
|
|
412
|
+
} catch (err) {
|
|
413
|
+
return { ok: false, reason: `done-claim.json parse error: ${err?.message ?? err}` };
|
|
414
|
+
}
|
|
415
|
+
if (!doneClaim) return { ok: false, reason: 'done-claim.json missing' };
|
|
416
|
+
|
|
417
|
+
// 2. us_id must match status.current_us in BOTH artifacts.
|
|
418
|
+
if (signal.us_id !== state.current_us) {
|
|
419
|
+
return {
|
|
420
|
+
ok: false,
|
|
421
|
+
reason: `iter-signal.us_id (${signal.us_id}) != status.current_us (${state.current_us})`,
|
|
422
|
+
};
|
|
423
|
+
}
|
|
424
|
+
if (doneClaim.us_id !== state.current_us) {
|
|
425
|
+
return {
|
|
426
|
+
ok: false,
|
|
427
|
+
reason: `done-claim.us_id (${doneClaim.us_id}) != status.current_us (${state.current_us})`,
|
|
428
|
+
};
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
// 3. iteration must match status.iteration in BOTH artifacts.
|
|
432
|
+
if (signal.iteration !== state.iteration) {
|
|
433
|
+
return {
|
|
434
|
+
ok: false,
|
|
435
|
+
reason: `iter-signal.iteration (${signal.iteration}) != status.iteration (${state.iteration})`,
|
|
436
|
+
};
|
|
437
|
+
}
|
|
438
|
+
if (doneClaim.iteration !== state.iteration) {
|
|
439
|
+
return {
|
|
440
|
+
ok: false,
|
|
441
|
+
reason: `done-claim.iteration (${doneClaim.iteration}) != status.iteration (${state.iteration})`,
|
|
442
|
+
};
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
// 4. iter_signal_quality must be 'specific' (not generic / vague).
|
|
446
|
+
if (signal.iter_signal_quality !== 'specific') {
|
|
447
|
+
return {
|
|
448
|
+
ok: false,
|
|
449
|
+
reason: `iter-signal.iter_signal_quality (${signal.iter_signal_quality}) != 'specific'`,
|
|
450
|
+
};
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
// 5. Both artifact mtimes must be NEWER than the most recent
|
|
454
|
+
// iter-NNN.worker-prompt.md mtime — guards against operator running
|
|
455
|
+
// `phase=verify` against stale artifacts from a much earlier iteration.
|
|
456
|
+
const promptFile = path.join(
|
|
457
|
+
paths.campaignLogDir,
|
|
458
|
+
`iter-${String(state.iteration).padStart(3, '0')}.worker-prompt.md`,
|
|
459
|
+
);
|
|
460
|
+
let promptMtime = 0;
|
|
461
|
+
try {
|
|
462
|
+
const promptStat = await fs.stat(promptFile);
|
|
463
|
+
promptMtime = promptStat.mtimeMs;
|
|
464
|
+
} catch {
|
|
465
|
+
// No worker-prompt.md for this iteration → check vacuously passes
|
|
466
|
+
// (operator is recovering from a state that never even dispatched yet).
|
|
467
|
+
promptMtime = 0;
|
|
468
|
+
}
|
|
469
|
+
if (promptMtime > 0) {
|
|
470
|
+
let signalMtime = 0;
|
|
471
|
+
let doneClaimMtime = 0;
|
|
472
|
+
try {
|
|
473
|
+
signalMtime = (await fs.stat(paths.signalFile)).mtimeMs;
|
|
474
|
+
doneClaimMtime = (await fs.stat(paths.doneClaimFile)).mtimeMs;
|
|
475
|
+
} catch (err) {
|
|
476
|
+
return { ok: false, reason: `mtime stat failed: ${err?.message ?? err}` };
|
|
477
|
+
}
|
|
478
|
+
if (signalMtime <= promptMtime) {
|
|
479
|
+
return {
|
|
480
|
+
ok: false,
|
|
481
|
+
reason: `iter-signal.json mtime (${signalMtime}) is not strictly newer than worker-prompt mtime (${promptMtime})`,
|
|
482
|
+
};
|
|
483
|
+
}
|
|
484
|
+
if (doneClaimMtime <= promptMtime) {
|
|
485
|
+
return {
|
|
486
|
+
ok: false,
|
|
487
|
+
reason: `done-claim.json mtime (${doneClaimMtime}) is not strictly newer than worker-prompt mtime (${promptMtime})`,
|
|
488
|
+
};
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
return { ok: true, reason: 'all five checks passed' };
|
|
493
|
+
}
|
|
494
|
+
|
|
391
495
|
async function appendIterationAnalytics(paths, state, usId, verdict, options) {
|
|
392
496
|
await appendCampaignAnalytics(paths.analyticsFile, {
|
|
393
497
|
iter: state.iteration,
|
|
@@ -1288,6 +1392,28 @@ async function _runCampaignBody(slug, options, paths, rootDir) {
|
|
|
1288
1392
|
|
|
1289
1393
|
let fixContractPath = null;
|
|
1290
1394
|
|
|
1395
|
+
// PR-A (Bug #10): operator-recovery hygiene. If the operator hand-rolled a
|
|
1396
|
+
// `phase=verify` recovery (jq-patches status.json, writes manual artifacts,
|
|
1397
|
+
// deletes the blocked sentinel), the leader MUST honor that work instead of
|
|
1398
|
+
// resetting to phase=worker on relaunch. The validator runs five checks
|
|
1399
|
+
// (see _validateOperatorRecoveryArtifacts); on full pass, _skipNextWorkerDispatch
|
|
1400
|
+
// is set as a one-shot flag consumed at the worker dispatch call site below.
|
|
1401
|
+
// On any failure the leader logs the reason and falls through to default
|
|
1402
|
+
// behavior.
|
|
1403
|
+
if (state.phase === 'verify' && state.iteration > 0) {
|
|
1404
|
+
const validation = await _validateOperatorRecoveryArtifacts({ paths, state });
|
|
1405
|
+
if (validation.ok) {
|
|
1406
|
+
console.error(
|
|
1407
|
+
`[recovery] Resuming verify phase — operator manual recovery detected (us=${state.current_us} iter=${state.iteration}): ${validation.reason}`,
|
|
1408
|
+
);
|
|
1409
|
+
state._skipNextWorkerDispatch = true;
|
|
1410
|
+
} else {
|
|
1411
|
+
console.error(
|
|
1412
|
+
`[recovery] phase=verify ignored, falling through to worker dispatch: ${validation.reason}`,
|
|
1413
|
+
);
|
|
1414
|
+
}
|
|
1415
|
+
}
|
|
1416
|
+
|
|
1291
1417
|
// P1-E Lane Enforcement: snapshot lane mtimes before each iteration,
|
|
1292
1418
|
// compare at the top of the next iteration. Drift on read-only artifacts
|
|
1293
1419
|
// (PRD, test-spec, context) emits a lane_violation_warning event + audit
|
|
@@ -1572,18 +1698,36 @@ async function _runCampaignBody(slug, options, paths, rootDir) {
|
|
|
1572
1698
|
}
|
|
1573
1699
|
}
|
|
1574
1700
|
|
|
1575
|
-
|
|
1576
|
-
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
state
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
|
|
1701
|
+
// PR-A (Bug #10): one-shot guard. When the operator's `phase=verify`
|
|
1702
|
+
// recovery was honored at campaign entry, skip both the phase reset and
|
|
1703
|
+
// the worker dispatch — the operator already wrote a valid iter-signal.json
|
|
1704
|
+
// and done-claim.json, so pollForSignal below will pick them up immediately
|
|
1705
|
+
// and the loop continues into the verifier phase. The flag is cleared
|
|
1706
|
+
// after consumption so subsequent iterations dispatch the worker normally.
|
|
1707
|
+
if (state._skipNextWorkerDispatch) {
|
|
1708
|
+
state._skipNextWorkerDispatch = false;
|
|
1709
|
+
console.error(
|
|
1710
|
+
`[recovery] Skipping worker dispatch for iter=${state.iteration} (honoring operator manual recovery)`,
|
|
1711
|
+
);
|
|
1712
|
+
// Persist phase=verify so a subsequent crash-and-relaunch sees the same
|
|
1713
|
+
// contract. writeStatus is intentionally called BEFORE pollForSignal so
|
|
1714
|
+
// the on-disk state matches what we are about to do.
|
|
1715
|
+
state.phase = 'verify';
|
|
1716
|
+
await writeStatus(paths, state, options.onStatusChange, options.now);
|
|
1717
|
+
} else {
|
|
1718
|
+
state.phase = 'worker';
|
|
1719
|
+
await writeStatus(paths, state, options.onStatusChange, options.now);
|
|
1720
|
+
await dispatchWorker({
|
|
1721
|
+
iteration: state.iteration,
|
|
1722
|
+
paths,
|
|
1723
|
+
slug,
|
|
1724
|
+
usList,
|
|
1725
|
+
state,
|
|
1726
|
+
sendKeys,
|
|
1727
|
+
workerPaneId: state.worker_pane_id,
|
|
1728
|
+
fixContractPath,
|
|
1729
|
+
});
|
|
1730
|
+
}
|
|
1587
1731
|
|
|
1588
1732
|
let signal;
|
|
1589
1733
|
try {
|
|
@@ -285,6 +285,90 @@ _unlock_sentinel() {
|
|
|
285
285
|
return 0
|
|
286
286
|
}
|
|
287
287
|
|
|
288
|
+
# PR-A (Bug #10) — validate operator-written manual recovery artifacts.
|
|
289
|
+
# Returns 0 when all 5 checks pass; 1 otherwise. Sets RECOVERY_FAIL_REASON
|
|
290
|
+
# (global) on failure for caller logging. Mirrors the Node-side helper
|
|
291
|
+
# `_validateOperatorRecoveryArtifacts` in `src/node/runner/campaign-main-loop.mjs`.
|
|
292
|
+
#
|
|
293
|
+
# Args:
|
|
294
|
+
# $1 iter-signal.json path
|
|
295
|
+
# $2 done-claim.json path
|
|
296
|
+
# $3 status.json path
|
|
297
|
+
# $4 iter-NNN.worker-prompt.md path (may not exist for iter-1 fresh start)
|
|
298
|
+
_validate_operator_recovery_artifacts() {
|
|
299
|
+
local sig_file="$1" done_file="$2" status_file="$3" prompt_file="$4"
|
|
300
|
+
RECOVERY_FAIL_REASON=""
|
|
301
|
+
|
|
302
|
+
# Check 1: both artifacts exist + parse as JSON
|
|
303
|
+
if [[ ! -f "$sig_file" ]]; then
|
|
304
|
+
RECOVERY_FAIL_REASON="iter-signal.json missing"; return 1
|
|
305
|
+
fi
|
|
306
|
+
if [[ ! -f "$done_file" ]]; then
|
|
307
|
+
RECOVERY_FAIL_REASON="done-claim.json missing"; return 1
|
|
308
|
+
fi
|
|
309
|
+
if ! command -v jq >/dev/null 2>&1; then
|
|
310
|
+
RECOVERY_FAIL_REASON="jq unavailable; cannot validate"; return 1
|
|
311
|
+
fi
|
|
312
|
+
if ! jq -e . "$sig_file" >/dev/null 2>&1; then
|
|
313
|
+
RECOVERY_FAIL_REASON="iter-signal.json parse error"; return 1
|
|
314
|
+
fi
|
|
315
|
+
if ! jq -e . "$done_file" >/dev/null 2>&1; then
|
|
316
|
+
RECOVERY_FAIL_REASON="done-claim.json parse error"; return 1
|
|
317
|
+
fi
|
|
318
|
+
if [[ ! -f "$status_file" ]] || ! jq -e . "$status_file" >/dev/null 2>&1; then
|
|
319
|
+
RECOVERY_FAIL_REASON="status.json missing or invalid"; return 1
|
|
320
|
+
fi
|
|
321
|
+
|
|
322
|
+
# Check 2: us_id match in both artifacts
|
|
323
|
+
local current_us sig_us done_us
|
|
324
|
+
current_us=$(jq -r '.current_us // ""' "$status_file" 2>/dev/null)
|
|
325
|
+
sig_us=$(jq -r '.us_id // ""' "$sig_file" 2>/dev/null)
|
|
326
|
+
done_us=$(jq -r '.us_id // ""' "$done_file" 2>/dev/null)
|
|
327
|
+
if [[ "$sig_us" != "$current_us" ]]; then
|
|
328
|
+
RECOVERY_FAIL_REASON="iter-signal.us_id ($sig_us) != status.current_us ($current_us)"; return 1
|
|
329
|
+
fi
|
|
330
|
+
if [[ "$done_us" != "$current_us" ]]; then
|
|
331
|
+
RECOVERY_FAIL_REASON="done-claim.us_id ($done_us) != status.current_us ($current_us)"; return 1
|
|
332
|
+
fi
|
|
333
|
+
|
|
334
|
+
# Check 3: iteration match in both artifacts
|
|
335
|
+
local current_iter sig_iter done_iter
|
|
336
|
+
current_iter=$(jq -r '.iteration // 0' "$status_file" 2>/dev/null)
|
|
337
|
+
sig_iter=$(jq -r '.iteration // 0' "$sig_file" 2>/dev/null)
|
|
338
|
+
done_iter=$(jq -r '.iteration // 0' "$done_file" 2>/dev/null)
|
|
339
|
+
if [[ "$sig_iter" != "$current_iter" ]]; then
|
|
340
|
+
RECOVERY_FAIL_REASON="iter-signal.iteration ($sig_iter) != status.iteration ($current_iter)"; return 1
|
|
341
|
+
fi
|
|
342
|
+
if [[ "$done_iter" != "$current_iter" ]]; then
|
|
343
|
+
RECOVERY_FAIL_REASON="done-claim.iteration ($done_iter) != status.iteration ($current_iter)"; return 1
|
|
344
|
+
fi
|
|
345
|
+
|
|
346
|
+
# Check 4: iter_signal_quality must equal 'specific'
|
|
347
|
+
local sig_quality
|
|
348
|
+
sig_quality=$(jq -r '.iter_signal_quality // ""' "$sig_file" 2>/dev/null)
|
|
349
|
+
if [[ "$sig_quality" != "specific" ]]; then
|
|
350
|
+
RECOVERY_FAIL_REASON="iter-signal.iter_signal_quality ($sig_quality) != 'specific'"; return 1
|
|
351
|
+
fi
|
|
352
|
+
|
|
353
|
+
# Check 5: artifact mtimes must be strictly newer than worker-prompt mtime.
|
|
354
|
+
# Vacuously passes when the prompt file does not exist (fresh iter-1 start
|
|
355
|
+
# before any leader-written prompt).
|
|
356
|
+
if [[ -f "$prompt_file" ]]; then
|
|
357
|
+
local prompt_mtime sig_mtime done_mtime
|
|
358
|
+
prompt_mtime=$(stat -f %m "$prompt_file" 2>/dev/null || stat -c %Y "$prompt_file" 2>/dev/null || print 0)
|
|
359
|
+
sig_mtime=$(stat -f %m "$sig_file" 2>/dev/null || stat -c %Y "$sig_file" 2>/dev/null || print 0)
|
|
360
|
+
done_mtime=$(stat -f %m "$done_file" 2>/dev/null || stat -c %Y "$done_file" 2>/dev/null || print 0)
|
|
361
|
+
if (( sig_mtime <= prompt_mtime )); then
|
|
362
|
+
RECOVERY_FAIL_REASON="iter-signal.json mtime ($sig_mtime) not strictly newer than worker-prompt mtime ($prompt_mtime)"; return 1
|
|
363
|
+
fi
|
|
364
|
+
if (( done_mtime <= prompt_mtime )); then
|
|
365
|
+
RECOVERY_FAIL_REASON="done-claim.json mtime ($done_mtime) not strictly newer than worker-prompt mtime ($prompt_mtime)"; return 1
|
|
366
|
+
fi
|
|
367
|
+
fi
|
|
368
|
+
|
|
369
|
+
return 0
|
|
370
|
+
}
|
|
371
|
+
|
|
288
372
|
# PR-0b-narrow (Plan v6) — stamp leader handshake ack onto the sentinel.
|
|
289
373
|
# Mirror of src/node/shared/fs.mjs::stampAckField. Best-effort, audit-only:
|
|
290
374
|
# any failure is silently swallowed. Sequence:
|
|
@@ -3045,24 +3045,50 @@ main() {
|
|
|
3045
3045
|
return 1
|
|
3046
3046
|
fi
|
|
3047
3047
|
|
|
3048
|
-
#
|
|
3049
|
-
#
|
|
3050
|
-
#
|
|
3051
|
-
|
|
3052
|
-
|
|
3053
|
-
|
|
3054
|
-
|
|
3055
|
-
|
|
3056
|
-
|
|
3057
|
-
|
|
3058
|
-
|
|
3059
|
-
|
|
3060
|
-
|
|
3061
|
-
|
|
3062
|
-
|
|
3063
|
-
|
|
3064
|
-
|
|
3065
|
-
|
|
3048
|
+
# PR-A (Bug #10): operator-recovery hygiene check.
|
|
3049
|
+
# When the operator hand-rolls a `phase=verify` recovery (jq-patches
|
|
3050
|
+
# status.json, writes manual iter-signal.json + done-claim.json, deletes
|
|
3051
|
+
# the blocked sentinel), the leader MUST honor that work instead of
|
|
3052
|
+
# deleting the artifacts and resetting to phase=worker. Mirrors the
|
|
3053
|
+
# Node-side guard in src/node/runner/campaign-main-loop.mjs.
|
|
3054
|
+
local SKIP_NEXT_WORKER=0
|
|
3055
|
+
local LAST_PHASE=""
|
|
3056
|
+
if [[ -f "$STATUS_FILE" ]] && command -v jq >/dev/null 2>&1; then
|
|
3057
|
+
LAST_PHASE=$(jq -r '.phase // ""' "$STATUS_FILE" 2>/dev/null)
|
|
3058
|
+
fi
|
|
3059
|
+
if [[ "$LAST_PHASE" == "verify" ]]; then
|
|
3060
|
+
local _iter_prompt="$LOGS_DIR/iter-$(printf '%03d' $ITERATION).worker-prompt.md"
|
|
3061
|
+
if _validate_operator_recovery_artifacts \
|
|
3062
|
+
"$SIGNAL_FILE" "$DONE_CLAIM_FILE" "$STATUS_FILE" "$_iter_prompt"; then
|
|
3063
|
+
log "[recovery] Resuming verify phase — operator manual recovery detected (iter=$ITERATION)"
|
|
3064
|
+
log_debug "[recovery] iter=$ITERATION skip_worker=true reason=manual_recovery_validated"
|
|
3065
|
+
SKIP_NEXT_WORKER=1
|
|
3066
|
+
else
|
|
3067
|
+
log "[recovery] phase=verify ignored: ${RECOVERY_FAIL_REASON}"
|
|
3068
|
+
log_debug "[recovery] iter=$ITERATION skip_worker=false reason=\"${RECOVERY_FAIL_REASON}\""
|
|
3069
|
+
fi
|
|
3070
|
+
fi
|
|
3071
|
+
|
|
3072
|
+
if (( ! SKIP_NEXT_WORKER )); then
|
|
3073
|
+
# --- governance.md s7 step 8 (cleanup): Clean previous iteration signals ---
|
|
3074
|
+
# Bug #7 Fix-R cleanup: unlock 0o444 sentinels written by the previous
|
|
3075
|
+
# iteration's reaper before rm so cleanup does not log permission noise.
|
|
3076
|
+
_unlock_sentinel "$SIGNAL_FILE"
|
|
3077
|
+
_unlock_sentinel "$VERDICT_FILE"
|
|
3078
|
+
rm -f "$SIGNAL_FILE" "$DONE_CLAIM_FILE" "$VERDICT_FILE" 2>/dev/null
|
|
3079
|
+
rm -f "$WORKER_HEARTBEAT" "$VERIFIER_HEARTBEAT" 2>/dev/null
|
|
3080
|
+
|
|
3081
|
+
# --- Clean previous claude session in panes (one-shot lifecycle) ---
|
|
3082
|
+
# Only needed from iteration 2 onwards (iteration 1 has fresh panes)
|
|
3083
|
+
if (( ITERATION > 1 )); then
|
|
3084
|
+
# Send C-c first (in case claude is mid-task), then /exit
|
|
3085
|
+
tmux send-keys -t "$WORKER_PANE" C-c 2>/dev/null
|
|
3086
|
+
sleep 1
|
|
3087
|
+
tmux send-keys -t "$WORKER_PANE" "/exit" C-m 2>/dev/null
|
|
3088
|
+
sleep 2
|
|
3089
|
+
# Wait for shell prompt before proceeding
|
|
3090
|
+
wait_for_pane_ready "$WORKER_PANE" 10 2>/dev/null || true
|
|
3091
|
+
fi
|
|
3066
3092
|
fi
|
|
3067
3093
|
|
|
3068
3094
|
# Reset per-iteration state
|
|
@@ -3074,33 +3100,44 @@ main() {
|
|
|
3074
3100
|
# --- US-004: detect PRD changes for live update + re-split ---
|
|
3075
3101
|
check_prd_update
|
|
3076
3102
|
|
|
3077
|
-
#
|
|
3078
|
-
|
|
3079
|
-
local worker_prompt="$LOGS_DIR/iter-$(printf '%03d' $ITERATION).worker-prompt.md"
|
|
3080
|
-
|
|
3081
|
-
# AC1: capture worker start timestamp
|
|
3103
|
+
# AC1: capture worker start timestamp (still set for downstream telemetry
|
|
3104
|
+
# even when the worker dispatch is skipped — recovery still consumes time).
|
|
3082
3105
|
ITER_WORKER_START=$(date +%s)
|
|
3083
3106
|
|
|
3084
|
-
|
|
3107
|
+
local worker_launch=""
|
|
3108
|
+
if (( ! SKIP_NEXT_WORKER )); then
|
|
3109
|
+
# --- governance.md s7 step 4: Build worker prompt + trigger ---
|
|
3110
|
+
write_worker_trigger "$ITERATION"
|
|
3111
|
+
local worker_prompt="$LOGS_DIR/iter-$(printf '%03d' $ITERATION).worker-prompt.md"
|
|
3085
3112
|
|
|
3086
|
-
|
|
3087
|
-
log_debug "[FLOW] iter=$ITERATION phase=worker engine=$WORKER_ENGINE model=$WORKER_MODEL dispatched=true"
|
|
3113
|
+
update_status "worker" "running"
|
|
3088
3114
|
|
|
3089
|
-
|
|
3090
|
-
|
|
3091
|
-
|
|
3092
|
-
if
|
|
3093
|
-
|
|
3094
|
-
|
|
3095
|
-
|
|
3115
|
+
# --- governance.md s7 step 5: Execute Worker (dispatched to engine-specific function) ---
|
|
3116
|
+
log_debug "[FLOW] iter=$ITERATION phase=worker engine=$WORKER_ENGINE model=$WORKER_MODEL dispatched=true"
|
|
3117
|
+
|
|
3118
|
+
if [[ "$WORKER_ENGINE" = "codex" ]]; then
|
|
3119
|
+
worker_launch="${CODEX_BIN:-codex} -m $WORKER_CODEX_MODEL -c model_reasoning_effort=\"$WORKER_CODEX_REASONING\" --disable plugins --dangerously-bypass-approvals-and-sandbox"
|
|
3120
|
+
if ! launch_worker_codex "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
|
|
3121
|
+
write_blocked_sentinel "Worker codex failed to start in pane" "" "infra_failure"
|
|
3122
|
+
update_status "blocked" "worker_start_failed"
|
|
3123
|
+
return 1
|
|
3124
|
+
fi
|
|
3125
|
+
else
|
|
3126
|
+
worker_launch="$(build_claude_cmd tui "$WORKER_MODEL" "" "" "$WORKER_EFFORT")"
|
|
3127
|
+
if ! launch_worker_claude "$WORKER_PANE" "$worker_prompt" "$ITERATION" "$worker_launch"; then
|
|
3128
|
+
write_blocked_sentinel "Worker claude failed to start in pane" "" "infra_failure"
|
|
3129
|
+
update_status "blocked" "worker_start_failed"
|
|
3130
|
+
return 1
|
|
3131
|
+
fi
|
|
3096
3132
|
fi
|
|
3097
3133
|
else
|
|
3098
|
-
|
|
3099
|
-
|
|
3100
|
-
|
|
3101
|
-
|
|
3102
|
-
|
|
3103
|
-
|
|
3134
|
+
# PR-A (Bug #10): one-shot recovery path. The operator's iter-signal.json
|
|
3135
|
+
# is already on disk; polling below picks it up immediately and the loop
|
|
3136
|
+
# transitions cleanly into the verifier phase. Persist phase=verify so a
|
|
3137
|
+
# subsequent crash-and-relaunch sees the same contract. SKIP_NEXT_WORKER
|
|
3138
|
+
# is local to this iteration so iter-N+1 dispatches the worker normally.
|
|
3139
|
+
update_status "verify" "running"
|
|
3140
|
+
log "[recovery] Skipping worker dispatch for iter=$ITERATION (one-shot, honoring operator manual recovery)"
|
|
3104
3141
|
fi
|
|
3105
3142
|
|
|
3106
3143
|
# --- governance.md s7 step 5+6: Poll for Worker completion ---
|