npm - synergyspec-selfevolving - Versions diffs - 2.1.0 → 2.1.2 - Mend

synergyspec-selfevolving 2.1.0 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/dist/commands/learn.js +29 -3
package/dist/commands/self-evolution-episode.js +37 -1
package/dist/core/fitness/health/local-source.d.ts +11 -0
package/dist/core/fitness/health/local-source.js +53 -1
package/dist/core/project-config.d.ts +5 -0
package/dist/core/project-config.js +23 -1
package/dist/core/self-evolution/critic-agent.d.ts +16 -1
package/dist/core/self-evolution/critic-agent.js +87 -17
package/dist/core/self-evolution/episode-orchestrator.d.ts +28 -0
package/dist/core/self-evolution/episode-orchestrator.js +369 -220
package/dist/core/self-evolution/episode-store.d.ts +41 -2
package/dist/core/self-evolution/episode-store.js +33 -9
package/dist/core/self-evolution/evolving-agent.d.ts +51 -2
package/dist/core/self-evolution/evolving-agent.js +45 -4
package/dist/core/self-evolution/host-harness.d.ts +43 -0
package/dist/core/self-evolution/host-harness.js +192 -0
package/dist/core/self-evolution/reward-agent.d.ts +68 -0
package/dist/core/self-evolution/reward-agent.js +92 -23
package/dist/core/self-evolution/reward-aggregator.d.ts +26 -7
package/dist/core/self-evolution/reward-aggregator.js +78 -20
package/dist/core/self-evolution/verdict.d.ts +3 -2
package/dist/core/self-evolution/verdict.js +4 -1
package/dist/dashboard/react-client.js +2 -1
package/dist/ui/ascii-patterns.d.ts +7 -8
package/dist/ui/ascii-patterns.js +54 -120
package/dist/ui/welcome-screen.d.ts +8 -0
package/dist/ui/welcome-screen.js +2 -2
package/package.json +1 -1

package/dist/core/self-evolution/episode-orchestrator.js CHANGED Viewed

@@ -27,6 +27,18 @@ import { reconcilePrediction, summarizeCalibration, } from './policy/prediction-
 export async function captureMainArm(opts) {
     const sample = opts.report.fitnessSample;
     const facts = sample?.trajectoryFacts;
+    // ④ Observable degrade: a verified:false arm — whether because NO
+    // observed-trajectory facts were captured, OR facts exist but the runner's
+    // pass/fail was not derivable (`facts.verified !== true`) — is surfaced on
+    // stderr so it is never SILENT. A wedged/missing/unextractable runner is the
+    // most common loop-stall cause, and a silent false reads identically to a real
+    // miss. The arm's recorded `verified` collapses to false in BOTH cases (see the
+    // `objective.verified` below), so EVERY verified:false arm warns exactly once;
+    // a genuinely verified arm (`facts.verified === true`) stays quiet.
+    if (!facts || facts.verified !== true) {
+        // eslint-disable-next-line no-console
+        console.warn(`[episode-orchestrator] observed grading unavailable for change "${opts.changeName}" — recording verified:false (observed run not verified)`);
+    }
     // Honesty: prefer the OBSERVED pass rate (a real runner ran), else the
     // authored test-report summary; null when neither parsed (never fabricated) —
     // the exact precedence learn.ts uses to compute the loss.
@@ -80,14 +92,30 @@ export async function captureMainArm(opts) {
  * Whether the episode SKIPS the rollback decision + 演进智能体 EVOLVING AGENT:
  * the judge 弃权 abstained (no nameable gap), found no gaps, OR returned the
  * ⑤ `insufficient-signal` verdict (within the A/A noise floor, or a blocked
- * tamper). All three mean "do not evolve on this episode".
+ * tamper). These mean "do not evolve on this episode".
+ *
+ * EXCEPTION (cold-start bootstrap): `insufficient-signal` is honored ONLY as a
+ * genuine can't-tell — a within-noise-floor result (the baseline ran) or a blocked
+ * tamper. On a baseline-skipped episode (policyVersions.baseline === null) there is
+ * no comparison to be uncertain about, so a stray `insufficient-signal` emitted
+ * alongside real gaps (and no tamper) must NOT block: the first v0→v1 evolution has
+ * to be reachable from absolute signal, or a fresh target stays at v0 forever. This
+ * is defense-in-depth behind {@link deriveSingleSampleVerdict}, which already drops
+ * a volunteered verdict to `undefined` on a baseline skip.
  */
 function shouldSkipEvolution(diagnosis) {
     if (diagnosis === null)
         return true;
-    return (diagnosis.abstained ||
-        diagnosis.gaps.length === 0 ||
-        diagnosis.verdict === 'insufficient-signal');
+    if (diagnosis.abstained || diagnosis.gaps.length === 0)
+        return true;
+    if (diagnosis.verdict === 'insufficient-signal') {
+        const baselineSkipped = diagnosis.policyVersions?.baseline === null;
+        const tamper = diagnosis.integrity?.testTamperSuspected ?? false;
+        if (baselineSkipped && !tamper)
+            return false;
+        return true;
+    }
+    return false;
 }
 /**
  * Count the consecutive trailing rolled-back episodes in the 版本账本 ledger.
@@ -152,6 +180,17 @@ function deriveEpisodeId(changeName, now) {
         .replace(/-{2,}/g, '-')
         .replace(/^-+|-+$/g, '');
 }
+/**
+ * Build the `terminalError` note for a thrown step. A timeout reads identically
+ * to a hard crash on disk otherwise, so a message that names a host-agent timeout
+ * (the spawn timeout puts `headless agent timed out after Nms` into stderr → the
+ * error message) is PREFIXED with a `timeout:` marker. A timed-out episode is
+ * then distinguishable from a genuine crash in episode.json. Pure.
+ */
+function terminalErrorLabel(err) {
+    const msg = err instanceof Error ? err.message : String(err);
+    return /timed out/i.test(msg) ? `timeout: ${msg}` : msg;
+}
 /**
  * Run ONE episode through the loop in the strict, durably-persisted order
  * documented at the top of this module. See {@link RunEpisodeResult}.
@@ -251,149 +290,180 @@ async function runEpisodeAfterCreate(opts) {
         objective: opts.mainArm.objective,
     });
     await advanceEpisodeStage({ repoRoot, episodeId, stage: 'main-arm-captured' });
-    // ── d: CRITIC AGENT（基线智能体 baseline agent）or skip ───────────────────────
-    const shouldCritic = await shouldRunCriticAgent({ repoRoot, targetId });
-    if (shouldCritic.run && shouldCritic.baselineVersion !== null) {
-        // runCriticAgent advances the episode to 'baseline-arm-captured'.
-        await runCriticAgent({
-            repoRoot,
-            targetId,
-            changeName: opts.changeName,
-            episodeId,
-            baselineVersion: shouldCritic.baselineVersion,
-            ...(opts.critic?.baselineMode ? { baselineMode: opts.critic.baselineMode } : {}),
-            spawn: opts.spawn,
-        });
-    }
-    else {
-        baselineSkipped = true;
-        await advanceEpisodeStage({
-            repoRoot,
-            episodeId,
-            stage: 'baseline-skipped',
-            patch: { baselineSkippedReason: shouldCritic.reason },
-        });
-    }
-    // ── e: 奖励智能体 REWARD AGENT — score + diagnosis.json + advance 'scored' ────
-    // ④ Compute the tamper hint first (unless tamperCheck is 'off'); the ensemble
-    // injects it into the judge prompt + the diagnosis integrity, and BLOCK mode
-    // forces an insufficient-signal verdict (no extra spawns at the default
-    // samples=1, flag-only).
-    const tamperMode = opts.reward?.tamperCheck ?? 'flag';
-    const integrityHint = tamperMode === 'off' ? null : await detectTestTamper({ changeDirPath: opts.changeDirPath });
-    const reward = await runRewardAgentEnsemble({
-        repoRoot,
-        episodeId,
-        spawn: opts.spawn,
-        ...(opts.reward ? { reward: opts.reward } : {}),
-        integrityHint,
-    });
-    const diagnosis = reward.diagnosis;
-    advantage = diagnosis.advantage;
-    // 预测校准: settle the PRIOR 'evolve' step's checkable prediction against this
-    // episode's measured arm delta (verifiable per-metric main−baseline, NOT the
-    // judge's reward) and log the residual. Best-effort — a calibration miss must
-    // never fail the episode.
+    // Steps d–g spawn the three agents (any of which may THROW — a wedged/crashed
+    // host CLI, a never-validating reward output, an evolving-agent invocation
+    // error). A thrown agent step must record a DURABLE terminal 'errored' stage
+    // (with the error text) so the episode is never orphaned at 'kept'/'scored'
+    // (indistinguishable from a still-running episode — ses_1330/1331). The lock
+    // release stays in runEpisode's finally; the re-throw below reaches it.
     try {
-        await reconcilePrediction({ repoRoot, targetId, episodeId });
-    }
-    catch {
-        // best-effort: the prediction-reconcile ledger is advisory only
-    }
-    // ── f: DECISION (every step durably persisted before the next) ───────────────
-    if (shouldSkipEvolution(diagnosis)) {
-        // 弃权 abstained / no nameable gap / insufficient-signal → no rollback
-        // decision; SKIP evolution. A BLOCKED tamper records a reject-buffer entry so
-        // the gamed attempt is visible to future episodes' 演进智能体.
-        if (tamperMode === 'block' &&
-            diagnosis.integrity?.testTamperSuspected &&
-            diagnosis.verdict === 'insufficient-signal') {
-            const head = await currentPolicyVersion(repoRoot, targetId);
-            await appendRejectBufferEntry(repoRoot, {
-                schemaVersion: 1,
-                at: new Date().toISOString(),
-                episodeId,
-                targetId,
-                // No version moved (the main arm's tampered tests are not a policy edit);
-                // record at the current head so the entry is informational, not a rollback.
-                fromVersion: head ?? 0,
-                toVersion: head ?? 0,
-                advantage: diagnosis.advantage,
-                rewardMain: diagnosis.rewardMain,
-                rewardBaseline: diagnosis.rewardBaseline,
-                textualGradientTried: diagnosis.textualGradient ?? '',
-                editSummary: buildRejectEditSummary(diagnosis),
-                reason: 'tamper-suspected',
-            });
-        }
-        decision = 'abstained';
-        await advanceEpisodeStage({ repoRoot, episodeId, stage: 'abstained' });
-    }
-    else {
-        const badAdvantage = advantage !== null && advantage < threshold;
-        const ep = await readEpisode(repoRoot, episodeId);
-        const headBeforeRollback = await currentPolicyVersion(repoRoot, targetId);
-        // Resolve the rollback target: the policy the CRITIC AGENT reran
-        // (`policyVersionBaseline`) when it is a valid EARLIER version, else the
-        // version immediately before the head (the prior good policy the bad edit
-        // advanced past). `rollbackPolicyVersion` requires `toVersion < head`.
-        const rollbackTarget = resolveRollbackTarget(ep.policyVersionBaseline, headBeforeRollback);
-        if (badAdvantage && rollbackTarget !== null && headBeforeRollback !== null) {
-            // (i) ROLLBACK first — durable on disk. `advantage` is non-null inside the
-            // badAdvantage branch; `?? undefined` satisfies the optional `number` param.
-            await rollbackPolicyVersion({
+        // ── d: CRITIC AGENT（基线智能体 baseline agent）or skip ───────────────────────
+        const shouldCritic = await shouldRunCriticAgent({ repoRoot, targetId });
+        if (shouldCritic.run && shouldCritic.baselineVersion !== null) {
+            // runCriticAgent advances the episode to 'baseline-arm-captured'.
+            await runCriticAgent({
                 repoRoot,
                 targetId,
+                changeName: opts.changeName,
                 episodeId,
-                toVersion: rollbackTarget,
-                advantage: advantage ?? undefined,
+                baselineVersion: shouldCritic.baselineVersion,
+                ...(opts.critic?.baselineMode ? { baselineMode: opts.critic.baselineMode } : {}),
+                spawn: opts.spawn,
+                ...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
+                ...(opts.harness ? { harness: opts.harness } : {}),
             });
-            // (ii) THEN append the 否决缓冲 reject-buffer entry — durable on disk —
-            // BEFORE the 演进智能体 EVOLVING AGENT is even called, so the entry written
-            // THIS episode is in its fresh-from-disk prompt.
-            const rejectEntry = {
-                schemaVersion: 1,
-                at: new Date().toISOString(),
-                episodeId,
-                targetId,
-                // fromVersion = the version we rolled back TO (the prior good policy);
-                // toVersion = the (now rolled-back) version the rejected edit reached.
-                fromVersion: rollbackTarget,
-                toVersion: headBeforeRollback,
-                advantage,
-                rewardMain: diagnosis.rewardMain,
-                rewardBaseline: diagnosis.rewardBaseline,
-                textualGradientTried: diagnosis.textualGradient ?? '',
-                editSummary: buildRejectEditSummary(diagnosis),
-                reason: 'bad-advantage',
-            };
-            await appendRejectBufferEntry(repoRoot, rejectEntry);
-            decision = 'rolled-back';
-            await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
         }
         else {
-            // Good advantage, OR no earlier version to roll back to (e.g. head is v0):
-            // keep the current head.
-            decision = 'kept';
-            await advanceEpisodeStage({ repoRoot, episodeId, stage: 'kept' });
+            baselineSkipped = true;
+            await advanceEpisodeStage({
+                repoRoot,
+                episodeId,
+                stage: 'baseline-skipped',
+                patch: { baselineSkippedReason: shouldCritic.reason },
+            });
         }
-        // ── g: ONLY AFTER (f) persisted 'rolled-back'/'kept' ───────────────────────
-        // runEvolvingAgent reads the reject-buffer FRESH from disk (the entry just
-        // written THIS episode is in its prompt). Never parallelized with (f).
-        // 步长: after a rollback, shrink the edit budget (smaller step after a step
-        // that lost ground). 预测校准: pass the proposer's recent prediction record.
-        const scheduledBudget = decision === 'rolled-back'
-            ? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
-            : editBudget;
-        const calibrationNote = await summarizeCalibration(repoRoot, targetId);
-        evolution = await runEvolvingAgent({
+        // ── e: 奖励智能体 REWARD AGENT — score + diagnosis.json + advance 'scored' ────
+        // ④ Compute the tamper hint first (unless tamperCheck is 'off'); the ensemble
+        // injects it into the judge prompt + the diagnosis integrity, and BLOCK mode
+        // forces an insufficient-signal verdict (no extra spawns at the default
+        // samples=1, flag-only).
+        const tamperMode = opts.reward?.tamperCheck ?? 'flag';
+        const integrityHint = tamperMode === 'off' ? null : await detectTestTamper({ changeDirPath: opts.changeDirPath });
+        const reward = await runRewardAgentEnsemble({
             repoRoot,
             episodeId,
-            targetId,
-            editBudget: scheduledBudget,
-            ...(calibrationNote ? { calibrationNote } : {}),
             spawn: opts.spawn,
+            ...(opts.reward ? { reward: opts.reward } : {}),
+            integrityHint,
+            ...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
+            ...(opts.harness ? { harness: opts.harness } : {}),
         });
+        const diagnosis = reward.diagnosis;
+        advantage = diagnosis.advantage;
+        // 预测校准: settle the PRIOR 'evolve' step's checkable prediction against this
+        // episode's measured arm delta (verifiable per-metric main−baseline, NOT the
+        // judge's reward) and log the residual. Best-effort — a calibration miss must
+        // never fail the episode.
+        try {
+            await reconcilePrediction({ repoRoot, targetId, episodeId });
+        }
+        catch {
+            // best-effort: the prediction-reconcile ledger is advisory only
+        }
+        // ── f: DECISION (every step durably persisted before the next) ───────────────
+        if (shouldSkipEvolution(diagnosis)) {
+            // 弃权 abstained / no nameable gap / insufficient-signal → no rollback
+            // decision; SKIP evolution. A BLOCKED tamper records a reject-buffer entry so
+            // the gamed attempt is visible to future episodes' 演进智能体.
+            if (tamperMode === 'block' &&
+                diagnosis.integrity?.testTamperSuspected &&
+                diagnosis.verdict === 'insufficient-signal') {
+                const head = await currentPolicyVersion(repoRoot, targetId);
+                await appendRejectBufferEntry(repoRoot, {
+                    schemaVersion: 1,
+                    at: new Date().toISOString(),
+                    episodeId,
+                    targetId,
+                    // No version moved (the main arm's tampered tests are not a policy edit);
+                    // record at the current head so the entry is informational, not a rollback.
+                    fromVersion: head ?? 0,
+                    toVersion: head ?? 0,
+                    advantage: diagnosis.advantage,
+                    rewardMain: diagnosis.rewardMain,
+                    rewardBaseline: diagnosis.rewardBaseline,
+                    textualGradientTried: diagnosis.textualGradient ?? '',
+                    editSummary: buildRejectEditSummary(diagnosis),
+                    reason: 'tamper-suspected',
+                });
+            }
+            decision = 'abstained';
+            await advanceEpisodeStage({ repoRoot, episodeId, stage: 'abstained' });
+        }
+        else {
+            const badAdvantage = advantage !== null && advantage < threshold;
+            const ep = await readEpisode(repoRoot, episodeId);
+            const headBeforeRollback = await currentPolicyVersion(repoRoot, targetId);
+            // Resolve the rollback target: the policy the CRITIC AGENT reran
+            // (`policyVersionBaseline`) when it is a valid EARLIER version, else the
+            // version immediately before the head (the prior good policy the bad edit
+            // advanced past). `rollbackPolicyVersion` requires `toVersion < head`.
+            const rollbackTarget = resolveRollbackTarget(ep.policyVersionBaseline, headBeforeRollback);
+            if (badAdvantage && rollbackTarget !== null && headBeforeRollback !== null) {
+                // (i) ROLLBACK first — durable on disk. `advantage` is non-null inside the
+                // badAdvantage branch; `?? undefined` satisfies the optional `number` param.
+                await rollbackPolicyVersion({
+                    repoRoot,
+                    targetId,
+                    episodeId,
+                    toVersion: rollbackTarget,
+                    advantage: advantage ?? undefined,
+                });
+                // (ii) THEN append the 否决缓冲 reject-buffer entry — durable on disk —
+                // BEFORE the 演进智能体 EVOLVING AGENT is even called, so the entry written
+                // THIS episode is in its fresh-from-disk prompt.
+                const rejectEntry = {
+                    schemaVersion: 1,
+                    at: new Date().toISOString(),
+                    episodeId,
+                    targetId,
+                    // fromVersion = the version we rolled back TO (the prior good policy);
+                    // toVersion = the (now rolled-back) version the rejected edit reached.
+                    fromVersion: rollbackTarget,
+                    toVersion: headBeforeRollback,
+                    advantage,
+                    rewardMain: diagnosis.rewardMain,
+                    rewardBaseline: diagnosis.rewardBaseline,
+                    textualGradientTried: diagnosis.textualGradient ?? '',
+                    editSummary: buildRejectEditSummary(diagnosis),
+                    reason: 'bad-advantage',
+                };
+                await appendRejectBufferEntry(repoRoot, rejectEntry);
+                decision = 'rolled-back';
+                await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
+            }
+            else {
+                // Good advantage, OR no earlier version to roll back to (e.g. head is v0):
+                // keep the current head.
+                decision = 'kept';
+                await advanceEpisodeStage({ repoRoot, episodeId, stage: 'kept' });
+            }
+            // ── g: ONLY AFTER (f) persisted 'rolled-back'/'kept' ───────────────────────
+            // runEvolvingAgent reads the reject-buffer FRESH from disk (the entry just
+            // written THIS episode is in its prompt). Never parallelized with (f).
+            // 步长: after a rollback, shrink the edit budget (smaller step after a step
+            // that lost ground). 预测校准: pass the proposer's recent prediction record.
+            const scheduledBudget = decision === 'rolled-back'
+                ? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
+                : editBudget;
+            const calibrationNote = await summarizeCalibration(repoRoot, targetId);
+            evolution = await runEvolvingAgent({
+                repoRoot,
+                episodeId,
+                targetId,
+                editBudget: scheduledBudget,
+                ...(calibrationNote ? { calibrationNote } : {}),
+                spawn: opts.spawn,
+                ...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
+                ...(opts.harness ? { harness: opts.harness } : {}),
+            });
+        }
+    }
+    catch (err) {
+        // A thrown agent step (d–g) records a DURABLE terminal 'errored' stage so the
+        // episode is never orphaned at 'kept'/'scored' (which is indistinguishable
+        // from a still-running episode — the ses_1330/1331 wsgidav orphan). The
+        // 'errored' stage is reachable from EVERY non-terminal stage; the patch
+        // carries the error text (prefixed `timeout:` when the throw was a host-agent
+        // timeout, so a timed-out episode is distinguishable from a hard crash on
+        // disk). Best-effort: a failed record write must not mask the original throw,
+        // which still propagates to runEpisode's finally (lock release).
+        await advanceEpisodeStage({
+            repoRoot,
+            episodeId,
+            stage: 'errored',
+            patch: { terminalError: terminalErrorLabel(err) },
+        }).catch(() => { });
+        throw err;
     }
     // ── h (stage half): advance 'closed' (best-effort) ───────────────────────────
     await closeEpisodeBestEffort(repoRoot, episodeId);
@@ -401,11 +471,25 @@ async function runEpisodeAfterCreate(opts) {
     return { episodeId, baselineSkipped, advantage, decision, evolution, newPolicyVersion };
 }
 /**
- * Advance the episode to 'closed' from whatever terminal-ish stage it reached
- * (evolved | evolution-refused | abstained), best-effort: a stage that cannot
- * legally reach 'closed' (e.g. the evolving agent was not-spawned, leaving the
- * episode at 'kept'/'rolled-back') is left as-is rather than throwing, so the
- * close never masks the real episode outcome.
+ * Advance the episode to 'closed' from whatever terminal-ish stage it reached,
+ * best-effort.
+ *
+ * Closable stages:
+ *   - evolved | evolution-refused | abstained — the 演进智能体 EVOLVING AGENT
+ *     reached a definite outcome (or the judge 弃权 abstained), the normal close.
+ *   - kept | rolled-back — the 演进智能体 returned not-spawned (its diagnosis
+ *     abstained-after-gap-check, no gaps, or the target resolved to no editable
+ *     local files), so the episode never advanced past the decision. By the time
+ *     this runs (AFTER runEvolvingAgent returned), a stage still at 'kept'/
+ *     'rolled-back' can ONLY mean not-spawned — a success advances 'evolved', a
+ *     refusal advances 'evolution-refused', and a throw is caught upstream and
+ *     records 'errored' + rethrows so this close is never reached. So a leftover
+ *     kept/rolled-back at close time IS the finished-nothing-to-evolve case and
+ *     must close, not rest forever at a non-terminal stage (the exact ambiguity
+ *     the 'errored' stage was meant to remove).
+ *
+ * Any other (genuinely non-closable) stage is left as-is rather than throwing, so
+ * the close never masks the real episode outcome.
  */
 async function closeEpisodeBestEffort(repoRoot, episodeId) {
     const ep = await readEpisode(repoRoot, episodeId);
@@ -413,6 +497,9 @@ async function closeEpisodeBestEffort(repoRoot, episodeId) {
         'evolved',
         'evolution-refused',
         'abstained',
+        // not-spawned 演进智能体 leaves the episode here — close the finished episode.
+        'kept',
+        'rolled-back',
     ]);
     if (closable.has(ep.stage)) {
         await advanceEpisodeStage({ repoRoot, episodeId, stage: 'closed' });
@@ -501,6 +588,16 @@ async function ensureRejectBufferEntry(repoRoot, opts) {
  *   - 'scored'                    → run the decision (f) then the 演进智能体 (g).
  *   - 'rolled-back' / 'kept'      → run the 演进智能体 EVOLVING AGENT (g) then close.
  *   - 'evolved'/'evolution-refused'/'abstained' → close.
+ *   - 'errored'                   → RE-DRIVE from the last GOOD pre-error stage
+ *                                   (an episode may have errored on a TRANSIENT
+ *                                   cause — a one-off git/analyzer/agent timeout).
+ *                                   The pre-error stage is the last `stageHistory`
+ *                                   entry that is NOT 'errored'; when it is one of
+ *                                   {'scored','rolled-back','kept'} (the
+ *                                   resume-entry stages) we advance errored → that
+ *                                   stage and fall through to the normal dispatch.
+ *                                   Otherwise the pre-error stage is not
+ *                                   auto-resumable and the episode is reported as-is.
  *   - earlier stages              → not auto-resumable here (the arms / reward
  *                                   agent need their own re-entry); reported as-is.
  *
@@ -516,78 +613,135 @@ export async function resumeEpisode(opts) {
     const resumedFrom = ep.stage;
     const targetId = ep.targetId;
     let evolution = null;
-    if (ep.stage === 'scored') {
-        // Re-run the decision (f) from the on-disk diagnosis, then (g).
-        const diagnosis = await readDiagnosisForResume(repoRoot, episodeId);
-        if (shouldSkipEvolution(diagnosis)) {
-            await advanceEpisodeStage({ repoRoot, episodeId, stage: 'abstained' });
+    // The effective stage we dispatch on. Normally the episode's current stage;
+    // for an 'errored' episode we attempt to RE-DRIVE from the last good pre-error
+    // stage (a transient git/analyzer/agent failure should be retryable via an
+    // operator resume). 'errored' stays terminal for every OTHER caller — only this
+    // resume path may re-drive it, via the errored → {scored,rolled-back,kept}
+    // transitions the stage machine allows ONLY for operator recovery.
+    let stage = ep.stage;
+    if (ep.stage === 'errored') {
+        const preError = [...ep.stageHistory]
+            .reverse()
+            .find((h) => h.stage !== 'errored')?.stage;
+        if (preError === 'scored' ||
+            preError === 'rolled-back' ||
+            preError === 'kept') {
+            // Re-open the errored episode at its last auto-resumable stage, then fall
+            // through to the normal dispatch for that stage.
+            await advanceEpisodeStage({ repoRoot, episodeId, stage: preError });
+            stage = preError;
         }
-        else if (diagnosis) {
-            // (shouldSkipEvolution returns true for null, so diagnosis is non-null here)
-            const advantage = diagnosis.advantage;
-            const badAdvantage = advantage !== null && advantage < threshold;
-            // Crash-resume dedup: a 'rollback' ledger entry whose episodeId is THIS
-            // episode means runEpisode already applied the rollback before the host
-            // crashed (the rollback head is monotonic — re-calling rollbackPolicyVersion
-            // would stack a SECOND, duplicate rollback version). When present, reuse its
-            // recorded version axis and SKIP the re-rollback; only ensure the
-            // reject-buffer entry + the 'rolled-back' stage advance complete.
-            const ledger = await readPolicyLedger(repoRoot, targetId);
-            const priorRollback = ledger.find((e) => e.action === 'rollback' && e.episodeId === episodeId);
-            if (priorRollback) {
-                // The prior rollback already advanced to `priorRollback.version`, rolling
-                // FORWARD to the content of the version immediately before the rejected
-                // edit's head. Reconstruct the reject-buffer axis from that entry:
-                //   toVersion   = the (rolled-back) version the rejected edit reached
-                //               = priorRollback.version - 1 (the head before the rollback)
-                //   fromVersion = the prior good policy restored (one before that head)
-                const toVersion = priorRollback.version - 1;
-                const fromVersion = resolveRollbackTarget(ep.policyVersionBaseline, toVersion);
-                await ensureRejectBufferEntry(repoRoot, {
-                    episodeId,
-                    targetId,
-                    fromVersion: fromVersion ?? toVersion,
-                    toVersion,
-                    advantage,
-                    diagnosis,
-                });
-                await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
+        // Otherwise the pre-error stage is not auto-resumable (e.g. a reward throw at
+        // 'baseline-skipped'); leave the episode at 'errored' and report it as-is.
+    }
+    // The decision (f) + 演进智能体 EVOLVING AGENT (g) re-runs below can THROW — a
+    // wedged/crashed host CLI (CanonicalProposerInvocationError), a timeout, or an
+    // observed-GREEN gate throw. UNCAUGHT, that leaves the episode DURABLY stuck at
+    // a non-terminal stage ('scored'/'rolled-back'/'kept' — the orphan state fix ❷
+    // eliminates for runEpisode). Record the SAME terminal 'errored' stage here
+    // (the transition map already allows scored/rolled-back/kept → 'errored'), then
+    // re-throw. Resume holds NO in-flight lock, so this is a durable-stage fix, not
+    // a leak fix. Best-effort write: a failed record must not mask the original throw.
+    try {
+        if (stage === 'scored') {
+            // Re-run the decision (f) from the on-disk diagnosis, then (g).
+            const diagnosis = await readDiagnosisForResume(repoRoot, episodeId);
+            if (shouldSkipEvolution(diagnosis)) {
+                await advanceEpisodeStage({ repoRoot, episodeId, stage: 'abstained' });
             }
-            else {
-                const headBeforeRollback = await currentPolicyVersion(repoRoot, targetId);
-                const rollbackTarget = resolveRollbackTarget(ep.policyVersionBaseline, headBeforeRollback);
-                if (badAdvantage && rollbackTarget !== null && headBeforeRollback !== null) {
-                    await rollbackPolicyVersion({
-                        repoRoot,
-                        targetId,
-                        episodeId,
-                        toVersion: rollbackTarget,
-                        advantage: advantage ?? undefined,
-                    });
+            else if (diagnosis) {
+                // (shouldSkipEvolution returns true for null, so diagnosis is non-null here)
+                const advantage = diagnosis.advantage;
+                const badAdvantage = advantage !== null && advantage < threshold;
+                // Crash-resume dedup: a 'rollback' ledger entry whose episodeId is THIS
+                // episode means runEpisode already applied the rollback before the host
+                // crashed (the rollback head is monotonic — re-calling rollbackPolicyVersion
+                // would stack a SECOND, duplicate rollback version). When present, reuse its
+                // recorded version axis and SKIP the re-rollback; only ensure the
+                // reject-buffer entry + the 'rolled-back' stage advance complete.
+                const ledger = await readPolicyLedger(repoRoot, targetId);
+                const priorRollback = ledger.find((e) => e.action === 'rollback' && e.episodeId === episodeId);
+                if (priorRollback) {
+                    // The prior rollback already advanced to `priorRollback.version`, rolling
+                    // FORWARD to the content of the version immediately before the rejected
+                    // edit's head. Reconstruct the reject-buffer axis from that entry:
+                    //   toVersion   = the (rolled-back) version the rejected edit reached
+                    //               = priorRollback.version - 1 (the head before the rollback)
+                    //   fromVersion = the prior good policy restored (one before that head)
+                    const toVersion = priorRollback.version - 1;
+                    const fromVersion = resolveRollbackTarget(ep.policyVersionBaseline, toVersion);
                     await ensureRejectBufferEntry(repoRoot, {
                         episodeId,
                         targetId,
-                        fromVersion: rollbackTarget,
-                        toVersion: headBeforeRollback,
+                        fromVersion: fromVersion ?? toVersion,
+                        toVersion,
                         advantage,
                         diagnosis,
                     });
                     await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
                 }
                 else {
-                    await advanceEpisodeStage({ repoRoot, episodeId, stage: 'kept' });
+                    const headBeforeRollback = await currentPolicyVersion(repoRoot, targetId);
+                    const rollbackTarget = resolveRollbackTarget(ep.policyVersionBaseline, headBeforeRollback);
+                    if (badAdvantage && rollbackTarget !== null && headBeforeRollback !== null) {
+                        await rollbackPolicyVersion({
+                            repoRoot,
+                            targetId,
+                            episodeId,
+                            toVersion: rollbackTarget,
+                            advantage: advantage ?? undefined,
+                        });
+                        await ensureRejectBufferEntry(repoRoot, {
+                            episodeId,
+                            targetId,
+                            fromVersion: rollbackTarget,
+                            toVersion: headBeforeRollback,
+                            advantage,
+                            diagnosis,
+                        });
+                        await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
+                    }
+                    else {
+                        await advanceEpisodeStage({ repoRoot, episodeId, stage: 'kept' });
+                    }
+                }
+                // 预测校准 (idempotent if the original run already settled it) + 步长 schedule
+                // + calibration note, mirroring runEpisode's (g) step.
+                try {
+                    await reconcilePrediction({ repoRoot, targetId, episodeId });
                 }
+                catch {
+                    // best-effort: advisory only
+                }
+                const afterDecision = await readEpisode(repoRoot, episodeId);
+                const scheduledBudget = afterDecision.stage === 'rolled-back'
+                    ? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
+                    : editBudget;
+                const calibrationNote = await summarizeCalibration(repoRoot, targetId);
+                evolution = await runEvolvingAgent({
+                    repoRoot,
+                    episodeId,
+                    targetId,
+                    editBudget: scheduledBudget,
+                    ...(calibrationNote ? { calibrationNote } : {}),
+                    spawn: opts.spawn,
+                    ...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
+                    ...(opts.harness ? { harness: opts.harness } : {}),
+                });
             }
-            // 预测校准 (idempotent if the original run already settled it) + 步长 schedule
-            // + calibration note, mirroring runEpisode's (g) step.
+            await closeEpisodeBestEffort(repoRoot, episodeId);
+        }
+        else if (stage === 'rolled-back' || stage === 'kept') {
+            // The decision already ran (and the original episode settled the prediction);
+            // re-settle idempotently for the crash window, then schedule + calibrate.
             try {
                 await reconcilePrediction({ repoRoot, targetId, episodeId });
             }
             catch {
                 // best-effort: advisory only
             }
-            const afterDecision = await readEpisode(repoRoot, episodeId);
-            const scheduledBudget = afterDecision.stage === 'rolled-back'
+            const scheduledBudget = stage === 'rolled-back'
                 ? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
                 : editBudget;
             const calibrationNote = await summarizeCalibration(repoRoot, targetId);
@@ -598,39 +752,34 @@ export async function resumeEpisode(opts) {
                 editBudget: scheduledBudget,
                 ...(calibrationNote ? { calibrationNote } : {}),
                 spawn: opts.spawn,
+                ...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
+                ...(opts.harness ? { harness: opts.harness } : {}),
             });
+            await closeEpisodeBestEffort(repoRoot, episodeId);
         }
-        await closeEpisodeBestEffort(repoRoot, episodeId);
-    }
-    else if (ep.stage === 'rolled-back' || ep.stage === 'kept') {
-        // The decision already ran (and the original episode settled the prediction);
-        // re-settle idempotently for the crash window, then schedule + calibrate.
-        try {
-            await reconcilePrediction({ repoRoot, targetId, episodeId });
-        }
-        catch {
-            // best-effort: advisory only
+        else if (stage === 'evolved' ||
+            stage === 'evolution-refused' ||
+            stage === 'abstained') {
+            await closeEpisodeBestEffort(repoRoot, episodeId);
         }
-        const scheduledBudget = ep.stage === 'rolled-back'
-            ? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
-            : editBudget;
-        const calibrationNote = await summarizeCalibration(repoRoot, targetId);
-        evolution = await runEvolvingAgent({
+        // earlier stages (and a non-auto-resumable 'errored'): not auto-resumable here
+        // — reported as-is.
+    }
+    catch (err) {
+        // A thrown decision/evolving step records a DURABLE terminal 'errored' stage so
+        // the resumed episode is never left stuck at 'scored'/'rolled-back'/'kept'
+        // (indistinguishable from a still-running episode). Mirrors
+        // runEpisodeAfterCreate's catch — including the `timeout:` marker so a timed-out
+        // resume is distinguishable from a hard crash. Best-effort: a failed record must
+        // not mask the original throw, which still propagates to the caller.
+        await advanceEpisodeStage({
             repoRoot,
             episodeId,
-            targetId,
-            editBudget: scheduledBudget,
-            ...(calibrationNote ? { calibrationNote } : {}),
-            spawn: opts.spawn,
-        });
-        await closeEpisodeBestEffort(repoRoot, episodeId);
-    }
-    else if (ep.stage === 'evolved' ||
-        ep.stage === 'evolution-refused' ||
-        ep.stage === 'abstained') {
-        await closeEpisodeBestEffort(repoRoot, episodeId);
+            stage: 'errored',
+            patch: { terminalError: terminalErrorLabel(err) },
+        }).catch(() => { });
+        throw err;
     }
-    // earlier stages: not auto-resumable here — reported as-is.
     const after = await readEpisode(repoRoot, episodeId);
     return { episodeId, resumedFrom, stage: after.stage, evolution };
 }