synergyspec-selfevolving 2.1.1 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,6 +27,18 @@ import { reconcilePrediction, summarizeCalibration, } from './policy/prediction-
27
27
  export async function captureMainArm(opts) {
28
28
  const sample = opts.report.fitnessSample;
29
29
  const facts = sample?.trajectoryFacts;
30
+ // ④ Observable degrade: a verified:false arm — whether because NO
31
+ // observed-trajectory facts were captured, OR facts exist but the runner's
32
+ // pass/fail was not derivable (`facts.verified !== true`) — is surfaced on
33
+ // stderr so it is never SILENT. A wedged/missing/unextractable runner is the
34
+ // most common loop-stall cause, and a silent false reads identically to a real
35
+ // miss. The arm's recorded `verified` collapses to false in BOTH cases (see the
36
+ // `objective.verified` below), so EVERY verified:false arm warns exactly once;
37
+ // a genuinely verified arm (`facts.verified === true`) stays quiet.
38
+ if (!facts || facts.verified !== true) {
39
+ // eslint-disable-next-line no-console
40
+ console.warn(`[episode-orchestrator] observed grading unavailable for change "${opts.changeName}" — recording verified:false (observed run not verified)`);
41
+ }
30
42
  // Honesty: prefer the OBSERVED pass rate (a real runner ran), else the
31
43
  // authored test-report summary; null when neither parsed (never fabricated) —
32
44
  // the exact precedence learn.ts uses to compute the loss.
@@ -168,6 +180,17 @@ function deriveEpisodeId(changeName, now) {
168
180
  .replace(/-{2,}/g, '-')
169
181
  .replace(/^-+|-+$/g, '');
170
182
  }
183
+ /**
184
+ * Build the `terminalError` note for a thrown step. A timeout reads identically
185
+ * to a hard crash on disk otherwise, so a message that names a host-agent timeout
186
+ * (the spawn timeout puts `headless agent timed out after Nms` into stderr → the
187
+ * error message) is PREFIXED with a `timeout:` marker. A timed-out episode is
188
+ * then distinguishable from a genuine crash in episode.json. Pure.
189
+ */
190
+ function terminalErrorLabel(err) {
191
+ const msg = err instanceof Error ? err.message : String(err);
192
+ return /timed out/i.test(msg) ? `timeout: ${msg}` : msg;
193
+ }
171
194
  /**
172
195
  * Run ONE episode through the loop in the strict, durably-persisted order
173
196
  * documented at the top of this module. See {@link RunEpisodeResult}.
@@ -267,149 +290,180 @@ async function runEpisodeAfterCreate(opts) {
267
290
  objective: opts.mainArm.objective,
268
291
  });
269
292
  await advanceEpisodeStage({ repoRoot, episodeId, stage: 'main-arm-captured' });
270
- // ── d: CRITIC AGENT(基线智能体 baseline agent)or skip ───────────────────────
271
- const shouldCritic = await shouldRunCriticAgent({ repoRoot, targetId });
272
- if (shouldCritic.run && shouldCritic.baselineVersion !== null) {
273
- // runCriticAgent advances the episode to 'baseline-arm-captured'.
274
- await runCriticAgent({
275
- repoRoot,
276
- targetId,
277
- changeName: opts.changeName,
278
- episodeId,
279
- baselineVersion: shouldCritic.baselineVersion,
280
- ...(opts.critic?.baselineMode ? { baselineMode: opts.critic.baselineMode } : {}),
281
- spawn: opts.spawn,
282
- });
283
- }
284
- else {
285
- baselineSkipped = true;
286
- await advanceEpisodeStage({
287
- repoRoot,
288
- episodeId,
289
- stage: 'baseline-skipped',
290
- patch: { baselineSkippedReason: shouldCritic.reason },
291
- });
292
- }
293
- // ── e: 奖励智能体 REWARD AGENT — score + diagnosis.json + advance 'scored' ────
294
- // ④ Compute the tamper hint first (unless tamperCheck is 'off'); the ensemble
295
- // injects it into the judge prompt + the diagnosis integrity, and BLOCK mode
296
- // forces an insufficient-signal verdict (no extra spawns at the default
297
- // samples=1, flag-only).
298
- const tamperMode = opts.reward?.tamperCheck ?? 'flag';
299
- const integrityHint = tamperMode === 'off' ? null : await detectTestTamper({ changeDirPath: opts.changeDirPath });
300
- const reward = await runRewardAgentEnsemble({
301
- repoRoot,
302
- episodeId,
303
- spawn: opts.spawn,
304
- ...(opts.reward ? { reward: opts.reward } : {}),
305
- integrityHint,
306
- });
307
- const diagnosis = reward.diagnosis;
308
- advantage = diagnosis.advantage;
309
- // 预测校准: settle the PRIOR 'evolve' step's checkable prediction against this
310
- // episode's measured arm delta (verifiable per-metric main−baseline, NOT the
311
- // judge's reward) and log the residual. Best-effort — a calibration miss must
312
- // never fail the episode.
293
+ // Steps d–g spawn the three agents (any of which may THROW — a wedged/crashed
294
+ // host CLI, a never-validating reward output, an evolving-agent invocation
295
+ // error). A thrown agent step must record a DURABLE terminal 'errored' stage
296
+ // (with the error text) so the episode is never orphaned at 'kept'/'scored'
297
+ // (indistinguishable from a still-running episode — ses_1330/1331). The lock
298
+ // release stays in runEpisode's finally; the re-throw below reaches it.
313
299
  try {
314
- await reconcilePrediction({ repoRoot, targetId, episodeId });
315
- }
316
- catch {
317
- // best-effort: the prediction-reconcile ledger is advisory only
318
- }
319
- // ── f: DECISION (every step durably persisted before the next) ───────────────
320
- if (shouldSkipEvolution(diagnosis)) {
321
- // 弃权 abstained / no nameable gap / insufficient-signal → no rollback
322
- // decision; SKIP evolution. A BLOCKED tamper records a reject-buffer entry so
323
- // the gamed attempt is visible to future episodes' 演进智能体.
324
- if (tamperMode === 'block' &&
325
- diagnosis.integrity?.testTamperSuspected &&
326
- diagnosis.verdict === 'insufficient-signal') {
327
- const head = await currentPolicyVersion(repoRoot, targetId);
328
- await appendRejectBufferEntry(repoRoot, {
329
- schemaVersion: 1,
330
- at: new Date().toISOString(),
331
- episodeId,
332
- targetId,
333
- // No version moved (the main arm's tampered tests are not a policy edit);
334
- // record at the current head so the entry is informational, not a rollback.
335
- fromVersion: head ?? 0,
336
- toVersion: head ?? 0,
337
- advantage: diagnosis.advantage,
338
- rewardMain: diagnosis.rewardMain,
339
- rewardBaseline: diagnosis.rewardBaseline,
340
- textualGradientTried: diagnosis.textualGradient ?? '',
341
- editSummary: buildRejectEditSummary(diagnosis),
342
- reason: 'tamper-suspected',
343
- });
344
- }
345
- decision = 'abstained';
346
- await advanceEpisodeStage({ repoRoot, episodeId, stage: 'abstained' });
347
- }
348
- else {
349
- const badAdvantage = advantage !== null && advantage < threshold;
350
- const ep = await readEpisode(repoRoot, episodeId);
351
- const headBeforeRollback = await currentPolicyVersion(repoRoot, targetId);
352
- // Resolve the rollback target: the policy the CRITIC AGENT reran
353
- // (`policyVersionBaseline`) when it is a valid EARLIER version, else the
354
- // version immediately before the head (the prior good policy the bad edit
355
- // advanced past). `rollbackPolicyVersion` requires `toVersion < head`.
356
- const rollbackTarget = resolveRollbackTarget(ep.policyVersionBaseline, headBeforeRollback);
357
- if (badAdvantage && rollbackTarget !== null && headBeforeRollback !== null) {
358
- // (i) ROLLBACK first — durable on disk. `advantage` is non-null inside the
359
- // badAdvantage branch; `?? undefined` satisfies the optional `number` param.
360
- await rollbackPolicyVersion({
300
+ // ── d: CRITIC AGENT(基线智能体 baseline agent)or skip ───────────────────────
301
+ const shouldCritic = await shouldRunCriticAgent({ repoRoot, targetId });
302
+ if (shouldCritic.run && shouldCritic.baselineVersion !== null) {
303
+ // runCriticAgent advances the episode to 'baseline-arm-captured'.
304
+ await runCriticAgent({
361
305
  repoRoot,
362
306
  targetId,
307
+ changeName: opts.changeName,
363
308
  episodeId,
364
- toVersion: rollbackTarget,
365
- advantage: advantage ?? undefined,
309
+ baselineVersion: shouldCritic.baselineVersion,
310
+ ...(opts.critic?.baselineMode ? { baselineMode: opts.critic.baselineMode } : {}),
311
+ spawn: opts.spawn,
312
+ ...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
313
+ ...(opts.harness ? { harness: opts.harness } : {}),
366
314
  });
367
- // (ii) THEN append the 否决缓冲 reject-buffer entry — durable on disk —
368
- // BEFORE the 演进智能体 EVOLVING AGENT is even called, so the entry written
369
- // THIS episode is in its fresh-from-disk prompt.
370
- const rejectEntry = {
371
- schemaVersion: 1,
372
- at: new Date().toISOString(),
373
- episodeId,
374
- targetId,
375
- // fromVersion = the version we rolled back TO (the prior good policy);
376
- // toVersion = the (now rolled-back) version the rejected edit reached.
377
- fromVersion: rollbackTarget,
378
- toVersion: headBeforeRollback,
379
- advantage,
380
- rewardMain: diagnosis.rewardMain,
381
- rewardBaseline: diagnosis.rewardBaseline,
382
- textualGradientTried: diagnosis.textualGradient ?? '',
383
- editSummary: buildRejectEditSummary(diagnosis),
384
- reason: 'bad-advantage',
385
- };
386
- await appendRejectBufferEntry(repoRoot, rejectEntry);
387
- decision = 'rolled-back';
388
- await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
389
315
  }
390
316
  else {
391
- // Good advantage, OR no earlier version to roll back to (e.g. head is v0):
392
- // keep the current head.
393
- decision = 'kept';
394
- await advanceEpisodeStage({ repoRoot, episodeId, stage: 'kept' });
317
+ baselineSkipped = true;
318
+ await advanceEpisodeStage({
319
+ repoRoot,
320
+ episodeId,
321
+ stage: 'baseline-skipped',
322
+ patch: { baselineSkippedReason: shouldCritic.reason },
323
+ });
395
324
  }
396
- // ── g: ONLY AFTER (f) persisted 'rolled-back'/'kept' ───────────────────────
397
- // runEvolvingAgent reads the reject-buffer FRESH from disk (the entry just
398
- // written THIS episode is in its prompt). Never parallelized with (f).
399
- // 步长: after a rollback, shrink the edit budget (smaller step after a step
400
- // that lost ground). 预测校准: pass the proposer's recent prediction record.
401
- const scheduledBudget = decision === 'rolled-back'
402
- ? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
403
- : editBudget;
404
- const calibrationNote = await summarizeCalibration(repoRoot, targetId);
405
- evolution = await runEvolvingAgent({
325
+ // ── e: 奖励智能体 REWARD AGENT score + diagnosis.json + advance 'scored' ────
326
+ // Compute the tamper hint first (unless tamperCheck is 'off'); the ensemble
327
+ // injects it into the judge prompt + the diagnosis integrity, and BLOCK mode
328
+ // forces an insufficient-signal verdict (no extra spawns at the default
329
+ // samples=1, flag-only).
330
+ const tamperMode = opts.reward?.tamperCheck ?? 'flag';
331
+ const integrityHint = tamperMode === 'off' ? null : await detectTestTamper({ changeDirPath: opts.changeDirPath });
332
+ const reward = await runRewardAgentEnsemble({
406
333
  repoRoot,
407
334
  episodeId,
408
- targetId,
409
- editBudget: scheduledBudget,
410
- ...(calibrationNote ? { calibrationNote } : {}),
411
335
  spawn: opts.spawn,
336
+ ...(opts.reward ? { reward: opts.reward } : {}),
337
+ integrityHint,
338
+ ...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
339
+ ...(opts.harness ? { harness: opts.harness } : {}),
412
340
  });
341
+ const diagnosis = reward.diagnosis;
342
+ advantage = diagnosis.advantage;
343
+ // 预测校准: settle the PRIOR 'evolve' step's checkable prediction against this
344
+ // episode's measured arm delta (verifiable per-metric main−baseline, NOT the
345
+ // judge's reward) and log the residual. Best-effort — a calibration miss must
346
+ // never fail the episode.
347
+ try {
348
+ await reconcilePrediction({ repoRoot, targetId, episodeId });
349
+ }
350
+ catch {
351
+ // best-effort: the prediction-reconcile ledger is advisory only
352
+ }
353
+ // ── f: DECISION (every step durably persisted before the next) ───────────────
354
+ if (shouldSkipEvolution(diagnosis)) {
355
+ // 弃权 abstained / no nameable gap / insufficient-signal → no rollback
356
+ // decision; SKIP evolution. A BLOCKED tamper records a reject-buffer entry so
357
+ // the gamed attempt is visible to future episodes' 演进智能体.
358
+ if (tamperMode === 'block' &&
359
+ diagnosis.integrity?.testTamperSuspected &&
360
+ diagnosis.verdict === 'insufficient-signal') {
361
+ const head = await currentPolicyVersion(repoRoot, targetId);
362
+ await appendRejectBufferEntry(repoRoot, {
363
+ schemaVersion: 1,
364
+ at: new Date().toISOString(),
365
+ episodeId,
366
+ targetId,
367
+ // No version moved (the main arm's tampered tests are not a policy edit);
368
+ // record at the current head so the entry is informational, not a rollback.
369
+ fromVersion: head ?? 0,
370
+ toVersion: head ?? 0,
371
+ advantage: diagnosis.advantage,
372
+ rewardMain: diagnosis.rewardMain,
373
+ rewardBaseline: diagnosis.rewardBaseline,
374
+ textualGradientTried: diagnosis.textualGradient ?? '',
375
+ editSummary: buildRejectEditSummary(diagnosis),
376
+ reason: 'tamper-suspected',
377
+ });
378
+ }
379
+ decision = 'abstained';
380
+ await advanceEpisodeStage({ repoRoot, episodeId, stage: 'abstained' });
381
+ }
382
+ else {
383
+ const badAdvantage = advantage !== null && advantage < threshold;
384
+ const ep = await readEpisode(repoRoot, episodeId);
385
+ const headBeforeRollback = await currentPolicyVersion(repoRoot, targetId);
386
+ // Resolve the rollback target: the policy the CRITIC AGENT reran
387
+ // (`policyVersionBaseline`) when it is a valid EARLIER version, else the
388
+ // version immediately before the head (the prior good policy the bad edit
389
+ // advanced past). `rollbackPolicyVersion` requires `toVersion < head`.
390
+ const rollbackTarget = resolveRollbackTarget(ep.policyVersionBaseline, headBeforeRollback);
391
+ if (badAdvantage && rollbackTarget !== null && headBeforeRollback !== null) {
392
+ // (i) ROLLBACK first — durable on disk. `advantage` is non-null inside the
393
+ // badAdvantage branch; `?? undefined` satisfies the optional `number` param.
394
+ await rollbackPolicyVersion({
395
+ repoRoot,
396
+ targetId,
397
+ episodeId,
398
+ toVersion: rollbackTarget,
399
+ advantage: advantage ?? undefined,
400
+ });
401
+ // (ii) THEN append the 否决缓冲 reject-buffer entry — durable on disk —
402
+ // BEFORE the 演进智能体 EVOLVING AGENT is even called, so the entry written
403
+ // THIS episode is in its fresh-from-disk prompt.
404
+ const rejectEntry = {
405
+ schemaVersion: 1,
406
+ at: new Date().toISOString(),
407
+ episodeId,
408
+ targetId,
409
+ // fromVersion = the version we rolled back TO (the prior good policy);
410
+ // toVersion = the (now rolled-back) version the rejected edit reached.
411
+ fromVersion: rollbackTarget,
412
+ toVersion: headBeforeRollback,
413
+ advantage,
414
+ rewardMain: diagnosis.rewardMain,
415
+ rewardBaseline: diagnosis.rewardBaseline,
416
+ textualGradientTried: diagnosis.textualGradient ?? '',
417
+ editSummary: buildRejectEditSummary(diagnosis),
418
+ reason: 'bad-advantage',
419
+ };
420
+ await appendRejectBufferEntry(repoRoot, rejectEntry);
421
+ decision = 'rolled-back';
422
+ await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
423
+ }
424
+ else {
425
+ // Good advantage, OR no earlier version to roll back to (e.g. head is v0):
426
+ // keep the current head.
427
+ decision = 'kept';
428
+ await advanceEpisodeStage({ repoRoot, episodeId, stage: 'kept' });
429
+ }
430
+ // ── g: ONLY AFTER (f) persisted 'rolled-back'/'kept' ───────────────────────
431
+ // runEvolvingAgent reads the reject-buffer FRESH from disk (the entry just
432
+ // written THIS episode is in its prompt). Never parallelized with (f).
433
+ // 步长: after a rollback, shrink the edit budget (smaller step after a step
434
+ // that lost ground). 预测校准: pass the proposer's recent prediction record.
435
+ const scheduledBudget = decision === 'rolled-back'
436
+ ? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
437
+ : editBudget;
438
+ const calibrationNote = await summarizeCalibration(repoRoot, targetId);
439
+ evolution = await runEvolvingAgent({
440
+ repoRoot,
441
+ episodeId,
442
+ targetId,
443
+ editBudget: scheduledBudget,
444
+ ...(calibrationNote ? { calibrationNote } : {}),
445
+ spawn: opts.spawn,
446
+ ...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
447
+ ...(opts.harness ? { harness: opts.harness } : {}),
448
+ });
449
+ }
450
+ }
451
+ catch (err) {
452
+ // A thrown agent step (d–g) records a DURABLE terminal 'errored' stage so the
453
+ // episode is never orphaned at 'kept'/'scored' (which is indistinguishable
454
+ // from a still-running episode — the ses_1330/1331 wsgidav orphan). The
455
+ // 'errored' stage is reachable from EVERY non-terminal stage; the patch
456
+ // carries the error text (prefixed `timeout:` when the throw was a host-agent
457
+ // timeout, so a timed-out episode is distinguishable from a hard crash on
458
+ // disk). Best-effort: a failed record write must not mask the original throw,
459
+ // which still propagates to runEpisode's finally (lock release).
460
+ await advanceEpisodeStage({
461
+ repoRoot,
462
+ episodeId,
463
+ stage: 'errored',
464
+ patch: { terminalError: terminalErrorLabel(err) },
465
+ }).catch(() => { });
466
+ throw err;
413
467
  }
414
468
  // ── h (stage half): advance 'closed' (best-effort) ───────────────────────────
415
469
  await closeEpisodeBestEffort(repoRoot, episodeId);
@@ -417,11 +471,25 @@ async function runEpisodeAfterCreate(opts) {
417
471
  return { episodeId, baselineSkipped, advantage, decision, evolution, newPolicyVersion };
418
472
  }
419
473
  /**
420
- * Advance the episode to 'closed' from whatever terminal-ish stage it reached
421
- * (evolved | evolution-refused | abstained), best-effort: a stage that cannot
422
- * legally reach 'closed' (e.g. the evolving agent was not-spawned, leaving the
423
- * episode at 'kept'/'rolled-back') is left as-is rather than throwing, so the
424
- * close never masks the real episode outcome.
474
+ * Advance the episode to 'closed' from whatever terminal-ish stage it reached,
475
+ * best-effort.
476
+ *
477
+ * Closable stages:
478
+ * - evolved | evolution-refused | abstained — the 演进智能体 EVOLVING AGENT
479
+ * reached a definite outcome (or the judge 弃权 abstained), the normal close.
480
+ * - kept | rolled-back — the 演进智能体 returned not-spawned (its diagnosis
481
+ * abstained-after-gap-check, no gaps, or the target resolved to no editable
482
+ * local files), so the episode never advanced past the decision. By the time
483
+ * this runs (AFTER runEvolvingAgent returned), a stage still at 'kept'/
484
+ * 'rolled-back' can ONLY mean not-spawned — a success advances 'evolved', a
485
+ * refusal advances 'evolution-refused', and a throw is caught upstream and
486
+ * records 'errored' + rethrows so this close is never reached. So a leftover
487
+ * kept/rolled-back at close time IS the finished-nothing-to-evolve case and
488
+ * must close, not rest forever at a non-terminal stage (the exact ambiguity
489
+ * the 'errored' stage was meant to remove).
490
+ *
491
+ * Any other (genuinely non-closable) stage is left as-is rather than throwing, so
492
+ * the close never masks the real episode outcome.
425
493
  */
426
494
  async function closeEpisodeBestEffort(repoRoot, episodeId) {
427
495
  const ep = await readEpisode(repoRoot, episodeId);
@@ -429,6 +497,9 @@ async function closeEpisodeBestEffort(repoRoot, episodeId) {
429
497
  'evolved',
430
498
  'evolution-refused',
431
499
  'abstained',
500
+ // not-spawned 演进智能体 leaves the episode here — close the finished episode.
501
+ 'kept',
502
+ 'rolled-back',
432
503
  ]);
433
504
  if (closable.has(ep.stage)) {
434
505
  await advanceEpisodeStage({ repoRoot, episodeId, stage: 'closed' });
@@ -517,6 +588,16 @@ async function ensureRejectBufferEntry(repoRoot, opts) {
517
588
  * - 'scored' → run the decision (f) then the 演进智能体 (g).
518
589
  * - 'rolled-back' / 'kept' → run the 演进智能体 EVOLVING AGENT (g) then close.
519
590
  * - 'evolved'/'evolution-refused'/'abstained' → close.
591
+ * - 'errored' → RE-DRIVE from the last GOOD pre-error stage
592
+ * (an episode may have errored on a TRANSIENT
593
+ * cause — a one-off git/analyzer/agent timeout).
594
+ * The pre-error stage is the last `stageHistory`
595
+ * entry that is NOT 'errored'; when it is one of
596
+ * {'scored','rolled-back','kept'} (the
597
+ * resume-entry stages) we advance errored → that
598
+ * stage and fall through to the normal dispatch.
599
+ * Otherwise the pre-error stage is not
600
+ * auto-resumable and the episode is reported as-is.
520
601
  * - earlier stages → not auto-resumable here (the arms / reward
521
602
  * agent need their own re-entry); reported as-is.
522
603
  *
@@ -532,78 +613,135 @@ export async function resumeEpisode(opts) {
532
613
  const resumedFrom = ep.stage;
533
614
  const targetId = ep.targetId;
534
615
  let evolution = null;
535
- if (ep.stage === 'scored') {
536
- // Re-run the decision (f) from the on-disk diagnosis, then (g).
537
- const diagnosis = await readDiagnosisForResume(repoRoot, episodeId);
538
- if (shouldSkipEvolution(diagnosis)) {
539
- await advanceEpisodeStage({ repoRoot, episodeId, stage: 'abstained' });
616
+ // The effective stage we dispatch on. Normally the episode's current stage;
617
+ // for an 'errored' episode we attempt to RE-DRIVE from the last good pre-error
618
+ // stage (a transient git/analyzer/agent failure should be retryable via an
619
+ // operator resume). 'errored' stays terminal for every OTHER caller — only this
620
+ // resume path may re-drive it, via the errored → {scored,rolled-back,kept}
621
+ // transitions the stage machine allows ONLY for operator recovery.
622
+ let stage = ep.stage;
623
+ if (ep.stage === 'errored') {
624
+ const preError = [...ep.stageHistory]
625
+ .reverse()
626
+ .find((h) => h.stage !== 'errored')?.stage;
627
+ if (preError === 'scored' ||
628
+ preError === 'rolled-back' ||
629
+ preError === 'kept') {
630
+ // Re-open the errored episode at its last auto-resumable stage, then fall
631
+ // through to the normal dispatch for that stage.
632
+ await advanceEpisodeStage({ repoRoot, episodeId, stage: preError });
633
+ stage = preError;
540
634
  }
541
- else if (diagnosis) {
542
- // (shouldSkipEvolution returns true for null, so diagnosis is non-null here)
543
- const advantage = diagnosis.advantage;
544
- const badAdvantage = advantage !== null && advantage < threshold;
545
- // Crash-resume dedup: a 'rollback' ledger entry whose episodeId is THIS
546
- // episode means runEpisode already applied the rollback before the host
547
- // crashed (the rollback head is monotonic re-calling rollbackPolicyVersion
548
- // would stack a SECOND, duplicate rollback version). When present, reuse its
549
- // recorded version axis and SKIP the re-rollback; only ensure the
550
- // reject-buffer entry + the 'rolled-back' stage advance complete.
551
- const ledger = await readPolicyLedger(repoRoot, targetId);
552
- const priorRollback = ledger.find((e) => e.action === 'rollback' && e.episodeId === episodeId);
553
- if (priorRollback) {
554
- // The prior rollback already advanced to `priorRollback.version`, rolling
555
- // FORWARD to the content of the version immediately before the rejected
556
- // edit's head. Reconstruct the reject-buffer axis from that entry:
557
- // toVersion = the (rolled-back) version the rejected edit reached
558
- // = priorRollback.version - 1 (the head before the rollback)
559
- // fromVersion = the prior good policy restored (one before that head)
560
- const toVersion = priorRollback.version - 1;
561
- const fromVersion = resolveRollbackTarget(ep.policyVersionBaseline, toVersion);
562
- await ensureRejectBufferEntry(repoRoot, {
563
- episodeId,
564
- targetId,
565
- fromVersion: fromVersion ?? toVersion,
566
- toVersion,
567
- advantage,
568
- diagnosis,
569
- });
570
- await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
635
+ // Otherwise the pre-error stage is not auto-resumable (e.g. a reward throw at
636
+ // 'baseline-skipped'); leave the episode at 'errored' and report it as-is.
637
+ }
638
+ // The decision (f) + 演进智能体 EVOLVING AGENT (g) re-runs below can THROW — a
639
+ // wedged/crashed host CLI (CanonicalProposerInvocationError), a timeout, or an
640
+ // observed-GREEN gate throw. UNCAUGHT, that leaves the episode DURABLY stuck at
641
+ // a non-terminal stage ('scored'/'rolled-back'/'kept' the orphan state fix
642
+ // eliminates for runEpisode). Record the SAME terminal 'errored' stage here
643
+ // (the transition map already allows scored/rolled-back/kept 'errored'), then
644
+ // re-throw. Resume holds NO in-flight lock, so this is a durable-stage fix, not
645
+ // a leak fix. Best-effort write: a failed record must not mask the original throw.
646
+ try {
647
+ if (stage === 'scored') {
648
+ // Re-run the decision (f) from the on-disk diagnosis, then (g).
649
+ const diagnosis = await readDiagnosisForResume(repoRoot, episodeId);
650
+ if (shouldSkipEvolution(diagnosis)) {
651
+ await advanceEpisodeStage({ repoRoot, episodeId, stage: 'abstained' });
571
652
  }
572
- else {
573
- const headBeforeRollback = await currentPolicyVersion(repoRoot, targetId);
574
- const rollbackTarget = resolveRollbackTarget(ep.policyVersionBaseline, headBeforeRollback);
575
- if (badAdvantage && rollbackTarget !== null && headBeforeRollback !== null) {
576
- await rollbackPolicyVersion({
577
- repoRoot,
578
- targetId,
579
- episodeId,
580
- toVersion: rollbackTarget,
581
- advantage: advantage ?? undefined,
582
- });
653
+ else if (diagnosis) {
654
+ // (shouldSkipEvolution returns true for null, so diagnosis is non-null here)
655
+ const advantage = diagnosis.advantage;
656
+ const badAdvantage = advantage !== null && advantage < threshold;
657
+ // Crash-resume dedup: a 'rollback' ledger entry whose episodeId is THIS
658
+ // episode means runEpisode already applied the rollback before the host
659
+ // crashed (the rollback head is monotonic — re-calling rollbackPolicyVersion
660
+ // would stack a SECOND, duplicate rollback version). When present, reuse its
661
+ // recorded version axis and SKIP the re-rollback; only ensure the
662
+ // reject-buffer entry + the 'rolled-back' stage advance complete.
663
+ const ledger = await readPolicyLedger(repoRoot, targetId);
664
+ const priorRollback = ledger.find((e) => e.action === 'rollback' && e.episodeId === episodeId);
665
+ if (priorRollback) {
666
+ // The prior rollback already advanced to `priorRollback.version`, rolling
667
+ // FORWARD to the content of the version immediately before the rejected
668
+ // edit's head. Reconstruct the reject-buffer axis from that entry:
669
+ // toVersion = the (rolled-back) version the rejected edit reached
670
+ // = priorRollback.version - 1 (the head before the rollback)
671
+ // fromVersion = the prior good policy restored (one before that head)
672
+ const toVersion = priorRollback.version - 1;
673
+ const fromVersion = resolveRollbackTarget(ep.policyVersionBaseline, toVersion);
583
674
  await ensureRejectBufferEntry(repoRoot, {
584
675
  episodeId,
585
676
  targetId,
586
- fromVersion: rollbackTarget,
587
- toVersion: headBeforeRollback,
677
+ fromVersion: fromVersion ?? toVersion,
678
+ toVersion,
588
679
  advantage,
589
680
  diagnosis,
590
681
  });
591
682
  await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
592
683
  }
593
684
  else {
594
- await advanceEpisodeStage({ repoRoot, episodeId, stage: 'kept' });
685
+ const headBeforeRollback = await currentPolicyVersion(repoRoot, targetId);
686
+ const rollbackTarget = resolveRollbackTarget(ep.policyVersionBaseline, headBeforeRollback);
687
+ if (badAdvantage && rollbackTarget !== null && headBeforeRollback !== null) {
688
+ await rollbackPolicyVersion({
689
+ repoRoot,
690
+ targetId,
691
+ episodeId,
692
+ toVersion: rollbackTarget,
693
+ advantage: advantage ?? undefined,
694
+ });
695
+ await ensureRejectBufferEntry(repoRoot, {
696
+ episodeId,
697
+ targetId,
698
+ fromVersion: rollbackTarget,
699
+ toVersion: headBeforeRollback,
700
+ advantage,
701
+ diagnosis,
702
+ });
703
+ await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
704
+ }
705
+ else {
706
+ await advanceEpisodeStage({ repoRoot, episodeId, stage: 'kept' });
707
+ }
708
+ }
709
+ // 预测校准 (idempotent if the original run already settled it) + 步长 schedule
710
+ // + calibration note, mirroring runEpisode's (g) step.
711
+ try {
712
+ await reconcilePrediction({ repoRoot, targetId, episodeId });
713
+ }
714
+ catch {
715
+ // best-effort: advisory only
595
716
  }
717
+ const afterDecision = await readEpisode(repoRoot, episodeId);
718
+ const scheduledBudget = afterDecision.stage === 'rolled-back'
719
+ ? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
720
+ : editBudget;
721
+ const calibrationNote = await summarizeCalibration(repoRoot, targetId);
722
+ evolution = await runEvolvingAgent({
723
+ repoRoot,
724
+ episodeId,
725
+ targetId,
726
+ editBudget: scheduledBudget,
727
+ ...(calibrationNote ? { calibrationNote } : {}),
728
+ spawn: opts.spawn,
729
+ ...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
730
+ ...(opts.harness ? { harness: opts.harness } : {}),
731
+ });
596
732
  }
597
- // 预测校准 (idempotent if the original run already settled it) + 步长 schedule
598
- // + calibration note, mirroring runEpisode's (g) step.
733
+ await closeEpisodeBestEffort(repoRoot, episodeId);
734
+ }
735
+ else if (stage === 'rolled-back' || stage === 'kept') {
736
+ // The decision already ran (and the original episode settled the prediction);
737
+ // re-settle idempotently for the crash window, then schedule + calibrate.
599
738
  try {
600
739
  await reconcilePrediction({ repoRoot, targetId, episodeId });
601
740
  }
602
741
  catch {
603
742
  // best-effort: advisory only
604
743
  }
605
- const afterDecision = await readEpisode(repoRoot, episodeId);
606
- const scheduledBudget = afterDecision.stage === 'rolled-back'
744
+ const scheduledBudget = stage === 'rolled-back'
607
745
  ? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
608
746
  : editBudget;
609
747
  const calibrationNote = await summarizeCalibration(repoRoot, targetId);
@@ -614,39 +752,34 @@ export async function resumeEpisode(opts) {
614
752
  editBudget: scheduledBudget,
615
753
  ...(calibrationNote ? { calibrationNote } : {}),
616
754
  spawn: opts.spawn,
755
+ ...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
756
+ ...(opts.harness ? { harness: opts.harness } : {}),
617
757
  });
758
+ await closeEpisodeBestEffort(repoRoot, episodeId);
618
759
  }
619
- await closeEpisodeBestEffort(repoRoot, episodeId);
620
- }
621
- else if (ep.stage === 'rolled-back' || ep.stage === 'kept') {
622
- // The decision already ran (and the original episode settled the prediction);
623
- // re-settle idempotently for the crash window, then schedule + calibrate.
624
- try {
625
- await reconcilePrediction({ repoRoot, targetId, episodeId });
760
+ else if (stage === 'evolved' ||
761
+ stage === 'evolution-refused' ||
762
+ stage === 'abstained') {
763
+ await closeEpisodeBestEffort(repoRoot, episodeId);
626
764
  }
627
- catch {
628
- // best-effort: advisory only
629
- }
630
- const scheduledBudget = ep.stage === 'rolled-back'
631
- ? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
632
- : editBudget;
633
- const calibrationNote = await summarizeCalibration(repoRoot, targetId);
634
- evolution = await runEvolvingAgent({
765
+ // earlier stages (and a non-auto-resumable 'errored'): not auto-resumable here
766
+ // reported as-is.
767
+ }
768
+ catch (err) {
769
+ // A thrown decision/evolving step records a DURABLE terminal 'errored' stage so
770
+ // the resumed episode is never left stuck at 'scored'/'rolled-back'/'kept'
771
+ // (indistinguishable from a still-running episode). Mirrors
772
+ // runEpisodeAfterCreate's catch — including the `timeout:` marker so a timed-out
773
+ // resume is distinguishable from a hard crash. Best-effort: a failed record must
774
+ // not mask the original throw, which still propagates to the caller.
775
+ await advanceEpisodeStage({
635
776
  repoRoot,
636
777
  episodeId,
637
- targetId,
638
- editBudget: scheduledBudget,
639
- ...(calibrationNote ? { calibrationNote } : {}),
640
- spawn: opts.spawn,
641
- });
642
- await closeEpisodeBestEffort(repoRoot, episodeId);
643
- }
644
- else if (ep.stage === 'evolved' ||
645
- ep.stage === 'evolution-refused' ||
646
- ep.stage === 'abstained') {
647
- await closeEpisodeBestEffort(repoRoot, episodeId);
778
+ stage: 'errored',
779
+ patch: { terminalError: terminalErrorLabel(err) },
780
+ }).catch(() => { });
781
+ throw err;
648
782
  }
649
- // earlier stages: not auto-resumable here — reported as-is.
650
783
  const after = await readEpisode(repoRoot, episodeId);
651
784
  return { episodeId, resumedFrom, stage: after.stage, evolution };
652
785
  }