synergyspec-selfevolving 2.1.0 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. package/dist/commands/learn.js +29 -3
  2. package/dist/commands/self-evolution-episode.js +37 -1
  3. package/dist/core/fitness/health/local-source.d.ts +11 -0
  4. package/dist/core/fitness/health/local-source.js +53 -1
  5. package/dist/core/project-config.d.ts +5 -0
  6. package/dist/core/project-config.js +23 -1
  7. package/dist/core/self-evolution/critic-agent.d.ts +16 -1
  8. package/dist/core/self-evolution/critic-agent.js +87 -17
  9. package/dist/core/self-evolution/episode-orchestrator.d.ts +28 -0
  10. package/dist/core/self-evolution/episode-orchestrator.js +369 -220
  11. package/dist/core/self-evolution/episode-store.d.ts +41 -2
  12. package/dist/core/self-evolution/episode-store.js +33 -9
  13. package/dist/core/self-evolution/evolving-agent.d.ts +51 -2
  14. package/dist/core/self-evolution/evolving-agent.js +45 -4
  15. package/dist/core/self-evolution/host-harness.d.ts +43 -0
  16. package/dist/core/self-evolution/host-harness.js +192 -0
  17. package/dist/core/self-evolution/reward-agent.d.ts +68 -0
  18. package/dist/core/self-evolution/reward-agent.js +92 -23
  19. package/dist/core/self-evolution/reward-aggregator.d.ts +26 -7
  20. package/dist/core/self-evolution/reward-aggregator.js +78 -20
  21. package/dist/core/self-evolution/verdict.d.ts +3 -2
  22. package/dist/core/self-evolution/verdict.js +4 -1
  23. package/dist/dashboard/react-client.js +2 -1
  24. package/dist/ui/ascii-patterns.d.ts +7 -8
  25. package/dist/ui/ascii-patterns.js +54 -120
  26. package/dist/ui/welcome-screen.d.ts +8 -0
  27. package/dist/ui/welcome-screen.js +2 -2
  28. package/package.json +1 -1
@@ -27,6 +27,18 @@ import { reconcilePrediction, summarizeCalibration, } from './policy/prediction-
27
27
  export async function captureMainArm(opts) {
28
28
  const sample = opts.report.fitnessSample;
29
29
  const facts = sample?.trajectoryFacts;
30
+ // ④ Observable degrade: a verified:false arm — whether because NO
31
+ // observed-trajectory facts were captured, OR facts exist but the runner's
32
+ // pass/fail was not derivable (`facts.verified !== true`) — is surfaced on
33
+ // stderr so it is never SILENT. A wedged/missing/unextractable runner is the
34
+ // most common loop-stall cause, and a silent false reads identically to a real
35
+ // miss. The arm's recorded `verified` collapses to false in BOTH cases (see the
36
+ // `objective.verified` below), so EVERY verified:false arm warns exactly once;
37
+ // a genuinely verified arm (`facts.verified === true`) stays quiet.
38
+ if (!facts || facts.verified !== true) {
39
+ // eslint-disable-next-line no-console
40
+ console.warn(`[episode-orchestrator] observed grading unavailable for change "${opts.changeName}" — recording verified:false (observed run not verified)`);
41
+ }
30
42
  // Honesty: prefer the OBSERVED pass rate (a real runner ran), else the
31
43
  // authored test-report summary; null when neither parsed (never fabricated) —
32
44
  // the exact precedence learn.ts uses to compute the loss.
@@ -80,14 +92,30 @@ export async function captureMainArm(opts) {
80
92
  * Whether the episode SKIPS the rollback decision + 演进智能体 EVOLVING AGENT:
81
93
  * the judge 弃权 abstained (no nameable gap), found no gaps, OR returned the
82
94
  * ⑤ `insufficient-signal` verdict (within the A/A noise floor, or a blocked
83
- * tamper). All three mean "do not evolve on this episode".
95
+ * tamper). These mean "do not evolve on this episode".
96
+ *
97
+ * EXCEPTION (cold-start bootstrap): `insufficient-signal` is honored ONLY as a
98
+ * genuine can't-tell — a within-noise-floor result (the baseline ran) or a blocked
99
+ * tamper. On a baseline-skipped episode (policyVersions.baseline === null) there is
100
+ * no comparison to be uncertain about, so a stray `insufficient-signal` emitted
101
+ * alongside real gaps (and no tamper) must NOT block: the first v0→v1 evolution has
102
+ * to be reachable from absolute signal, or a fresh target stays at v0 forever. This
103
+ * is defense-in-depth behind {@link deriveSingleSampleVerdict}, which already drops
104
+ * a volunteered verdict to `undefined` on a baseline skip.
84
105
  */
85
106
  function shouldSkipEvolution(diagnosis) {
86
107
  if (diagnosis === null)
87
108
  return true;
88
- return (diagnosis.abstained ||
89
- diagnosis.gaps.length === 0 ||
90
- diagnosis.verdict === 'insufficient-signal');
109
+ if (diagnosis.abstained || diagnosis.gaps.length === 0)
110
+ return true;
111
+ if (diagnosis.verdict === 'insufficient-signal') {
112
+ const baselineSkipped = diagnosis.policyVersions?.baseline === null;
113
+ const tamper = diagnosis.integrity?.testTamperSuspected ?? false;
114
+ if (baselineSkipped && !tamper)
115
+ return false;
116
+ return true;
117
+ }
118
+ return false;
91
119
  }
92
120
  /**
93
121
  * Count the consecutive trailing rolled-back episodes in the 版本账本 ledger.
@@ -152,6 +180,17 @@ function deriveEpisodeId(changeName, now) {
152
180
  .replace(/-{2,}/g, '-')
153
181
  .replace(/^-+|-+$/g, '');
154
182
  }
183
+ /**
184
+ * Build the `terminalError` note for a thrown step. A timeout reads identically
185
+ * to a hard crash on disk otherwise, so a message that names a host-agent timeout
186
+ * (the spawn timeout puts `headless agent timed out after Nms` into stderr → the
187
+ * error message) is PREFIXED with a `timeout:` marker. A timed-out episode is
188
+ * then distinguishable from a genuine crash in episode.json. Pure.
189
+ */
190
+ function terminalErrorLabel(err) {
191
+ const msg = err instanceof Error ? err.message : String(err);
192
+ return /timed out/i.test(msg) ? `timeout: ${msg}` : msg;
193
+ }
155
194
  /**
156
195
  * Run ONE episode through the loop in the strict, durably-persisted order
157
196
  * documented at the top of this module. See {@link RunEpisodeResult}.
@@ -251,149 +290,180 @@ async function runEpisodeAfterCreate(opts) {
251
290
  objective: opts.mainArm.objective,
252
291
  });
253
292
  await advanceEpisodeStage({ repoRoot, episodeId, stage: 'main-arm-captured' });
254
- // ── d: CRITIC AGENT(基线智能体 baseline agent)or skip ───────────────────────
255
- const shouldCritic = await shouldRunCriticAgent({ repoRoot, targetId });
256
- if (shouldCritic.run && shouldCritic.baselineVersion !== null) {
257
- // runCriticAgent advances the episode to 'baseline-arm-captured'.
258
- await runCriticAgent({
259
- repoRoot,
260
- targetId,
261
- changeName: opts.changeName,
262
- episodeId,
263
- baselineVersion: shouldCritic.baselineVersion,
264
- ...(opts.critic?.baselineMode ? { baselineMode: opts.critic.baselineMode } : {}),
265
- spawn: opts.spawn,
266
- });
267
- }
268
- else {
269
- baselineSkipped = true;
270
- await advanceEpisodeStage({
271
- repoRoot,
272
- episodeId,
273
- stage: 'baseline-skipped',
274
- patch: { baselineSkippedReason: shouldCritic.reason },
275
- });
276
- }
277
- // ── e: 奖励智能体 REWARD AGENT — score + diagnosis.json + advance 'scored' ────
278
- // ④ Compute the tamper hint first (unless tamperCheck is 'off'); the ensemble
279
- // injects it into the judge prompt + the diagnosis integrity, and BLOCK mode
280
- // forces an insufficient-signal verdict (no extra spawns at the default
281
- // samples=1, flag-only).
282
- const tamperMode = opts.reward?.tamperCheck ?? 'flag';
283
- const integrityHint = tamperMode === 'off' ? null : await detectTestTamper({ changeDirPath: opts.changeDirPath });
284
- const reward = await runRewardAgentEnsemble({
285
- repoRoot,
286
- episodeId,
287
- spawn: opts.spawn,
288
- ...(opts.reward ? { reward: opts.reward } : {}),
289
- integrityHint,
290
- });
291
- const diagnosis = reward.diagnosis;
292
- advantage = diagnosis.advantage;
293
- // 预测校准: settle the PRIOR 'evolve' step's checkable prediction against this
294
- // episode's measured arm delta (verifiable per-metric main−baseline, NOT the
295
- // judge's reward) and log the residual. Best-effort — a calibration miss must
296
- // never fail the episode.
293
+ // Steps d–g spawn the three agents (any of which may THROW — a wedged/crashed
294
+ // host CLI, a never-validating reward output, an evolving-agent invocation
295
+ // error). A thrown agent step must record a DURABLE terminal 'errored' stage
296
+ // (with the error text) so the episode is never orphaned at 'kept'/'scored'
297
+ // (indistinguishable from a still-running episode — ses_1330/1331). The lock
298
+ // release stays in runEpisode's finally; the re-throw below reaches it.
297
299
  try {
298
- await reconcilePrediction({ repoRoot, targetId, episodeId });
299
- }
300
- catch {
301
- // best-effort: the prediction-reconcile ledger is advisory only
302
- }
303
- // ── f: DECISION (every step durably persisted before the next) ───────────────
304
- if (shouldSkipEvolution(diagnosis)) {
305
- // 弃权 abstained / no nameable gap / insufficient-signal → no rollback
306
- // decision; SKIP evolution. A BLOCKED tamper records a reject-buffer entry so
307
- // the gamed attempt is visible to future episodes' 演进智能体.
308
- if (tamperMode === 'block' &&
309
- diagnosis.integrity?.testTamperSuspected &&
310
- diagnosis.verdict === 'insufficient-signal') {
311
- const head = await currentPolicyVersion(repoRoot, targetId);
312
- await appendRejectBufferEntry(repoRoot, {
313
- schemaVersion: 1,
314
- at: new Date().toISOString(),
315
- episodeId,
316
- targetId,
317
- // No version moved (the main arm's tampered tests are not a policy edit);
318
- // record at the current head so the entry is informational, not a rollback.
319
- fromVersion: head ?? 0,
320
- toVersion: head ?? 0,
321
- advantage: diagnosis.advantage,
322
- rewardMain: diagnosis.rewardMain,
323
- rewardBaseline: diagnosis.rewardBaseline,
324
- textualGradientTried: diagnosis.textualGradient ?? '',
325
- editSummary: buildRejectEditSummary(diagnosis),
326
- reason: 'tamper-suspected',
327
- });
328
- }
329
- decision = 'abstained';
330
- await advanceEpisodeStage({ repoRoot, episodeId, stage: 'abstained' });
331
- }
332
- else {
333
- const badAdvantage = advantage !== null && advantage < threshold;
334
- const ep = await readEpisode(repoRoot, episodeId);
335
- const headBeforeRollback = await currentPolicyVersion(repoRoot, targetId);
336
- // Resolve the rollback target: the policy the CRITIC AGENT reran
337
- // (`policyVersionBaseline`) when it is a valid EARLIER version, else the
338
- // version immediately before the head (the prior good policy the bad edit
339
- // advanced past). `rollbackPolicyVersion` requires `toVersion < head`.
340
- const rollbackTarget = resolveRollbackTarget(ep.policyVersionBaseline, headBeforeRollback);
341
- if (badAdvantage && rollbackTarget !== null && headBeforeRollback !== null) {
342
- // (i) ROLLBACK first — durable on disk. `advantage` is non-null inside the
343
- // badAdvantage branch; `?? undefined` satisfies the optional `number` param.
344
- await rollbackPolicyVersion({
300
+ // ── d: CRITIC AGENT(基线智能体 baseline agent)or skip ───────────────────────
301
+ const shouldCritic = await shouldRunCriticAgent({ repoRoot, targetId });
302
+ if (shouldCritic.run && shouldCritic.baselineVersion !== null) {
303
+ // runCriticAgent advances the episode to 'baseline-arm-captured'.
304
+ await runCriticAgent({
345
305
  repoRoot,
346
306
  targetId,
307
+ changeName: opts.changeName,
347
308
  episodeId,
348
- toVersion: rollbackTarget,
349
- advantage: advantage ?? undefined,
309
+ baselineVersion: shouldCritic.baselineVersion,
310
+ ...(opts.critic?.baselineMode ? { baselineMode: opts.critic.baselineMode } : {}),
311
+ spawn: opts.spawn,
312
+ ...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
313
+ ...(opts.harness ? { harness: opts.harness } : {}),
350
314
  });
351
- // (ii) THEN append the 否决缓冲 reject-buffer entry — durable on disk —
352
- // BEFORE the 演进智能体 EVOLVING AGENT is even called, so the entry written
353
- // THIS episode is in its fresh-from-disk prompt.
354
- const rejectEntry = {
355
- schemaVersion: 1,
356
- at: new Date().toISOString(),
357
- episodeId,
358
- targetId,
359
- // fromVersion = the version we rolled back TO (the prior good policy);
360
- // toVersion = the (now rolled-back) version the rejected edit reached.
361
- fromVersion: rollbackTarget,
362
- toVersion: headBeforeRollback,
363
- advantage,
364
- rewardMain: diagnosis.rewardMain,
365
- rewardBaseline: diagnosis.rewardBaseline,
366
- textualGradientTried: diagnosis.textualGradient ?? '',
367
- editSummary: buildRejectEditSummary(diagnosis),
368
- reason: 'bad-advantage',
369
- };
370
- await appendRejectBufferEntry(repoRoot, rejectEntry);
371
- decision = 'rolled-back';
372
- await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
373
315
  }
374
316
  else {
375
- // Good advantage, OR no earlier version to roll back to (e.g. head is v0):
376
- // keep the current head.
377
- decision = 'kept';
378
- await advanceEpisodeStage({ repoRoot, episodeId, stage: 'kept' });
317
+ baselineSkipped = true;
318
+ await advanceEpisodeStage({
319
+ repoRoot,
320
+ episodeId,
321
+ stage: 'baseline-skipped',
322
+ patch: { baselineSkippedReason: shouldCritic.reason },
323
+ });
379
324
  }
380
- // ── g: ONLY AFTER (f) persisted 'rolled-back'/'kept' ───────────────────────
381
- // runEvolvingAgent reads the reject-buffer FRESH from disk (the entry just
382
- // written THIS episode is in its prompt). Never parallelized with (f).
383
- // 步长: after a rollback, shrink the edit budget (smaller step after a step
384
- // that lost ground). 预测校准: pass the proposer's recent prediction record.
385
- const scheduledBudget = decision === 'rolled-back'
386
- ? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
387
- : editBudget;
388
- const calibrationNote = await summarizeCalibration(repoRoot, targetId);
389
- evolution = await runEvolvingAgent({
325
+ // ── e: 奖励智能体 REWARD AGENT score + diagnosis.json + advance 'scored' ────
326
+ // Compute the tamper hint first (unless tamperCheck is 'off'); the ensemble
327
+ // injects it into the judge prompt + the diagnosis integrity, and BLOCK mode
328
+ // forces an insufficient-signal verdict (no extra spawns at the default
329
+ // samples=1, flag-only).
330
+ const tamperMode = opts.reward?.tamperCheck ?? 'flag';
331
+ const integrityHint = tamperMode === 'off' ? null : await detectTestTamper({ changeDirPath: opts.changeDirPath });
332
+ const reward = await runRewardAgentEnsemble({
390
333
  repoRoot,
391
334
  episodeId,
392
- targetId,
393
- editBudget: scheduledBudget,
394
- ...(calibrationNote ? { calibrationNote } : {}),
395
335
  spawn: opts.spawn,
336
+ ...(opts.reward ? { reward: opts.reward } : {}),
337
+ integrityHint,
338
+ ...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
339
+ ...(opts.harness ? { harness: opts.harness } : {}),
396
340
  });
341
+ const diagnosis = reward.diagnosis;
342
+ advantage = diagnosis.advantage;
343
+ // 预测校准: settle the PRIOR 'evolve' step's checkable prediction against this
344
+ // episode's measured arm delta (verifiable per-metric main−baseline, NOT the
345
+ // judge's reward) and log the residual. Best-effort — a calibration miss must
346
+ // never fail the episode.
347
+ try {
348
+ await reconcilePrediction({ repoRoot, targetId, episodeId });
349
+ }
350
+ catch {
351
+ // best-effort: the prediction-reconcile ledger is advisory only
352
+ }
353
+ // ── f: DECISION (every step durably persisted before the next) ───────────────
354
+ if (shouldSkipEvolution(diagnosis)) {
355
+ // 弃权 abstained / no nameable gap / insufficient-signal → no rollback
356
+ // decision; SKIP evolution. A BLOCKED tamper records a reject-buffer entry so
357
+ // the gamed attempt is visible to future episodes' 演进智能体.
358
+ if (tamperMode === 'block' &&
359
+ diagnosis.integrity?.testTamperSuspected &&
360
+ diagnosis.verdict === 'insufficient-signal') {
361
+ const head = await currentPolicyVersion(repoRoot, targetId);
362
+ await appendRejectBufferEntry(repoRoot, {
363
+ schemaVersion: 1,
364
+ at: new Date().toISOString(),
365
+ episodeId,
366
+ targetId,
367
+ // No version moved (the main arm's tampered tests are not a policy edit);
368
+ // record at the current head so the entry is informational, not a rollback.
369
+ fromVersion: head ?? 0,
370
+ toVersion: head ?? 0,
371
+ advantage: diagnosis.advantage,
372
+ rewardMain: diagnosis.rewardMain,
373
+ rewardBaseline: diagnosis.rewardBaseline,
374
+ textualGradientTried: diagnosis.textualGradient ?? '',
375
+ editSummary: buildRejectEditSummary(diagnosis),
376
+ reason: 'tamper-suspected',
377
+ });
378
+ }
379
+ decision = 'abstained';
380
+ await advanceEpisodeStage({ repoRoot, episodeId, stage: 'abstained' });
381
+ }
382
+ else {
383
+ const badAdvantage = advantage !== null && advantage < threshold;
384
+ const ep = await readEpisode(repoRoot, episodeId);
385
+ const headBeforeRollback = await currentPolicyVersion(repoRoot, targetId);
386
+ // Resolve the rollback target: the policy the CRITIC AGENT reran
387
+ // (`policyVersionBaseline`) when it is a valid EARLIER version, else the
388
+ // version immediately before the head (the prior good policy the bad edit
389
+ // advanced past). `rollbackPolicyVersion` requires `toVersion < head`.
390
+ const rollbackTarget = resolveRollbackTarget(ep.policyVersionBaseline, headBeforeRollback);
391
+ if (badAdvantage && rollbackTarget !== null && headBeforeRollback !== null) {
392
+ // (i) ROLLBACK first — durable on disk. `advantage` is non-null inside the
393
+ // badAdvantage branch; `?? undefined` satisfies the optional `number` param.
394
+ await rollbackPolicyVersion({
395
+ repoRoot,
396
+ targetId,
397
+ episodeId,
398
+ toVersion: rollbackTarget,
399
+ advantage: advantage ?? undefined,
400
+ });
401
+ // (ii) THEN append the 否决缓冲 reject-buffer entry — durable on disk —
402
+ // BEFORE the 演进智能体 EVOLVING AGENT is even called, so the entry written
403
+ // THIS episode is in its fresh-from-disk prompt.
404
+ const rejectEntry = {
405
+ schemaVersion: 1,
406
+ at: new Date().toISOString(),
407
+ episodeId,
408
+ targetId,
409
+ // fromVersion = the version we rolled back TO (the prior good policy);
410
+ // toVersion = the (now rolled-back) version the rejected edit reached.
411
+ fromVersion: rollbackTarget,
412
+ toVersion: headBeforeRollback,
413
+ advantage,
414
+ rewardMain: diagnosis.rewardMain,
415
+ rewardBaseline: diagnosis.rewardBaseline,
416
+ textualGradientTried: diagnosis.textualGradient ?? '',
417
+ editSummary: buildRejectEditSummary(diagnosis),
418
+ reason: 'bad-advantage',
419
+ };
420
+ await appendRejectBufferEntry(repoRoot, rejectEntry);
421
+ decision = 'rolled-back';
422
+ await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
423
+ }
424
+ else {
425
+ // Good advantage, OR no earlier version to roll back to (e.g. head is v0):
426
+ // keep the current head.
427
+ decision = 'kept';
428
+ await advanceEpisodeStage({ repoRoot, episodeId, stage: 'kept' });
429
+ }
430
+ // ── g: ONLY AFTER (f) persisted 'rolled-back'/'kept' ───────────────────────
431
+ // runEvolvingAgent reads the reject-buffer FRESH from disk (the entry just
432
+ // written THIS episode is in its prompt). Never parallelized with (f).
433
+ // 步长: after a rollback, shrink the edit budget (smaller step after a step
434
+ // that lost ground). 预测校准: pass the proposer's recent prediction record.
435
+ const scheduledBudget = decision === 'rolled-back'
436
+ ? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
437
+ : editBudget;
438
+ const calibrationNote = await summarizeCalibration(repoRoot, targetId);
439
+ evolution = await runEvolvingAgent({
440
+ repoRoot,
441
+ episodeId,
442
+ targetId,
443
+ editBudget: scheduledBudget,
444
+ ...(calibrationNote ? { calibrationNote } : {}),
445
+ spawn: opts.spawn,
446
+ ...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
447
+ ...(opts.harness ? { harness: opts.harness } : {}),
448
+ });
449
+ }
450
+ }
451
+ catch (err) {
452
+ // A thrown agent step (d–g) records a DURABLE terminal 'errored' stage so the
453
+ // episode is never orphaned at 'kept'/'scored' (which is indistinguishable
454
+ // from a still-running episode — the ses_1330/1331 wsgidav orphan). The
455
+ // 'errored' stage is reachable from EVERY non-terminal stage; the patch
456
+ // carries the error text (prefixed `timeout:` when the throw was a host-agent
457
+ // timeout, so a timed-out episode is distinguishable from a hard crash on
458
+ // disk). Best-effort: a failed record write must not mask the original throw,
459
+ // which still propagates to runEpisode's finally (lock release).
460
+ await advanceEpisodeStage({
461
+ repoRoot,
462
+ episodeId,
463
+ stage: 'errored',
464
+ patch: { terminalError: terminalErrorLabel(err) },
465
+ }).catch(() => { });
466
+ throw err;
397
467
  }
398
468
  // ── h (stage half): advance 'closed' (best-effort) ───────────────────────────
399
469
  await closeEpisodeBestEffort(repoRoot, episodeId);
@@ -401,11 +471,25 @@ async function runEpisodeAfterCreate(opts) {
401
471
  return { episodeId, baselineSkipped, advantage, decision, evolution, newPolicyVersion };
402
472
  }
403
473
  /**
404
- * Advance the episode to 'closed' from whatever terminal-ish stage it reached
405
- * (evolved | evolution-refused | abstained), best-effort: a stage that cannot
406
- * legally reach 'closed' (e.g. the evolving agent was not-spawned, leaving the
407
- * episode at 'kept'/'rolled-back') is left as-is rather than throwing, so the
408
- * close never masks the real episode outcome.
474
+ * Advance the episode to 'closed' from whatever terminal-ish stage it reached,
475
+ * best-effort.
476
+ *
477
+ * Closable stages:
478
+ * - evolved | evolution-refused | abstained — the 演进智能体 EVOLVING AGENT
479
+ * reached a definite outcome (or the judge 弃权 abstained), the normal close.
480
+ * - kept | rolled-back — the 演进智能体 returned not-spawned (its diagnosis
481
+ * abstained-after-gap-check, no gaps, or the target resolved to no editable
482
+ * local files), so the episode never advanced past the decision. By the time
483
+ * this runs (AFTER runEvolvingAgent returned), a stage still at 'kept'/
484
+ * 'rolled-back' can ONLY mean not-spawned — a success advances 'evolved', a
485
+ * refusal advances 'evolution-refused', and a throw is caught upstream and
486
+ * records 'errored' + rethrows so this close is never reached. So a leftover
487
+ * kept/rolled-back at close time IS the finished-nothing-to-evolve case and
488
+ * must close, not rest forever at a non-terminal stage (the exact ambiguity
489
+ * the 'errored' stage was meant to remove).
490
+ *
491
+ * Any other (genuinely non-closable) stage is left as-is rather than throwing, so
492
+ * the close never masks the real episode outcome.
409
493
  */
410
494
  async function closeEpisodeBestEffort(repoRoot, episodeId) {
411
495
  const ep = await readEpisode(repoRoot, episodeId);
@@ -413,6 +497,9 @@ async function closeEpisodeBestEffort(repoRoot, episodeId) {
413
497
  'evolved',
414
498
  'evolution-refused',
415
499
  'abstained',
500
+ // not-spawned 演进智能体 leaves the episode here — close the finished episode.
501
+ 'kept',
502
+ 'rolled-back',
416
503
  ]);
417
504
  if (closable.has(ep.stage)) {
418
505
  await advanceEpisodeStage({ repoRoot, episodeId, stage: 'closed' });
@@ -501,6 +588,16 @@ async function ensureRejectBufferEntry(repoRoot, opts) {
501
588
  * - 'scored' → run the decision (f) then the 演进智能体 (g).
502
589
  * - 'rolled-back' / 'kept' → run the 演进智能体 EVOLVING AGENT (g) then close.
503
590
  * - 'evolved'/'evolution-refused'/'abstained' → close.
591
+ * - 'errored' → RE-DRIVE from the last GOOD pre-error stage
592
+ * (an episode may have errored on a TRANSIENT
593
+ * cause — a one-off git/analyzer/agent timeout).
594
+ * The pre-error stage is the last `stageHistory`
595
+ * entry that is NOT 'errored'; when it is one of
596
+ * {'scored','rolled-back','kept'} (the
597
+ * resume-entry stages) we advance errored → that
598
+ * stage and fall through to the normal dispatch.
599
+ * Otherwise the pre-error stage is not
600
+ * auto-resumable and the episode is reported as-is.
504
601
  * - earlier stages → not auto-resumable here (the arms / reward
505
602
  * agent need their own re-entry); reported as-is.
506
603
  *
@@ -516,78 +613,135 @@ export async function resumeEpisode(opts) {
516
613
  const resumedFrom = ep.stage;
517
614
  const targetId = ep.targetId;
518
615
  let evolution = null;
519
- if (ep.stage === 'scored') {
520
- // Re-run the decision (f) from the on-disk diagnosis, then (g).
521
- const diagnosis = await readDiagnosisForResume(repoRoot, episodeId);
522
- if (shouldSkipEvolution(diagnosis)) {
523
- await advanceEpisodeStage({ repoRoot, episodeId, stage: 'abstained' });
616
+ // The effective stage we dispatch on. Normally the episode's current stage;
617
+ // for an 'errored' episode we attempt to RE-DRIVE from the last good pre-error
618
+ // stage (a transient git/analyzer/agent failure should be retryable via an
619
+ // operator resume). 'errored' stays terminal for every OTHER caller — only this
620
+ // resume path may re-drive it, via the errored → {scored,rolled-back,kept}
621
+ // transitions the stage machine allows ONLY for operator recovery.
622
+ let stage = ep.stage;
623
+ if (ep.stage === 'errored') {
624
+ const preError = [...ep.stageHistory]
625
+ .reverse()
626
+ .find((h) => h.stage !== 'errored')?.stage;
627
+ if (preError === 'scored' ||
628
+ preError === 'rolled-back' ||
629
+ preError === 'kept') {
630
+ // Re-open the errored episode at its last auto-resumable stage, then fall
631
+ // through to the normal dispatch for that stage.
632
+ await advanceEpisodeStage({ repoRoot, episodeId, stage: preError });
633
+ stage = preError;
524
634
  }
525
- else if (diagnosis) {
526
- // (shouldSkipEvolution returns true for null, so diagnosis is non-null here)
527
- const advantage = diagnosis.advantage;
528
- const badAdvantage = advantage !== null && advantage < threshold;
529
- // Crash-resume dedup: a 'rollback' ledger entry whose episodeId is THIS
530
- // episode means runEpisode already applied the rollback before the host
531
- // crashed (the rollback head is monotonic re-calling rollbackPolicyVersion
532
- // would stack a SECOND, duplicate rollback version). When present, reuse its
533
- // recorded version axis and SKIP the re-rollback; only ensure the
534
- // reject-buffer entry + the 'rolled-back' stage advance complete.
535
- const ledger = await readPolicyLedger(repoRoot, targetId);
536
- const priorRollback = ledger.find((e) => e.action === 'rollback' && e.episodeId === episodeId);
537
- if (priorRollback) {
538
- // The prior rollback already advanced to `priorRollback.version`, rolling
539
- // FORWARD to the content of the version immediately before the rejected
540
- // edit's head. Reconstruct the reject-buffer axis from that entry:
541
- // toVersion = the (rolled-back) version the rejected edit reached
542
- // = priorRollback.version - 1 (the head before the rollback)
543
- // fromVersion = the prior good policy restored (one before that head)
544
- const toVersion = priorRollback.version - 1;
545
- const fromVersion = resolveRollbackTarget(ep.policyVersionBaseline, toVersion);
546
- await ensureRejectBufferEntry(repoRoot, {
547
- episodeId,
548
- targetId,
549
- fromVersion: fromVersion ?? toVersion,
550
- toVersion,
551
- advantage,
552
- diagnosis,
553
- });
554
- await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
635
+ // Otherwise the pre-error stage is not auto-resumable (e.g. a reward throw at
636
+ // 'baseline-skipped'); leave the episode at 'errored' and report it as-is.
637
+ }
638
+ // The decision (f) + 演进智能体 EVOLVING AGENT (g) re-runs below can THROW — a
639
+ // wedged/crashed host CLI (CanonicalProposerInvocationError), a timeout, or an
640
+ // observed-GREEN gate throw. UNCAUGHT, that leaves the episode DURABLY stuck at
641
+ // a non-terminal stage ('scored'/'rolled-back'/'kept' the orphan state fix
642
+ // eliminates for runEpisode). Record the SAME terminal 'errored' stage here
643
+ // (the transition map already allows scored/rolled-back/kept 'errored'), then
644
+ // re-throw. Resume holds NO in-flight lock, so this is a durable-stage fix, not
645
+ // a leak fix. Best-effort write: a failed record must not mask the original throw.
646
+ try {
647
+ if (stage === 'scored') {
648
+ // Re-run the decision (f) from the on-disk diagnosis, then (g).
649
+ const diagnosis = await readDiagnosisForResume(repoRoot, episodeId);
650
+ if (shouldSkipEvolution(diagnosis)) {
651
+ await advanceEpisodeStage({ repoRoot, episodeId, stage: 'abstained' });
555
652
  }
556
- else {
557
- const headBeforeRollback = await currentPolicyVersion(repoRoot, targetId);
558
- const rollbackTarget = resolveRollbackTarget(ep.policyVersionBaseline, headBeforeRollback);
559
- if (badAdvantage && rollbackTarget !== null && headBeforeRollback !== null) {
560
- await rollbackPolicyVersion({
561
- repoRoot,
562
- targetId,
563
- episodeId,
564
- toVersion: rollbackTarget,
565
- advantage: advantage ?? undefined,
566
- });
653
+ else if (diagnosis) {
654
+ // (shouldSkipEvolution returns true for null, so diagnosis is non-null here)
655
+ const advantage = diagnosis.advantage;
656
+ const badAdvantage = advantage !== null && advantage < threshold;
657
+ // Crash-resume dedup: a 'rollback' ledger entry whose episodeId is THIS
658
+ // episode means runEpisode already applied the rollback before the host
659
+ // crashed (the rollback head is monotonic — re-calling rollbackPolicyVersion
660
+ // would stack a SECOND, duplicate rollback version). When present, reuse its
661
+ // recorded version axis and SKIP the re-rollback; only ensure the
662
+ // reject-buffer entry + the 'rolled-back' stage advance complete.
663
+ const ledger = await readPolicyLedger(repoRoot, targetId);
664
+ const priorRollback = ledger.find((e) => e.action === 'rollback' && e.episodeId === episodeId);
665
+ if (priorRollback) {
666
+ // The prior rollback already advanced to `priorRollback.version`, rolling
667
+ // FORWARD to the content of the version immediately before the rejected
668
+ // edit's head. Reconstruct the reject-buffer axis from that entry:
669
+ // toVersion = the (rolled-back) version the rejected edit reached
670
+ // = priorRollback.version - 1 (the head before the rollback)
671
+ // fromVersion = the prior good policy restored (one before that head)
672
+ const toVersion = priorRollback.version - 1;
673
+ const fromVersion = resolveRollbackTarget(ep.policyVersionBaseline, toVersion);
567
674
  await ensureRejectBufferEntry(repoRoot, {
568
675
  episodeId,
569
676
  targetId,
570
- fromVersion: rollbackTarget,
571
- toVersion: headBeforeRollback,
677
+ fromVersion: fromVersion ?? toVersion,
678
+ toVersion,
572
679
  advantage,
573
680
  diagnosis,
574
681
  });
575
682
  await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
576
683
  }
577
684
  else {
578
- await advanceEpisodeStage({ repoRoot, episodeId, stage: 'kept' });
685
+ const headBeforeRollback = await currentPolicyVersion(repoRoot, targetId);
686
+ const rollbackTarget = resolveRollbackTarget(ep.policyVersionBaseline, headBeforeRollback);
687
+ if (badAdvantage && rollbackTarget !== null && headBeforeRollback !== null) {
688
+ await rollbackPolicyVersion({
689
+ repoRoot,
690
+ targetId,
691
+ episodeId,
692
+ toVersion: rollbackTarget,
693
+ advantage: advantage ?? undefined,
694
+ });
695
+ await ensureRejectBufferEntry(repoRoot, {
696
+ episodeId,
697
+ targetId,
698
+ fromVersion: rollbackTarget,
699
+ toVersion: headBeforeRollback,
700
+ advantage,
701
+ diagnosis,
702
+ });
703
+ await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
704
+ }
705
+ else {
706
+ await advanceEpisodeStage({ repoRoot, episodeId, stage: 'kept' });
707
+ }
708
+ }
709
+ // 预测校准 (idempotent if the original run already settled it) + 步长 schedule
710
+ // + calibration note, mirroring runEpisode's (g) step.
711
+ try {
712
+ await reconcilePrediction({ repoRoot, targetId, episodeId });
579
713
  }
714
+ catch {
715
+ // best-effort: advisory only
716
+ }
717
+ const afterDecision = await readEpisode(repoRoot, episodeId);
718
+ const scheduledBudget = afterDecision.stage === 'rolled-back'
719
+ ? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
720
+ : editBudget;
721
+ const calibrationNote = await summarizeCalibration(repoRoot, targetId);
722
+ evolution = await runEvolvingAgent({
723
+ repoRoot,
724
+ episodeId,
725
+ targetId,
726
+ editBudget: scheduledBudget,
727
+ ...(calibrationNote ? { calibrationNote } : {}),
728
+ spawn: opts.spawn,
729
+ ...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
730
+ ...(opts.harness ? { harness: opts.harness } : {}),
731
+ });
580
732
  }
581
- // 预测校准 (idempotent if the original run already settled it) + 步长 schedule
582
- // + calibration note, mirroring runEpisode's (g) step.
733
+ await closeEpisodeBestEffort(repoRoot, episodeId);
734
+ }
735
+ else if (stage === 'rolled-back' || stage === 'kept') {
736
+ // The decision already ran (and the original episode settled the prediction);
737
+ // re-settle idempotently for the crash window, then schedule + calibrate.
583
738
  try {
584
739
  await reconcilePrediction({ repoRoot, targetId, episodeId });
585
740
  }
586
741
  catch {
587
742
  // best-effort: advisory only
588
743
  }
589
- const afterDecision = await readEpisode(repoRoot, episodeId);
590
- const scheduledBudget = afterDecision.stage === 'rolled-back'
744
+ const scheduledBudget = stage === 'rolled-back'
591
745
  ? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
592
746
  : editBudget;
593
747
  const calibrationNote = await summarizeCalibration(repoRoot, targetId);
@@ -598,39 +752,34 @@ export async function resumeEpisode(opts) {
598
752
  editBudget: scheduledBudget,
599
753
  ...(calibrationNote ? { calibrationNote } : {}),
600
754
  spawn: opts.spawn,
755
+ ...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
756
+ ...(opts.harness ? { harness: opts.harness } : {}),
601
757
  });
758
+ await closeEpisodeBestEffort(repoRoot, episodeId);
602
759
  }
603
- await closeEpisodeBestEffort(repoRoot, episodeId);
604
- }
605
- else if (ep.stage === 'rolled-back' || ep.stage === 'kept') {
606
- // The decision already ran (and the original episode settled the prediction);
607
- // re-settle idempotently for the crash window, then schedule + calibrate.
608
- try {
609
- await reconcilePrediction({ repoRoot, targetId, episodeId });
610
- }
611
- catch {
612
- // best-effort: advisory only
760
+ else if (stage === 'evolved' ||
761
+ stage === 'evolution-refused' ||
762
+ stage === 'abstained') {
763
+ await closeEpisodeBestEffort(repoRoot, episodeId);
613
764
  }
614
- const scheduledBudget = ep.stage === 'rolled-back'
615
- ? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
616
- : editBudget;
617
- const calibrationNote = await summarizeCalibration(repoRoot, targetId);
618
- evolution = await runEvolvingAgent({
765
+ // earlier stages (and a non-auto-resumable 'errored'): not auto-resumable here
766
+ // reported as-is.
767
+ }
768
+ catch (err) {
769
+ // A thrown decision/evolving step records a DURABLE terminal 'errored' stage so
770
+ // the resumed episode is never left stuck at 'scored'/'rolled-back'/'kept'
771
+ // (indistinguishable from a still-running episode). Mirrors
772
+ // runEpisodeAfterCreate's catch — including the `timeout:` marker so a timed-out
773
+ // resume is distinguishable from a hard crash. Best-effort: a failed record must
774
+ // not mask the original throw, which still propagates to the caller.
775
+ await advanceEpisodeStage({
619
776
  repoRoot,
620
777
  episodeId,
621
- targetId,
622
- editBudget: scheduledBudget,
623
- ...(calibrationNote ? { calibrationNote } : {}),
624
- spawn: opts.spawn,
625
- });
626
- await closeEpisodeBestEffort(repoRoot, episodeId);
627
- }
628
- else if (ep.stage === 'evolved' ||
629
- ep.stage === 'evolution-refused' ||
630
- ep.stage === 'abstained') {
631
- await closeEpisodeBestEffort(repoRoot, episodeId);
778
+ stage: 'errored',
779
+ patch: { terminalError: terminalErrorLabel(err) },
780
+ }).catch(() => { });
781
+ throw err;
632
782
  }
633
- // earlier stages: not auto-resumable here — reported as-is.
634
783
  const after = await readEpisode(repoRoot, episodeId);
635
784
  return { episodeId, resumedFrom, stage: after.stage, evolution };
636
785
  }