synergyspec-selfevolving 2.1.2 → 2.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/dist/commands/learn.js +13 -3
  2. package/dist/commands/self-evolution-episode.d.ts +6 -1
  3. package/dist/commands/self-evolution-episode.js +8 -1
  4. package/dist/commands/self-evolution.d.ts +2 -2
  5. package/dist/commands/self-evolution.js +10 -10
  6. package/dist/commands/workflow/status.js +5 -0
  7. package/dist/core/change-readiness.d.ts +1 -1
  8. package/dist/core/change-readiness.js +66 -11
  9. package/dist/core/fitness/test-metrics.d.ts +33 -0
  10. package/dist/core/fitness/test-metrics.js +67 -0
  11. package/dist/core/learn.js +11 -2
  12. package/dist/core/project-config.d.ts +3 -0
  13. package/dist/core/project-config.js +7 -1
  14. package/dist/core/self-evolution/critic-agent.js +13 -5
  15. package/dist/core/self-evolution/edits-contract.d.ts +15 -5
  16. package/dist/core/self-evolution/edits-contract.js +26 -16
  17. package/dist/core/self-evolution/episode-orchestrator.d.ts +16 -9
  18. package/dist/core/self-evolution/episode-orchestrator.js +126 -35
  19. package/dist/core/self-evolution/episode-store.d.ts +34 -11
  20. package/dist/core/self-evolution/episode-store.js +45 -10
  21. package/dist/core/self-evolution/evolving-agent.d.ts +12 -12
  22. package/dist/core/self-evolution/evolving-agent.js +46 -48
  23. package/dist/core/self-evolution/host-harness.d.ts +68 -2
  24. package/dist/core/self-evolution/host-harness.js +208 -21
  25. package/dist/core/self-evolution/policy/policy-store.d.ts +8 -6
  26. package/dist/core/self-evolution/policy/policy-store.js +124 -24
  27. package/dist/core/self-evolution/proposer-slice.d.ts +4 -3
  28. package/dist/core/self-evolution/reward-agent.d.ts +11 -1
  29. package/dist/core/self-evolution/reward-agent.js +53 -20
  30. package/dist/core/self-evolution/reward-aggregator.d.ts +18 -0
  31. package/dist/core/self-evolution/reward-aggregator.js +53 -3
  32. package/dist/core/self-evolution/reward-deepread.d.ts +64 -0
  33. package/dist/core/self-evolution/reward-deepread.js +112 -0
  34. package/dist/core/templates/workflows/learn.js +3 -2
  35. package/dist/core/templates/workflows/self-evolving.js +5 -2
  36. package/dist/core/trajectory/facts.d.ts +69 -2
  37. package/dist/core/trajectory/facts.js +179 -10
  38. package/dist/core/trajectory/skeleton.d.ts +10 -0
  39. package/dist/core/trajectory/skeleton.js +24 -3
  40. package/package.json +4 -3
  41. package/schemas/spec-driven/templates/design.md +2 -1
@@ -14,26 +14,36 @@
14
14
  * canonical file.
15
15
  */
16
16
  import { GATE_DEFINING_FILES } from './candidate-gates.js';
17
- export class CanonicalProposerOutputInvalid extends Error {
17
+ export class EvolvingAgentOutputInvalid extends Error {
18
18
  constructor(message) {
19
- super(`canonical proposer output invalid: ${message}`);
20
- this.name = 'CanonicalProposerOutputInvalid';
19
+ super(`evolving agent output invalid: ${message}`);
20
+ this.name = 'EvolvingAgentOutputInvalid';
21
21
  }
22
22
  }
23
23
  /** The model declined to edit anything (empty edits). Not an error — a no-op. */
24
- export class CanonicalProposerNoOp extends Error {
24
+ export class EvolvingAgentNoOp extends Error {
25
25
  constructor() {
26
- super('canonical proposer returned no edits');
27
- this.name = 'CanonicalProposerNoOp';
26
+ super('evolving agent returned no edits');
27
+ this.name = 'EvolvingAgentNoOp';
28
28
  }
29
29
  }
30
30
  /** The headless agent invocation itself failed (crash / empty output). */
31
- export class CanonicalProposerInvocationError extends Error {
31
+ export class EvolvingAgentInvocationError extends Error {
32
32
  constructor(stderr) {
33
- super(`canonical proposer invocation failed: ${stderr}`);
34
- this.name = 'CanonicalProposerInvocationError';
33
+ super(`evolving agent invocation failed: ${stderr}`);
34
+ this.name = 'EvolvingAgentInvocationError';
35
35
  }
36
36
  }
37
+ /**
38
+ * @deprecated v2.0.0 removed the GA "canonical proposer"; these names are
39
+ * retained only as transitional aliases for any external importer. Use the
40
+ * `EvolvingAgent*` classes — they are the same constructors.
41
+ */
42
+ export const CanonicalProposerOutputInvalid = EvolvingAgentOutputInvalid;
43
+ /** @deprecated alias of {@link EvolvingAgentNoOp}. */
44
+ export const CanonicalProposerNoOp = EvolvingAgentNoOp;
45
+ /** @deprecated alias of {@link EvolvingAgentInvocationError}. */
46
+ export const CanonicalProposerInvocationError = EvolvingAgentInvocationError;
37
47
  /**
38
48
  * Validate already-structured candidate edits against the allowed (target-
39
49
  * scoped) file set and the frozen gate-defining files. Author-agnostic: this is
@@ -44,33 +54,33 @@ export class CanonicalProposerInvocationError extends Error {
44
54
  * the loop-v2 演进智能体 EVOLVING AGENT call this so their safety contract is
45
55
  * byte-identical. relPaths are normalized to POSIX separators.
46
56
  *
47
- * Throws {@link CanonicalProposerNoOp} when `rawEdits` is empty and
48
- * {@link CanonicalProposerOutputInvalid} for any shape / frozen / scope
57
+ * Throws {@link EvolvingAgentNoOp} when `rawEdits` is empty and
58
+ * {@link EvolvingAgentOutputInvalid} for any shape / frozen / scope
49
59
  * violation. Path traversal and absolute paths are rejected transitively: they
50
60
  * can never be a member of `allowedFiles`, so they fail the scope check.
51
61
  */
52
62
  export function validateCandidateEdits(rawEdits, allowedFiles) {
53
63
  if (rawEdits.length === 0) {
54
- throw new CanonicalProposerNoOp();
64
+ throw new EvolvingAgentNoOp();
55
65
  }
56
66
  const allowed = new Set(allowedFiles.map((p) => p.replace(/\\/g, '/')));
57
67
  const frozen = new Set(GATE_DEFINING_FILES.map((p) => p.replace(/\\/g, '/')));
58
68
  const validated = [];
59
69
  for (const e of rawEdits) {
60
70
  if (!e || typeof e !== 'object') {
61
- throw new CanonicalProposerOutputInvalid('edit entry must be an object');
71
+ throw new EvolvingAgentOutputInvalid('edit entry must be an object');
62
72
  }
63
73
  const relPath = e.relPath;
64
74
  const content = e.content;
65
75
  if (typeof relPath !== 'string' || typeof content !== 'string') {
66
- throw new CanonicalProposerOutputInvalid('edit must have string relPath and string content');
76
+ throw new EvolvingAgentOutputInvalid('edit must have string relPath and string content');
67
77
  }
68
78
  const norm = relPath.replace(/\\/g, '/');
69
79
  if (frozen.has(norm)) {
70
- throw new CanonicalProposerOutputInvalid(`edit relPath "${relPath}" is a gate-defining/frozen file and may never be proposed`);
80
+ throw new EvolvingAgentOutputInvalid(`edit relPath "${relPath}" is a gate-defining/frozen file and may never be proposed`);
71
81
  }
72
82
  if (!allowed.has(norm)) {
73
- throw new CanonicalProposerOutputInvalid(`edit relPath "${relPath}" is outside the target's declared files`);
83
+ throw new EvolvingAgentOutputInvalid(`edit relPath "${relPath}" is outside the target's declared files`);
74
84
  }
75
85
  validated.push({ relPath: norm, content });
76
86
  }
@@ -28,17 +28,22 @@
28
28
  * reject-buffer entry — BOTH durably on
29
29
  * disk — THEN advance 'rolled-back'.
30
30
  * - otherwise → advance 'kept'.
31
- * g. 演进智能体 EVOLVING AGENT — ONLY after (f) persisted: runEvolvingAgent
32
- * (optimizer.step) reads the reject-buffer FRESH from disk
33
- * (so THIS episode's just-written entry is
34
- * in its prompt) and either not-spawned /
31
+ * g. 演进智能体 EVOLVING AGENT — ONLY after (f) persisted: require the
32
+ * (optimizer.step) main-arm observed-GREEN evidence, then
33
+ * advance the 'evolving' marker
34
+ * (heartbeat) so a concurrent sibling
35
+ * sees a live holder, then
36
+ * runEvolvingAgent reads the
37
+ * reject-buffer FRESH from disk (so THIS
38
+ * episode's just-written entry is in its
39
+ * prompt) and either not-spawned /
35
40
  * refused / evolved.
36
41
  * h. advance 'closed' + releaseInFlight — ALWAYS, even on error.
37
42
  *
38
43
  * ORDERING GUARANTEE: the rollback + reject-buffer write are SEQUENTIAL awaits
39
- * that BOTH complete (and the stage reads 'rolled-back'/'kept') before
40
- * {@link runEvolvingAgent} is even called. (f) and (g) are never parallelized
41
- * and never share a Promise.all.
44
+ * that BOTH complete (and the stage reads 'rolled-back'/'kept') before the
45
+ * observed-GREEN preflight and {@link runEvolvingAgent}. (f) and (g) are never
46
+ * parallelized and never share a Promise.all.
42
47
  *
43
48
  * This module orchestrates; it never spawns an agent itself — the three agents
44
49
  * own their own {@link runHeadlessAgent} spawns (the `spawn` seam threads to all
@@ -239,14 +244,16 @@ export interface ResumeEpisodeResult {
239
244
  * done step rather than re-advancing a stage already entered:
240
245
  *
241
246
  * - 'scored' → run the decision (f) then the 演进智能体 (g).
242
- * - 'rolled-back' / 'kept' → run the 演进智能体 EVOLVING AGENT (g) then close.
247
+ * - 'rolled-back' / 'kept' / 'evolving' → run the 演进智能体 EVOLVING AGENT (g)
248
+ * then close. ('evolving' means a crash AFTER the
249
+ * marker but before the agent settled an outcome.)
243
250
  * - 'evolved'/'evolution-refused'/'abstained' → close.
244
251
  * - 'errored' → RE-DRIVE from the last GOOD pre-error stage
245
252
  * (an episode may have errored on a TRANSIENT
246
253
  * cause — a one-off git/analyzer/agent timeout).
247
254
  * The pre-error stage is the last `stageHistory`
248
255
  * entry that is NOT 'errored'; when it is one of
249
- * {'scored','rolled-back','kept'} (the
256
+ * {'scored','rolled-back','kept','evolving'} (the
250
257
  * resume-entry stages) we advance errored → that
251
258
  * stage and fall through to the normal dispatch.
252
259
  * Otherwise the pre-error stage is not
@@ -1,3 +1,5 @@
1
+ import { promises as fs } from 'node:fs';
2
+ import * as path from 'node:path';
1
3
  import { toActionSkeleton } from '../trajectory/skeleton.js';
2
4
  import { getTrajectoryForChange } from '../trajectory/registry.js';
3
5
  import { acquireInFlight, releaseInFlight, currentPolicyVersion, readPolicyLedger, initPolicyLineage, rollbackPolicyVersion, } from './policy/policy-store.js';
@@ -6,7 +8,7 @@ import { createEpisode, advanceEpisodeStage, writeArmCapture, readEpisode, episo
6
8
  import { shouldRunCriticAgent, runCriticAgent, } from './critic-agent.js';
7
9
  import { runRewardAgentEnsemble } from './reward-aggregator.js';
8
10
  import { detectTestTamper } from './tamper-check.js';
9
- import { runEvolvingAgent, DEFAULT_EVOLVING_AGENT_EDIT_BUDGET, MIN_EVOLVING_AGENT_EDIT_BUDGET, } from './evolving-agent.js';
11
+ import { runEvolvingAgent, DEFAULT_EVOLVING_AGENT_EDIT_BUDGET, MIN_EVOLVING_AGENT_EDIT_BUDGET, isArmObjectiveGreen, } from './evolving-agent.js';
10
12
  import { reconcilePrediction, summarizeCalibration, } from './policy/prediction-reconcile.js';
11
13
  /**
12
14
  * Build the 主智能体 MAIN AGENT arm `{transcript?, skeleton?, objective}` from an
@@ -183,13 +185,65 @@ function deriveEpisodeId(changeName, now) {
183
185
  /**
184
186
  * Build the `terminalError` note for a thrown step. A timeout reads identically
185
187
  * to a hard crash on disk otherwise, so a message that names a host-agent timeout
186
- * (the spawn timeout puts `headless agent timed out after Nms` into stderr the
187
- * error message) is PREFIXED with a `timeout:` marker. A timed-out episode is
188
- * then distinguishable from a genuine crash in episode.json. Pure.
188
+ * (absolute wall: `headless agent timed out after Nms`; idle wall: `idle timeout`)
189
+ * is PREFIXED with a `timeout:` marker. A timed-out episode is then
190
+ * distinguishable from a genuine crash in episode.json. Pure.
189
191
  */
190
192
  function terminalErrorLabel(err) {
191
193
  const msg = err instanceof Error ? err.message : String(err);
192
- return /timed out/i.test(msg) ? `timeout: ${msg}` : msg;
194
+ return /\b(timed out|idle timeout)\b/i.test(msg) ? `timeout: ${msg}` : msg;
195
+ }
196
+ function observedGreenFailureReason(objective) {
197
+ if (!objective) {
198
+ return 'observed-GREEN gate: main-arm/objective.json is missing or unreadable - cannot confirm a verified green run';
199
+ }
200
+ const evidence = isArmObjectiveGreen(objective);
201
+ return evidence.ok ? null : `observed-GREEN gate failed: ${evidence.reason}`;
202
+ }
203
+ async function readMainArmObjectiveForEpisode(repoRoot, episodeId) {
204
+ const file = path.join(episodeDir(repoRoot, episodeId), 'main-arm', 'objective.json');
205
+ let raw;
206
+ try {
207
+ raw = await fs.readFile(file, 'utf8');
208
+ }
209
+ catch (err) {
210
+ if (err.code === 'ENOENT')
211
+ return null;
212
+ throw err;
213
+ }
214
+ try {
215
+ return JSON.parse(raw);
216
+ }
217
+ catch {
218
+ return null;
219
+ }
220
+ }
221
+ async function runEvolvingStepIfObservedGreen(opts) {
222
+ const objective = Object.prototype.hasOwnProperty.call(opts, 'objective')
223
+ ? (opts.objective ?? null)
224
+ : await readMainArmObjectiveForEpisode(opts.repoRoot, opts.episodeId);
225
+ const gateFailure = observedGreenFailureReason(objective);
226
+ if (gateFailure) {
227
+ return { kind: 'not-spawned', reason: gateFailure };
228
+ }
229
+ if (opts.markEvolving) {
230
+ await advanceEpisodeStage({
231
+ repoRoot: opts.repoRoot,
232
+ episodeId: opts.episodeId,
233
+ stage: 'evolving',
234
+ patch: { evolvingHeartbeatAt: new Date().toISOString() },
235
+ });
236
+ }
237
+ return await runEvolvingAgent({
238
+ repoRoot: opts.repoRoot,
239
+ episodeId: opts.episodeId,
240
+ targetId: opts.targetId,
241
+ editBudget: opts.editBudget,
242
+ ...(opts.calibrationNote ? { calibrationNote: opts.calibrationNote } : {}),
243
+ spawn: opts.spawn,
244
+ ...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
245
+ ...(opts.harness ? { harness: opts.harness } : {}),
246
+ });
193
247
  }
194
248
  /**
195
249
  * Run ONE episode through the loop in the strict, durably-persisted order
@@ -431,12 +485,13 @@ async function runEpisodeAfterCreate(opts) {
431
485
  // runEvolvingAgent reads the reject-buffer FRESH from disk (the entry just
432
486
  // written THIS episode is in its prompt). Never parallelized with (f).
433
487
  // 步长: after a rollback, shrink the edit budget (smaller step after a step
434
- // that lost ground). 预测校准: pass the proposer's recent prediction record.
488
+ // that lost ground). 预测校准: pass the 演进智能体 EVOLVING AGENT's recent
489
+ // prediction record.
435
490
  const scheduledBudget = decision === 'rolled-back'
436
491
  ? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
437
492
  : editBudget;
438
493
  const calibrationNote = await summarizeCalibration(repoRoot, targetId);
439
- evolution = await runEvolvingAgent({
494
+ evolution = await runEvolvingStepIfObservedGreen({
440
495
  repoRoot,
441
496
  episodeId,
442
497
  targetId,
@@ -445,6 +500,8 @@ async function runEpisodeAfterCreate(opts) {
445
500
  spawn: opts.spawn,
446
501
  ...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
447
502
  ...(opts.harness ? { harness: opts.harness } : {}),
503
+ markEvolving: true,
504
+ objective: opts.mainArm.objective,
448
505
  });
449
506
  }
450
507
  }
@@ -477,16 +534,19 @@ async function runEpisodeAfterCreate(opts) {
477
534
  * Closable stages:
478
535
  * - evolved | evolution-refused | abstained — the 演进智能体 EVOLVING AGENT
479
536
  * reached a definite outcome (or the judge 弃权 abstained), the normal close.
480
- * - kept | rolled-back — the 演进智能体 returned not-spawned (its diagnosis
537
+ * - evolving — the 演进智能体 returned not-spawned (its diagnosis
481
538
  * abstained-after-gap-check, no gaps, or the target resolved to no editable
482
- * local files), so the episode never advanced past the decision. By the time
483
- * this runs (AFTER runEvolvingAgent returned), a stage still at 'kept'/
484
- * 'rolled-back' can ONLY mean not-spawned — a success advances 'evolved', a
539
+ * local files), so the episode never advanced past the 'evolving' marker.
540
+ * By the time this runs (AFTER runEvolvingAgent returned), a stage still at
541
+ * 'evolving' can ONLY mean not-spawned — a success advances 'evolved', a
485
542
  * refusal advances 'evolution-refused', and a throw is caught upstream and
486
543
  * records 'errored' + rethrows so this close is never reached. So a leftover
487
- * kept/rolled-back at close time IS the finished-nothing-to-evolve case and
488
- * must close, not rest forever at a non-terminal stage (the exact ambiguity
489
- * the 'errored' stage was meant to remove).
544
+ * 'evolving' at close time IS the finished-nothing-to-evolve case and must
545
+ * close, not rest forever at a non-terminal stage (the exact ambiguity the
546
+ * 'errored' stage was meant to remove).
547
+ * - kept | rolled-back — retained for back-compat: an OLD episode record (or a
548
+ * code path that did not advance the 'evolving' marker) that returned
549
+ * not-spawned never advances past the decision; close it the same way.
490
550
  *
491
551
  * Any other (genuinely non-closable) stage is left as-is rather than throwing, so
492
552
  * the close never masks the real episode outcome.
@@ -497,7 +557,10 @@ async function closeEpisodeBestEffort(repoRoot, episodeId) {
497
557
  'evolved',
498
558
  'evolution-refused',
499
559
  'abstained',
500
- // not-spawned 演进智能体 leaves the episode here close the finished episode.
560
+ // not-spawned 演进智能体 leaves the episode at the 'evolving' marker close
561
+ // the finished episode. 'kept'/'rolled-back' retained for back-compat with
562
+ // an old record / a path that never advanced the marker.
563
+ 'evolving',
501
564
  'kept',
502
565
  'rolled-back',
503
566
  ]);
@@ -586,14 +649,16 @@ async function ensureRejectBufferEntry(repoRoot, opts) {
586
649
  * done step rather than re-advancing a stage already entered:
587
650
  *
588
651
  * - 'scored' → run the decision (f) then the 演进智能体 (g).
589
- * - 'rolled-back' / 'kept' → run the 演进智能体 EVOLVING AGENT (g) then close.
652
+ * - 'rolled-back' / 'kept' / 'evolving' → run the 演进智能体 EVOLVING AGENT (g)
653
+ * then close. ('evolving' means a crash AFTER the
654
+ * marker but before the agent settled an outcome.)
590
655
  * - 'evolved'/'evolution-refused'/'abstained' → close.
591
656
  * - 'errored' → RE-DRIVE from the last GOOD pre-error stage
592
657
  * (an episode may have errored on a TRANSIENT
593
658
  * cause — a one-off git/analyzer/agent timeout).
594
659
  * The pre-error stage is the last `stageHistory`
595
660
  * entry that is NOT 'errored'; when it is one of
596
- * {'scored','rolled-back','kept'} (the
661
+ * {'scored','rolled-back','kept','evolving'} (the
597
662
  * resume-entry stages) we advance errored → that
598
663
  * stage and fall through to the normal dispatch.
599
664
  * Otherwise the pre-error stage is not
@@ -617,7 +682,7 @@ export async function resumeEpisode(opts) {
617
682
  // for an 'errored' episode we attempt to RE-DRIVE from the last good pre-error
618
683
  // stage (a transient git/analyzer/agent failure should be retryable via an
619
684
  // operator resume). 'errored' stays terminal for every OTHER caller — only this
620
- // resume path may re-drive it, via the errored → {scored,rolled-back,kept}
685
+ // resume path may re-drive it, via the errored → {scored,rolled-back,kept,evolving}
621
686
  // transitions the stage machine allows ONLY for operator recovery.
622
687
  let stage = ep.stage;
623
688
  if (ep.stage === 'errored') {
@@ -626,7 +691,8 @@ export async function resumeEpisode(opts) {
626
691
  .find((h) => h.stage !== 'errored')?.stage;
627
692
  if (preError === 'scored' ||
628
693
  preError === 'rolled-back' ||
629
- preError === 'kept') {
694
+ preError === 'kept' ||
695
+ preError === 'evolving') {
630
696
  // Re-open the errored episode at its last auto-resumable stage, then fall
631
697
  // through to the normal dispatch for that stage.
632
698
  await advanceEpisodeStage({ repoRoot, episodeId, stage: preError });
@@ -636,7 +702,7 @@ export async function resumeEpisode(opts) {
636
702
  // 'baseline-skipped'); leave the episode at 'errored' and report it as-is.
637
703
  }
638
704
  // The decision (f) + 演进智能体 EVOLVING AGENT (g) re-runs below can THROW — a
639
- // wedged/crashed host CLI (CanonicalProposerInvocationError), a timeout, or an
705
+ // wedged/crashed host CLI (EvolvingAgentInvocationError), a timeout, or an
640
706
  // observed-GREEN gate throw. UNCAUGHT, that leaves the episode DURABLY stuck at
641
707
  // a non-terminal stage ('scored'/'rolled-back'/'kept' — the orphan state fix ❷
642
708
  // eliminates for runEpisode). Record the SAME terminal 'errored' stage here
@@ -644,7 +710,31 @@ export async function resumeEpisode(opts) {
644
710
  // re-throw. Resume holds NO in-flight lock, so this is a durable-stage fix, not
645
711
  // a leak fix. Best-effort write: a failed record must not mask the original throw.
646
712
  try {
647
- if (stage === 'scored') {
713
+ // TOCTOU guard: resume read the stage at entry (~L945), but it holds NO in-flight
714
+ // lock, so a CONCURRENT runEpisode for the same target can advance THIS episode to
715
+ // a TERMINAL stage between that read and the transitions below. Re-read the episode
716
+ // immediately before dispatching; if it is already finished, the transitions would
717
+ // throw an illegal-transition error (which the catch below would then mis-record as
718
+ // a fresh 'errored'). Short-circuit instead: report the already-finished episode
719
+ // via the normal completion return. (The errored→pre-error re-drive above already
720
+ // turned a re-drivable 'errored' into a non-terminal stage, so a stage that is
721
+ // STILL terminal here is genuinely finished, not auto-resumable.)
722
+ const TERMINAL_STAGES = new Set([
723
+ 'closed',
724
+ 'errored',
725
+ 'evolution-refused',
726
+ 'evolved',
727
+ 'abstained',
728
+ ]);
729
+ const fresh = await readEpisode(repoRoot, episodeId);
730
+ stage = fresh.stage;
731
+ if (TERMINAL_STAGES.has(stage)) {
732
+ // 'evolved'/'evolution-refused'/'abstained' still want their best-effort close;
733
+ // 'closed'/'errored' are no-ops for closeEpisodeBestEffort. No transition is
734
+ // attempted, so the race cannot surface as an illegal-transition throw.
735
+ await closeEpisodeBestEffort(repoRoot, episodeId);
736
+ }
737
+ else if (stage === 'scored') {
648
738
  // Re-run the decision (f) from the on-disk diagnosis, then (g).
649
739
  const diagnosis = await readDiagnosisForResume(repoRoot, episodeId);
650
740
  if (shouldSkipEvolution(diagnosis)) {
@@ -719,7 +809,7 @@ export async function resumeEpisode(opts) {
719
809
  ? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
720
810
  : editBudget;
721
811
  const calibrationNote = await summarizeCalibration(repoRoot, targetId);
722
- evolution = await runEvolvingAgent({
812
+ evolution = await runEvolvingStepIfObservedGreen({
723
813
  repoRoot,
724
814
  episodeId,
725
815
  targetId,
@@ -728,11 +818,12 @@ export async function resumeEpisode(opts) {
728
818
  spawn: opts.spawn,
729
819
  ...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
730
820
  ...(opts.harness ? { harness: opts.harness } : {}),
821
+ markEvolving: true,
731
822
  });
732
823
  }
733
824
  await closeEpisodeBestEffort(repoRoot, episodeId);
734
825
  }
735
- else if (stage === 'rolled-back' || stage === 'kept') {
826
+ else if (stage === 'rolled-back' || stage === 'kept' || stage === 'evolving') {
736
827
  // The decision already ran (and the original episode settled the prediction);
737
828
  // re-settle idempotently for the crash window, then schedule + calibrate.
738
829
  try {
@@ -741,11 +832,16 @@ export async function resumeEpisode(opts) {
741
832
  catch {
742
833
  // best-effort: advisory only
743
834
  }
744
- const scheduledBudget = stage === 'rolled-back'
835
+ // Resuming from 'evolving' means the decision is in history (not the resume
836
+ // stage); read it from stageHistory so the 步长 schedule still shrinks after a
837
+ // rollback. Resuming from 'rolled-back'/'kept' uses the resume stage directly.
838
+ const wasRolledBack = stage === 'rolled-back' ||
839
+ (stage === 'evolving' && ep.stageHistory.some((h) => h.stage === 'rolled-back'));
840
+ const scheduledBudget = wasRolledBack
745
841
  ? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
746
842
  : editBudget;
747
843
  const calibrationNote = await summarizeCalibration(repoRoot, targetId);
748
- evolution = await runEvolvingAgent({
844
+ evolution = await runEvolvingStepIfObservedGreen({
749
845
  repoRoot,
750
846
  episodeId,
751
847
  targetId,
@@ -754,16 +850,13 @@ export async function resumeEpisode(opts) {
754
850
  spawn: opts.spawn,
755
851
  ...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
756
852
  ...(opts.harness ? { harness: opts.harness } : {}),
853
+ markEvolving: stage !== 'evolving',
757
854
  });
758
855
  await closeEpisodeBestEffort(repoRoot, episodeId);
759
856
  }
760
- else if (stage === 'evolved' ||
761
- stage === 'evolution-refused' ||
762
- stage === 'abstained') {
763
- await closeEpisodeBestEffort(repoRoot, episodeId);
764
- }
765
- // earlier stages (and a non-auto-resumable 'errored'): not auto-resumable here
766
- // — reported as-is.
857
+ // Terminal stages (incl. a non-auto-resumable 'errored') are handled by the
858
+ // TOCTOU guard above; earlier stages are not auto-resumable here — reported
859
+ // as-is via the completion return below.
767
860
  }
768
861
  catch (err) {
769
862
  // A thrown decision/evolving step records a DURABLE terminal 'errored' stage so
@@ -790,9 +883,7 @@ export async function resumeEpisode(opts) {
790
883
  * AGENT's reader uses).
791
884
  */
792
885
  async function readDiagnosisForResume(repoRoot, episodeId) {
793
- const { promises: fs } = await import('node:fs');
794
- const pathMod = await import('node:path');
795
- const file = pathMod.join(episodeDir(repoRoot, episodeId), 'diagnosis.json');
886
+ const file = path.join(episodeDir(repoRoot, episodeId), 'diagnosis.json');
796
887
  let raw;
797
888
  try {
798
889
  raw = await fs.readFile(file, 'utf8');
@@ -57,10 +57,11 @@
57
57
  * -> (baseline-arm-captured | baseline-skipped) // CRITIC AGENT(基线智能体 baseline agent)arm
58
58
  * -> scored // 奖励智能体 REWARD AGENT wrote diagnosis.json
59
59
  * -> (rolled-back | kept) // rollback decision on the main arm's edits
60
+ * -> evolving // 演进智能体 EVOLVING AGENT holds the in-flight lock
60
61
  * -> (evolved | evolution-refused | abstained) // 演进智能体 EVOLVING AGENT outcome
61
62
  * -> closed // terminal
62
63
  *
63
- * (rolled-back | kept)
64
+ * (rolled-back | kept | evolving)
64
65
  * -> closed // terminal — see below
65
66
  *
66
67
  * (any non-terminal stage)
@@ -73,12 +74,23 @@
73
74
  * 弃权 abstains when no nameable gap → no rollback decision needed → the
74
75
  * 演进智能体 EVOLVING AGENT is never spawned.
75
76
  *
76
- * `rolled-back`/`kept` may also reach `closed` DIRECTLY: when the 演进智能体
77
- * EVOLVING AGENT was NOT spawned (its diagnosis abstained-after-gap-check, named
78
- * no gaps, or the target resolved to no editable local files) the episode never
79
- * advances past the decision, so the orchestrator's best-effort close terminates
80
- * the finished-nothing-to-evolve episode rather than leaving it resting forever
81
- * at a non-terminal stage.
77
+ * `evolving` is advanced by the orchestrator BEFORE it spawns the 演进智能体
78
+ * EVOLVING AGENT, while that agent holds the in-flight lock. It exists so a
79
+ * concurrent sibling reading the store distinguishes a LIVE-but-slow holder
80
+ * (stage `evolving`) from an episode that merely reached the decision (`kept`/
81
+ * `rolled-back`) — without it the stage stays `kept` for the whole evolving
82
+ * spawn, and a sibling can misread a running holder as stale. The
83
+ * `evolvingHeartbeatAt` field records when the stage was entered. Old episode
84
+ * records that predate this stage never carry it; they resume exactly as before
85
+ * (the `rolled-back`/`kept` → outcome transitions are retained for them).
86
+ *
87
+ * `rolled-back`/`kept`/`evolving` may also reach `closed` DIRECTLY: when the
88
+ * 演进智能体 EVOLVING AGENT was NOT spawned (its diagnosis abstained-after-gap-
89
+ * check, named no gaps, or the target resolved to no editable local files) the
90
+ * episode never advances past the decision (it stays `kept`/`rolled-back`, or —
91
+ * for the not-spawned-after-evolving-marker case — `evolving`), so the
92
+ * orchestrator's best-effort close terminates the finished-nothing-to-evolve
93
+ * episode rather than leaving it resting forever at a non-terminal stage.
82
94
  *
83
95
  * `errored` is a SECOND terminal stage reachable from EVERY non-terminal stage.
84
96
  * A thrown step — an agent spawn that crashes or times out (主智能体 MAIN AGENT /
@@ -92,10 +104,10 @@
92
104
  * `errored` is terminal for every target EXCEPT an operator-driven resume: a
93
105
  * transient cause (a one-off git/analyzer/agent timeout) is retryable, so an
94
106
  * `episode resume` may RE-DRIVE an errored episode back to its last good
95
- * pre-error stage — `errored -> {scored, rolled-back, kept}` (the resume-entry
96
- * stages). No other caller may leave `errored`.
107
+ * pre-error stage — `errored -> {scored, rolled-back, kept, evolving}` (the
108
+ * resume-entry stages). No other caller may leave `errored`.
97
109
  */
98
- export type EpisodeStage = 'created' | 'main-arm-captured' | 'baseline-arm-captured' | 'baseline-skipped' | 'scored' | 'rolled-back' | 'kept' | 'evolved' | 'evolution-refused' | 'abstained' | 'closed' | 'errored';
110
+ export type EpisodeStage = 'created' | 'main-arm-captured' | 'baseline-arm-captured' | 'baseline-skipped' | 'scored' | 'rolled-back' | 'kept' | 'evolving' | 'evolved' | 'evolution-refused' | 'abstained' | 'closed' | 'errored';
99
111
  /**
100
112
  * Iterable list of every legal {@link EpisodeStage} value. Order follows the
101
113
  * documented state machine for readability, not behavior.
@@ -143,6 +155,14 @@ export interface EpisodeRecord {
143
155
  stageHistory: EpisodeStageHistoryEntry[];
144
156
  /** Why the baseline arm was skipped (set with stage `baseline-skipped`). */
145
157
  baselineSkippedReason?: string;
158
+ /**
159
+ * ISO 8601 UTC timestamp the episode entered the `evolving` stage (the moment
160
+ * the 演进智能体 EVOLVING AGENT spawn began holding the in-flight lock). A
161
+ * heartbeat for liveness reads — a concurrent sibling can tell a recently-
162
+ * entered `evolving` holder apart from one that genuinely wedged. Absent on
163
+ * old records (and on every stage before `evolving`).
164
+ */
165
+ evolvingHeartbeatAt?: string;
146
166
  /** advantage = reward(主臂) − reward(基线臂); null when the 奖励智能体 REWARD AGENT 弃权 abstained. */
147
167
  advantage?: number | null;
148
168
  /**
@@ -163,6 +183,8 @@ export interface EpisodeStagePatch {
163
183
  advantage?: number | null;
164
184
  /** Cause note merged alongside the terminal `errored` stage. */
165
185
  terminalError?: string;
186
+ /** Heartbeat timestamp merged alongside the `evolving` stage. */
187
+ evolvingHeartbeatAt?: string;
166
188
  }
167
189
  /**
168
190
  * True iff `(from -> to)` is a legal transition in the episode stage machine.
@@ -234,7 +256,8 @@ export interface AdvanceEpisodeStageOptions {
234
256
  * advancing to a stage not reachable from the current one throws.
235
257
  * - Appends `{stage, at}` to `stageHistory`.
236
258
  * - Merges the allowlisted `patch` fields (`policyVersionBaseline`,
237
- * `baselineSkippedReason`, `advantage`, `terminalError`) in the same write.
259
+ * `baselineSkippedReason`, `advantage`, `terminalError`, `evolvingHeartbeatAt`)
260
+ * in the same write.
238
261
  * - Bumps `updatedAt`.
239
262
  */
240
263
  export declare function advanceEpisodeStage(opts: AdvanceEpisodeStageOptions): Promise<EpisodeRecord>;
@@ -61,6 +61,7 @@ export const EPISODE_STAGES = [
61
61
  'scored',
62
62
  'rolled-back',
63
63
  'kept',
64
+ 'evolving',
64
65
  'evolved',
65
66
  'evolution-refused',
66
67
  'abstained',
@@ -77,8 +78,9 @@ const EPISODE_ID_PATTERN = /^[a-z0-9][a-z0-9-]*$/;
77
78
  // step: agent spawn crash/timeout or un-repairable gate), so a failed episode
78
79
  // is never orphaned mid-flight. `closed` and `errored` are the two terminals;
79
80
  // `errored` is terminal EXCEPT for an operator resume re-drive back to its last
80
- // good pre-error stage (scored/rolled-back/kept). `rolled-back`/`kept` may also
81
- // close directly (the not-spawned 演进智能体 finished-nothing-to-evolve case).
81
+ // good pre-error stage (scored/rolled-back/kept/evolving). `rolled-back`/`kept`/
82
+ // `evolving` may also close directly (the not-spawned 演进智能体
83
+ // finished-nothing-to-evolve case).
82
84
  const LEGAL_STAGE_TRANSITIONS = new Map([
83
85
  ['created', new Set(['main-arm-captured', 'errored'])],
84
86
  [
@@ -91,15 +93,38 @@ const LEGAL_STAGE_TRANSITIONS = new Map([
91
93
  // abstained, so no rollback decision is needed and the 演进智能体
92
94
  // EVOLVING AGENT is never spawned.
93
95
  ['scored', new Set(['rolled-back', 'kept', 'abstained', 'errored'])],
94
- // 'rolled-back'/'kept' may also reach 'closed' DIRECTLY when the 演进智能体
95
- // EVOLVING AGENT was not-spawned (the finished-nothing-to-evolve case), so the
96
- // episode never rests forever at a non-terminal stage.
96
+ // 'rolled-back'/'kept' advance to 'evolving' BEFORE the 演进智能体 EVOLVING
97
+ // AGENT spawn (so a sibling can tell a live holder from a stale lock). They
98
+ // also retain the DIRECT transitions to the evolving outcomes + 'closed' so
99
+ // (a) an OLD episode record resumed from 'rolled-back'/'kept' (no 'evolving'
100
+ // stage) behaves exactly as before, and (b) the not-spawned
101
+ // finished-nothing-to-evolve case can still close directly.
97
102
  [
98
103
  'rolled-back',
99
- new Set(['evolved', 'evolution-refused', 'abstained', 'closed', 'errored']),
104
+ new Set([
105
+ 'evolving',
106
+ 'evolved',
107
+ 'evolution-refused',
108
+ 'abstained',
109
+ 'closed',
110
+ 'errored',
111
+ ]),
100
112
  ],
101
113
  [
102
114
  'kept',
115
+ new Set([
116
+ 'evolving',
117
+ 'evolved',
118
+ 'evolution-refused',
119
+ 'abstained',
120
+ 'closed',
121
+ 'errored',
122
+ ]),
123
+ ],
124
+ // The 演进智能体 EVOLVING AGENT outcome (or a not-spawned close), or 'errored'
125
+ // on a thrown spawn/gate.
126
+ [
127
+ 'evolving',
103
128
  new Set(['evolved', 'evolution-refused', 'abstained', 'closed', 'errored']),
104
129
  ],
105
130
  ['evolved', new Set(['closed'])],
@@ -107,8 +132,9 @@ const LEGAL_STAGE_TRANSITIONS = new Map([
107
132
  ['abstained', new Set(['closed'])],
108
133
  ['closed', new Set()],
109
134
  // 'errored' is terminal EXCEPT for an operator resume re-drive back to the
110
- // last good pre-error stage (scored/rolled-back/kept); no other caller leaves it.
111
- ['errored', new Set(['scored', 'rolled-back', 'kept'])],
135
+ // last good pre-error stage (scored/rolled-back/kept/evolving); no other
136
+ // caller leaves it.
137
+ ['errored', new Set(['scored', 'rolled-back', 'kept', 'evolving'])],
112
138
  ]);
113
139
  /**
114
140
  * True iff `(from -> to)` is a legal transition in the episode stage machine.
@@ -387,6 +413,7 @@ const ALLOWED_PATCH_KEYS = new Set([
387
413
  'baselineSkippedReason',
388
414
  'advantage',
389
415
  'terminalError',
416
+ 'evolvingHeartbeatAt',
390
417
  ]);
391
418
  /** Validate an {@link EpisodeStagePatch} fail-closed; returns the merge slice. */
392
419
  function validateStagePatch(patch, episodeId) {
@@ -394,7 +421,7 @@ function validateStagePatch(patch, episodeId) {
394
421
  for (const key of Object.keys(patch)) {
395
422
  if (!ALLOWED_PATCH_KEYS.has(key)) {
396
423
  throw new Error(`Illegal episode patch field for ${episodeId}: "${key}" ` +
397
- `(allowed: policyVersionBaseline, baselineSkippedReason, advantage, terminalError)`);
424
+ `(allowed: policyVersionBaseline, baselineSkippedReason, advantage, terminalError, evolvingHeartbeatAt)`);
398
425
  }
399
426
  }
400
427
  if ('policyVersionBaseline' in patch) {
@@ -425,6 +452,13 @@ function validateStagePatch(patch, episodeId) {
425
452
  }
426
453
  merge.terminalError = v;
427
454
  }
455
+ if ('evolvingHeartbeatAt' in patch) {
456
+ const v = patch.evolvingHeartbeatAt;
457
+ if (typeof v !== 'string' || v.length === 0) {
458
+ throw new Error(`Invalid patch for ${episodeId}: evolvingHeartbeatAt must be a non-empty string`);
459
+ }
460
+ merge.evolvingHeartbeatAt = v;
461
+ }
428
462
  return merge;
429
463
  }
430
464
  /**
@@ -436,7 +470,8 @@ function validateStagePatch(patch, episodeId) {
436
470
  * advancing to a stage not reachable from the current one throws.
437
471
  * - Appends `{stage, at}` to `stageHistory`.
438
472
  * - Merges the allowlisted `patch` fields (`policyVersionBaseline`,
439
- * `baselineSkippedReason`, `advantage`, `terminalError`) in the same write.
473
+ * `baselineSkippedReason`, `advantage`, `terminalError`, `evolvingHeartbeatAt`)
474
+ * in the same write.
440
475
  * - Bumps `updatedAt`.
441
476
  */
442
477
  export async function advanceEpisodeStage(opts) {