synergyspec-selfevolving 2.1.2 → 2.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/learn.js +13 -3
- package/dist/commands/self-evolution-episode.d.ts +6 -1
- package/dist/commands/self-evolution-episode.js +8 -1
- package/dist/commands/self-evolution.d.ts +2 -2
- package/dist/commands/self-evolution.js +10 -10
- package/dist/commands/workflow/status.js +5 -0
- package/dist/core/change-readiness.d.ts +1 -1
- package/dist/core/change-readiness.js +13 -5
- package/dist/core/fitness/test-metrics.d.ts +33 -0
- package/dist/core/fitness/test-metrics.js +67 -0
- package/dist/core/learn.js +11 -2
- package/dist/core/project-config.d.ts +3 -0
- package/dist/core/project-config.js +7 -1
- package/dist/core/self-evolution/critic-agent.js +13 -5
- package/dist/core/self-evolution/edits-contract.d.ts +15 -5
- package/dist/core/self-evolution/edits-contract.js +26 -16
- package/dist/core/self-evolution/episode-orchestrator.d.ts +11 -6
- package/dist/core/self-evolution/episode-orchestrator.js +88 -24
- package/dist/core/self-evolution/episode-store.d.ts +34 -11
- package/dist/core/self-evolution/episode-store.js +45 -10
- package/dist/core/self-evolution/evolving-agent.d.ts +4 -4
- package/dist/core/self-evolution/evolving-agent.js +26 -26
- package/dist/core/self-evolution/host-harness.d.ts +68 -2
- package/dist/core/self-evolution/host-harness.js +208 -21
- package/dist/core/self-evolution/policy/policy-store.d.ts +8 -6
- package/dist/core/self-evolution/policy/policy-store.js +124 -24
- package/dist/core/self-evolution/proposer-slice.d.ts +4 -3
- package/dist/core/self-evolution/reward-agent.d.ts +11 -1
- package/dist/core/self-evolution/reward-agent.js +53 -20
- package/dist/core/self-evolution/reward-aggregator.d.ts +18 -0
- package/dist/core/self-evolution/reward-aggregator.js +53 -3
- package/dist/core/self-evolution/reward-deepread.d.ts +64 -0
- package/dist/core/self-evolution/reward-deepread.js +112 -0
- package/dist/core/templates/workflows/learn.js +2 -1
- package/dist/core/templates/workflows/self-evolving.js +5 -2
- package/dist/core/trajectory/facts.d.ts +69 -2
- package/dist/core/trajectory/facts.js +179 -10
- package/dist/core/trajectory/skeleton.d.ts +10 -0
- package/dist/core/trajectory/skeleton.js +24 -3
- package/package.json +1 -1
|
@@ -431,11 +431,21 @@ async function runEpisodeAfterCreate(opts) {
|
|
|
431
431
|
// runEvolvingAgent reads the reject-buffer FRESH from disk (the entry just
|
|
432
432
|
// written THIS episode is in its prompt). Never parallelized with (f).
|
|
433
433
|
// 步长: after a rollback, shrink the edit budget (smaller step after a step
|
|
434
|
-
// that lost ground). 预测校准: pass the
|
|
434
|
+
// that lost ground). 预测校准: pass the 演进智能体 EVOLVING AGENT's recent
|
|
435
|
+
// prediction record.
|
|
435
436
|
const scheduledBudget = decision === 'rolled-back'
|
|
436
437
|
? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
|
|
437
438
|
: editBudget;
|
|
438
439
|
const calibrationNote = await summarizeCalibration(repoRoot, targetId);
|
|
440
|
+
// Advance 'evolving' (with a heartbeat) BEFORE the spawn so a concurrent
|
|
441
|
+
// sibling reading the store sees a LIVE-but-slow holder, not a stale lock at
|
|
442
|
+
// 'kept'/'rolled-back'. runEvolvingAgent advances the terminal outcome.
|
|
443
|
+
await advanceEpisodeStage({
|
|
444
|
+
repoRoot,
|
|
445
|
+
episodeId,
|
|
446
|
+
stage: 'evolving',
|
|
447
|
+
patch: { evolvingHeartbeatAt: new Date().toISOString() },
|
|
448
|
+
});
|
|
439
449
|
evolution = await runEvolvingAgent({
|
|
440
450
|
repoRoot,
|
|
441
451
|
episodeId,
|
|
@@ -477,16 +487,19 @@ async function runEpisodeAfterCreate(opts) {
|
|
|
477
487
|
* Closable stages:
|
|
478
488
|
* - evolved | evolution-refused | abstained — the 演进智能体 EVOLVING AGENT
|
|
479
489
|
* reached a definite outcome (or the judge 弃权 abstained), the normal close.
|
|
480
|
-
* -
|
|
490
|
+
* - evolving — the 演进智能体 returned not-spawned (its diagnosis
|
|
481
491
|
* abstained-after-gap-check, no gaps, or the target resolved to no editable
|
|
482
|
-
* local files), so the episode never advanced past the
|
|
483
|
-
* this runs (AFTER runEvolvingAgent returned), a stage still at
|
|
484
|
-
* '
|
|
492
|
+
* local files), so the episode never advanced past the 'evolving' marker.
|
|
493
|
+
* By the time this runs (AFTER runEvolvingAgent returned), a stage still at
|
|
494
|
+
* 'evolving' can ONLY mean not-spawned — a success advances 'evolved', a
|
|
485
495
|
* refusal advances 'evolution-refused', and a throw is caught upstream and
|
|
486
496
|
* records 'errored' + rethrows so this close is never reached. So a leftover
|
|
487
|
-
*
|
|
488
|
-
*
|
|
489
|
-
*
|
|
497
|
+
* 'evolving' at close time IS the finished-nothing-to-evolve case and must
|
|
498
|
+
* close, not rest forever at a non-terminal stage (the exact ambiguity the
|
|
499
|
+
* 'errored' stage was meant to remove).
|
|
500
|
+
* - kept | rolled-back — retained for back-compat: an OLD episode record (or a
|
|
501
|
+
* code path that did not advance the 'evolving' marker) that returned
|
|
502
|
+
* not-spawned never advances past the decision; close it the same way.
|
|
490
503
|
*
|
|
491
504
|
* Any other (genuinely non-closable) stage is left as-is rather than throwing, so
|
|
492
505
|
* the close never masks the real episode outcome.
|
|
@@ -497,7 +510,10 @@ async function closeEpisodeBestEffort(repoRoot, episodeId) {
|
|
|
497
510
|
'evolved',
|
|
498
511
|
'evolution-refused',
|
|
499
512
|
'abstained',
|
|
500
|
-
// not-spawned 演进智能体 leaves the episode
|
|
513
|
+
// not-spawned 演进智能体 leaves the episode at the 'evolving' marker — close
|
|
514
|
+
// the finished episode. 'kept'/'rolled-back' retained for back-compat with
|
|
515
|
+
// an old record / a path that never advanced the marker.
|
|
516
|
+
'evolving',
|
|
501
517
|
'kept',
|
|
502
518
|
'rolled-back',
|
|
503
519
|
]);
|
|
@@ -586,14 +602,16 @@ async function ensureRejectBufferEntry(repoRoot, opts) {
|
|
|
586
602
|
* done step rather than re-advancing a stage already entered:
|
|
587
603
|
*
|
|
588
604
|
* - 'scored' → run the decision (f) then the 演进智能体 (g).
|
|
589
|
-
* - 'rolled-back' / 'kept'
|
|
605
|
+
* - 'rolled-back' / 'kept' / 'evolving' → run the 演进智能体 EVOLVING AGENT (g)
|
|
606
|
+
* then close. ('evolving' means a crash AFTER the
|
|
607
|
+
* marker but before the agent settled an outcome.)
|
|
590
608
|
* - 'evolved'/'evolution-refused'/'abstained' → close.
|
|
591
609
|
* - 'errored' → RE-DRIVE from the last GOOD pre-error stage
|
|
592
610
|
* (an episode may have errored on a TRANSIENT
|
|
593
611
|
* cause — a one-off git/analyzer/agent timeout).
|
|
594
612
|
* The pre-error stage is the last `stageHistory`
|
|
595
613
|
* entry that is NOT 'errored'; when it is one of
|
|
596
|
-
* {'scored','rolled-back','kept'} (the
|
|
614
|
+
* {'scored','rolled-back','kept','evolving'} (the
|
|
597
615
|
* resume-entry stages) we advance errored → that
|
|
598
616
|
* stage and fall through to the normal dispatch.
|
|
599
617
|
* Otherwise the pre-error stage is not
|
|
@@ -617,7 +635,7 @@ export async function resumeEpisode(opts) {
|
|
|
617
635
|
// for an 'errored' episode we attempt to RE-DRIVE from the last good pre-error
|
|
618
636
|
// stage (a transient git/analyzer/agent failure should be retryable via an
|
|
619
637
|
// operator resume). 'errored' stays terminal for every OTHER caller — only this
|
|
620
|
-
// resume path may re-drive it, via the errored → {scored,rolled-back,kept}
|
|
638
|
+
// resume path may re-drive it, via the errored → {scored,rolled-back,kept,evolving}
|
|
621
639
|
// transitions the stage machine allows ONLY for operator recovery.
|
|
622
640
|
let stage = ep.stage;
|
|
623
641
|
if (ep.stage === 'errored') {
|
|
@@ -626,7 +644,8 @@ export async function resumeEpisode(opts) {
|
|
|
626
644
|
.find((h) => h.stage !== 'errored')?.stage;
|
|
627
645
|
if (preError === 'scored' ||
|
|
628
646
|
preError === 'rolled-back' ||
|
|
629
|
-
preError === 'kept'
|
|
647
|
+
preError === 'kept' ||
|
|
648
|
+
preError === 'evolving') {
|
|
630
649
|
// Re-open the errored episode at its last auto-resumable stage, then fall
|
|
631
650
|
// through to the normal dispatch for that stage.
|
|
632
651
|
await advanceEpisodeStage({ repoRoot, episodeId, stage: preError });
|
|
@@ -636,7 +655,7 @@ export async function resumeEpisode(opts) {
|
|
|
636
655
|
// 'baseline-skipped'); leave the episode at 'errored' and report it as-is.
|
|
637
656
|
}
|
|
638
657
|
// The decision (f) + 演进智能体 EVOLVING AGENT (g) re-runs below can THROW — a
|
|
639
|
-
// wedged/crashed host CLI (
|
|
658
|
+
// wedged/crashed host CLI (EvolvingAgentInvocationError), a timeout, or an
|
|
640
659
|
// observed-GREEN gate throw. UNCAUGHT, that leaves the episode DURABLY stuck at
|
|
641
660
|
// a non-terminal stage ('scored'/'rolled-back'/'kept' — the orphan state fix ❷
|
|
642
661
|
// eliminates for runEpisode). Record the SAME terminal 'errored' stage here
|
|
@@ -644,7 +663,31 @@ export async function resumeEpisode(opts) {
|
|
|
644
663
|
// re-throw. Resume holds NO in-flight lock, so this is a durable-stage fix, not
|
|
645
664
|
// a leak fix. Best-effort write: a failed record must not mask the original throw.
|
|
646
665
|
try {
|
|
647
|
-
|
|
666
|
+
// TOCTOU guard: resume read the stage at entry (~L945), but it holds NO in-flight
|
|
667
|
+
// lock, so a CONCURRENT runEpisode for the same target can advance THIS episode to
|
|
668
|
+
// a TERMINAL stage between that read and the transitions below. Re-read the episode
|
|
669
|
+
// immediately before dispatching; if it is already finished, the transitions would
|
|
670
|
+
// throw an illegal-transition error (which the catch below would then mis-record as
|
|
671
|
+
// a fresh 'errored'). Short-circuit instead: report the already-finished episode
|
|
672
|
+
// via the normal completion return. (The errored→pre-error re-drive above already
|
|
673
|
+
// turned a re-drivable 'errored' into a non-terminal stage, so a stage that is
|
|
674
|
+
// STILL terminal here is genuinely finished, not auto-resumable.)
|
|
675
|
+
const TERMINAL_STAGES = new Set([
|
|
676
|
+
'closed',
|
|
677
|
+
'errored',
|
|
678
|
+
'evolution-refused',
|
|
679
|
+
'evolved',
|
|
680
|
+
'abstained',
|
|
681
|
+
]);
|
|
682
|
+
const fresh = await readEpisode(repoRoot, episodeId);
|
|
683
|
+
stage = fresh.stage;
|
|
684
|
+
if (TERMINAL_STAGES.has(stage)) {
|
|
685
|
+
// 'evolved'/'evolution-refused'/'abstained' still want their best-effort close;
|
|
686
|
+
// 'closed'/'errored' are no-ops for closeEpisodeBestEffort. No transition is
|
|
687
|
+
// attempted, so the race cannot surface as an illegal-transition throw.
|
|
688
|
+
await closeEpisodeBestEffort(repoRoot, episodeId);
|
|
689
|
+
}
|
|
690
|
+
else if (stage === 'scored') {
|
|
648
691
|
// Re-run the decision (f) from the on-disk diagnosis, then (g).
|
|
649
692
|
const diagnosis = await readDiagnosisForResume(repoRoot, episodeId);
|
|
650
693
|
if (shouldSkipEvolution(diagnosis)) {
|
|
@@ -719,6 +762,15 @@ export async function resumeEpisode(opts) {
|
|
|
719
762
|
? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
|
|
720
763
|
: editBudget;
|
|
721
764
|
const calibrationNote = await summarizeCalibration(repoRoot, targetId);
|
|
765
|
+
// Advance the 'evolving' marker (heartbeat) before the spawn, mirroring
|
|
766
|
+
// runEpisode's (g). Idempotent across a crash-resume: 'evolving' is reached
|
|
767
|
+
// from both 'rolled-back' and 'kept'.
|
|
768
|
+
await advanceEpisodeStage({
|
|
769
|
+
repoRoot,
|
|
770
|
+
episodeId,
|
|
771
|
+
stage: 'evolving',
|
|
772
|
+
patch: { evolvingHeartbeatAt: new Date().toISOString() },
|
|
773
|
+
});
|
|
722
774
|
evolution = await runEvolvingAgent({
|
|
723
775
|
repoRoot,
|
|
724
776
|
episodeId,
|
|
@@ -732,7 +784,7 @@ export async function resumeEpisode(opts) {
|
|
|
732
784
|
}
|
|
733
785
|
await closeEpisodeBestEffort(repoRoot, episodeId);
|
|
734
786
|
}
|
|
735
|
-
else if (stage === 'rolled-back' || stage === 'kept') {
|
|
787
|
+
else if (stage === 'rolled-back' || stage === 'kept' || stage === 'evolving') {
|
|
736
788
|
// The decision already ran (and the original episode settled the prediction);
|
|
737
789
|
// re-settle idempotently for the crash window, then schedule + calibrate.
|
|
738
790
|
try {
|
|
@@ -741,10 +793,26 @@ export async function resumeEpisode(opts) {
|
|
|
741
793
|
catch {
|
|
742
794
|
// best-effort: advisory only
|
|
743
795
|
}
|
|
744
|
-
|
|
796
|
+
// Resuming from 'evolving' means the decision is in history (not the resume
|
|
797
|
+
// stage); read it from stageHistory so the 步长 schedule still shrinks after a
|
|
798
|
+
// rollback. Resuming from 'rolled-back'/'kept' uses the resume stage directly.
|
|
799
|
+
const wasRolledBack = stage === 'rolled-back' ||
|
|
800
|
+
(stage === 'evolving' && ep.stageHistory.some((h) => h.stage === 'rolled-back'));
|
|
801
|
+
const scheduledBudget = wasRolledBack
|
|
745
802
|
? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
|
|
746
803
|
: editBudget;
|
|
747
804
|
const calibrationNote = await summarizeCalibration(repoRoot, targetId);
|
|
805
|
+
// Advance the 'evolving' marker before the spawn when resuming from the
|
|
806
|
+
// decision stage. When already at 'evolving' (a crash mid-spawn re-drive),
|
|
807
|
+
// the marker is present — skip the (now illegal) self-transition.
|
|
808
|
+
if (stage !== 'evolving') {
|
|
809
|
+
await advanceEpisodeStage({
|
|
810
|
+
repoRoot,
|
|
811
|
+
episodeId,
|
|
812
|
+
stage: 'evolving',
|
|
813
|
+
patch: { evolvingHeartbeatAt: new Date().toISOString() },
|
|
814
|
+
});
|
|
815
|
+
}
|
|
748
816
|
evolution = await runEvolvingAgent({
|
|
749
817
|
repoRoot,
|
|
750
818
|
episodeId,
|
|
@@ -757,13 +825,9 @@ export async function resumeEpisode(opts) {
|
|
|
757
825
|
});
|
|
758
826
|
await closeEpisodeBestEffort(repoRoot, episodeId);
|
|
759
827
|
}
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
await closeEpisodeBestEffort(repoRoot, episodeId);
|
|
764
|
-
}
|
|
765
|
-
// earlier stages (and a non-auto-resumable 'errored'): not auto-resumable here
|
|
766
|
-
// — reported as-is.
|
|
828
|
+
// Terminal stages (incl. a non-auto-resumable 'errored') are handled by the
|
|
829
|
+
// TOCTOU guard above; earlier stages are not auto-resumable here — reported
|
|
830
|
+
// as-is via the completion return below.
|
|
767
831
|
}
|
|
768
832
|
catch (err) {
|
|
769
833
|
// A thrown decision/evolving step records a DURABLE terminal 'errored' stage so
|
|
@@ -57,10 +57,11 @@
|
|
|
57
57
|
* -> (baseline-arm-captured | baseline-skipped) // CRITIC AGENT(基线智能体 baseline agent)arm
|
|
58
58
|
* -> scored // 奖励智能体 REWARD AGENT wrote diagnosis.json
|
|
59
59
|
* -> (rolled-back | kept) // rollback decision on the main arm's edits
|
|
60
|
+
* -> evolving // 演进智能体 EVOLVING AGENT holds the in-flight lock
|
|
60
61
|
* -> (evolved | evolution-refused | abstained) // 演进智能体 EVOLVING AGENT outcome
|
|
61
62
|
* -> closed // terminal
|
|
62
63
|
*
|
|
63
|
-
* (rolled-back | kept)
|
|
64
|
+
* (rolled-back | kept | evolving)
|
|
64
65
|
* -> closed // terminal — see below
|
|
65
66
|
*
|
|
66
67
|
* (any non-terminal stage)
|
|
@@ -73,12 +74,23 @@
|
|
|
73
74
|
* 弃权 abstains when no nameable gap → no rollback decision needed → the
|
|
74
75
|
* 演进智能体 EVOLVING AGENT is never spawned.
|
|
75
76
|
*
|
|
76
|
-
* `
|
|
77
|
-
* EVOLVING AGENT
|
|
78
|
-
*
|
|
79
|
-
*
|
|
80
|
-
* the
|
|
81
|
-
*
|
|
77
|
+
* `evolving` is advanced by the orchestrator BEFORE it spawns the 演进智能体
|
|
78
|
+
* EVOLVING AGENT, while that agent holds the in-flight lock. It exists so a
|
|
79
|
+
* concurrent sibling reading the store distinguishes a LIVE-but-slow holder
|
|
80
|
+
* (stage `evolving`) from an episode that merely reached the decision (`kept`/
|
|
81
|
+
* `rolled-back`) — without it the stage stays `kept` for the whole evolving
|
|
82
|
+
* spawn, and a sibling can misread a running holder as stale. The
|
|
83
|
+
* `evolvingHeartbeatAt` field records when the stage was entered. Old episode
|
|
84
|
+
* records that predate this stage never carry it; they resume exactly as before
|
|
85
|
+
* (the `rolled-back`/`kept` → outcome transitions are retained for them).
|
|
86
|
+
*
|
|
87
|
+
* `rolled-back`/`kept`/`evolving` may also reach `closed` DIRECTLY: when the
|
|
88
|
+
* 演进智能体 EVOLVING AGENT was NOT spawned (its diagnosis abstained-after-gap-
|
|
89
|
+
* check, named no gaps, or the target resolved to no editable local files) the
|
|
90
|
+
* episode never advances past the decision (it stays `kept`/`rolled-back`, or —
|
|
91
|
+
* for the not-spawned-after-evolving-marker case — `evolving`), so the
|
|
92
|
+
* orchestrator's best-effort close terminates the finished-nothing-to-evolve
|
|
93
|
+
* episode rather than leaving it resting forever at a non-terminal stage.
|
|
82
94
|
*
|
|
83
95
|
* `errored` is a SECOND terminal stage reachable from EVERY non-terminal stage.
|
|
84
96
|
* A thrown step — an agent spawn that crashes or times out (主智能体 MAIN AGENT /
|
|
@@ -92,10 +104,10 @@
|
|
|
92
104
|
* `errored` is terminal for every target EXCEPT an operator-driven resume: a
|
|
93
105
|
* transient cause (a one-off git/analyzer/agent timeout) is retryable, so an
|
|
94
106
|
* `episode resume` may RE-DRIVE an errored episode back to its last good
|
|
95
|
-
* pre-error stage — `errored -> {scored, rolled-back, kept}` (the
|
|
96
|
-
* stages). No other caller may leave `errored`.
|
|
107
|
+
* pre-error stage — `errored -> {scored, rolled-back, kept, evolving}` (the
|
|
108
|
+
* resume-entry stages). No other caller may leave `errored`.
|
|
97
109
|
*/
|
|
98
|
-
export type EpisodeStage = 'created' | 'main-arm-captured' | 'baseline-arm-captured' | 'baseline-skipped' | 'scored' | 'rolled-back' | 'kept' | 'evolved' | 'evolution-refused' | 'abstained' | 'closed' | 'errored';
|
|
110
|
+
export type EpisodeStage = 'created' | 'main-arm-captured' | 'baseline-arm-captured' | 'baseline-skipped' | 'scored' | 'rolled-back' | 'kept' | 'evolving' | 'evolved' | 'evolution-refused' | 'abstained' | 'closed' | 'errored';
|
|
99
111
|
/**
|
|
100
112
|
* Iterable list of every legal {@link EpisodeStage} value. Order follows the
|
|
101
113
|
* documented state machine for readability, not behavior.
|
|
@@ -143,6 +155,14 @@ export interface EpisodeRecord {
|
|
|
143
155
|
stageHistory: EpisodeStageHistoryEntry[];
|
|
144
156
|
/** Why the baseline arm was skipped (set with stage `baseline-skipped`). */
|
|
145
157
|
baselineSkippedReason?: string;
|
|
158
|
+
/**
|
|
159
|
+
* ISO 8601 UTC timestamp the episode entered the `evolving` stage (the moment
|
|
160
|
+
* the 演进智能体 EVOLVING AGENT spawn began holding the in-flight lock). A
|
|
161
|
+
* heartbeat for liveness reads — a concurrent sibling can tell a recently-
|
|
162
|
+
* entered `evolving` holder apart from one that genuinely wedged. Absent on
|
|
163
|
+
* old records (and on every stage before `evolving`).
|
|
164
|
+
*/
|
|
165
|
+
evolvingHeartbeatAt?: string;
|
|
146
166
|
/** advantage = reward(主臂) − reward(基线臂); null when the 奖励智能体 REWARD AGENT 弃权 abstained. */
|
|
147
167
|
advantage?: number | null;
|
|
148
168
|
/**
|
|
@@ -163,6 +183,8 @@ export interface EpisodeStagePatch {
|
|
|
163
183
|
advantage?: number | null;
|
|
164
184
|
/** Cause note merged alongside the terminal `errored` stage. */
|
|
165
185
|
terminalError?: string;
|
|
186
|
+
/** Heartbeat timestamp merged alongside the `evolving` stage. */
|
|
187
|
+
evolvingHeartbeatAt?: string;
|
|
166
188
|
}
|
|
167
189
|
/**
|
|
168
190
|
* True iff `(from -> to)` is a legal transition in the episode stage machine.
|
|
@@ -234,7 +256,8 @@ export interface AdvanceEpisodeStageOptions {
|
|
|
234
256
|
* advancing to a stage not reachable from the current one throws.
|
|
235
257
|
* - Appends `{stage, at}` to `stageHistory`.
|
|
236
258
|
* - Merges the allowlisted `patch` fields (`policyVersionBaseline`,
|
|
237
|
-
* `baselineSkippedReason`, `advantage`, `terminalError`)
|
|
259
|
+
* `baselineSkippedReason`, `advantage`, `terminalError`, `evolvingHeartbeatAt`)
|
|
260
|
+
* in the same write.
|
|
238
261
|
* - Bumps `updatedAt`.
|
|
239
262
|
*/
|
|
240
263
|
export declare function advanceEpisodeStage(opts: AdvanceEpisodeStageOptions): Promise<EpisodeRecord>;
|
|
@@ -61,6 +61,7 @@ export const EPISODE_STAGES = [
|
|
|
61
61
|
'scored',
|
|
62
62
|
'rolled-back',
|
|
63
63
|
'kept',
|
|
64
|
+
'evolving',
|
|
64
65
|
'evolved',
|
|
65
66
|
'evolution-refused',
|
|
66
67
|
'abstained',
|
|
@@ -77,8 +78,9 @@ const EPISODE_ID_PATTERN = /^[a-z0-9][a-z0-9-]*$/;
|
|
|
77
78
|
// step: agent spawn crash/timeout or un-repairable gate), so a failed episode
|
|
78
79
|
// is never orphaned mid-flight. `closed` and `errored` are the two terminals;
|
|
79
80
|
// `errored` is terminal EXCEPT for an operator resume re-drive back to its last
|
|
80
|
-
// good pre-error stage (scored/rolled-back/kept). `rolled-back`/`kept
|
|
81
|
-
// close directly (the not-spawned 演进智能体
|
|
81
|
+
// good pre-error stage (scored/rolled-back/kept/evolving). `rolled-back`/`kept`/
|
|
82
|
+
// `evolving` may also close directly (the not-spawned 演进智能体
|
|
83
|
+
// finished-nothing-to-evolve case).
|
|
82
84
|
const LEGAL_STAGE_TRANSITIONS = new Map([
|
|
83
85
|
['created', new Set(['main-arm-captured', 'errored'])],
|
|
84
86
|
[
|
|
@@ -91,15 +93,38 @@ const LEGAL_STAGE_TRANSITIONS = new Map([
|
|
|
91
93
|
// abstained, so no rollback decision is needed and the 演进智能体
|
|
92
94
|
// EVOLVING AGENT is never spawned.
|
|
93
95
|
['scored', new Set(['rolled-back', 'kept', 'abstained', 'errored'])],
|
|
94
|
-
// 'rolled-back'/'kept'
|
|
95
|
-
//
|
|
96
|
-
//
|
|
96
|
+
// 'rolled-back'/'kept' advance to 'evolving' BEFORE the 演进智能体 EVOLVING
|
|
97
|
+
// AGENT spawn (so a sibling can tell a live holder from a stale lock). They
|
|
98
|
+
// also retain the DIRECT transitions to the evolving outcomes + 'closed' so
|
|
99
|
+
// (a) an OLD episode record resumed from 'rolled-back'/'kept' (no 'evolving'
|
|
100
|
+
// stage) behaves exactly as before, and (b) the not-spawned
|
|
101
|
+
// finished-nothing-to-evolve case can still close directly.
|
|
97
102
|
[
|
|
98
103
|
'rolled-back',
|
|
99
|
-
new Set([
|
|
104
|
+
new Set([
|
|
105
|
+
'evolving',
|
|
106
|
+
'evolved',
|
|
107
|
+
'evolution-refused',
|
|
108
|
+
'abstained',
|
|
109
|
+
'closed',
|
|
110
|
+
'errored',
|
|
111
|
+
]),
|
|
100
112
|
],
|
|
101
113
|
[
|
|
102
114
|
'kept',
|
|
115
|
+
new Set([
|
|
116
|
+
'evolving',
|
|
117
|
+
'evolved',
|
|
118
|
+
'evolution-refused',
|
|
119
|
+
'abstained',
|
|
120
|
+
'closed',
|
|
121
|
+
'errored',
|
|
122
|
+
]),
|
|
123
|
+
],
|
|
124
|
+
// The 演进智能体 EVOLVING AGENT outcome (or a not-spawned close), or 'errored'
|
|
125
|
+
// on a thrown spawn/gate.
|
|
126
|
+
[
|
|
127
|
+
'evolving',
|
|
103
128
|
new Set(['evolved', 'evolution-refused', 'abstained', 'closed', 'errored']),
|
|
104
129
|
],
|
|
105
130
|
['evolved', new Set(['closed'])],
|
|
@@ -107,8 +132,9 @@ const LEGAL_STAGE_TRANSITIONS = new Map([
|
|
|
107
132
|
['abstained', new Set(['closed'])],
|
|
108
133
|
['closed', new Set()],
|
|
109
134
|
// 'errored' is terminal EXCEPT for an operator resume re-drive back to the
|
|
110
|
-
// last good pre-error stage (scored/rolled-back/kept); no other
|
|
111
|
-
|
|
135
|
+
// last good pre-error stage (scored/rolled-back/kept/evolving); no other
|
|
136
|
+
// caller leaves it.
|
|
137
|
+
['errored', new Set(['scored', 'rolled-back', 'kept', 'evolving'])],
|
|
112
138
|
]);
|
|
113
139
|
/**
|
|
114
140
|
* True iff `(from -> to)` is a legal transition in the episode stage machine.
|
|
@@ -387,6 +413,7 @@ const ALLOWED_PATCH_KEYS = new Set([
|
|
|
387
413
|
'baselineSkippedReason',
|
|
388
414
|
'advantage',
|
|
389
415
|
'terminalError',
|
|
416
|
+
'evolvingHeartbeatAt',
|
|
390
417
|
]);
|
|
391
418
|
/** Validate an {@link EpisodeStagePatch} fail-closed; returns the merge slice. */
|
|
392
419
|
function validateStagePatch(patch, episodeId) {
|
|
@@ -394,7 +421,7 @@ function validateStagePatch(patch, episodeId) {
|
|
|
394
421
|
for (const key of Object.keys(patch)) {
|
|
395
422
|
if (!ALLOWED_PATCH_KEYS.has(key)) {
|
|
396
423
|
throw new Error(`Illegal episode patch field for ${episodeId}: "${key}" ` +
|
|
397
|
-
`(allowed: policyVersionBaseline, baselineSkippedReason, advantage, terminalError)`);
|
|
424
|
+
`(allowed: policyVersionBaseline, baselineSkippedReason, advantage, terminalError, evolvingHeartbeatAt)`);
|
|
398
425
|
}
|
|
399
426
|
}
|
|
400
427
|
if ('policyVersionBaseline' in patch) {
|
|
@@ -425,6 +452,13 @@ function validateStagePatch(patch, episodeId) {
|
|
|
425
452
|
}
|
|
426
453
|
merge.terminalError = v;
|
|
427
454
|
}
|
|
455
|
+
if ('evolvingHeartbeatAt' in patch) {
|
|
456
|
+
const v = patch.evolvingHeartbeatAt;
|
|
457
|
+
if (typeof v !== 'string' || v.length === 0) {
|
|
458
|
+
throw new Error(`Invalid patch for ${episodeId}: evolvingHeartbeatAt must be a non-empty string`);
|
|
459
|
+
}
|
|
460
|
+
merge.evolvingHeartbeatAt = v;
|
|
461
|
+
}
|
|
428
462
|
return merge;
|
|
429
463
|
}
|
|
430
464
|
/**
|
|
@@ -436,7 +470,8 @@ function validateStagePatch(patch, episodeId) {
|
|
|
436
470
|
* advancing to a stage not reachable from the current one throws.
|
|
437
471
|
* - Appends `{stage, at}` to `stageHistory`.
|
|
438
472
|
* - Merges the allowlisted `patch` fields (`policyVersionBaseline`,
|
|
439
|
-
* `baselineSkippedReason`, `advantage`, `terminalError`)
|
|
473
|
+
* `baselineSkippedReason`, `advantage`, `terminalError`, `evolvingHeartbeatAt`)
|
|
474
|
+
* in the same write.
|
|
440
475
|
* - Bumps `updatedAt`.
|
|
441
476
|
*/
|
|
442
477
|
export async function advanceEpisodeStage(opts) {
|
|
@@ -104,7 +104,7 @@ export interface AssembleEvolvingAgentPromptInput {
|
|
|
104
104
|
/** Pre-rendered DO-NOT-PRUNE block (成功保护). Omitted when empty. */
|
|
105
105
|
doNotPrune?: string;
|
|
106
106
|
/**
|
|
107
|
-
* One-line 预测校准 prediction-calibration note: the
|
|
107
|
+
* One-line 预测校准 prediction-calibration note: the evolving agent's recent
|
|
108
108
|
* checkable predictions' hit/miss record, settled by later measurements.
|
|
109
109
|
* Read-only context (it never scores); omitted when there is no settled
|
|
110
110
|
* prediction history, so prompts on early episodes stay byte-identical.
|
|
@@ -136,9 +136,9 @@ export type ParsedEvolvingAgentResponse = EvolvingAgentRefusal | EvolvingAgentEd
|
|
|
136
136
|
/**
|
|
137
137
|
* Parse the model's single `json:patch` block. Accepts EITHER the refusal shape
|
|
138
138
|
* (`{edits: [], refusal: string}`) OR a concrete edit (`{rationale, prediction,
|
|
139
|
-
* edits[]}`). Throws {@link
|
|
139
|
+
* edits[]}`). Throws {@link EvolvingAgentOutputInvalid} on a malformed
|
|
140
140
|
* block, the wrong block count, a missing/invalid prediction, or
|
|
141
|
-
* {@link
|
|
141
|
+
* {@link EvolvingAgentNoOp} on empty edits WITHOUT a refusal reason.
|
|
142
142
|
*
|
|
143
143
|
* Edits are NOT yet scope-validated here (the caller runs the static gate over
|
|
144
144
|
* them); this only enforces the SHAPE of the contract.
|
|
@@ -181,7 +181,7 @@ export interface RunEvolvingAgentOptions {
|
|
|
181
181
|
*/
|
|
182
182
|
exemplarPaths?: string[];
|
|
183
183
|
/**
|
|
184
|
-
* One-line 预测校准 prediction-calibration note surfaced to the
|
|
184
|
+
* One-line 预测校准 prediction-calibration note surfaced to the evolving agent
|
|
185
185
|
* (read-only, advisory). Defaults to absent; the orchestrator computes it from
|
|
186
186
|
* the prediction-reconcile ledger via `summarizeCalibration`.
|
|
187
187
|
*/
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { promises as fs } from 'node:fs';
|
|
2
2
|
import * as path from 'node:path';
|
|
3
|
-
import { runHeadlessAgent,
|
|
3
|
+
import { runHeadlessAgent, resolveAgentTimeoutMs, } from './host-harness.js';
|
|
4
4
|
import { evaluateToolEvolutionCandidate, } from './tool-evolution.js';
|
|
5
|
-
import { validateCandidateEdits,
|
|
5
|
+
import { validateCandidateEdits, EvolvingAgentNoOp, EvolvingAgentOutputInvalid, EvolvingAgentInvocationError, renderUnifiedDiff, } from './edits-contract.js';
|
|
6
6
|
import { requireCanonicalTarget } from './canonical-targets.js';
|
|
7
7
|
import { resolveTargetLocalFiles } from './local-targets.js';
|
|
8
8
|
import { renderDoNotPruneBlock, readProtections, listExemplarFiles, } from './success-channel.js';
|
|
@@ -203,9 +203,9 @@ const PREDICTION_METRICS = new Set(['loss', 'passRate', 'healthPenalty']);
|
|
|
203
203
|
/**
|
|
204
204
|
* Parse the model's single `json:patch` block. Accepts EITHER the refusal shape
|
|
205
205
|
* (`{edits: [], refusal: string}`) OR a concrete edit (`{rationale, prediction,
|
|
206
|
-
* edits[]}`). Throws {@link
|
|
206
|
+
* edits[]}`). Throws {@link EvolvingAgentOutputInvalid} on a malformed
|
|
207
207
|
* block, the wrong block count, a missing/invalid prediction, or
|
|
208
|
-
* {@link
|
|
208
|
+
* {@link EvolvingAgentNoOp} on empty edits WITHOUT a refusal reason.
|
|
209
209
|
*
|
|
210
210
|
* Edits are NOT yet scope-validated here (the caller runs the static gate over
|
|
211
211
|
* them); this only enforces the SHAPE of the contract.
|
|
@@ -217,25 +217,25 @@ export function parseEvolvingAgentResponse(text) {
|
|
|
217
217
|
while ((m = fenceRe.exec(text)) !== null)
|
|
218
218
|
matches.push(m[1]);
|
|
219
219
|
if (matches.length === 0) {
|
|
220
|
-
throw new
|
|
220
|
+
throw new EvolvingAgentOutputInvalid('no `json:patch` fenced block found in response');
|
|
221
221
|
}
|
|
222
222
|
if (matches.length > 1) {
|
|
223
|
-
throw new
|
|
223
|
+
throw new EvolvingAgentOutputInvalid(`expected exactly 1 \`json:patch\` block, found ${matches.length}`);
|
|
224
224
|
}
|
|
225
225
|
let parsed;
|
|
226
226
|
try {
|
|
227
227
|
parsed = JSON.parse(matches[0].trim());
|
|
228
228
|
}
|
|
229
229
|
catch (err) {
|
|
230
|
-
throw new
|
|
230
|
+
throw new EvolvingAgentOutputInvalid(`failed to parse JSON inside patch block: ${err instanceof Error ? err.message : String(err)}`);
|
|
231
231
|
}
|
|
232
232
|
if (!parsed || typeof parsed !== 'object') {
|
|
233
|
-
throw new
|
|
233
|
+
throw new EvolvingAgentOutputInvalid('patch block must be a JSON object');
|
|
234
234
|
}
|
|
235
235
|
const o = parsed;
|
|
236
236
|
const rawEdits = o.edits;
|
|
237
237
|
if (!Array.isArray(rawEdits)) {
|
|
238
|
-
throw new
|
|
238
|
+
throw new EvolvingAgentOutputInvalid('patch block must contain an `edits` array');
|
|
239
239
|
}
|
|
240
240
|
// Refusal shape: empty edits + a refusal string.
|
|
241
241
|
const refusal = o.refusal;
|
|
@@ -244,7 +244,7 @@ export function parseEvolvingAgentResponse(text) {
|
|
|
244
244
|
return { kind: 'refusal', reason: refusal.trim() };
|
|
245
245
|
}
|
|
246
246
|
// Empty edits with no refusal reason is a malformed no-op, not a refusal.
|
|
247
|
-
throw new
|
|
247
|
+
throw new EvolvingAgentNoOp();
|
|
248
248
|
}
|
|
249
249
|
// Concrete-edit shape: validate prediction + edit shapes.
|
|
250
250
|
const prediction = parsePrediction(o.prediction);
|
|
@@ -253,7 +253,7 @@ export function parseEvolvingAgentResponse(text) {
|
|
|
253
253
|
const relPath = e?.relPath;
|
|
254
254
|
const content = e?.content;
|
|
255
255
|
if (typeof relPath !== 'string' || typeof content !== 'string') {
|
|
256
|
-
throw new
|
|
256
|
+
throw new EvolvingAgentOutputInvalid('edit must have string relPath and string content');
|
|
257
257
|
}
|
|
258
258
|
edits.push({ relPath: relPath.replace(/\\/g, '/'), content });
|
|
259
259
|
}
|
|
@@ -262,17 +262,17 @@ export function parseEvolvingAgentResponse(text) {
|
|
|
262
262
|
}
|
|
263
263
|
function parsePrediction(raw) {
|
|
264
264
|
if (!raw || typeof raw !== 'object') {
|
|
265
|
-
throw new
|
|
265
|
+
throw new EvolvingAgentOutputInvalid('a concrete edit requires a `prediction` object {metric, direction, checkBy}');
|
|
266
266
|
}
|
|
267
267
|
const p = raw;
|
|
268
268
|
if (typeof p.metric !== 'string' || !PREDICTION_METRICS.has(p.metric)) {
|
|
269
|
-
throw new
|
|
269
|
+
throw new EvolvingAgentOutputInvalid("prediction.metric must be 'loss' | 'passRate' | 'healthPenalty'");
|
|
270
270
|
}
|
|
271
271
|
if (p.direction !== 'down' && p.direction !== 'up') {
|
|
272
|
-
throw new
|
|
272
|
+
throw new EvolvingAgentOutputInvalid("prediction.direction must be 'down' | 'up'");
|
|
273
273
|
}
|
|
274
274
|
if (typeof p.checkBy !== 'string' || p.checkBy.trim().length === 0) {
|
|
275
|
-
throw new
|
|
275
|
+
throw new EvolvingAgentOutputInvalid('prediction.checkBy must be a non-empty string');
|
|
276
276
|
}
|
|
277
277
|
return {
|
|
278
278
|
metric: p.metric,
|
|
@@ -403,7 +403,7 @@ export async function runEvolvingAgent(opts) {
|
|
|
403
403
|
const { episodeId, targetId } = opts;
|
|
404
404
|
const editBudget = opts.editBudget ?? DEFAULT_EVOLVING_AGENT_EDIT_BUDGET;
|
|
405
405
|
const maxRepairAttempts = Math.max(0, opts.maxRepairAttempts ?? 2);
|
|
406
|
-
const timeoutMs = opts.timeoutMs ??
|
|
406
|
+
const timeoutMs = opts.timeoutMs ?? resolveAgentTimeoutMs(opts.harness);
|
|
407
407
|
// Fail closed: the episode must exist (and tells us nothing else we need yet).
|
|
408
408
|
const episode = await readEpisode(repoRoot, episodeId);
|
|
409
409
|
void episode;
|
|
@@ -473,8 +473,8 @@ export async function runEvolvingAgent(opts) {
|
|
|
473
473
|
harness: opts.harness,
|
|
474
474
|
});
|
|
475
475
|
if (run.exitCode !== 0 || run.stdout.length === 0) {
|
|
476
|
-
// Agent crash is NOT repaired (
|
|
477
|
-
throw new
|
|
476
|
+
// Agent crash is NOT repaired (the evolving agent's invocation contract).
|
|
477
|
+
throw new EvolvingAgentInvocationError(run.stderr);
|
|
478
478
|
}
|
|
479
479
|
try {
|
|
480
480
|
const candidate = parseEvolvingAgentResponse(run.stdout);
|
|
@@ -483,12 +483,12 @@ export async function runEvolvingAgent(opts) {
|
|
|
483
483
|
break;
|
|
484
484
|
}
|
|
485
485
|
// Static-shape edit: validate scope-to-target + frozen freeze here so a
|
|
486
|
-
// bad path is a REPAIRABLE failure (
|
|
486
|
+
// bad path is a REPAIRABLE failure (the evolving agent's repair contract).
|
|
487
487
|
validateCandidateEdits(candidate.edits, allowedFiles);
|
|
488
488
|
// ≤ L budget (repairable).
|
|
489
489
|
const changed = countChangedLines(candidate.edits, currentFiles);
|
|
490
490
|
if (changed > editBudget) {
|
|
491
|
-
throw new
|
|
491
|
+
throw new EvolvingAgentOutputInvalid(`edit changes ${changed} lines, over the ${editBudget}-line budget (L) — make a smaller, more targeted edit`);
|
|
492
492
|
}
|
|
493
493
|
// 范围⊆诊断 (gate-3, repairable).
|
|
494
494
|
const scope = checkScopeWithinDiagnosis({
|
|
@@ -500,7 +500,7 @@ export async function runEvolvingAgent(opts) {
|
|
|
500
500
|
const where = scope.violations
|
|
501
501
|
.map((v) => `${v.file} §"${v.section}"`)
|
|
502
502
|
.join(', ');
|
|
503
|
-
throw new
|
|
503
|
+
throw new EvolvingAgentOutputInvalid(`edit touches sections outside the diagnosis (范围⊆诊断 violated): ${where} — only edit the diagnosed sections`);
|
|
504
504
|
}
|
|
505
505
|
// static guard (tool-evolution) — RUN INSIDE the repair loop so a
|
|
506
506
|
// content-driven failure (missing rationale / validation evidence / diff)
|
|
@@ -517,14 +517,14 @@ export async function runEvolvingAgent(opts) {
|
|
|
517
517
|
const errs = findings
|
|
518
518
|
.filter((f) => f.severity === 'error')
|
|
519
519
|
.map((f) => `${f.code}: ${f.message}`);
|
|
520
|
-
throw new
|
|
520
|
+
throw new EvolvingAgentOutputInvalid(`static gate failed (score ${toolReport.score.toFixed(2)}): ${errs.join('; ') || 'score below threshold'}`);
|
|
521
521
|
}
|
|
522
522
|
parsed = candidate;
|
|
523
523
|
scopeResult = scope;
|
|
524
524
|
break;
|
|
525
525
|
}
|
|
526
526
|
catch (err) {
|
|
527
|
-
if (err instanceof
|
|
527
|
+
if (err instanceof EvolvingAgentOutputInvalid && attempt < maxRepairAttempts) {
|
|
528
528
|
feedback = gateFeedback(err.message);
|
|
529
529
|
continue;
|
|
530
530
|
}
|
|
@@ -546,7 +546,7 @@ export async function runEvolvingAgent(opts) {
|
|
|
546
546
|
// scopeResult was set alongside the accepted parse; reasserted defensively.
|
|
547
547
|
if (!scopeResult || !scopeResult.pass) {
|
|
548
548
|
// Unreachable on the accept path; fail closed rather than evolve out of scope.
|
|
549
|
-
throw new
|
|
549
|
+
throw new EvolvingAgentOutputInvalid('范围⊆诊断 scope gate did not pass');
|
|
550
550
|
}
|
|
551
551
|
// ── 3. POST-LOOP GATE: observed-GREEN ───────────────────────────────────────
|
|
552
552
|
// static / 范围⊆诊断 / budget / valid-prediction were all enforced inside the
|
|
@@ -556,11 +556,11 @@ export async function runEvolvingAgent(opts) {
|
|
|
556
556
|
// would be a category error.
|
|
557
557
|
const objective = await readMainArmObjective(repoRoot, episodeId);
|
|
558
558
|
if (!objective) {
|
|
559
|
-
throw new
|
|
559
|
+
throw new EvolvingAgentOutputInvalid('observed-GREEN gate: main-arm/objective.json is missing or unreadable — cannot confirm a verified green run');
|
|
560
560
|
}
|
|
561
561
|
const evidence = isArmObjectiveGreen(objective);
|
|
562
562
|
if (!evidence.ok) {
|
|
563
|
-
throw new
|
|
563
|
+
throw new EvolvingAgentOutputInvalid(`observed-GREEN gate failed: ${evidence.reason}`);
|
|
564
564
|
}
|
|
565
565
|
// ── 4. Write back the next policy version. NO candidate dir / sidecar / verdict. ─
|
|
566
566
|
const ledgerEntry = await advancePolicyVersion({
|