synergyspec-selfevolving 2.1.2 → 2.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/learn.js +13 -3
- package/dist/commands/self-evolution-episode.d.ts +6 -1
- package/dist/commands/self-evolution-episode.js +8 -1
- package/dist/commands/self-evolution.d.ts +2 -2
- package/dist/commands/self-evolution.js +10 -10
- package/dist/commands/workflow/status.js +5 -0
- package/dist/core/change-readiness.d.ts +1 -1
- package/dist/core/change-readiness.js +66 -11
- package/dist/core/fitness/test-metrics.d.ts +33 -0
- package/dist/core/fitness/test-metrics.js +67 -0
- package/dist/core/learn.js +11 -2
- package/dist/core/project-config.d.ts +3 -0
- package/dist/core/project-config.js +7 -1
- package/dist/core/self-evolution/critic-agent.js +13 -5
- package/dist/core/self-evolution/edits-contract.d.ts +15 -5
- package/dist/core/self-evolution/edits-contract.js +26 -16
- package/dist/core/self-evolution/episode-orchestrator.d.ts +16 -9
- package/dist/core/self-evolution/episode-orchestrator.js +126 -35
- package/dist/core/self-evolution/episode-store.d.ts +34 -11
- package/dist/core/self-evolution/episode-store.js +45 -10
- package/dist/core/self-evolution/evolving-agent.d.ts +12 -12
- package/dist/core/self-evolution/evolving-agent.js +46 -48
- package/dist/core/self-evolution/host-harness.d.ts +68 -2
- package/dist/core/self-evolution/host-harness.js +208 -21
- package/dist/core/self-evolution/policy/policy-store.d.ts +8 -6
- package/dist/core/self-evolution/policy/policy-store.js +124 -24
- package/dist/core/self-evolution/proposer-slice.d.ts +4 -3
- package/dist/core/self-evolution/reward-agent.d.ts +11 -1
- package/dist/core/self-evolution/reward-agent.js +53 -20
- package/dist/core/self-evolution/reward-aggregator.d.ts +18 -0
- package/dist/core/self-evolution/reward-aggregator.js +53 -3
- package/dist/core/self-evolution/reward-deepread.d.ts +64 -0
- package/dist/core/self-evolution/reward-deepread.js +112 -0
- package/dist/core/templates/workflows/learn.js +3 -2
- package/dist/core/templates/workflows/self-evolving.js +5 -2
- package/dist/core/trajectory/facts.d.ts +69 -2
- package/dist/core/trajectory/facts.js +179 -10
- package/dist/core/trajectory/skeleton.d.ts +10 -0
- package/dist/core/trajectory/skeleton.js +24 -3
- package/package.json +4 -3
- package/schemas/spec-driven/templates/design.md +2 -1
|
@@ -14,26 +14,36 @@
|
|
|
14
14
|
* canonical file.
|
|
15
15
|
*/
|
|
16
16
|
import { GATE_DEFINING_FILES } from './candidate-gates.js';
|
|
17
|
-
export class
|
|
17
|
+
export class EvolvingAgentOutputInvalid extends Error {
|
|
18
18
|
constructor(message) {
|
|
19
|
-
super(`
|
|
20
|
-
this.name = '
|
|
19
|
+
super(`evolving agent output invalid: ${message}`);
|
|
20
|
+
this.name = 'EvolvingAgentOutputInvalid';
|
|
21
21
|
}
|
|
22
22
|
}
|
|
23
23
|
/** The model declined to edit anything (empty edits). Not an error — a no-op. */
|
|
24
|
-
export class
|
|
24
|
+
export class EvolvingAgentNoOp extends Error {
|
|
25
25
|
constructor() {
|
|
26
|
-
super('
|
|
27
|
-
this.name = '
|
|
26
|
+
super('evolving agent returned no edits');
|
|
27
|
+
this.name = 'EvolvingAgentNoOp';
|
|
28
28
|
}
|
|
29
29
|
}
|
|
30
30
|
/** The headless agent invocation itself failed (crash / empty output). */
|
|
31
|
-
export class
|
|
31
|
+
export class EvolvingAgentInvocationError extends Error {
|
|
32
32
|
constructor(stderr) {
|
|
33
|
-
super(`
|
|
34
|
-
this.name = '
|
|
33
|
+
super(`evolving agent invocation failed: ${stderr}`);
|
|
34
|
+
this.name = 'EvolvingAgentInvocationError';
|
|
35
35
|
}
|
|
36
36
|
}
|
|
37
|
+
/**
|
|
38
|
+
* @deprecated v2.0.0 removed the GA "canonical proposer"; these names are
|
|
39
|
+
* retained only as transitional aliases for any external importer. Use the
|
|
40
|
+
* `EvolvingAgent*` classes — they are the same constructors.
|
|
41
|
+
*/
|
|
42
|
+
export const CanonicalProposerOutputInvalid = EvolvingAgentOutputInvalid;
|
|
43
|
+
/** @deprecated alias of {@link EvolvingAgentNoOp}. */
|
|
44
|
+
export const CanonicalProposerNoOp = EvolvingAgentNoOp;
|
|
45
|
+
/** @deprecated alias of {@link EvolvingAgentInvocationError}. */
|
|
46
|
+
export const CanonicalProposerInvocationError = EvolvingAgentInvocationError;
|
|
37
47
|
/**
|
|
38
48
|
* Validate already-structured candidate edits against the allowed (target-
|
|
39
49
|
* scoped) file set and the frozen gate-defining files. Author-agnostic: this is
|
|
@@ -44,33 +54,33 @@ export class CanonicalProposerInvocationError extends Error {
|
|
|
44
54
|
* the loop-v2 演进智能体 EVOLVING AGENT call this so their safety contract is
|
|
45
55
|
* byte-identical. relPaths are normalized to POSIX separators.
|
|
46
56
|
*
|
|
47
|
-
* Throws {@link
|
|
48
|
-
* {@link
|
|
57
|
+
* Throws {@link EvolvingAgentNoOp} when `rawEdits` is empty and
|
|
58
|
+
* {@link EvolvingAgentOutputInvalid} for any shape / frozen / scope
|
|
49
59
|
* violation. Path traversal and absolute paths are rejected transitively: they
|
|
50
60
|
* can never be a member of `allowedFiles`, so they fail the scope check.
|
|
51
61
|
*/
|
|
52
62
|
export function validateCandidateEdits(rawEdits, allowedFiles) {
|
|
53
63
|
if (rawEdits.length === 0) {
|
|
54
|
-
throw new
|
|
64
|
+
throw new EvolvingAgentNoOp();
|
|
55
65
|
}
|
|
56
66
|
const allowed = new Set(allowedFiles.map((p) => p.replace(/\\/g, '/')));
|
|
57
67
|
const frozen = new Set(GATE_DEFINING_FILES.map((p) => p.replace(/\\/g, '/')));
|
|
58
68
|
const validated = [];
|
|
59
69
|
for (const e of rawEdits) {
|
|
60
70
|
if (!e || typeof e !== 'object') {
|
|
61
|
-
throw new
|
|
71
|
+
throw new EvolvingAgentOutputInvalid('edit entry must be an object');
|
|
62
72
|
}
|
|
63
73
|
const relPath = e.relPath;
|
|
64
74
|
const content = e.content;
|
|
65
75
|
if (typeof relPath !== 'string' || typeof content !== 'string') {
|
|
66
|
-
throw new
|
|
76
|
+
throw new EvolvingAgentOutputInvalid('edit must have string relPath and string content');
|
|
67
77
|
}
|
|
68
78
|
const norm = relPath.replace(/\\/g, '/');
|
|
69
79
|
if (frozen.has(norm)) {
|
|
70
|
-
throw new
|
|
80
|
+
throw new EvolvingAgentOutputInvalid(`edit relPath "${relPath}" is a gate-defining/frozen file and may never be proposed`);
|
|
71
81
|
}
|
|
72
82
|
if (!allowed.has(norm)) {
|
|
73
|
-
throw new
|
|
83
|
+
throw new EvolvingAgentOutputInvalid(`edit relPath "${relPath}" is outside the target's declared files`);
|
|
74
84
|
}
|
|
75
85
|
validated.push({ relPath: norm, content });
|
|
76
86
|
}
|
|
@@ -28,17 +28,22 @@
|
|
|
28
28
|
* reject-buffer entry — BOTH durably on
|
|
29
29
|
* disk — THEN advance 'rolled-back'.
|
|
30
30
|
* - otherwise → advance 'kept'.
|
|
31
|
-
* g. 演进智能体 EVOLVING AGENT — ONLY after (f) persisted:
|
|
32
|
-
* (optimizer.step)
|
|
33
|
-
*
|
|
34
|
-
*
|
|
31
|
+
* g. 演进智能体 EVOLVING AGENT — ONLY after (f) persisted: require the
|
|
32
|
+
* (optimizer.step) main-arm observed-GREEN evidence, then
|
|
33
|
+
* advance the 'evolving' marker
|
|
34
|
+
* (heartbeat) so a concurrent sibling
|
|
35
|
+
* sees a live holder, then
|
|
36
|
+
* runEvolvingAgent reads the
|
|
37
|
+
* reject-buffer FRESH from disk (so THIS
|
|
38
|
+
* episode's just-written entry is in its
|
|
39
|
+
* prompt) and either not-spawned /
|
|
35
40
|
* refused / evolved.
|
|
36
41
|
* h. advance 'closed' + releaseInFlight — ALWAYS, even on error.
|
|
37
42
|
*
|
|
38
43
|
* ORDERING GUARANTEE: the rollback + reject-buffer write are SEQUENTIAL awaits
|
|
39
|
-
* that BOTH complete (and the stage reads 'rolled-back'/'kept') before
|
|
40
|
-
* {@link runEvolvingAgent}
|
|
41
|
-
* and never share a Promise.all.
|
|
44
|
+
* that BOTH complete (and the stage reads 'rolled-back'/'kept') before the
|
|
45
|
+
* observed-GREEN preflight and {@link runEvolvingAgent}. (f) and (g) are never
|
|
46
|
+
* parallelized and never share a Promise.all.
|
|
42
47
|
*
|
|
43
48
|
* This module orchestrates; it never spawns an agent itself — the three agents
|
|
44
49
|
* own their own {@link runHeadlessAgent} spawns (the `spawn` seam threads to all
|
|
@@ -239,14 +244,16 @@ export interface ResumeEpisodeResult {
|
|
|
239
244
|
* done step rather than re-advancing a stage already entered:
|
|
240
245
|
*
|
|
241
246
|
* - 'scored' → run the decision (f) then the 演进智能体 (g).
|
|
242
|
-
* - 'rolled-back' / 'kept'
|
|
247
|
+
* - 'rolled-back' / 'kept' / 'evolving' → run the 演进智能体 EVOLVING AGENT (g)
|
|
248
|
+
* then close. ('evolving' means a crash AFTER the
|
|
249
|
+
* marker but before the agent settled an outcome.)
|
|
243
250
|
* - 'evolved'/'evolution-refused'/'abstained' → close.
|
|
244
251
|
* - 'errored' → RE-DRIVE from the last GOOD pre-error stage
|
|
245
252
|
* (an episode may have errored on a TRANSIENT
|
|
246
253
|
* cause — a one-off git/analyzer/agent timeout).
|
|
247
254
|
* The pre-error stage is the last `stageHistory`
|
|
248
255
|
* entry that is NOT 'errored'; when it is one of
|
|
249
|
-
* {'scored','rolled-back','kept'} (the
|
|
256
|
+
* {'scored','rolled-back','kept','evolving'} (the
|
|
250
257
|
* resume-entry stages) we advance errored → that
|
|
251
258
|
* stage and fall through to the normal dispatch.
|
|
252
259
|
* Otherwise the pre-error stage is not
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { promises as fs } from 'node:fs';
|
|
2
|
+
import * as path from 'node:path';
|
|
1
3
|
import { toActionSkeleton } from '../trajectory/skeleton.js';
|
|
2
4
|
import { getTrajectoryForChange } from '../trajectory/registry.js';
|
|
3
5
|
import { acquireInFlight, releaseInFlight, currentPolicyVersion, readPolicyLedger, initPolicyLineage, rollbackPolicyVersion, } from './policy/policy-store.js';
|
|
@@ -6,7 +8,7 @@ import { createEpisode, advanceEpisodeStage, writeArmCapture, readEpisode, episo
|
|
|
6
8
|
import { shouldRunCriticAgent, runCriticAgent, } from './critic-agent.js';
|
|
7
9
|
import { runRewardAgentEnsemble } from './reward-aggregator.js';
|
|
8
10
|
import { detectTestTamper } from './tamper-check.js';
|
|
9
|
-
import { runEvolvingAgent, DEFAULT_EVOLVING_AGENT_EDIT_BUDGET, MIN_EVOLVING_AGENT_EDIT_BUDGET, } from './evolving-agent.js';
|
|
11
|
+
import { runEvolvingAgent, DEFAULT_EVOLVING_AGENT_EDIT_BUDGET, MIN_EVOLVING_AGENT_EDIT_BUDGET, isArmObjectiveGreen, } from './evolving-agent.js';
|
|
10
12
|
import { reconcilePrediction, summarizeCalibration, } from './policy/prediction-reconcile.js';
|
|
11
13
|
/**
|
|
12
14
|
* Build the 主智能体 MAIN AGENT arm `{transcript?, skeleton?, objective}` from an
|
|
@@ -183,13 +185,65 @@ function deriveEpisodeId(changeName, now) {
|
|
|
183
185
|
/**
|
|
184
186
|
* Build the `terminalError` note for a thrown step. A timeout reads identically
|
|
185
187
|
* to a hard crash on disk otherwise, so a message that names a host-agent timeout
|
|
186
|
-
* (
|
|
187
|
-
*
|
|
188
|
-
*
|
|
188
|
+
* (absolute wall: `headless agent timed out after Nms`; idle wall: `idle timeout`)
|
|
189
|
+
* is PREFIXED with a `timeout:` marker. A timed-out episode is then
|
|
190
|
+
* distinguishable from a genuine crash in episode.json. Pure.
|
|
189
191
|
*/
|
|
190
192
|
function terminalErrorLabel(err) {
|
|
191
193
|
const msg = err instanceof Error ? err.message : String(err);
|
|
192
|
-
return
|
|
194
|
+
return /\b(timed out|idle timeout)\b/i.test(msg) ? `timeout: ${msg}` : msg;
|
|
195
|
+
}
|
|
196
|
+
function observedGreenFailureReason(objective) {
|
|
197
|
+
if (!objective) {
|
|
198
|
+
return 'observed-GREEN gate: main-arm/objective.json is missing or unreadable - cannot confirm a verified green run';
|
|
199
|
+
}
|
|
200
|
+
const evidence = isArmObjectiveGreen(objective);
|
|
201
|
+
return evidence.ok ? null : `observed-GREEN gate failed: ${evidence.reason}`;
|
|
202
|
+
}
|
|
203
|
+
async function readMainArmObjectiveForEpisode(repoRoot, episodeId) {
|
|
204
|
+
const file = path.join(episodeDir(repoRoot, episodeId), 'main-arm', 'objective.json');
|
|
205
|
+
let raw;
|
|
206
|
+
try {
|
|
207
|
+
raw = await fs.readFile(file, 'utf8');
|
|
208
|
+
}
|
|
209
|
+
catch (err) {
|
|
210
|
+
if (err.code === 'ENOENT')
|
|
211
|
+
return null;
|
|
212
|
+
throw err;
|
|
213
|
+
}
|
|
214
|
+
try {
|
|
215
|
+
return JSON.parse(raw);
|
|
216
|
+
}
|
|
217
|
+
catch {
|
|
218
|
+
return null;
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
async function runEvolvingStepIfObservedGreen(opts) {
|
|
222
|
+
const objective = Object.prototype.hasOwnProperty.call(opts, 'objective')
|
|
223
|
+
? (opts.objective ?? null)
|
|
224
|
+
: await readMainArmObjectiveForEpisode(opts.repoRoot, opts.episodeId);
|
|
225
|
+
const gateFailure = observedGreenFailureReason(objective);
|
|
226
|
+
if (gateFailure) {
|
|
227
|
+
return { kind: 'not-spawned', reason: gateFailure };
|
|
228
|
+
}
|
|
229
|
+
if (opts.markEvolving) {
|
|
230
|
+
await advanceEpisodeStage({
|
|
231
|
+
repoRoot: opts.repoRoot,
|
|
232
|
+
episodeId: opts.episodeId,
|
|
233
|
+
stage: 'evolving',
|
|
234
|
+
patch: { evolvingHeartbeatAt: new Date().toISOString() },
|
|
235
|
+
});
|
|
236
|
+
}
|
|
237
|
+
return await runEvolvingAgent({
|
|
238
|
+
repoRoot: opts.repoRoot,
|
|
239
|
+
episodeId: opts.episodeId,
|
|
240
|
+
targetId: opts.targetId,
|
|
241
|
+
editBudget: opts.editBudget,
|
|
242
|
+
...(opts.calibrationNote ? { calibrationNote: opts.calibrationNote } : {}),
|
|
243
|
+
spawn: opts.spawn,
|
|
244
|
+
...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
|
|
245
|
+
...(opts.harness ? { harness: opts.harness } : {}),
|
|
246
|
+
});
|
|
193
247
|
}
|
|
194
248
|
/**
|
|
195
249
|
* Run ONE episode through the loop in the strict, durably-persisted order
|
|
@@ -431,12 +485,13 @@ async function runEpisodeAfterCreate(opts) {
|
|
|
431
485
|
// runEvolvingAgent reads the reject-buffer FRESH from disk (the entry just
|
|
432
486
|
// written THIS episode is in its prompt). Never parallelized with (f).
|
|
433
487
|
// 步长: after a rollback, shrink the edit budget (smaller step after a step
|
|
434
|
-
// that lost ground). 预测校准: pass the
|
|
488
|
+
// that lost ground). 预测校准: pass the 演进智能体 EVOLVING AGENT's recent
|
|
489
|
+
// prediction record.
|
|
435
490
|
const scheduledBudget = decision === 'rolled-back'
|
|
436
491
|
? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
|
|
437
492
|
: editBudget;
|
|
438
493
|
const calibrationNote = await summarizeCalibration(repoRoot, targetId);
|
|
439
|
-
evolution = await
|
|
494
|
+
evolution = await runEvolvingStepIfObservedGreen({
|
|
440
495
|
repoRoot,
|
|
441
496
|
episodeId,
|
|
442
497
|
targetId,
|
|
@@ -445,6 +500,8 @@ async function runEpisodeAfterCreate(opts) {
|
|
|
445
500
|
spawn: opts.spawn,
|
|
446
501
|
...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
|
|
447
502
|
...(opts.harness ? { harness: opts.harness } : {}),
|
|
503
|
+
markEvolving: true,
|
|
504
|
+
objective: opts.mainArm.objective,
|
|
448
505
|
});
|
|
449
506
|
}
|
|
450
507
|
}
|
|
@@ -477,16 +534,19 @@ async function runEpisodeAfterCreate(opts) {
|
|
|
477
534
|
* Closable stages:
|
|
478
535
|
* - evolved | evolution-refused | abstained — the 演进智能体 EVOLVING AGENT
|
|
479
536
|
* reached a definite outcome (or the judge 弃权 abstained), the normal close.
|
|
480
|
-
* -
|
|
537
|
+
* - evolving — the 演进智能体 returned not-spawned (its diagnosis
|
|
481
538
|
* abstained-after-gap-check, no gaps, or the target resolved to no editable
|
|
482
|
-
* local files), so the episode never advanced past the
|
|
483
|
-
* this runs (AFTER runEvolvingAgent returned), a stage still at
|
|
484
|
-
* '
|
|
539
|
+
* local files), so the episode never advanced past the 'evolving' marker.
|
|
540
|
+
* By the time this runs (AFTER runEvolvingAgent returned), a stage still at
|
|
541
|
+
* 'evolving' can ONLY mean not-spawned — a success advances 'evolved', a
|
|
485
542
|
* refusal advances 'evolution-refused', and a throw is caught upstream and
|
|
486
543
|
* records 'errored' + rethrows so this close is never reached. So a leftover
|
|
487
|
-
*
|
|
488
|
-
*
|
|
489
|
-
*
|
|
544
|
+
* 'evolving' at close time IS the finished-nothing-to-evolve case and must
|
|
545
|
+
* close, not rest forever at a non-terminal stage (the exact ambiguity the
|
|
546
|
+
* 'errored' stage was meant to remove).
|
|
547
|
+
* - kept | rolled-back — retained for back-compat: an OLD episode record (or a
|
|
548
|
+
* code path that did not advance the 'evolving' marker) that returned
|
|
549
|
+
* not-spawned never advances past the decision; close it the same way.
|
|
490
550
|
*
|
|
491
551
|
* Any other (genuinely non-closable) stage is left as-is rather than throwing, so
|
|
492
552
|
* the close never masks the real episode outcome.
|
|
@@ -497,7 +557,10 @@ async function closeEpisodeBestEffort(repoRoot, episodeId) {
|
|
|
497
557
|
'evolved',
|
|
498
558
|
'evolution-refused',
|
|
499
559
|
'abstained',
|
|
500
|
-
// not-spawned 演进智能体 leaves the episode
|
|
560
|
+
// not-spawned 演进智能体 leaves the episode at the 'evolving' marker — close
|
|
561
|
+
// the finished episode. 'kept'/'rolled-back' retained for back-compat with
|
|
562
|
+
// an old record / a path that never advanced the marker.
|
|
563
|
+
'evolving',
|
|
501
564
|
'kept',
|
|
502
565
|
'rolled-back',
|
|
503
566
|
]);
|
|
@@ -586,14 +649,16 @@ async function ensureRejectBufferEntry(repoRoot, opts) {
|
|
|
586
649
|
* done step rather than re-advancing a stage already entered:
|
|
587
650
|
*
|
|
588
651
|
* - 'scored' → run the decision (f) then the 演进智能体 (g).
|
|
589
|
-
* - 'rolled-back' / 'kept'
|
|
652
|
+
* - 'rolled-back' / 'kept' / 'evolving' → run the 演进智能体 EVOLVING AGENT (g)
|
|
653
|
+
* then close. ('evolving' means a crash AFTER the
|
|
654
|
+
* marker but before the agent settled an outcome.)
|
|
590
655
|
* - 'evolved'/'evolution-refused'/'abstained' → close.
|
|
591
656
|
* - 'errored' → RE-DRIVE from the last GOOD pre-error stage
|
|
592
657
|
* (an episode may have errored on a TRANSIENT
|
|
593
658
|
* cause — a one-off git/analyzer/agent timeout).
|
|
594
659
|
* The pre-error stage is the last `stageHistory`
|
|
595
660
|
* entry that is NOT 'errored'; when it is one of
|
|
596
|
-
* {'scored','rolled-back','kept'} (the
|
|
661
|
+
* {'scored','rolled-back','kept','evolving'} (the
|
|
597
662
|
* resume-entry stages) we advance errored → that
|
|
598
663
|
* stage and fall through to the normal dispatch.
|
|
599
664
|
* Otherwise the pre-error stage is not
|
|
@@ -617,7 +682,7 @@ export async function resumeEpisode(opts) {
|
|
|
617
682
|
// for an 'errored' episode we attempt to RE-DRIVE from the last good pre-error
|
|
618
683
|
// stage (a transient git/analyzer/agent failure should be retryable via an
|
|
619
684
|
// operator resume). 'errored' stays terminal for every OTHER caller — only this
|
|
620
|
-
// resume path may re-drive it, via the errored → {scored,rolled-back,kept}
|
|
685
|
+
// resume path may re-drive it, via the errored → {scored,rolled-back,kept,evolving}
|
|
621
686
|
// transitions the stage machine allows ONLY for operator recovery.
|
|
622
687
|
let stage = ep.stage;
|
|
623
688
|
if (ep.stage === 'errored') {
|
|
@@ -626,7 +691,8 @@ export async function resumeEpisode(opts) {
|
|
|
626
691
|
.find((h) => h.stage !== 'errored')?.stage;
|
|
627
692
|
if (preError === 'scored' ||
|
|
628
693
|
preError === 'rolled-back' ||
|
|
629
|
-
preError === 'kept'
|
|
694
|
+
preError === 'kept' ||
|
|
695
|
+
preError === 'evolving') {
|
|
630
696
|
// Re-open the errored episode at its last auto-resumable stage, then fall
|
|
631
697
|
// through to the normal dispatch for that stage.
|
|
632
698
|
await advanceEpisodeStage({ repoRoot, episodeId, stage: preError });
|
|
@@ -636,7 +702,7 @@ export async function resumeEpisode(opts) {
|
|
|
636
702
|
// 'baseline-skipped'); leave the episode at 'errored' and report it as-is.
|
|
637
703
|
}
|
|
638
704
|
// The decision (f) + 演进智能体 EVOLVING AGENT (g) re-runs below can THROW — a
|
|
639
|
-
// wedged/crashed host CLI (
|
|
705
|
+
// wedged/crashed host CLI (EvolvingAgentInvocationError), a timeout, or an
|
|
640
706
|
// observed-GREEN gate throw. UNCAUGHT, that leaves the episode DURABLY stuck at
|
|
641
707
|
// a non-terminal stage ('scored'/'rolled-back'/'kept' — the orphan state fix ❷
|
|
642
708
|
// eliminates for runEpisode). Record the SAME terminal 'errored' stage here
|
|
@@ -644,7 +710,31 @@ export async function resumeEpisode(opts) {
|
|
|
644
710
|
// re-throw. Resume holds NO in-flight lock, so this is a durable-stage fix, not
|
|
645
711
|
// a leak fix. Best-effort write: a failed record must not mask the original throw.
|
|
646
712
|
try {
|
|
647
|
-
|
|
713
|
+
// TOCTOU guard: resume read the stage at entry (~L945), but it holds NO in-flight
|
|
714
|
+
// lock, so a CONCURRENT runEpisode for the same target can advance THIS episode to
|
|
715
|
+
// a TERMINAL stage between that read and the transitions below. Re-read the episode
|
|
716
|
+
// immediately before dispatching; if it is already finished, the transitions would
|
|
717
|
+
// throw an illegal-transition error (which the catch below would then mis-record as
|
|
718
|
+
// a fresh 'errored'). Short-circuit instead: report the already-finished episode
|
|
719
|
+
// via the normal completion return. (The errored→pre-error re-drive above already
|
|
720
|
+
// turned a re-drivable 'errored' into a non-terminal stage, so a stage that is
|
|
721
|
+
// STILL terminal here is genuinely finished, not auto-resumable.)
|
|
722
|
+
const TERMINAL_STAGES = new Set([
|
|
723
|
+
'closed',
|
|
724
|
+
'errored',
|
|
725
|
+
'evolution-refused',
|
|
726
|
+
'evolved',
|
|
727
|
+
'abstained',
|
|
728
|
+
]);
|
|
729
|
+
const fresh = await readEpisode(repoRoot, episodeId);
|
|
730
|
+
stage = fresh.stage;
|
|
731
|
+
if (TERMINAL_STAGES.has(stage)) {
|
|
732
|
+
// 'evolved'/'evolution-refused'/'abstained' still want their best-effort close;
|
|
733
|
+
// 'closed'/'errored' are no-ops for closeEpisodeBestEffort. No transition is
|
|
734
|
+
// attempted, so the race cannot surface as an illegal-transition throw.
|
|
735
|
+
await closeEpisodeBestEffort(repoRoot, episodeId);
|
|
736
|
+
}
|
|
737
|
+
else if (stage === 'scored') {
|
|
648
738
|
// Re-run the decision (f) from the on-disk diagnosis, then (g).
|
|
649
739
|
const diagnosis = await readDiagnosisForResume(repoRoot, episodeId);
|
|
650
740
|
if (shouldSkipEvolution(diagnosis)) {
|
|
@@ -719,7 +809,7 @@ export async function resumeEpisode(opts) {
|
|
|
719
809
|
? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
|
|
720
810
|
: editBudget;
|
|
721
811
|
const calibrationNote = await summarizeCalibration(repoRoot, targetId);
|
|
722
|
-
evolution = await
|
|
812
|
+
evolution = await runEvolvingStepIfObservedGreen({
|
|
723
813
|
repoRoot,
|
|
724
814
|
episodeId,
|
|
725
815
|
targetId,
|
|
@@ -728,11 +818,12 @@ export async function resumeEpisode(opts) {
|
|
|
728
818
|
spawn: opts.spawn,
|
|
729
819
|
...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
|
|
730
820
|
...(opts.harness ? { harness: opts.harness } : {}),
|
|
821
|
+
markEvolving: true,
|
|
731
822
|
});
|
|
732
823
|
}
|
|
733
824
|
await closeEpisodeBestEffort(repoRoot, episodeId);
|
|
734
825
|
}
|
|
735
|
-
else if (stage === 'rolled-back' || stage === 'kept') {
|
|
826
|
+
else if (stage === 'rolled-back' || stage === 'kept' || stage === 'evolving') {
|
|
736
827
|
// The decision already ran (and the original episode settled the prediction);
|
|
737
828
|
// re-settle idempotently for the crash window, then schedule + calibrate.
|
|
738
829
|
try {
|
|
@@ -741,11 +832,16 @@ export async function resumeEpisode(opts) {
|
|
|
741
832
|
catch {
|
|
742
833
|
// best-effort: advisory only
|
|
743
834
|
}
|
|
744
|
-
|
|
835
|
+
// Resuming from 'evolving' means the decision is in history (not the resume
|
|
836
|
+
// stage); read it from stageHistory so the 步长 schedule still shrinks after a
|
|
837
|
+
// rollback. Resuming from 'rolled-back'/'kept' uses the resume stage directly.
|
|
838
|
+
const wasRolledBack = stage === 'rolled-back' ||
|
|
839
|
+
(stage === 'evolving' && ep.stageHistory.some((h) => h.stage === 'rolled-back'));
|
|
840
|
+
const scheduledBudget = wasRolledBack
|
|
745
841
|
? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
|
|
746
842
|
: editBudget;
|
|
747
843
|
const calibrationNote = await summarizeCalibration(repoRoot, targetId);
|
|
748
|
-
evolution = await
|
|
844
|
+
evolution = await runEvolvingStepIfObservedGreen({
|
|
749
845
|
repoRoot,
|
|
750
846
|
episodeId,
|
|
751
847
|
targetId,
|
|
@@ -754,16 +850,13 @@ export async function resumeEpisode(opts) {
|
|
|
754
850
|
spawn: opts.spawn,
|
|
755
851
|
...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
|
|
756
852
|
...(opts.harness ? { harness: opts.harness } : {}),
|
|
853
|
+
markEvolving: stage !== 'evolving',
|
|
757
854
|
});
|
|
758
855
|
await closeEpisodeBestEffort(repoRoot, episodeId);
|
|
759
856
|
}
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
await closeEpisodeBestEffort(repoRoot, episodeId);
|
|
764
|
-
}
|
|
765
|
-
// earlier stages (and a non-auto-resumable 'errored'): not auto-resumable here
|
|
766
|
-
// — reported as-is.
|
|
857
|
+
// Terminal stages (incl. a non-auto-resumable 'errored') are handled by the
|
|
858
|
+
// TOCTOU guard above; earlier stages are not auto-resumable here — reported
|
|
859
|
+
// as-is via the completion return below.
|
|
767
860
|
}
|
|
768
861
|
catch (err) {
|
|
769
862
|
// A thrown decision/evolving step records a DURABLE terminal 'errored' stage so
|
|
@@ -790,9 +883,7 @@ export async function resumeEpisode(opts) {
|
|
|
790
883
|
* AGENT's reader uses).
|
|
791
884
|
*/
|
|
792
885
|
async function readDiagnosisForResume(repoRoot, episodeId) {
|
|
793
|
-
const
|
|
794
|
-
const pathMod = await import('node:path');
|
|
795
|
-
const file = pathMod.join(episodeDir(repoRoot, episodeId), 'diagnosis.json');
|
|
886
|
+
const file = path.join(episodeDir(repoRoot, episodeId), 'diagnosis.json');
|
|
796
887
|
let raw;
|
|
797
888
|
try {
|
|
798
889
|
raw = await fs.readFile(file, 'utf8');
|
|
@@ -57,10 +57,11 @@
|
|
|
57
57
|
* -> (baseline-arm-captured | baseline-skipped) // CRITIC AGENT(基线智能体 baseline agent)arm
|
|
58
58
|
* -> scored // 奖励智能体 REWARD AGENT wrote diagnosis.json
|
|
59
59
|
* -> (rolled-back | kept) // rollback decision on the main arm's edits
|
|
60
|
+
* -> evolving // 演进智能体 EVOLVING AGENT holds the in-flight lock
|
|
60
61
|
* -> (evolved | evolution-refused | abstained) // 演进智能体 EVOLVING AGENT outcome
|
|
61
62
|
* -> closed // terminal
|
|
62
63
|
*
|
|
63
|
-
* (rolled-back | kept)
|
|
64
|
+
* (rolled-back | kept | evolving)
|
|
64
65
|
* -> closed // terminal — see below
|
|
65
66
|
*
|
|
66
67
|
* (any non-terminal stage)
|
|
@@ -73,12 +74,23 @@
|
|
|
73
74
|
* 弃权 abstains when no nameable gap → no rollback decision needed → the
|
|
74
75
|
* 演进智能体 EVOLVING AGENT is never spawned.
|
|
75
76
|
*
|
|
76
|
-
* `
|
|
77
|
-
* EVOLVING AGENT
|
|
78
|
-
*
|
|
79
|
-
*
|
|
80
|
-
* the
|
|
81
|
-
*
|
|
77
|
+
* `evolving` is advanced by the orchestrator BEFORE it spawns the 演进智能体
|
|
78
|
+
* EVOLVING AGENT, while that agent holds the in-flight lock. It exists so a
|
|
79
|
+
* concurrent sibling reading the store distinguishes a LIVE-but-slow holder
|
|
80
|
+
* (stage `evolving`) from an episode that merely reached the decision (`kept`/
|
|
81
|
+
* `rolled-back`) — without it the stage stays `kept` for the whole evolving
|
|
82
|
+
* spawn, and a sibling can misread a running holder as stale. The
|
|
83
|
+
* `evolvingHeartbeatAt` field records when the stage was entered. Old episode
|
|
84
|
+
* records that predate this stage never carry it; they resume exactly as before
|
|
85
|
+
* (the `rolled-back`/`kept` → outcome transitions are retained for them).
|
|
86
|
+
*
|
|
87
|
+
* `rolled-back`/`kept`/`evolving` may also reach `closed` DIRECTLY: when the
|
|
88
|
+
* 演进智能体 EVOLVING AGENT was NOT spawned (its diagnosis abstained-after-gap-
|
|
89
|
+
* check, named no gaps, or the target resolved to no editable local files) the
|
|
90
|
+
* episode never advances past the decision (it stays `kept`/`rolled-back`, or —
|
|
91
|
+
* for the not-spawned-after-evolving-marker case — `evolving`), so the
|
|
92
|
+
* orchestrator's best-effort close terminates the finished-nothing-to-evolve
|
|
93
|
+
* episode rather than leaving it resting forever at a non-terminal stage.
|
|
82
94
|
*
|
|
83
95
|
* `errored` is a SECOND terminal stage reachable from EVERY non-terminal stage.
|
|
84
96
|
* A thrown step — an agent spawn that crashes or times out (主智能体 MAIN AGENT /
|
|
@@ -92,10 +104,10 @@
|
|
|
92
104
|
* `errored` is terminal for every target EXCEPT an operator-driven resume: a
|
|
93
105
|
* transient cause (a one-off git/analyzer/agent timeout) is retryable, so an
|
|
94
106
|
* `episode resume` may RE-DRIVE an errored episode back to its last good
|
|
95
|
-
* pre-error stage — `errored -> {scored, rolled-back, kept}` (the
|
|
96
|
-
* stages). No other caller may leave `errored`.
|
|
107
|
+
* pre-error stage — `errored -> {scored, rolled-back, kept, evolving}` (the
|
|
108
|
+
* resume-entry stages). No other caller may leave `errored`.
|
|
97
109
|
*/
|
|
98
|
-
export type EpisodeStage = 'created' | 'main-arm-captured' | 'baseline-arm-captured' | 'baseline-skipped' | 'scored' | 'rolled-back' | 'kept' | 'evolved' | 'evolution-refused' | 'abstained' | 'closed' | 'errored';
|
|
110
|
+
export type EpisodeStage = 'created' | 'main-arm-captured' | 'baseline-arm-captured' | 'baseline-skipped' | 'scored' | 'rolled-back' | 'kept' | 'evolving' | 'evolved' | 'evolution-refused' | 'abstained' | 'closed' | 'errored';
|
|
99
111
|
/**
|
|
100
112
|
* Iterable list of every legal {@link EpisodeStage} value. Order follows the
|
|
101
113
|
* documented state machine for readability, not behavior.
|
|
@@ -143,6 +155,14 @@ export interface EpisodeRecord {
|
|
|
143
155
|
stageHistory: EpisodeStageHistoryEntry[];
|
|
144
156
|
/** Why the baseline arm was skipped (set with stage `baseline-skipped`). */
|
|
145
157
|
baselineSkippedReason?: string;
|
|
158
|
+
/**
|
|
159
|
+
* ISO 8601 UTC timestamp the episode entered the `evolving` stage (the moment
|
|
160
|
+
* the 演进智能体 EVOLVING AGENT spawn began holding the in-flight lock). A
|
|
161
|
+
* heartbeat for liveness reads — a concurrent sibling can tell a recently-
|
|
162
|
+
* entered `evolving` holder apart from one that genuinely wedged. Absent on
|
|
163
|
+
* old records (and on every stage before `evolving`).
|
|
164
|
+
*/
|
|
165
|
+
evolvingHeartbeatAt?: string;
|
|
146
166
|
/** advantage = reward(主臂) − reward(基线臂); null when the 奖励智能体 REWARD AGENT 弃权 abstained. */
|
|
147
167
|
advantage?: number | null;
|
|
148
168
|
/**
|
|
@@ -163,6 +183,8 @@ export interface EpisodeStagePatch {
|
|
|
163
183
|
advantage?: number | null;
|
|
164
184
|
/** Cause note merged alongside the terminal `errored` stage. */
|
|
165
185
|
terminalError?: string;
|
|
186
|
+
/** Heartbeat timestamp merged alongside the `evolving` stage. */
|
|
187
|
+
evolvingHeartbeatAt?: string;
|
|
166
188
|
}
|
|
167
189
|
/**
|
|
168
190
|
* True iff `(from -> to)` is a legal transition in the episode stage machine.
|
|
@@ -234,7 +256,8 @@ export interface AdvanceEpisodeStageOptions {
|
|
|
234
256
|
* advancing to a stage not reachable from the current one throws.
|
|
235
257
|
* - Appends `{stage, at}` to `stageHistory`.
|
|
236
258
|
* - Merges the allowlisted `patch` fields (`policyVersionBaseline`,
|
|
237
|
-
* `baselineSkippedReason`, `advantage`, `terminalError`)
|
|
259
|
+
* `baselineSkippedReason`, `advantage`, `terminalError`, `evolvingHeartbeatAt`)
|
|
260
|
+
* in the same write.
|
|
238
261
|
* - Bumps `updatedAt`.
|
|
239
262
|
*/
|
|
240
263
|
export declare function advanceEpisodeStage(opts: AdvanceEpisodeStageOptions): Promise<EpisodeRecord>;
|
|
@@ -61,6 +61,7 @@ export const EPISODE_STAGES = [
|
|
|
61
61
|
'scored',
|
|
62
62
|
'rolled-back',
|
|
63
63
|
'kept',
|
|
64
|
+
'evolving',
|
|
64
65
|
'evolved',
|
|
65
66
|
'evolution-refused',
|
|
66
67
|
'abstained',
|
|
@@ -77,8 +78,9 @@ const EPISODE_ID_PATTERN = /^[a-z0-9][a-z0-9-]*$/;
|
|
|
77
78
|
// step: agent spawn crash/timeout or un-repairable gate), so a failed episode
|
|
78
79
|
// is never orphaned mid-flight. `closed` and `errored` are the two terminals;
|
|
79
80
|
// `errored` is terminal EXCEPT for an operator resume re-drive back to its last
|
|
80
|
-
// good pre-error stage (scored/rolled-back/kept). `rolled-back`/`kept
|
|
81
|
-
// close directly (the not-spawned 演进智能体
|
|
81
|
+
// good pre-error stage (scored/rolled-back/kept/evolving). `rolled-back`/`kept`/
|
|
82
|
+
// `evolving` may also close directly (the not-spawned 演进智能体
|
|
83
|
+
// finished-nothing-to-evolve case).
|
|
82
84
|
const LEGAL_STAGE_TRANSITIONS = new Map([
|
|
83
85
|
['created', new Set(['main-arm-captured', 'errored'])],
|
|
84
86
|
[
|
|
@@ -91,15 +93,38 @@ const LEGAL_STAGE_TRANSITIONS = new Map([
|
|
|
91
93
|
// abstained, so no rollback decision is needed and the 演进智能体
|
|
92
94
|
// EVOLVING AGENT is never spawned.
|
|
93
95
|
['scored', new Set(['rolled-back', 'kept', 'abstained', 'errored'])],
|
|
94
|
-
// 'rolled-back'/'kept'
|
|
95
|
-
//
|
|
96
|
-
//
|
|
96
|
+
// 'rolled-back'/'kept' advance to 'evolving' BEFORE the 演进智能体 EVOLVING
|
|
97
|
+
// AGENT spawn (so a sibling can tell a live holder from a stale lock). They
|
|
98
|
+
// also retain the DIRECT transitions to the evolving outcomes + 'closed' so
|
|
99
|
+
// (a) an OLD episode record resumed from 'rolled-back'/'kept' (no 'evolving'
|
|
100
|
+
// stage) behaves exactly as before, and (b) the not-spawned
|
|
101
|
+
// finished-nothing-to-evolve case can still close directly.
|
|
97
102
|
[
|
|
98
103
|
'rolled-back',
|
|
99
|
-
new Set([
|
|
104
|
+
new Set([
|
|
105
|
+
'evolving',
|
|
106
|
+
'evolved',
|
|
107
|
+
'evolution-refused',
|
|
108
|
+
'abstained',
|
|
109
|
+
'closed',
|
|
110
|
+
'errored',
|
|
111
|
+
]),
|
|
100
112
|
],
|
|
101
113
|
[
|
|
102
114
|
'kept',
|
|
115
|
+
new Set([
|
|
116
|
+
'evolving',
|
|
117
|
+
'evolved',
|
|
118
|
+
'evolution-refused',
|
|
119
|
+
'abstained',
|
|
120
|
+
'closed',
|
|
121
|
+
'errored',
|
|
122
|
+
]),
|
|
123
|
+
],
|
|
124
|
+
// The 演进智能体 EVOLVING AGENT outcome (or a not-spawned close), or 'errored'
|
|
125
|
+
// on a thrown spawn/gate.
|
|
126
|
+
[
|
|
127
|
+
'evolving',
|
|
103
128
|
new Set(['evolved', 'evolution-refused', 'abstained', 'closed', 'errored']),
|
|
104
129
|
],
|
|
105
130
|
['evolved', new Set(['closed'])],
|
|
@@ -107,8 +132,9 @@ const LEGAL_STAGE_TRANSITIONS = new Map([
|
|
|
107
132
|
['abstained', new Set(['closed'])],
|
|
108
133
|
['closed', new Set()],
|
|
109
134
|
// 'errored' is terminal EXCEPT for an operator resume re-drive back to the
|
|
110
|
-
// last good pre-error stage (scored/rolled-back/kept); no other
|
|
111
|
-
|
|
135
|
+
// last good pre-error stage (scored/rolled-back/kept/evolving); no other
|
|
136
|
+
// caller leaves it.
|
|
137
|
+
['errored', new Set(['scored', 'rolled-back', 'kept', 'evolving'])],
|
|
112
138
|
]);
|
|
113
139
|
/**
|
|
114
140
|
* True iff `(from -> to)` is a legal transition in the episode stage machine.
|
|
@@ -387,6 +413,7 @@ const ALLOWED_PATCH_KEYS = new Set([
|
|
|
387
413
|
'baselineSkippedReason',
|
|
388
414
|
'advantage',
|
|
389
415
|
'terminalError',
|
|
416
|
+
'evolvingHeartbeatAt',
|
|
390
417
|
]);
|
|
391
418
|
/** Validate an {@link EpisodeStagePatch} fail-closed; returns the merge slice. */
|
|
392
419
|
function validateStagePatch(patch, episodeId) {
|
|
@@ -394,7 +421,7 @@ function validateStagePatch(patch, episodeId) {
|
|
|
394
421
|
for (const key of Object.keys(patch)) {
|
|
395
422
|
if (!ALLOWED_PATCH_KEYS.has(key)) {
|
|
396
423
|
throw new Error(`Illegal episode patch field for ${episodeId}: "${key}" ` +
|
|
397
|
-
`(allowed: policyVersionBaseline, baselineSkippedReason, advantage, terminalError)`);
|
|
424
|
+
`(allowed: policyVersionBaseline, baselineSkippedReason, advantage, terminalError, evolvingHeartbeatAt)`);
|
|
398
425
|
}
|
|
399
426
|
}
|
|
400
427
|
if ('policyVersionBaseline' in patch) {
|
|
@@ -425,6 +452,13 @@ function validateStagePatch(patch, episodeId) {
|
|
|
425
452
|
}
|
|
426
453
|
merge.terminalError = v;
|
|
427
454
|
}
|
|
455
|
+
if ('evolvingHeartbeatAt' in patch) {
|
|
456
|
+
const v = patch.evolvingHeartbeatAt;
|
|
457
|
+
if (typeof v !== 'string' || v.length === 0) {
|
|
458
|
+
throw new Error(`Invalid patch for ${episodeId}: evolvingHeartbeatAt must be a non-empty string`);
|
|
459
|
+
}
|
|
460
|
+
merge.evolvingHeartbeatAt = v;
|
|
461
|
+
}
|
|
428
462
|
return merge;
|
|
429
463
|
}
|
|
430
464
|
/**
|
|
@@ -436,7 +470,8 @@ function validateStagePatch(patch, episodeId) {
|
|
|
436
470
|
* advancing to a stage not reachable from the current one throws.
|
|
437
471
|
* - Appends `{stage, at}` to `stageHistory`.
|
|
438
472
|
* - Merges the allowlisted `patch` fields (`policyVersionBaseline`,
|
|
439
|
-
* `baselineSkippedReason`, `advantage`, `terminalError`)
|
|
473
|
+
* `baselineSkippedReason`, `advantage`, `terminalError`, `evolvingHeartbeatAt`)
|
|
474
|
+
* in the same write.
|
|
440
475
|
* - Bumps `updatedAt`.
|
|
441
476
|
*/
|
|
442
477
|
export async function advanceEpisodeStage(opts) {
|