synergyspec-selfevolving 2.1.0 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/learn.js +29 -3
- package/dist/commands/self-evolution-episode.js +37 -1
- package/dist/core/fitness/health/local-source.d.ts +11 -0
- package/dist/core/fitness/health/local-source.js +53 -1
- package/dist/core/project-config.d.ts +5 -0
- package/dist/core/project-config.js +23 -1
- package/dist/core/self-evolution/critic-agent.d.ts +16 -1
- package/dist/core/self-evolution/critic-agent.js +87 -17
- package/dist/core/self-evolution/episode-orchestrator.d.ts +28 -0
- package/dist/core/self-evolution/episode-orchestrator.js +369 -220
- package/dist/core/self-evolution/episode-store.d.ts +41 -2
- package/dist/core/self-evolution/episode-store.js +33 -9
- package/dist/core/self-evolution/evolving-agent.d.ts +51 -2
- package/dist/core/self-evolution/evolving-agent.js +45 -4
- package/dist/core/self-evolution/host-harness.d.ts +43 -0
- package/dist/core/self-evolution/host-harness.js +192 -0
- package/dist/core/self-evolution/reward-agent.d.ts +68 -0
- package/dist/core/self-evolution/reward-agent.js +92 -23
- package/dist/core/self-evolution/reward-aggregator.d.ts +26 -7
- package/dist/core/self-evolution/reward-aggregator.js +78 -20
- package/dist/core/self-evolution/verdict.d.ts +3 -2
- package/dist/core/self-evolution/verdict.js +4 -1
- package/dist/dashboard/react-client.js +2 -1
- package/dist/ui/ascii-patterns.d.ts +7 -8
- package/dist/ui/ascii-patterns.js +54 -120
- package/dist/ui/welcome-screen.d.ts +8 -0
- package/dist/ui/welcome-screen.js +2 -2
- package/package.json +1 -1
|
@@ -27,6 +27,18 @@ import { reconcilePrediction, summarizeCalibration, } from './policy/prediction-
|
|
|
27
27
|
export async function captureMainArm(opts) {
|
|
28
28
|
const sample = opts.report.fitnessSample;
|
|
29
29
|
const facts = sample?.trajectoryFacts;
|
|
30
|
+
// ④ Observable degrade: a verified:false arm — whether because NO
|
|
31
|
+
// observed-trajectory facts were captured, OR facts exist but the runner's
|
|
32
|
+
// pass/fail was not derivable (`facts.verified !== true`) — is surfaced on
|
|
33
|
+
// stderr so it is never SILENT. A wedged/missing/unextractable runner is the
|
|
34
|
+
// most common loop-stall cause, and a silent false reads identically to a real
|
|
35
|
+
// miss. The arm's recorded `verified` collapses to false in BOTH cases (see the
|
|
36
|
+
// `objective.verified` below), so EVERY verified:false arm warns exactly once;
|
|
37
|
+
// a genuinely verified arm (`facts.verified === true`) stays quiet.
|
|
38
|
+
if (!facts || facts.verified !== true) {
|
|
39
|
+
// eslint-disable-next-line no-console
|
|
40
|
+
console.warn(`[episode-orchestrator] observed grading unavailable for change "${opts.changeName}" — recording verified:false (observed run not verified)`);
|
|
41
|
+
}
|
|
30
42
|
// Honesty: prefer the OBSERVED pass rate (a real runner ran), else the
|
|
31
43
|
// authored test-report summary; null when neither parsed (never fabricated) —
|
|
32
44
|
// the exact precedence learn.ts uses to compute the loss.
|
|
@@ -80,14 +92,30 @@ export async function captureMainArm(opts) {
|
|
|
80
92
|
* Whether the episode SKIPS the rollback decision + 演进智能体 EVOLVING AGENT:
|
|
81
93
|
* the judge 弃权 abstained (no nameable gap), found no gaps, OR returned the
|
|
82
94
|
* ⑤ `insufficient-signal` verdict (within the A/A noise floor, or a blocked
|
|
83
|
-
* tamper).
|
|
95
|
+
* tamper). These mean "do not evolve on this episode".
|
|
96
|
+
*
|
|
97
|
+
* EXCEPTION (cold-start bootstrap): `insufficient-signal` is honored ONLY as a
|
|
98
|
+
* genuine can't-tell — a within-noise-floor result (the baseline ran) or a blocked
|
|
99
|
+
* tamper. On a baseline-skipped episode (policyVersions.baseline === null) there is
|
|
100
|
+
* no comparison to be uncertain about, so a stray `insufficient-signal` emitted
|
|
101
|
+
* alongside real gaps (and no tamper) must NOT block: the first v0→v1 evolution has
|
|
102
|
+
* to be reachable from absolute signal, or a fresh target stays at v0 forever. This
|
|
103
|
+
* is defense-in-depth behind {@link deriveSingleSampleVerdict}, which already drops
|
|
104
|
+
* a volunteered verdict to `undefined` on a baseline skip.
|
|
84
105
|
*/
|
|
85
106
|
function shouldSkipEvolution(diagnosis) {
|
|
86
107
|
if (diagnosis === null)
|
|
87
108
|
return true;
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
109
|
+
if (diagnosis.abstained || diagnosis.gaps.length === 0)
|
|
110
|
+
return true;
|
|
111
|
+
if (diagnosis.verdict === 'insufficient-signal') {
|
|
112
|
+
const baselineSkipped = diagnosis.policyVersions?.baseline === null;
|
|
113
|
+
const tamper = diagnosis.integrity?.testTamperSuspected ?? false;
|
|
114
|
+
if (baselineSkipped && !tamper)
|
|
115
|
+
return false;
|
|
116
|
+
return true;
|
|
117
|
+
}
|
|
118
|
+
return false;
|
|
91
119
|
}
|
|
92
120
|
/**
|
|
93
121
|
* Count the consecutive trailing rolled-back episodes in the 版本账本 ledger.
|
|
@@ -152,6 +180,17 @@ function deriveEpisodeId(changeName, now) {
|
|
|
152
180
|
.replace(/-{2,}/g, '-')
|
|
153
181
|
.replace(/^-+|-+$/g, '');
|
|
154
182
|
}
|
|
183
|
+
/**
|
|
184
|
+
* Build the `terminalError` note for a thrown step. A timeout reads identically
|
|
185
|
+
* to a hard crash on disk otherwise, so a message that names a host-agent timeout
|
|
186
|
+
* (the spawn timeout puts `headless agent timed out after Nms` into stderr → the
|
|
187
|
+
* error message) is PREFIXED with a `timeout:` marker. A timed-out episode is
|
|
188
|
+
* then distinguishable from a genuine crash in episode.json. Pure.
|
|
189
|
+
*/
|
|
190
|
+
function terminalErrorLabel(err) {
|
|
191
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
192
|
+
return /timed out/i.test(msg) ? `timeout: ${msg}` : msg;
|
|
193
|
+
}
|
|
155
194
|
/**
|
|
156
195
|
* Run ONE episode through the loop in the strict, durably-persisted order
|
|
157
196
|
* documented at the top of this module. See {@link RunEpisodeResult}.
|
|
@@ -251,149 +290,180 @@ async function runEpisodeAfterCreate(opts) {
|
|
|
251
290
|
objective: opts.mainArm.objective,
|
|
252
291
|
});
|
|
253
292
|
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'main-arm-captured' });
|
|
254
|
-
//
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
targetId,
|
|
261
|
-
changeName: opts.changeName,
|
|
262
|
-
episodeId,
|
|
263
|
-
baselineVersion: shouldCritic.baselineVersion,
|
|
264
|
-
...(opts.critic?.baselineMode ? { baselineMode: opts.critic.baselineMode } : {}),
|
|
265
|
-
spawn: opts.spawn,
|
|
266
|
-
});
|
|
267
|
-
}
|
|
268
|
-
else {
|
|
269
|
-
baselineSkipped = true;
|
|
270
|
-
await advanceEpisodeStage({
|
|
271
|
-
repoRoot,
|
|
272
|
-
episodeId,
|
|
273
|
-
stage: 'baseline-skipped',
|
|
274
|
-
patch: { baselineSkippedReason: shouldCritic.reason },
|
|
275
|
-
});
|
|
276
|
-
}
|
|
277
|
-
// ── e: 奖励智能体 REWARD AGENT — score + diagnosis.json + advance 'scored' ────
|
|
278
|
-
// ④ Compute the tamper hint first (unless tamperCheck is 'off'); the ensemble
|
|
279
|
-
// injects it into the judge prompt + the diagnosis integrity, and BLOCK mode
|
|
280
|
-
// forces an insufficient-signal verdict (no extra spawns at the default
|
|
281
|
-
// samples=1, flag-only).
|
|
282
|
-
const tamperMode = opts.reward?.tamperCheck ?? 'flag';
|
|
283
|
-
const integrityHint = tamperMode === 'off' ? null : await detectTestTamper({ changeDirPath: opts.changeDirPath });
|
|
284
|
-
const reward = await runRewardAgentEnsemble({
|
|
285
|
-
repoRoot,
|
|
286
|
-
episodeId,
|
|
287
|
-
spawn: opts.spawn,
|
|
288
|
-
...(opts.reward ? { reward: opts.reward } : {}),
|
|
289
|
-
integrityHint,
|
|
290
|
-
});
|
|
291
|
-
const diagnosis = reward.diagnosis;
|
|
292
|
-
advantage = diagnosis.advantage;
|
|
293
|
-
// 预测校准: settle the PRIOR 'evolve' step's checkable prediction against this
|
|
294
|
-
// episode's measured arm delta (verifiable per-metric main−baseline, NOT the
|
|
295
|
-
// judge's reward) and log the residual. Best-effort — a calibration miss must
|
|
296
|
-
// never fail the episode.
|
|
293
|
+
// Steps d–g spawn the three agents (any of which may THROW — a wedged/crashed
|
|
294
|
+
// host CLI, a never-validating reward output, an evolving-agent invocation
|
|
295
|
+
// error). A thrown agent step must record a DURABLE terminal 'errored' stage
|
|
296
|
+
// (with the error text) so the episode is never orphaned at 'kept'/'scored'
|
|
297
|
+
// (indistinguishable from a still-running episode — ses_1330/1331). The lock
|
|
298
|
+
// release stays in runEpisode's finally; the re-throw below reaches it.
|
|
297
299
|
try {
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
// ── f: DECISION (every step durably persisted before the next) ───────────────
|
|
304
|
-
if (shouldSkipEvolution(diagnosis)) {
|
|
305
|
-
// 弃权 abstained / no nameable gap / insufficient-signal → no rollback
|
|
306
|
-
// decision; SKIP evolution. A BLOCKED tamper records a reject-buffer entry so
|
|
307
|
-
// the gamed attempt is visible to future episodes' 演进智能体.
|
|
308
|
-
if (tamperMode === 'block' &&
|
|
309
|
-
diagnosis.integrity?.testTamperSuspected &&
|
|
310
|
-
diagnosis.verdict === 'insufficient-signal') {
|
|
311
|
-
const head = await currentPolicyVersion(repoRoot, targetId);
|
|
312
|
-
await appendRejectBufferEntry(repoRoot, {
|
|
313
|
-
schemaVersion: 1,
|
|
314
|
-
at: new Date().toISOString(),
|
|
315
|
-
episodeId,
|
|
316
|
-
targetId,
|
|
317
|
-
// No version moved (the main arm's tampered tests are not a policy edit);
|
|
318
|
-
// record at the current head so the entry is informational, not a rollback.
|
|
319
|
-
fromVersion: head ?? 0,
|
|
320
|
-
toVersion: head ?? 0,
|
|
321
|
-
advantage: diagnosis.advantage,
|
|
322
|
-
rewardMain: diagnosis.rewardMain,
|
|
323
|
-
rewardBaseline: diagnosis.rewardBaseline,
|
|
324
|
-
textualGradientTried: diagnosis.textualGradient ?? '',
|
|
325
|
-
editSummary: buildRejectEditSummary(diagnosis),
|
|
326
|
-
reason: 'tamper-suspected',
|
|
327
|
-
});
|
|
328
|
-
}
|
|
329
|
-
decision = 'abstained';
|
|
330
|
-
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'abstained' });
|
|
331
|
-
}
|
|
332
|
-
else {
|
|
333
|
-
const badAdvantage = advantage !== null && advantage < threshold;
|
|
334
|
-
const ep = await readEpisode(repoRoot, episodeId);
|
|
335
|
-
const headBeforeRollback = await currentPolicyVersion(repoRoot, targetId);
|
|
336
|
-
// Resolve the rollback target: the policy the CRITIC AGENT reran
|
|
337
|
-
// (`policyVersionBaseline`) when it is a valid EARLIER version, else the
|
|
338
|
-
// version immediately before the head (the prior good policy the bad edit
|
|
339
|
-
// advanced past). `rollbackPolicyVersion` requires `toVersion < head`.
|
|
340
|
-
const rollbackTarget = resolveRollbackTarget(ep.policyVersionBaseline, headBeforeRollback);
|
|
341
|
-
if (badAdvantage && rollbackTarget !== null && headBeforeRollback !== null) {
|
|
342
|
-
// (i) ROLLBACK first — durable on disk. `advantage` is non-null inside the
|
|
343
|
-
// badAdvantage branch; `?? undefined` satisfies the optional `number` param.
|
|
344
|
-
await rollbackPolicyVersion({
|
|
300
|
+
// ── d: CRITIC AGENT(基线智能体 baseline agent)or skip ───────────────────────
|
|
301
|
+
const shouldCritic = await shouldRunCriticAgent({ repoRoot, targetId });
|
|
302
|
+
if (shouldCritic.run && shouldCritic.baselineVersion !== null) {
|
|
303
|
+
// runCriticAgent advances the episode to 'baseline-arm-captured'.
|
|
304
|
+
await runCriticAgent({
|
|
345
305
|
repoRoot,
|
|
346
306
|
targetId,
|
|
307
|
+
changeName: opts.changeName,
|
|
347
308
|
episodeId,
|
|
348
|
-
|
|
349
|
-
|
|
309
|
+
baselineVersion: shouldCritic.baselineVersion,
|
|
310
|
+
...(opts.critic?.baselineMode ? { baselineMode: opts.critic.baselineMode } : {}),
|
|
311
|
+
spawn: opts.spawn,
|
|
312
|
+
...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
|
|
313
|
+
...(opts.harness ? { harness: opts.harness } : {}),
|
|
350
314
|
});
|
|
351
|
-
// (ii) THEN append the 否决缓冲 reject-buffer entry — durable on disk —
|
|
352
|
-
// BEFORE the 演进智能体 EVOLVING AGENT is even called, so the entry written
|
|
353
|
-
// THIS episode is in its fresh-from-disk prompt.
|
|
354
|
-
const rejectEntry = {
|
|
355
|
-
schemaVersion: 1,
|
|
356
|
-
at: new Date().toISOString(),
|
|
357
|
-
episodeId,
|
|
358
|
-
targetId,
|
|
359
|
-
// fromVersion = the version we rolled back TO (the prior good policy);
|
|
360
|
-
// toVersion = the (now rolled-back) version the rejected edit reached.
|
|
361
|
-
fromVersion: rollbackTarget,
|
|
362
|
-
toVersion: headBeforeRollback,
|
|
363
|
-
advantage,
|
|
364
|
-
rewardMain: diagnosis.rewardMain,
|
|
365
|
-
rewardBaseline: diagnosis.rewardBaseline,
|
|
366
|
-
textualGradientTried: diagnosis.textualGradient ?? '',
|
|
367
|
-
editSummary: buildRejectEditSummary(diagnosis),
|
|
368
|
-
reason: 'bad-advantage',
|
|
369
|
-
};
|
|
370
|
-
await appendRejectBufferEntry(repoRoot, rejectEntry);
|
|
371
|
-
decision = 'rolled-back';
|
|
372
|
-
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
|
|
373
315
|
}
|
|
374
316
|
else {
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
317
|
+
baselineSkipped = true;
|
|
318
|
+
await advanceEpisodeStage({
|
|
319
|
+
repoRoot,
|
|
320
|
+
episodeId,
|
|
321
|
+
stage: 'baseline-skipped',
|
|
322
|
+
patch: { baselineSkippedReason: shouldCritic.reason },
|
|
323
|
+
});
|
|
379
324
|
}
|
|
380
|
-
// ──
|
|
381
|
-
//
|
|
382
|
-
//
|
|
383
|
-
//
|
|
384
|
-
//
|
|
385
|
-
const
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
const calibrationNote = await summarizeCalibration(repoRoot, targetId);
|
|
389
|
-
evolution = await runEvolvingAgent({
|
|
325
|
+
// ── e: 奖励智能体 REWARD AGENT — score + diagnosis.json + advance 'scored' ────
|
|
326
|
+
// ④ Compute the tamper hint first (unless tamperCheck is 'off'); the ensemble
|
|
327
|
+
// injects it into the judge prompt + the diagnosis integrity, and BLOCK mode
|
|
328
|
+
// forces an insufficient-signal verdict (no extra spawns at the default
|
|
329
|
+
// samples=1, flag-only).
|
|
330
|
+
const tamperMode = opts.reward?.tamperCheck ?? 'flag';
|
|
331
|
+
const integrityHint = tamperMode === 'off' ? null : await detectTestTamper({ changeDirPath: opts.changeDirPath });
|
|
332
|
+
const reward = await runRewardAgentEnsemble({
|
|
390
333
|
repoRoot,
|
|
391
334
|
episodeId,
|
|
392
|
-
targetId,
|
|
393
|
-
editBudget: scheduledBudget,
|
|
394
|
-
...(calibrationNote ? { calibrationNote } : {}),
|
|
395
335
|
spawn: opts.spawn,
|
|
336
|
+
...(opts.reward ? { reward: opts.reward } : {}),
|
|
337
|
+
integrityHint,
|
|
338
|
+
...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
|
|
339
|
+
...(opts.harness ? { harness: opts.harness } : {}),
|
|
396
340
|
});
|
|
341
|
+
const diagnosis = reward.diagnosis;
|
|
342
|
+
advantage = diagnosis.advantage;
|
|
343
|
+
// 预测校准: settle the PRIOR 'evolve' step's checkable prediction against this
|
|
344
|
+
// episode's measured arm delta (verifiable per-metric main−baseline, NOT the
|
|
345
|
+
// judge's reward) and log the residual. Best-effort — a calibration miss must
|
|
346
|
+
// never fail the episode.
|
|
347
|
+
try {
|
|
348
|
+
await reconcilePrediction({ repoRoot, targetId, episodeId });
|
|
349
|
+
}
|
|
350
|
+
catch {
|
|
351
|
+
// best-effort: the prediction-reconcile ledger is advisory only
|
|
352
|
+
}
|
|
353
|
+
// ── f: DECISION (every step durably persisted before the next) ───────────────
|
|
354
|
+
if (shouldSkipEvolution(diagnosis)) {
|
|
355
|
+
// 弃权 abstained / no nameable gap / insufficient-signal → no rollback
|
|
356
|
+
// decision; SKIP evolution. A BLOCKED tamper records a reject-buffer entry so
|
|
357
|
+
// the gamed attempt is visible to future episodes' 演进智能体.
|
|
358
|
+
if (tamperMode === 'block' &&
|
|
359
|
+
diagnosis.integrity?.testTamperSuspected &&
|
|
360
|
+
diagnosis.verdict === 'insufficient-signal') {
|
|
361
|
+
const head = await currentPolicyVersion(repoRoot, targetId);
|
|
362
|
+
await appendRejectBufferEntry(repoRoot, {
|
|
363
|
+
schemaVersion: 1,
|
|
364
|
+
at: new Date().toISOString(),
|
|
365
|
+
episodeId,
|
|
366
|
+
targetId,
|
|
367
|
+
// No version moved (the main arm's tampered tests are not a policy edit);
|
|
368
|
+
// record at the current head so the entry is informational, not a rollback.
|
|
369
|
+
fromVersion: head ?? 0,
|
|
370
|
+
toVersion: head ?? 0,
|
|
371
|
+
advantage: diagnosis.advantage,
|
|
372
|
+
rewardMain: diagnosis.rewardMain,
|
|
373
|
+
rewardBaseline: diagnosis.rewardBaseline,
|
|
374
|
+
textualGradientTried: diagnosis.textualGradient ?? '',
|
|
375
|
+
editSummary: buildRejectEditSummary(diagnosis),
|
|
376
|
+
reason: 'tamper-suspected',
|
|
377
|
+
});
|
|
378
|
+
}
|
|
379
|
+
decision = 'abstained';
|
|
380
|
+
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'abstained' });
|
|
381
|
+
}
|
|
382
|
+
else {
|
|
383
|
+
const badAdvantage = advantage !== null && advantage < threshold;
|
|
384
|
+
const ep = await readEpisode(repoRoot, episodeId);
|
|
385
|
+
const headBeforeRollback = await currentPolicyVersion(repoRoot, targetId);
|
|
386
|
+
// Resolve the rollback target: the policy the CRITIC AGENT reran
|
|
387
|
+
// (`policyVersionBaseline`) when it is a valid EARLIER version, else the
|
|
388
|
+
// version immediately before the head (the prior good policy the bad edit
|
|
389
|
+
// advanced past). `rollbackPolicyVersion` requires `toVersion < head`.
|
|
390
|
+
const rollbackTarget = resolveRollbackTarget(ep.policyVersionBaseline, headBeforeRollback);
|
|
391
|
+
if (badAdvantage && rollbackTarget !== null && headBeforeRollback !== null) {
|
|
392
|
+
// (i) ROLLBACK first — durable on disk. `advantage` is non-null inside the
|
|
393
|
+
// badAdvantage branch; `?? undefined` satisfies the optional `number` param.
|
|
394
|
+
await rollbackPolicyVersion({
|
|
395
|
+
repoRoot,
|
|
396
|
+
targetId,
|
|
397
|
+
episodeId,
|
|
398
|
+
toVersion: rollbackTarget,
|
|
399
|
+
advantage: advantage ?? undefined,
|
|
400
|
+
});
|
|
401
|
+
// (ii) THEN append the 否决缓冲 reject-buffer entry — durable on disk —
|
|
402
|
+
// BEFORE the 演进智能体 EVOLVING AGENT is even called, so the entry written
|
|
403
|
+
// THIS episode is in its fresh-from-disk prompt.
|
|
404
|
+
const rejectEntry = {
|
|
405
|
+
schemaVersion: 1,
|
|
406
|
+
at: new Date().toISOString(),
|
|
407
|
+
episodeId,
|
|
408
|
+
targetId,
|
|
409
|
+
// fromVersion = the version we rolled back TO (the prior good policy);
|
|
410
|
+
// toVersion = the (now rolled-back) version the rejected edit reached.
|
|
411
|
+
fromVersion: rollbackTarget,
|
|
412
|
+
toVersion: headBeforeRollback,
|
|
413
|
+
advantage,
|
|
414
|
+
rewardMain: diagnosis.rewardMain,
|
|
415
|
+
rewardBaseline: diagnosis.rewardBaseline,
|
|
416
|
+
textualGradientTried: diagnosis.textualGradient ?? '',
|
|
417
|
+
editSummary: buildRejectEditSummary(diagnosis),
|
|
418
|
+
reason: 'bad-advantage',
|
|
419
|
+
};
|
|
420
|
+
await appendRejectBufferEntry(repoRoot, rejectEntry);
|
|
421
|
+
decision = 'rolled-back';
|
|
422
|
+
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
|
|
423
|
+
}
|
|
424
|
+
else {
|
|
425
|
+
// Good advantage, OR no earlier version to roll back to (e.g. head is v0):
|
|
426
|
+
// keep the current head.
|
|
427
|
+
decision = 'kept';
|
|
428
|
+
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'kept' });
|
|
429
|
+
}
|
|
430
|
+
// ── g: ONLY AFTER (f) persisted 'rolled-back'/'kept' ───────────────────────
|
|
431
|
+
// runEvolvingAgent reads the reject-buffer FRESH from disk (the entry just
|
|
432
|
+
// written THIS episode is in its prompt). Never parallelized with (f).
|
|
433
|
+
// 步长: after a rollback, shrink the edit budget (smaller step after a step
|
|
434
|
+
// that lost ground). 预测校准: pass the proposer's recent prediction record.
|
|
435
|
+
const scheduledBudget = decision === 'rolled-back'
|
|
436
|
+
? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
|
|
437
|
+
: editBudget;
|
|
438
|
+
const calibrationNote = await summarizeCalibration(repoRoot, targetId);
|
|
439
|
+
evolution = await runEvolvingAgent({
|
|
440
|
+
repoRoot,
|
|
441
|
+
episodeId,
|
|
442
|
+
targetId,
|
|
443
|
+
editBudget: scheduledBudget,
|
|
444
|
+
...(calibrationNote ? { calibrationNote } : {}),
|
|
445
|
+
spawn: opts.spawn,
|
|
446
|
+
...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
|
|
447
|
+
...(opts.harness ? { harness: opts.harness } : {}),
|
|
448
|
+
});
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
catch (err) {
|
|
452
|
+
// A thrown agent step (d–g) records a DURABLE terminal 'errored' stage so the
|
|
453
|
+
// episode is never orphaned at 'kept'/'scored' (which is indistinguishable
|
|
454
|
+
// from a still-running episode — the ses_1330/1331 wsgidav orphan). The
|
|
455
|
+
// 'errored' stage is reachable from EVERY non-terminal stage; the patch
|
|
456
|
+
// carries the error text (prefixed `timeout:` when the throw was a host-agent
|
|
457
|
+
// timeout, so a timed-out episode is distinguishable from a hard crash on
|
|
458
|
+
// disk). Best-effort: a failed record write must not mask the original throw,
|
|
459
|
+
// which still propagates to runEpisode's finally (lock release).
|
|
460
|
+
await advanceEpisodeStage({
|
|
461
|
+
repoRoot,
|
|
462
|
+
episodeId,
|
|
463
|
+
stage: 'errored',
|
|
464
|
+
patch: { terminalError: terminalErrorLabel(err) },
|
|
465
|
+
}).catch(() => { });
|
|
466
|
+
throw err;
|
|
397
467
|
}
|
|
398
468
|
// ── h (stage half): advance 'closed' (best-effort) ───────────────────────────
|
|
399
469
|
await closeEpisodeBestEffort(repoRoot, episodeId);
|
|
@@ -401,11 +471,25 @@ async function runEpisodeAfterCreate(opts) {
|
|
|
401
471
|
return { episodeId, baselineSkipped, advantage, decision, evolution, newPolicyVersion };
|
|
402
472
|
}
|
|
403
473
|
/**
|
|
404
|
-
* Advance the episode to 'closed' from whatever terminal-ish stage it reached
|
|
405
|
-
*
|
|
406
|
-
*
|
|
407
|
-
*
|
|
408
|
-
*
|
|
474
|
+
* Advance the episode to 'closed' from whatever terminal-ish stage it reached,
|
|
475
|
+
* best-effort.
|
|
476
|
+
*
|
|
477
|
+
* Closable stages:
|
|
478
|
+
* - evolved | evolution-refused | abstained — the 演进智能体 EVOLVING AGENT
|
|
479
|
+
* reached a definite outcome (or the judge 弃权 abstained), the normal close.
|
|
480
|
+
* - kept | rolled-back — the 演进智能体 returned not-spawned (its diagnosis
|
|
481
|
+
* abstained-after-gap-check, no gaps, or the target resolved to no editable
|
|
482
|
+
* local files), so the episode never advanced past the decision. By the time
|
|
483
|
+
* this runs (AFTER runEvolvingAgent returned), a stage still at 'kept'/
|
|
484
|
+
* 'rolled-back' can ONLY mean not-spawned — a success advances 'evolved', a
|
|
485
|
+
* refusal advances 'evolution-refused', and a throw is caught upstream and
|
|
486
|
+
* records 'errored' + rethrows so this close is never reached. So a leftover
|
|
487
|
+
* kept/rolled-back at close time IS the finished-nothing-to-evolve case and
|
|
488
|
+
* must close, not rest forever at a non-terminal stage (the exact ambiguity
|
|
489
|
+
* the 'errored' stage was meant to remove).
|
|
490
|
+
*
|
|
491
|
+
* Any other (genuinely non-closable) stage is left as-is rather than throwing, so
|
|
492
|
+
* the close never masks the real episode outcome.
|
|
409
493
|
*/
|
|
410
494
|
async function closeEpisodeBestEffort(repoRoot, episodeId) {
|
|
411
495
|
const ep = await readEpisode(repoRoot, episodeId);
|
|
@@ -413,6 +497,9 @@ async function closeEpisodeBestEffort(repoRoot, episodeId) {
|
|
|
413
497
|
'evolved',
|
|
414
498
|
'evolution-refused',
|
|
415
499
|
'abstained',
|
|
500
|
+
// not-spawned 演进智能体 leaves the episode here — close the finished episode.
|
|
501
|
+
'kept',
|
|
502
|
+
'rolled-back',
|
|
416
503
|
]);
|
|
417
504
|
if (closable.has(ep.stage)) {
|
|
418
505
|
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'closed' });
|
|
@@ -501,6 +588,16 @@ async function ensureRejectBufferEntry(repoRoot, opts) {
|
|
|
501
588
|
* - 'scored' → run the decision (f) then the 演进智能体 (g).
|
|
502
589
|
* - 'rolled-back' / 'kept' → run the 演进智能体 EVOLVING AGENT (g) then close.
|
|
503
590
|
* - 'evolved'/'evolution-refused'/'abstained' → close.
|
|
591
|
+
* - 'errored' → RE-DRIVE from the last GOOD pre-error stage
|
|
592
|
+
* (an episode may have errored on a TRANSIENT
|
|
593
|
+
* cause — a one-off git/analyzer/agent timeout).
|
|
594
|
+
* The pre-error stage is the last `stageHistory`
|
|
595
|
+
* entry that is NOT 'errored'; when it is one of
|
|
596
|
+
* {'scored','rolled-back','kept'} (the
|
|
597
|
+
* resume-entry stages) we advance errored → that
|
|
598
|
+
* stage and fall through to the normal dispatch.
|
|
599
|
+
* Otherwise the pre-error stage is not
|
|
600
|
+
* auto-resumable and the episode is reported as-is.
|
|
504
601
|
* - earlier stages → not auto-resumable here (the arms / reward
|
|
505
602
|
* agent need their own re-entry); reported as-is.
|
|
506
603
|
*
|
|
@@ -516,78 +613,135 @@ export async function resumeEpisode(opts) {
|
|
|
516
613
|
const resumedFrom = ep.stage;
|
|
517
614
|
const targetId = ep.targetId;
|
|
518
615
|
let evolution = null;
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
616
|
+
// The effective stage we dispatch on. Normally the episode's current stage;
|
|
617
|
+
// for an 'errored' episode we attempt to RE-DRIVE from the last good pre-error
|
|
618
|
+
// stage (a transient git/analyzer/agent failure should be retryable via an
|
|
619
|
+
// operator resume). 'errored' stays terminal for every OTHER caller — only this
|
|
620
|
+
// resume path may re-drive it, via the errored → {scored,rolled-back,kept}
|
|
621
|
+
// transitions the stage machine allows ONLY for operator recovery.
|
|
622
|
+
let stage = ep.stage;
|
|
623
|
+
if (ep.stage === 'errored') {
|
|
624
|
+
const preError = [...ep.stageHistory]
|
|
625
|
+
.reverse()
|
|
626
|
+
.find((h) => h.stage !== 'errored')?.stage;
|
|
627
|
+
if (preError === 'scored' ||
|
|
628
|
+
preError === 'rolled-back' ||
|
|
629
|
+
preError === 'kept') {
|
|
630
|
+
// Re-open the errored episode at its last auto-resumable stage, then fall
|
|
631
|
+
// through to the normal dispatch for that stage.
|
|
632
|
+
await advanceEpisodeStage({ repoRoot, episodeId, stage: preError });
|
|
633
|
+
stage = preError;
|
|
524
634
|
}
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
// = priorRollback.version - 1 (the head before the rollback)
|
|
543
|
-
// fromVersion = the prior good policy restored (one before that head)
|
|
544
|
-
const toVersion = priorRollback.version - 1;
|
|
545
|
-
const fromVersion = resolveRollbackTarget(ep.policyVersionBaseline, toVersion);
|
|
546
|
-
await ensureRejectBufferEntry(repoRoot, {
|
|
547
|
-
episodeId,
|
|
548
|
-
targetId,
|
|
549
|
-
fromVersion: fromVersion ?? toVersion,
|
|
550
|
-
toVersion,
|
|
551
|
-
advantage,
|
|
552
|
-
diagnosis,
|
|
553
|
-
});
|
|
554
|
-
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
|
|
635
|
+
// Otherwise the pre-error stage is not auto-resumable (e.g. a reward throw at
|
|
636
|
+
// 'baseline-skipped'); leave the episode at 'errored' and report it as-is.
|
|
637
|
+
}
|
|
638
|
+
// The decision (f) + 演进智能体 EVOLVING AGENT (g) re-runs below can THROW — a
|
|
639
|
+
// wedged/crashed host CLI (CanonicalProposerInvocationError), a timeout, or an
|
|
640
|
+
// observed-GREEN gate throw. UNCAUGHT, that leaves the episode DURABLY stuck at
|
|
641
|
+
// a non-terminal stage ('scored'/'rolled-back'/'kept' — the orphan state fix ❷
|
|
642
|
+
// eliminates for runEpisode). Record the SAME terminal 'errored' stage here
|
|
643
|
+
// (the transition map already allows scored/rolled-back/kept → 'errored'), then
|
|
644
|
+
// re-throw. Resume holds NO in-flight lock, so this is a durable-stage fix, not
|
|
645
|
+
// a leak fix. Best-effort write: a failed record must not mask the original throw.
|
|
646
|
+
try {
|
|
647
|
+
if (stage === 'scored') {
|
|
648
|
+
// Re-run the decision (f) from the on-disk diagnosis, then (g).
|
|
649
|
+
const diagnosis = await readDiagnosisForResume(repoRoot, episodeId);
|
|
650
|
+
if (shouldSkipEvolution(diagnosis)) {
|
|
651
|
+
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'abstained' });
|
|
555
652
|
}
|
|
556
|
-
else {
|
|
557
|
-
|
|
558
|
-
const
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
653
|
+
else if (diagnosis) {
|
|
654
|
+
// (shouldSkipEvolution returns true for null, so diagnosis is non-null here)
|
|
655
|
+
const advantage = diagnosis.advantage;
|
|
656
|
+
const badAdvantage = advantage !== null && advantage < threshold;
|
|
657
|
+
// Crash-resume dedup: a 'rollback' ledger entry whose episodeId is THIS
|
|
658
|
+
// episode means runEpisode already applied the rollback before the host
|
|
659
|
+
// crashed (the rollback head is monotonic — re-calling rollbackPolicyVersion
|
|
660
|
+
// would stack a SECOND, duplicate rollback version). When present, reuse its
|
|
661
|
+
// recorded version axis and SKIP the re-rollback; only ensure the
|
|
662
|
+
// reject-buffer entry + the 'rolled-back' stage advance complete.
|
|
663
|
+
const ledger = await readPolicyLedger(repoRoot, targetId);
|
|
664
|
+
const priorRollback = ledger.find((e) => e.action === 'rollback' && e.episodeId === episodeId);
|
|
665
|
+
if (priorRollback) {
|
|
666
|
+
// The prior rollback already advanced to `priorRollback.version`, rolling
|
|
667
|
+
// FORWARD to the content of the version immediately before the rejected
|
|
668
|
+
// edit's head. Reconstruct the reject-buffer axis from that entry:
|
|
669
|
+
// toVersion = the (rolled-back) version the rejected edit reached
|
|
670
|
+
// = priorRollback.version - 1 (the head before the rollback)
|
|
671
|
+
// fromVersion = the prior good policy restored (one before that head)
|
|
672
|
+
const toVersion = priorRollback.version - 1;
|
|
673
|
+
const fromVersion = resolveRollbackTarget(ep.policyVersionBaseline, toVersion);
|
|
567
674
|
await ensureRejectBufferEntry(repoRoot, {
|
|
568
675
|
episodeId,
|
|
569
676
|
targetId,
|
|
570
|
-
fromVersion:
|
|
571
|
-
toVersion
|
|
677
|
+
fromVersion: fromVersion ?? toVersion,
|
|
678
|
+
toVersion,
|
|
572
679
|
advantage,
|
|
573
680
|
diagnosis,
|
|
574
681
|
});
|
|
575
682
|
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
|
|
576
683
|
}
|
|
577
684
|
else {
|
|
578
|
-
await
|
|
685
|
+
const headBeforeRollback = await currentPolicyVersion(repoRoot, targetId);
|
|
686
|
+
const rollbackTarget = resolveRollbackTarget(ep.policyVersionBaseline, headBeforeRollback);
|
|
687
|
+
if (badAdvantage && rollbackTarget !== null && headBeforeRollback !== null) {
|
|
688
|
+
await rollbackPolicyVersion({
|
|
689
|
+
repoRoot,
|
|
690
|
+
targetId,
|
|
691
|
+
episodeId,
|
|
692
|
+
toVersion: rollbackTarget,
|
|
693
|
+
advantage: advantage ?? undefined,
|
|
694
|
+
});
|
|
695
|
+
await ensureRejectBufferEntry(repoRoot, {
|
|
696
|
+
episodeId,
|
|
697
|
+
targetId,
|
|
698
|
+
fromVersion: rollbackTarget,
|
|
699
|
+
toVersion: headBeforeRollback,
|
|
700
|
+
advantage,
|
|
701
|
+
diagnosis,
|
|
702
|
+
});
|
|
703
|
+
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
|
|
704
|
+
}
|
|
705
|
+
else {
|
|
706
|
+
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'kept' });
|
|
707
|
+
}
|
|
708
|
+
}
|
|
709
|
+
// 预测校准 (idempotent if the original run already settled it) + 步长 schedule
|
|
710
|
+
// + calibration note, mirroring runEpisode's (g) step.
|
|
711
|
+
try {
|
|
712
|
+
await reconcilePrediction({ repoRoot, targetId, episodeId });
|
|
579
713
|
}
|
|
714
|
+
catch {
|
|
715
|
+
// best-effort: advisory only
|
|
716
|
+
}
|
|
717
|
+
const afterDecision = await readEpisode(repoRoot, episodeId);
|
|
718
|
+
const scheduledBudget = afterDecision.stage === 'rolled-back'
|
|
719
|
+
? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
|
|
720
|
+
: editBudget;
|
|
721
|
+
const calibrationNote = await summarizeCalibration(repoRoot, targetId);
|
|
722
|
+
evolution = await runEvolvingAgent({
|
|
723
|
+
repoRoot,
|
|
724
|
+
episodeId,
|
|
725
|
+
targetId,
|
|
726
|
+
editBudget: scheduledBudget,
|
|
727
|
+
...(calibrationNote ? { calibrationNote } : {}),
|
|
728
|
+
spawn: opts.spawn,
|
|
729
|
+
...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
|
|
730
|
+
...(opts.harness ? { harness: opts.harness } : {}),
|
|
731
|
+
});
|
|
580
732
|
}
|
|
581
|
-
|
|
582
|
-
|
|
733
|
+
await closeEpisodeBestEffort(repoRoot, episodeId);
|
|
734
|
+
}
|
|
735
|
+
else if (stage === 'rolled-back' || stage === 'kept') {
|
|
736
|
+
// The decision already ran (and the original episode settled the prediction);
|
|
737
|
+
// re-settle idempotently for the crash window, then schedule + calibrate.
|
|
583
738
|
try {
|
|
584
739
|
await reconcilePrediction({ repoRoot, targetId, episodeId });
|
|
585
740
|
}
|
|
586
741
|
catch {
|
|
587
742
|
// best-effort: advisory only
|
|
588
743
|
}
|
|
589
|
-
const
|
|
590
|
-
const scheduledBudget = afterDecision.stage === 'rolled-back'
|
|
744
|
+
const scheduledBudget = stage === 'rolled-back'
|
|
591
745
|
? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
|
|
592
746
|
: editBudget;
|
|
593
747
|
const calibrationNote = await summarizeCalibration(repoRoot, targetId);
|
|
@@ -598,39 +752,34 @@ export async function resumeEpisode(opts) {
|
|
|
598
752
|
editBudget: scheduledBudget,
|
|
599
753
|
...(calibrationNote ? { calibrationNote } : {}),
|
|
600
754
|
spawn: opts.spawn,
|
|
755
|
+
...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
|
|
756
|
+
...(opts.harness ? { harness: opts.harness } : {}),
|
|
601
757
|
});
|
|
758
|
+
await closeEpisodeBestEffort(repoRoot, episodeId);
|
|
602
759
|
}
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
// re-settle idempotently for the crash window, then schedule + calibrate.
|
|
608
|
-
try {
|
|
609
|
-
await reconcilePrediction({ repoRoot, targetId, episodeId });
|
|
610
|
-
}
|
|
611
|
-
catch {
|
|
612
|
-
// best-effort: advisory only
|
|
760
|
+
else if (stage === 'evolved' ||
|
|
761
|
+
stage === 'evolution-refused' ||
|
|
762
|
+
stage === 'abstained') {
|
|
763
|
+
await closeEpisodeBestEffort(repoRoot, episodeId);
|
|
613
764
|
}
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
765
|
+
// earlier stages (and a non-auto-resumable 'errored'): not auto-resumable here
|
|
766
|
+
// — reported as-is.
|
|
767
|
+
}
|
|
768
|
+
catch (err) {
|
|
769
|
+
// A thrown decision/evolving step records a DURABLE terminal 'errored' stage so
|
|
770
|
+
// the resumed episode is never left stuck at 'scored'/'rolled-back'/'kept'
|
|
771
|
+
// (indistinguishable from a still-running episode). Mirrors
|
|
772
|
+
// runEpisodeAfterCreate's catch — including the `timeout:` marker so a timed-out
|
|
773
|
+
// resume is distinguishable from a hard crash. Best-effort: a failed record must
|
|
774
|
+
// not mask the original throw, which still propagates to the caller.
|
|
775
|
+
await advanceEpisodeStage({
|
|
619
776
|
repoRoot,
|
|
620
777
|
episodeId,
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
});
|
|
626
|
-
await closeEpisodeBestEffort(repoRoot, episodeId);
|
|
627
|
-
}
|
|
628
|
-
else if (ep.stage === 'evolved' ||
|
|
629
|
-
ep.stage === 'evolution-refused' ||
|
|
630
|
-
ep.stage === 'abstained') {
|
|
631
|
-
await closeEpisodeBestEffort(repoRoot, episodeId);
|
|
778
|
+
stage: 'errored',
|
|
779
|
+
patch: { terminalError: terminalErrorLabel(err) },
|
|
780
|
+
}).catch(() => { });
|
|
781
|
+
throw err;
|
|
632
782
|
}
|
|
633
|
-
// earlier stages: not auto-resumable here — reported as-is.
|
|
634
783
|
const after = await readEpisode(repoRoot, episodeId);
|
|
635
784
|
return { episodeId, resumedFrom, stage: after.stage, evolution };
|
|
636
785
|
}
|