synergyspec-selfevolving 2.1.1 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/learn.js +29 -3
- package/dist/commands/self-evolution-episode.js +37 -1
- package/dist/core/fitness/health/local-source.d.ts +11 -0
- package/dist/core/fitness/health/local-source.js +53 -1
- package/dist/core/project-config.d.ts +5 -0
- package/dist/core/project-config.js +23 -1
- package/dist/core/self-evolution/critic-agent.d.ts +16 -1
- package/dist/core/self-evolution/critic-agent.js +87 -17
- package/dist/core/self-evolution/episode-orchestrator.d.ts +28 -0
- package/dist/core/self-evolution/episode-orchestrator.js +349 -216
- package/dist/core/self-evolution/episode-store.d.ts +41 -2
- package/dist/core/self-evolution/episode-store.js +33 -9
- package/dist/core/self-evolution/evolving-agent.d.ts +51 -2
- package/dist/core/self-evolution/evolving-agent.js +45 -4
- package/dist/core/self-evolution/host-harness.d.ts +43 -0
- package/dist/core/self-evolution/host-harness.js +192 -0
- package/dist/core/self-evolution/reward-agent.d.ts +68 -0
- package/dist/core/self-evolution/reward-agent.js +76 -21
- package/dist/core/self-evolution/reward-aggregator.d.ts +26 -7
- package/dist/core/self-evolution/reward-aggregator.js +78 -20
- package/dist/core/self-evolution/verdict.d.ts +3 -2
- package/dist/core/self-evolution/verdict.js +4 -1
- package/dist/dashboard/react-client.js +2 -1
- package/package.json +1 -1
|
@@ -27,6 +27,18 @@ import { reconcilePrediction, summarizeCalibration, } from './policy/prediction-
|
|
|
27
27
|
export async function captureMainArm(opts) {
|
|
28
28
|
const sample = opts.report.fitnessSample;
|
|
29
29
|
const facts = sample?.trajectoryFacts;
|
|
30
|
+
// ④ Observable degrade: a verified:false arm — whether because NO
|
|
31
|
+
// observed-trajectory facts were captured, OR facts exist but the runner's
|
|
32
|
+
// pass/fail was not derivable (`facts.verified !== true`) — is surfaced on
|
|
33
|
+
// stderr so it is never SILENT. A wedged/missing/unextractable runner is the
|
|
34
|
+
// most common loop-stall cause, and a silent false reads identically to a real
|
|
35
|
+
// miss. The arm's recorded `verified` collapses to false in BOTH cases (see the
|
|
36
|
+
// `objective.verified` below), so EVERY verified:false arm warns exactly once;
|
|
37
|
+
// a genuinely verified arm (`facts.verified === true`) stays quiet.
|
|
38
|
+
if (!facts || facts.verified !== true) {
|
|
39
|
+
// eslint-disable-next-line no-console
|
|
40
|
+
console.warn(`[episode-orchestrator] observed grading unavailable for change "${opts.changeName}" — recording verified:false (observed run not verified)`);
|
|
41
|
+
}
|
|
30
42
|
// Honesty: prefer the OBSERVED pass rate (a real runner ran), else the
|
|
31
43
|
// authored test-report summary; null when neither parsed (never fabricated) —
|
|
32
44
|
// the exact precedence learn.ts uses to compute the loss.
|
|
@@ -168,6 +180,17 @@ function deriveEpisodeId(changeName, now) {
|
|
|
168
180
|
.replace(/-{2,}/g, '-')
|
|
169
181
|
.replace(/^-+|-+$/g, '');
|
|
170
182
|
}
|
|
183
|
+
/**
|
|
184
|
+
* Build the `terminalError` note for a thrown step. A timeout reads identically
|
|
185
|
+
* to a hard crash on disk otherwise, so a message that names a host-agent timeout
|
|
186
|
+
* (the spawn timeout puts `headless agent timed out after Nms` into stderr → the
|
|
187
|
+
* error message) is PREFIXED with a `timeout:` marker. A timed-out episode is
|
|
188
|
+
* then distinguishable from a genuine crash in episode.json. Pure.
|
|
189
|
+
*/
|
|
190
|
+
function terminalErrorLabel(err) {
|
|
191
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
192
|
+
return /timed out/i.test(msg) ? `timeout: ${msg}` : msg;
|
|
193
|
+
}
|
|
171
194
|
/**
|
|
172
195
|
* Run ONE episode through the loop in the strict, durably-persisted order
|
|
173
196
|
* documented at the top of this module. See {@link RunEpisodeResult}.
|
|
@@ -267,149 +290,180 @@ async function runEpisodeAfterCreate(opts) {
|
|
|
267
290
|
objective: opts.mainArm.objective,
|
|
268
291
|
});
|
|
269
292
|
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'main-arm-captured' });
|
|
270
|
-
//
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
targetId,
|
|
277
|
-
changeName: opts.changeName,
|
|
278
|
-
episodeId,
|
|
279
|
-
baselineVersion: shouldCritic.baselineVersion,
|
|
280
|
-
...(opts.critic?.baselineMode ? { baselineMode: opts.critic.baselineMode } : {}),
|
|
281
|
-
spawn: opts.spawn,
|
|
282
|
-
});
|
|
283
|
-
}
|
|
284
|
-
else {
|
|
285
|
-
baselineSkipped = true;
|
|
286
|
-
await advanceEpisodeStage({
|
|
287
|
-
repoRoot,
|
|
288
|
-
episodeId,
|
|
289
|
-
stage: 'baseline-skipped',
|
|
290
|
-
patch: { baselineSkippedReason: shouldCritic.reason },
|
|
291
|
-
});
|
|
292
|
-
}
|
|
293
|
-
// ── e: 奖励智能体 REWARD AGENT — score + diagnosis.json + advance 'scored' ────
|
|
294
|
-
// ④ Compute the tamper hint first (unless tamperCheck is 'off'); the ensemble
|
|
295
|
-
// injects it into the judge prompt + the diagnosis integrity, and BLOCK mode
|
|
296
|
-
// forces an insufficient-signal verdict (no extra spawns at the default
|
|
297
|
-
// samples=1, flag-only).
|
|
298
|
-
const tamperMode = opts.reward?.tamperCheck ?? 'flag';
|
|
299
|
-
const integrityHint = tamperMode === 'off' ? null : await detectTestTamper({ changeDirPath: opts.changeDirPath });
|
|
300
|
-
const reward = await runRewardAgentEnsemble({
|
|
301
|
-
repoRoot,
|
|
302
|
-
episodeId,
|
|
303
|
-
spawn: opts.spawn,
|
|
304
|
-
...(opts.reward ? { reward: opts.reward } : {}),
|
|
305
|
-
integrityHint,
|
|
306
|
-
});
|
|
307
|
-
const diagnosis = reward.diagnosis;
|
|
308
|
-
advantage = diagnosis.advantage;
|
|
309
|
-
// 预测校准: settle the PRIOR 'evolve' step's checkable prediction against this
|
|
310
|
-
// episode's measured arm delta (verifiable per-metric main−baseline, NOT the
|
|
311
|
-
// judge's reward) and log the residual. Best-effort — a calibration miss must
|
|
312
|
-
// never fail the episode.
|
|
293
|
+
// Steps d–g spawn the three agents (any of which may THROW — a wedged/crashed
|
|
294
|
+
// host CLI, a never-validating reward output, an evolving-agent invocation
|
|
295
|
+
// error). A thrown agent step must record a DURABLE terminal 'errored' stage
|
|
296
|
+
// (with the error text) so the episode is never orphaned at 'kept'/'scored'
|
|
297
|
+
// (indistinguishable from a still-running episode — ses_1330/1331). The lock
|
|
298
|
+
// release stays in runEpisode's finally; the re-throw below reaches it.
|
|
313
299
|
try {
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
// ── f: DECISION (every step durably persisted before the next) ───────────────
|
|
320
|
-
if (shouldSkipEvolution(diagnosis)) {
|
|
321
|
-
// 弃权 abstained / no nameable gap / insufficient-signal → no rollback
|
|
322
|
-
// decision; SKIP evolution. A BLOCKED tamper records a reject-buffer entry so
|
|
323
|
-
// the gamed attempt is visible to future episodes' 演进智能体.
|
|
324
|
-
if (tamperMode === 'block' &&
|
|
325
|
-
diagnosis.integrity?.testTamperSuspected &&
|
|
326
|
-
diagnosis.verdict === 'insufficient-signal') {
|
|
327
|
-
const head = await currentPolicyVersion(repoRoot, targetId);
|
|
328
|
-
await appendRejectBufferEntry(repoRoot, {
|
|
329
|
-
schemaVersion: 1,
|
|
330
|
-
at: new Date().toISOString(),
|
|
331
|
-
episodeId,
|
|
332
|
-
targetId,
|
|
333
|
-
// No version moved (the main arm's tampered tests are not a policy edit);
|
|
334
|
-
// record at the current head so the entry is informational, not a rollback.
|
|
335
|
-
fromVersion: head ?? 0,
|
|
336
|
-
toVersion: head ?? 0,
|
|
337
|
-
advantage: diagnosis.advantage,
|
|
338
|
-
rewardMain: diagnosis.rewardMain,
|
|
339
|
-
rewardBaseline: diagnosis.rewardBaseline,
|
|
340
|
-
textualGradientTried: diagnosis.textualGradient ?? '',
|
|
341
|
-
editSummary: buildRejectEditSummary(diagnosis),
|
|
342
|
-
reason: 'tamper-suspected',
|
|
343
|
-
});
|
|
344
|
-
}
|
|
345
|
-
decision = 'abstained';
|
|
346
|
-
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'abstained' });
|
|
347
|
-
}
|
|
348
|
-
else {
|
|
349
|
-
const badAdvantage = advantage !== null && advantage < threshold;
|
|
350
|
-
const ep = await readEpisode(repoRoot, episodeId);
|
|
351
|
-
const headBeforeRollback = await currentPolicyVersion(repoRoot, targetId);
|
|
352
|
-
// Resolve the rollback target: the policy the CRITIC AGENT reran
|
|
353
|
-
// (`policyVersionBaseline`) when it is a valid EARLIER version, else the
|
|
354
|
-
// version immediately before the head (the prior good policy the bad edit
|
|
355
|
-
// advanced past). `rollbackPolicyVersion` requires `toVersion < head`.
|
|
356
|
-
const rollbackTarget = resolveRollbackTarget(ep.policyVersionBaseline, headBeforeRollback);
|
|
357
|
-
if (badAdvantage && rollbackTarget !== null && headBeforeRollback !== null) {
|
|
358
|
-
// (i) ROLLBACK first — durable on disk. `advantage` is non-null inside the
|
|
359
|
-
// badAdvantage branch; `?? undefined` satisfies the optional `number` param.
|
|
360
|
-
await rollbackPolicyVersion({
|
|
300
|
+
// ── d: CRITIC AGENT(基线智能体 baseline agent)or skip ───────────────────────
|
|
301
|
+
const shouldCritic = await shouldRunCriticAgent({ repoRoot, targetId });
|
|
302
|
+
if (shouldCritic.run && shouldCritic.baselineVersion !== null) {
|
|
303
|
+
// runCriticAgent advances the episode to 'baseline-arm-captured'.
|
|
304
|
+
await runCriticAgent({
|
|
361
305
|
repoRoot,
|
|
362
306
|
targetId,
|
|
307
|
+
changeName: opts.changeName,
|
|
363
308
|
episodeId,
|
|
364
|
-
|
|
365
|
-
|
|
309
|
+
baselineVersion: shouldCritic.baselineVersion,
|
|
310
|
+
...(opts.critic?.baselineMode ? { baselineMode: opts.critic.baselineMode } : {}),
|
|
311
|
+
spawn: opts.spawn,
|
|
312
|
+
...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
|
|
313
|
+
...(opts.harness ? { harness: opts.harness } : {}),
|
|
366
314
|
});
|
|
367
|
-
// (ii) THEN append the 否决缓冲 reject-buffer entry — durable on disk —
|
|
368
|
-
// BEFORE the 演进智能体 EVOLVING AGENT is even called, so the entry written
|
|
369
|
-
// THIS episode is in its fresh-from-disk prompt.
|
|
370
|
-
const rejectEntry = {
|
|
371
|
-
schemaVersion: 1,
|
|
372
|
-
at: new Date().toISOString(),
|
|
373
|
-
episodeId,
|
|
374
|
-
targetId,
|
|
375
|
-
// fromVersion = the version we rolled back TO (the prior good policy);
|
|
376
|
-
// toVersion = the (now rolled-back) version the rejected edit reached.
|
|
377
|
-
fromVersion: rollbackTarget,
|
|
378
|
-
toVersion: headBeforeRollback,
|
|
379
|
-
advantage,
|
|
380
|
-
rewardMain: diagnosis.rewardMain,
|
|
381
|
-
rewardBaseline: diagnosis.rewardBaseline,
|
|
382
|
-
textualGradientTried: diagnosis.textualGradient ?? '',
|
|
383
|
-
editSummary: buildRejectEditSummary(diagnosis),
|
|
384
|
-
reason: 'bad-advantage',
|
|
385
|
-
};
|
|
386
|
-
await appendRejectBufferEntry(repoRoot, rejectEntry);
|
|
387
|
-
decision = 'rolled-back';
|
|
388
|
-
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
|
|
389
315
|
}
|
|
390
316
|
else {
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
317
|
+
baselineSkipped = true;
|
|
318
|
+
await advanceEpisodeStage({
|
|
319
|
+
repoRoot,
|
|
320
|
+
episodeId,
|
|
321
|
+
stage: 'baseline-skipped',
|
|
322
|
+
patch: { baselineSkippedReason: shouldCritic.reason },
|
|
323
|
+
});
|
|
395
324
|
}
|
|
396
|
-
// ──
|
|
397
|
-
//
|
|
398
|
-
//
|
|
399
|
-
//
|
|
400
|
-
//
|
|
401
|
-
const
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
const calibrationNote = await summarizeCalibration(repoRoot, targetId);
|
|
405
|
-
evolution = await runEvolvingAgent({
|
|
325
|
+
// ── e: 奖励智能体 REWARD AGENT — score + diagnosis.json + advance 'scored' ────
|
|
326
|
+
// ④ Compute the tamper hint first (unless tamperCheck is 'off'); the ensemble
|
|
327
|
+
// injects it into the judge prompt + the diagnosis integrity, and BLOCK mode
|
|
328
|
+
// forces an insufficient-signal verdict (no extra spawns at the default
|
|
329
|
+
// samples=1, flag-only).
|
|
330
|
+
const tamperMode = opts.reward?.tamperCheck ?? 'flag';
|
|
331
|
+
const integrityHint = tamperMode === 'off' ? null : await detectTestTamper({ changeDirPath: opts.changeDirPath });
|
|
332
|
+
const reward = await runRewardAgentEnsemble({
|
|
406
333
|
repoRoot,
|
|
407
334
|
episodeId,
|
|
408
|
-
targetId,
|
|
409
|
-
editBudget: scheduledBudget,
|
|
410
|
-
...(calibrationNote ? { calibrationNote } : {}),
|
|
411
335
|
spawn: opts.spawn,
|
|
336
|
+
...(opts.reward ? { reward: opts.reward } : {}),
|
|
337
|
+
integrityHint,
|
|
338
|
+
...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
|
|
339
|
+
...(opts.harness ? { harness: opts.harness } : {}),
|
|
412
340
|
});
|
|
341
|
+
const diagnosis = reward.diagnosis;
|
|
342
|
+
advantage = diagnosis.advantage;
|
|
343
|
+
// 预测校准: settle the PRIOR 'evolve' step's checkable prediction against this
|
|
344
|
+
// episode's measured arm delta (verifiable per-metric main−baseline, NOT the
|
|
345
|
+
// judge's reward) and log the residual. Best-effort — a calibration miss must
|
|
346
|
+
// never fail the episode.
|
|
347
|
+
try {
|
|
348
|
+
await reconcilePrediction({ repoRoot, targetId, episodeId });
|
|
349
|
+
}
|
|
350
|
+
catch {
|
|
351
|
+
// best-effort: the prediction-reconcile ledger is advisory only
|
|
352
|
+
}
|
|
353
|
+
// ── f: DECISION (every step durably persisted before the next) ───────────────
|
|
354
|
+
if (shouldSkipEvolution(diagnosis)) {
|
|
355
|
+
// 弃权 abstained / no nameable gap / insufficient-signal → no rollback
|
|
356
|
+
// decision; SKIP evolution. A BLOCKED tamper records a reject-buffer entry so
|
|
357
|
+
// the gamed attempt is visible to future episodes' 演进智能体.
|
|
358
|
+
if (tamperMode === 'block' &&
|
|
359
|
+
diagnosis.integrity?.testTamperSuspected &&
|
|
360
|
+
diagnosis.verdict === 'insufficient-signal') {
|
|
361
|
+
const head = await currentPolicyVersion(repoRoot, targetId);
|
|
362
|
+
await appendRejectBufferEntry(repoRoot, {
|
|
363
|
+
schemaVersion: 1,
|
|
364
|
+
at: new Date().toISOString(),
|
|
365
|
+
episodeId,
|
|
366
|
+
targetId,
|
|
367
|
+
// No version moved (the main arm's tampered tests are not a policy edit);
|
|
368
|
+
// record at the current head so the entry is informational, not a rollback.
|
|
369
|
+
fromVersion: head ?? 0,
|
|
370
|
+
toVersion: head ?? 0,
|
|
371
|
+
advantage: diagnosis.advantage,
|
|
372
|
+
rewardMain: diagnosis.rewardMain,
|
|
373
|
+
rewardBaseline: diagnosis.rewardBaseline,
|
|
374
|
+
textualGradientTried: diagnosis.textualGradient ?? '',
|
|
375
|
+
editSummary: buildRejectEditSummary(diagnosis),
|
|
376
|
+
reason: 'tamper-suspected',
|
|
377
|
+
});
|
|
378
|
+
}
|
|
379
|
+
decision = 'abstained';
|
|
380
|
+
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'abstained' });
|
|
381
|
+
}
|
|
382
|
+
else {
|
|
383
|
+
const badAdvantage = advantage !== null && advantage < threshold;
|
|
384
|
+
const ep = await readEpisode(repoRoot, episodeId);
|
|
385
|
+
const headBeforeRollback = await currentPolicyVersion(repoRoot, targetId);
|
|
386
|
+
// Resolve the rollback target: the policy the CRITIC AGENT reran
|
|
387
|
+
// (`policyVersionBaseline`) when it is a valid EARLIER version, else the
|
|
388
|
+
// version immediately before the head (the prior good policy the bad edit
|
|
389
|
+
// advanced past). `rollbackPolicyVersion` requires `toVersion < head`.
|
|
390
|
+
const rollbackTarget = resolveRollbackTarget(ep.policyVersionBaseline, headBeforeRollback);
|
|
391
|
+
if (badAdvantage && rollbackTarget !== null && headBeforeRollback !== null) {
|
|
392
|
+
// (i) ROLLBACK first — durable on disk. `advantage` is non-null inside the
|
|
393
|
+
// badAdvantage branch; `?? undefined` satisfies the optional `number` param.
|
|
394
|
+
await rollbackPolicyVersion({
|
|
395
|
+
repoRoot,
|
|
396
|
+
targetId,
|
|
397
|
+
episodeId,
|
|
398
|
+
toVersion: rollbackTarget,
|
|
399
|
+
advantage: advantage ?? undefined,
|
|
400
|
+
});
|
|
401
|
+
// (ii) THEN append the 否决缓冲 reject-buffer entry — durable on disk —
|
|
402
|
+
// BEFORE the 演进智能体 EVOLVING AGENT is even called, so the entry written
|
|
403
|
+
// THIS episode is in its fresh-from-disk prompt.
|
|
404
|
+
const rejectEntry = {
|
|
405
|
+
schemaVersion: 1,
|
|
406
|
+
at: new Date().toISOString(),
|
|
407
|
+
episodeId,
|
|
408
|
+
targetId,
|
|
409
|
+
// fromVersion = the version we rolled back TO (the prior good policy);
|
|
410
|
+
// toVersion = the (now rolled-back) version the rejected edit reached.
|
|
411
|
+
fromVersion: rollbackTarget,
|
|
412
|
+
toVersion: headBeforeRollback,
|
|
413
|
+
advantage,
|
|
414
|
+
rewardMain: diagnosis.rewardMain,
|
|
415
|
+
rewardBaseline: diagnosis.rewardBaseline,
|
|
416
|
+
textualGradientTried: diagnosis.textualGradient ?? '',
|
|
417
|
+
editSummary: buildRejectEditSummary(diagnosis),
|
|
418
|
+
reason: 'bad-advantage',
|
|
419
|
+
};
|
|
420
|
+
await appendRejectBufferEntry(repoRoot, rejectEntry);
|
|
421
|
+
decision = 'rolled-back';
|
|
422
|
+
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
|
|
423
|
+
}
|
|
424
|
+
else {
|
|
425
|
+
// Good advantage, OR no earlier version to roll back to (e.g. head is v0):
|
|
426
|
+
// keep the current head.
|
|
427
|
+
decision = 'kept';
|
|
428
|
+
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'kept' });
|
|
429
|
+
}
|
|
430
|
+
// ── g: ONLY AFTER (f) persisted 'rolled-back'/'kept' ───────────────────────
|
|
431
|
+
// runEvolvingAgent reads the reject-buffer FRESH from disk (the entry just
|
|
432
|
+
// written THIS episode is in its prompt). Never parallelized with (f).
|
|
433
|
+
// 步长: after a rollback, shrink the edit budget (smaller step after a step
|
|
434
|
+
// that lost ground). 预测校准: pass the proposer's recent prediction record.
|
|
435
|
+
const scheduledBudget = decision === 'rolled-back'
|
|
436
|
+
? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
|
|
437
|
+
: editBudget;
|
|
438
|
+
const calibrationNote = await summarizeCalibration(repoRoot, targetId);
|
|
439
|
+
evolution = await runEvolvingAgent({
|
|
440
|
+
repoRoot,
|
|
441
|
+
episodeId,
|
|
442
|
+
targetId,
|
|
443
|
+
editBudget: scheduledBudget,
|
|
444
|
+
...(calibrationNote ? { calibrationNote } : {}),
|
|
445
|
+
spawn: opts.spawn,
|
|
446
|
+
...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
|
|
447
|
+
...(opts.harness ? { harness: opts.harness } : {}),
|
|
448
|
+
});
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
catch (err) {
|
|
452
|
+
// A thrown agent step (d–g) records a DURABLE terminal 'errored' stage so the
|
|
453
|
+
// episode is never orphaned at 'kept'/'scored' (which is indistinguishable
|
|
454
|
+
// from a still-running episode — the ses_1330/1331 wsgidav orphan). The
|
|
455
|
+
// 'errored' stage is reachable from EVERY non-terminal stage; the patch
|
|
456
|
+
// carries the error text (prefixed `timeout:` when the throw was a host-agent
|
|
457
|
+
// timeout, so a timed-out episode is distinguishable from a hard crash on
|
|
458
|
+
// disk). Best-effort: a failed record write must not mask the original throw,
|
|
459
|
+
// which still propagates to runEpisode's finally (lock release).
|
|
460
|
+
await advanceEpisodeStage({
|
|
461
|
+
repoRoot,
|
|
462
|
+
episodeId,
|
|
463
|
+
stage: 'errored',
|
|
464
|
+
patch: { terminalError: terminalErrorLabel(err) },
|
|
465
|
+
}).catch(() => { });
|
|
466
|
+
throw err;
|
|
413
467
|
}
|
|
414
468
|
// ── h (stage half): advance 'closed' (best-effort) ───────────────────────────
|
|
415
469
|
await closeEpisodeBestEffort(repoRoot, episodeId);
|
|
@@ -417,11 +471,25 @@ async function runEpisodeAfterCreate(opts) {
|
|
|
417
471
|
return { episodeId, baselineSkipped, advantage, decision, evolution, newPolicyVersion };
|
|
418
472
|
}
|
|
419
473
|
/**
|
|
420
|
-
* Advance the episode to 'closed' from whatever terminal-ish stage it reached
|
|
421
|
-
*
|
|
422
|
-
*
|
|
423
|
-
*
|
|
424
|
-
*
|
|
474
|
+
* Advance the episode to 'closed' from whatever terminal-ish stage it reached,
|
|
475
|
+
* best-effort.
|
|
476
|
+
*
|
|
477
|
+
* Closable stages:
|
|
478
|
+
* - evolved | evolution-refused | abstained — the 演进智能体 EVOLVING AGENT
|
|
479
|
+
* reached a definite outcome (or the judge 弃权 abstained), the normal close.
|
|
480
|
+
* - kept | rolled-back — the 演进智能体 returned not-spawned (its diagnosis
|
|
481
|
+
* abstained-after-gap-check, no gaps, or the target resolved to no editable
|
|
482
|
+
* local files), so the episode never advanced past the decision. By the time
|
|
483
|
+
* this runs (AFTER runEvolvingAgent returned), a stage still at 'kept'/
|
|
484
|
+
* 'rolled-back' can ONLY mean not-spawned — a success advances 'evolved', a
|
|
485
|
+
* refusal advances 'evolution-refused', and a throw is caught upstream and
|
|
486
|
+
* records 'errored' + rethrows so this close is never reached. So a leftover
|
|
487
|
+
* kept/rolled-back at close time IS the finished-nothing-to-evolve case and
|
|
488
|
+
* must close, not rest forever at a non-terminal stage (the exact ambiguity
|
|
489
|
+
* the 'errored' stage was meant to remove).
|
|
490
|
+
*
|
|
491
|
+
* Any other (genuinely non-closable) stage is left as-is rather than throwing, so
|
|
492
|
+
* the close never masks the real episode outcome.
|
|
425
493
|
*/
|
|
426
494
|
async function closeEpisodeBestEffort(repoRoot, episodeId) {
|
|
427
495
|
const ep = await readEpisode(repoRoot, episodeId);
|
|
@@ -429,6 +497,9 @@ async function closeEpisodeBestEffort(repoRoot, episodeId) {
|
|
|
429
497
|
'evolved',
|
|
430
498
|
'evolution-refused',
|
|
431
499
|
'abstained',
|
|
500
|
+
// not-spawned 演进智能体 leaves the episode here — close the finished episode.
|
|
501
|
+
'kept',
|
|
502
|
+
'rolled-back',
|
|
432
503
|
]);
|
|
433
504
|
if (closable.has(ep.stage)) {
|
|
434
505
|
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'closed' });
|
|
@@ -517,6 +588,16 @@ async function ensureRejectBufferEntry(repoRoot, opts) {
|
|
|
517
588
|
* - 'scored' → run the decision (f) then the 演进智能体 (g).
|
|
518
589
|
* - 'rolled-back' / 'kept' → run the 演进智能体 EVOLVING AGENT (g) then close.
|
|
519
590
|
* - 'evolved'/'evolution-refused'/'abstained' → close.
|
|
591
|
+
* - 'errored' → RE-DRIVE from the last GOOD pre-error stage
|
|
592
|
+
* (an episode may have errored on a TRANSIENT
|
|
593
|
+
* cause — a one-off git/analyzer/agent timeout).
|
|
594
|
+
* The pre-error stage is the last `stageHistory`
|
|
595
|
+
* entry that is NOT 'errored'; when it is one of
|
|
596
|
+
* {'scored','rolled-back','kept'} (the
|
|
597
|
+
* resume-entry stages) we advance errored → that
|
|
598
|
+
* stage and fall through to the normal dispatch.
|
|
599
|
+
* Otherwise the pre-error stage is not
|
|
600
|
+
* auto-resumable and the episode is reported as-is.
|
|
520
601
|
* - earlier stages → not auto-resumable here (the arms / reward
|
|
521
602
|
* agent need their own re-entry); reported as-is.
|
|
522
603
|
*
|
|
@@ -532,78 +613,135 @@ export async function resumeEpisode(opts) {
|
|
|
532
613
|
const resumedFrom = ep.stage;
|
|
533
614
|
const targetId = ep.targetId;
|
|
534
615
|
let evolution = null;
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
616
|
+
// The effective stage we dispatch on. Normally the episode's current stage;
|
|
617
|
+
// for an 'errored' episode we attempt to RE-DRIVE from the last good pre-error
|
|
618
|
+
// stage (a transient git/analyzer/agent failure should be retryable via an
|
|
619
|
+
// operator resume). 'errored' stays terminal for every OTHER caller — only this
|
|
620
|
+
// resume path may re-drive it, via the errored → {scored,rolled-back,kept}
|
|
621
|
+
// transitions the stage machine allows ONLY for operator recovery.
|
|
622
|
+
let stage = ep.stage;
|
|
623
|
+
if (ep.stage === 'errored') {
|
|
624
|
+
const preError = [...ep.stageHistory]
|
|
625
|
+
.reverse()
|
|
626
|
+
.find((h) => h.stage !== 'errored')?.stage;
|
|
627
|
+
if (preError === 'scored' ||
|
|
628
|
+
preError === 'rolled-back' ||
|
|
629
|
+
preError === 'kept') {
|
|
630
|
+
// Re-open the errored episode at its last auto-resumable stage, then fall
|
|
631
|
+
// through to the normal dispatch for that stage.
|
|
632
|
+
await advanceEpisodeStage({ repoRoot, episodeId, stage: preError });
|
|
633
|
+
stage = preError;
|
|
540
634
|
}
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
// = priorRollback.version - 1 (the head before the rollback)
|
|
559
|
-
// fromVersion = the prior good policy restored (one before that head)
|
|
560
|
-
const toVersion = priorRollback.version - 1;
|
|
561
|
-
const fromVersion = resolveRollbackTarget(ep.policyVersionBaseline, toVersion);
|
|
562
|
-
await ensureRejectBufferEntry(repoRoot, {
|
|
563
|
-
episodeId,
|
|
564
|
-
targetId,
|
|
565
|
-
fromVersion: fromVersion ?? toVersion,
|
|
566
|
-
toVersion,
|
|
567
|
-
advantage,
|
|
568
|
-
diagnosis,
|
|
569
|
-
});
|
|
570
|
-
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
|
|
635
|
+
// Otherwise the pre-error stage is not auto-resumable (e.g. a reward throw at
|
|
636
|
+
// 'baseline-skipped'); leave the episode at 'errored' and report it as-is.
|
|
637
|
+
}
|
|
638
|
+
// The decision (f) + 演进智能体 EVOLVING AGENT (g) re-runs below can THROW — a
|
|
639
|
+
// wedged/crashed host CLI (CanonicalProposerInvocationError), a timeout, or an
|
|
640
|
+
// observed-GREEN gate throw. UNCAUGHT, that leaves the episode DURABLY stuck at
|
|
641
|
+
// a non-terminal stage ('scored'/'rolled-back'/'kept' — the orphan state fix ❷
|
|
642
|
+
// eliminates for runEpisode). Record the SAME terminal 'errored' stage here
|
|
643
|
+
// (the transition map already allows scored/rolled-back/kept → 'errored'), then
|
|
644
|
+
// re-throw. Resume holds NO in-flight lock, so this is a durable-stage fix, not
|
|
645
|
+
// a leak fix. Best-effort write: a failed record must not mask the original throw.
|
|
646
|
+
try {
|
|
647
|
+
if (stage === 'scored') {
|
|
648
|
+
// Re-run the decision (f) from the on-disk diagnosis, then (g).
|
|
649
|
+
const diagnosis = await readDiagnosisForResume(repoRoot, episodeId);
|
|
650
|
+
if (shouldSkipEvolution(diagnosis)) {
|
|
651
|
+
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'abstained' });
|
|
571
652
|
}
|
|
572
|
-
else {
|
|
573
|
-
|
|
574
|
-
const
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
653
|
+
else if (diagnosis) {
|
|
654
|
+
// (shouldSkipEvolution returns true for null, so diagnosis is non-null here)
|
|
655
|
+
const advantage = diagnosis.advantage;
|
|
656
|
+
const badAdvantage = advantage !== null && advantage < threshold;
|
|
657
|
+
// Crash-resume dedup: a 'rollback' ledger entry whose episodeId is THIS
|
|
658
|
+
// episode means runEpisode already applied the rollback before the host
|
|
659
|
+
// crashed (the rollback head is monotonic — re-calling rollbackPolicyVersion
|
|
660
|
+
// would stack a SECOND, duplicate rollback version). When present, reuse its
|
|
661
|
+
// recorded version axis and SKIP the re-rollback; only ensure the
|
|
662
|
+
// reject-buffer entry + the 'rolled-back' stage advance complete.
|
|
663
|
+
const ledger = await readPolicyLedger(repoRoot, targetId);
|
|
664
|
+
const priorRollback = ledger.find((e) => e.action === 'rollback' && e.episodeId === episodeId);
|
|
665
|
+
if (priorRollback) {
|
|
666
|
+
// The prior rollback already advanced to `priorRollback.version`, rolling
|
|
667
|
+
// FORWARD to the content of the version immediately before the rejected
|
|
668
|
+
// edit's head. Reconstruct the reject-buffer axis from that entry:
|
|
669
|
+
// toVersion = the (rolled-back) version the rejected edit reached
|
|
670
|
+
// = priorRollback.version - 1 (the head before the rollback)
|
|
671
|
+
// fromVersion = the prior good policy restored (one before that head)
|
|
672
|
+
const toVersion = priorRollback.version - 1;
|
|
673
|
+
const fromVersion = resolveRollbackTarget(ep.policyVersionBaseline, toVersion);
|
|
583
674
|
await ensureRejectBufferEntry(repoRoot, {
|
|
584
675
|
episodeId,
|
|
585
676
|
targetId,
|
|
586
|
-
fromVersion:
|
|
587
|
-
toVersion
|
|
677
|
+
fromVersion: fromVersion ?? toVersion,
|
|
678
|
+
toVersion,
|
|
588
679
|
advantage,
|
|
589
680
|
diagnosis,
|
|
590
681
|
});
|
|
591
682
|
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
|
|
592
683
|
}
|
|
593
684
|
else {
|
|
594
|
-
await
|
|
685
|
+
const headBeforeRollback = await currentPolicyVersion(repoRoot, targetId);
|
|
686
|
+
const rollbackTarget = resolveRollbackTarget(ep.policyVersionBaseline, headBeforeRollback);
|
|
687
|
+
if (badAdvantage && rollbackTarget !== null && headBeforeRollback !== null) {
|
|
688
|
+
await rollbackPolicyVersion({
|
|
689
|
+
repoRoot,
|
|
690
|
+
targetId,
|
|
691
|
+
episodeId,
|
|
692
|
+
toVersion: rollbackTarget,
|
|
693
|
+
advantage: advantage ?? undefined,
|
|
694
|
+
});
|
|
695
|
+
await ensureRejectBufferEntry(repoRoot, {
|
|
696
|
+
episodeId,
|
|
697
|
+
targetId,
|
|
698
|
+
fromVersion: rollbackTarget,
|
|
699
|
+
toVersion: headBeforeRollback,
|
|
700
|
+
advantage,
|
|
701
|
+
diagnosis,
|
|
702
|
+
});
|
|
703
|
+
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'rolled-back' });
|
|
704
|
+
}
|
|
705
|
+
else {
|
|
706
|
+
await advanceEpisodeStage({ repoRoot, episodeId, stage: 'kept' });
|
|
707
|
+
}
|
|
708
|
+
}
|
|
709
|
+
// 预测校准 (idempotent if the original run already settled it) + 步长 schedule
|
|
710
|
+
// + calibration note, mirroring runEpisode's (g) step.
|
|
711
|
+
try {
|
|
712
|
+
await reconcilePrediction({ repoRoot, targetId, episodeId });
|
|
713
|
+
}
|
|
714
|
+
catch {
|
|
715
|
+
// best-effort: advisory only
|
|
595
716
|
}
|
|
717
|
+
const afterDecision = await readEpisode(repoRoot, episodeId);
|
|
718
|
+
const scheduledBudget = afterDecision.stage === 'rolled-back'
|
|
719
|
+
? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
|
|
720
|
+
: editBudget;
|
|
721
|
+
const calibrationNote = await summarizeCalibration(repoRoot, targetId);
|
|
722
|
+
evolution = await runEvolvingAgent({
|
|
723
|
+
repoRoot,
|
|
724
|
+
episodeId,
|
|
725
|
+
targetId,
|
|
726
|
+
editBudget: scheduledBudget,
|
|
727
|
+
...(calibrationNote ? { calibrationNote } : {}),
|
|
728
|
+
spawn: opts.spawn,
|
|
729
|
+
...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
|
|
730
|
+
...(opts.harness ? { harness: opts.harness } : {}),
|
|
731
|
+
});
|
|
596
732
|
}
|
|
597
|
-
|
|
598
|
-
|
|
733
|
+
await closeEpisodeBestEffort(repoRoot, episodeId);
|
|
734
|
+
}
|
|
735
|
+
else if (stage === 'rolled-back' || stage === 'kept') {
|
|
736
|
+
// The decision already ran (and the original episode settled the prediction);
|
|
737
|
+
// re-settle idempotently for the crash window, then schedule + calibrate.
|
|
599
738
|
try {
|
|
600
739
|
await reconcilePrediction({ repoRoot, targetId, episodeId });
|
|
601
740
|
}
|
|
602
741
|
catch {
|
|
603
742
|
// best-effort: advisory only
|
|
604
743
|
}
|
|
605
|
-
const
|
|
606
|
-
const scheduledBudget = afterDecision.stage === 'rolled-back'
|
|
744
|
+
const scheduledBudget = stage === 'rolled-back'
|
|
607
745
|
? scheduledEditBudget(await readPolicyLedger(repoRoot, targetId), editBudget)
|
|
608
746
|
: editBudget;
|
|
609
747
|
const calibrationNote = await summarizeCalibration(repoRoot, targetId);
|
|
@@ -614,39 +752,34 @@ export async function resumeEpisode(opts) {
|
|
|
614
752
|
editBudget: scheduledBudget,
|
|
615
753
|
...(calibrationNote ? { calibrationNote } : {}),
|
|
616
754
|
spawn: opts.spawn,
|
|
755
|
+
...(opts.agentTimeoutMs !== undefined ? { timeoutMs: opts.agentTimeoutMs } : {}),
|
|
756
|
+
...(opts.harness ? { harness: opts.harness } : {}),
|
|
617
757
|
});
|
|
758
|
+
await closeEpisodeBestEffort(repoRoot, episodeId);
|
|
618
759
|
}
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
// re-settle idempotently for the crash window, then schedule + calibrate.
|
|
624
|
-
try {
|
|
625
|
-
await reconcilePrediction({ repoRoot, targetId, episodeId });
|
|
760
|
+
else if (stage === 'evolved' ||
|
|
761
|
+
stage === 'evolution-refused' ||
|
|
762
|
+
stage === 'abstained') {
|
|
763
|
+
await closeEpisodeBestEffort(repoRoot, episodeId);
|
|
626
764
|
}
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
765
|
+
// earlier stages (and a non-auto-resumable 'errored'): not auto-resumable here
|
|
766
|
+
// — reported as-is.
|
|
767
|
+
}
|
|
768
|
+
catch (err) {
|
|
769
|
+
// A thrown decision/evolving step records a DURABLE terminal 'errored' stage so
|
|
770
|
+
// the resumed episode is never left stuck at 'scored'/'rolled-back'/'kept'
|
|
771
|
+
// (indistinguishable from a still-running episode). Mirrors
|
|
772
|
+
// runEpisodeAfterCreate's catch — including the `timeout:` marker so a timed-out
|
|
773
|
+
// resume is distinguishable from a hard crash. Best-effort: a failed record must
|
|
774
|
+
// not mask the original throw, which still propagates to the caller.
|
|
775
|
+
await advanceEpisodeStage({
|
|
635
776
|
repoRoot,
|
|
636
777
|
episodeId,
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
});
|
|
642
|
-
await closeEpisodeBestEffort(repoRoot, episodeId);
|
|
643
|
-
}
|
|
644
|
-
else if (ep.stage === 'evolved' ||
|
|
645
|
-
ep.stage === 'evolution-refused' ||
|
|
646
|
-
ep.stage === 'abstained') {
|
|
647
|
-
await closeEpisodeBestEffort(repoRoot, episodeId);
|
|
778
|
+
stage: 'errored',
|
|
779
|
+
patch: { terminalError: terminalErrorLabel(err) },
|
|
780
|
+
}).catch(() => { });
|
|
781
|
+
throw err;
|
|
648
782
|
}
|
|
649
|
-
// earlier stages: not auto-resumable here — reported as-is.
|
|
650
783
|
const after = await readEpisode(repoRoot, episodeId);
|
|
651
784
|
return { episodeId, resumedFrom, stage: after.stage, evolution };
|
|
652
785
|
}
|