synergyspec-selfevolving 2.0.0 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/learn.js +7 -0
- package/dist/commands/self-evolution-episode.js +8 -0
- package/dist/core/fitness/test-failures.js +10 -2
- package/dist/core/project-config.d.ts +17 -0
- package/dist/core/project-config.js +68 -0
- package/dist/core/self-evolution/critic-agent.d.ts +52 -10
- package/dist/core/self-evolution/critic-agent.js +109 -28
- package/dist/core/self-evolution/episode-orchestrator.d.ts +39 -2
- package/dist/core/self-evolution/episode-orchestrator.js +173 -10
- package/dist/core/self-evolution/evolving-agent.d.ts +63 -17
- package/dist/core/self-evolution/evolving-agent.js +106 -20
- package/dist/core/self-evolution/host-harness.d.ts +14 -14
- package/dist/core/self-evolution/host-harness.js +48 -22
- package/dist/core/self-evolution/index.d.ts +2 -0
- package/dist/core/self-evolution/index.js +2 -0
- package/dist/core/self-evolution/policy/prediction-reconcile.d.ts +54 -0
- package/dist/core/self-evolution/policy/prediction-reconcile.js +191 -0
- package/dist/core/self-evolution/policy/reject-buffer.d.ts +9 -2
- package/dist/core/self-evolution/policy/reject-buffer.js +4 -2
- package/dist/core/self-evolution/reward-agent.d.ts +159 -14
- package/dist/core/self-evolution/reward-agent.js +459 -69
- package/dist/core/self-evolution/reward-aggregator.d.ts +59 -0
- package/dist/core/self-evolution/reward-aggregator.js +262 -0
- package/dist/core/self-evolution/tamper-check.d.ts +24 -0
- package/dist/core/self-evolution/tamper-check.js +236 -0
- package/dist/core/templates/workflows/gen-tests.js +1 -1
- package/dist/core/templates/workflows/learn.js +7 -6
- package/dist/core/trajectory/scrub.d.ts +27 -0
- package/dist/core/trajectory/scrub.js +79 -0
- package/dist/core/trajectory/skeleton.d.ts +27 -1
- package/dist/core/trajectory/skeleton.js +152 -8
- package/dist/ui/ascii-patterns.d.ts +7 -8
- package/dist/ui/ascii-patterns.js +54 -120
- package/dist/ui/welcome-screen.d.ts +8 -0
- package/dist/ui/welcome-screen.js +2 -2
- package/package.json +1 -1
- package/dist/core/self-evolution/ga-selection.d.ts +0 -94
- package/dist/core/self-evolution/ga-selection.js +0 -153
- package/dist/core/self-evolution/proposer-agent.d.ts +0 -182
- package/dist/core/self-evolution/proposer-agent.js +0 -326
- package/dist/core/self-evolution/replay-runner.d.ts +0 -100
- package/dist/core/self-evolution/replay-runner.js +0 -170
- package/dist/core/self-evolution/replay.d.ts +0 -45
- package/dist/core/self-evolution/replay.js +0 -56
- package/dist/core/self-evolution/template-variants.d.ts +0 -62
- package/dist/core/self-evolution/template-variants.js +0 -171
- package/dist/core/self-evolution/trajectory.d.ts +0 -65
- package/dist/core/self-evolution/trajectory.js +0 -185
package/dist/commands/learn.js
CHANGED
|
@@ -179,12 +179,19 @@ export function registerLearnCommand(program, deps = {}) {
|
|
|
179
179
|
changeName: report.changeName,
|
|
180
180
|
report,
|
|
181
181
|
});
|
|
182
|
+
// Thread the loop-v2 reward judge-quality config (samples / noiseFloor /
|
|
183
|
+
// orderSwap / tamperCheck). Omitted ⇒ the orchestrator's single-sample,
|
|
184
|
+
// flag-only default (no extra spawns).
|
|
185
|
+
const episodeConfig = readProjectConfig(projectRoot);
|
|
182
186
|
episodeOutcome = await runEpisodeImpl({
|
|
183
187
|
repoRoot: projectRoot,
|
|
184
188
|
targetId: concreteEvolveTarget.targetId,
|
|
185
189
|
changeName: report.changeName,
|
|
186
190
|
changeDirPath: report.changeDir,
|
|
187
191
|
mainArm,
|
|
192
|
+
...(episodeConfig?.selfEvolution?.reward
|
|
193
|
+
? { reward: episodeConfig.selfEvolution.reward }
|
|
194
|
+
: {}),
|
|
188
195
|
});
|
|
189
196
|
}
|
|
190
197
|
if (options.json) {
|
|
@@ -9,6 +9,7 @@ lookupCanonicalTarget, listCanonicalTargets, DESIGN_ARTIFACT_TARGET_ID, } from '
|
|
|
9
9
|
import { generateLearnReport } from '../core/learn.js';
|
|
10
10
|
import { validateExplicitTrajectoryHandle } from '../core/learn/trajectory-discovery.js';
|
|
11
11
|
import { validateChangeExists } from './workflow/shared.js';
|
|
12
|
+
import { readProjectConfig } from '../core/project-config.js';
|
|
12
13
|
/**
|
|
13
14
|
* The 主智能体 MAIN AGENT arm is graded from a learn report exactly the way the
|
|
14
15
|
* `learn` command grades it (the orchestrator REUSES that grading; it never
|
|
@@ -148,6 +149,7 @@ export async function runEpisodeCommand(args, opts) {
|
|
|
148
149
|
// orchestrator's behavior when it is unaware of the flag.
|
|
149
150
|
let outcome;
|
|
150
151
|
try {
|
|
152
|
+
const episodeConfig = readProjectConfig(opts.repoRoot);
|
|
151
153
|
const episodeOptions = {
|
|
152
154
|
repoRoot: opts.repoRoot,
|
|
153
155
|
targetId,
|
|
@@ -155,6 +157,12 @@ export async function runEpisodeCommand(args, opts) {
|
|
|
155
157
|
changeDirPath,
|
|
156
158
|
mainArm,
|
|
157
159
|
...(args.noBaseline ? { skipBaseline: true } : {}),
|
|
160
|
+
...(episodeConfig?.selfEvolution?.reward
|
|
161
|
+
? { reward: episodeConfig.selfEvolution.reward }
|
|
162
|
+
: {}),
|
|
163
|
+
...(episodeConfig?.selfEvolution?.critic
|
|
164
|
+
? { critic: episodeConfig.selfEvolution.critic }
|
|
165
|
+
: {}),
|
|
158
166
|
};
|
|
159
167
|
outcome = await runEpisode(episodeOptions);
|
|
160
168
|
}
|
|
@@ -57,6 +57,14 @@ function findAssertion(lines, from) {
|
|
|
57
57
|
function cleanToken(value) {
|
|
58
58
|
return value.replace(/^[`'"]+|[`'"]+$/g, '');
|
|
59
59
|
}
|
|
60
|
+
/**
|
|
61
|
+
* POSIX-normalize a path so a Windows pytest path (`tests\test_x.py`) matches the
|
|
62
|
+
* already-POSIX-normalized file-edit paths in the action skeleton — the reward
|
|
63
|
+
* agent's renamed/edited-test caveat compares the two by exact string.
|
|
64
|
+
*/
|
|
65
|
+
function toPosix(p) {
|
|
66
|
+
return p.replace(/\\/g, '/');
|
|
67
|
+
}
|
|
60
68
|
/**
|
|
61
69
|
* Extract failing test ids + assertion lines from observed runner output.
|
|
62
70
|
* Returns `[]` when nothing is recognized. Deduplicates by testId, preserves
|
|
@@ -86,7 +94,7 @@ export function parseTestFailures(output) {
|
|
|
86
94
|
const inline = pytest[2]?.trim();
|
|
87
95
|
push({
|
|
88
96
|
testId,
|
|
89
|
-
file: testId.split('::')[0],
|
|
97
|
+
file: toPosix(testId.split('::')[0]),
|
|
90
98
|
...(inline
|
|
91
99
|
? { assertion: capAssertion(inline) }
|
|
92
100
|
: (() => {
|
|
@@ -98,7 +106,7 @@ export function parseTestFailures(output) {
|
|
|
98
106
|
}
|
|
99
107
|
const vitest = VITEST_FAIL_RE.exec(line);
|
|
100
108
|
if (vitest) {
|
|
101
|
-
const file = cleanToken(vitest[1]);
|
|
109
|
+
const file = toPosix(cleanToken(vitest[1]));
|
|
102
110
|
const rest = vitest[2]?.trim();
|
|
103
111
|
const testId = rest ? `${file} > ${rest}` : file;
|
|
104
112
|
const assertion = findAssertion(lines, i);
|
|
@@ -27,6 +27,23 @@ export declare const ProjectConfigSchema: z.ZodObject<{
|
|
|
27
27
|
focus: z.ZodOptional<z.ZodBoolean>;
|
|
28
28
|
advantageRollbackThreshold: z.ZodOptional<z.ZodNumber>;
|
|
29
29
|
editBudget: z.ZodOptional<z.ZodNumber>;
|
|
30
|
+
reward: z.ZodOptional<z.ZodObject<{
|
|
31
|
+
samples: z.ZodOptional<z.ZodNumber>;
|
|
32
|
+
noiseFloor: z.ZodOptional<z.ZodNumber>;
|
|
33
|
+
orderSwap: z.ZodOptional<z.ZodBoolean>;
|
|
34
|
+
requireCorrectnessGate: z.ZodOptional<z.ZodBoolean>;
|
|
35
|
+
tamperCheck: z.ZodOptional<z.ZodEnum<{
|
|
36
|
+
off: "off";
|
|
37
|
+
flag: "flag";
|
|
38
|
+
block: "block";
|
|
39
|
+
}>>;
|
|
40
|
+
}, z.core.$strip>>;
|
|
41
|
+
critic: z.ZodOptional<z.ZodObject<{
|
|
42
|
+
baselineMode: z.ZodOptional<z.ZodEnum<{
|
|
43
|
+
"re-test": "re-test";
|
|
44
|
+
"re-do": "re-do";
|
|
45
|
+
}>>;
|
|
46
|
+
}, z.core.$strip>>;
|
|
30
47
|
}, z.core.$strip>>;
|
|
31
48
|
health: z.ZodOptional<z.ZodObject<{
|
|
32
49
|
source: z.ZodDefault<z.ZodEnum<{
|
|
@@ -60,6 +60,42 @@ export const ProjectConfigSchema = z.object({
|
|
|
60
60
|
// 演进智能体 EVOLVING AGENT's ONE bounded edit may total. Default 40.
|
|
61
61
|
// Optional/omitted ⇒ the agent's DEFAULT_EVOLVING_AGENT_EDIT_BUDGET applies.
|
|
62
62
|
editBudget: z.number().optional(),
|
|
63
|
+
// Loop v2 — 奖励智能体 REWARD AGENT judge-quality knobs. ALL optional; omitted
|
|
64
|
+
// ⇒ the historical single-sample, flag-only behaviour (no extra LLM spawns).
|
|
65
|
+
reward: z
|
|
66
|
+
.object({
|
|
67
|
+
// ② How many judged duels per episode. Default 1 (single sample, no
|
|
68
|
+
// extra spawns). >1 enables the A/A noise floor + SPRT + order-swap.
|
|
69
|
+
samples: z.number().optional(),
|
|
70
|
+
// ② Minimum |advantage| to trust; within the floor ⇒ insufficient-signal.
|
|
71
|
+
// Omitted ⇒ measured from an A/A pair when samples>1, else unused.
|
|
72
|
+
noiseFloor: z.number().optional(),
|
|
73
|
+
// ③ Swap arm presentation order across samples to cancel position bias.
|
|
74
|
+
orderSwap: z.boolean().optional(),
|
|
75
|
+
// ① Enforce the correctness hard-gate inside the judge (default on).
|
|
76
|
+
requireCorrectnessGate: z.boolean().optional(),
|
|
77
|
+
// ④ Test-tamper handling: 'off' (no check), 'flag' (annotate only,
|
|
78
|
+
// default), or 'block' (force insufficient-signal + reject-buffer).
|
|
79
|
+
tamperCheck: z.enum(['off', 'flag', 'block']).optional(),
|
|
80
|
+
})
|
|
81
|
+
.optional(),
|
|
82
|
+
// Loop v2 — CRITIC AGENT(基线智能体 baseline agent)baseline construction.
|
|
83
|
+
// 're-do' (default): the baseline arm RE-DOES the change under the prior
|
|
84
|
+
// policy vN — it resets the change's GENERATED artifacts (design.md,
|
|
85
|
+
// tasks.md), re-authors design under the installed vN template, then
|
|
86
|
+
// re-implements → gen-test → run-test. So advantage = reward(主臂) −
|
|
87
|
+
// reward(基线臂) reflects the POLICY change, not re-run noise. Faithful
|
|
88
|
+
// when the change's implementation is still uncommitted at episode time
|
|
89
|
+
// (the workflow default) and an isolated git worktree can be created;
|
|
90
|
+
// on a non-git copy fallback it degrades to a re-measure (documented).
|
|
91
|
+
// 're-test': the prior behaviour — re-run the EXISTING change's tests
|
|
92
|
+
// under vN's template (cheaper, but an already-authored change does not
|
|
93
|
+
// exercise the design template). Omitted ⇒ 're-do'.
|
|
94
|
+
critic: z
|
|
95
|
+
.object({
|
|
96
|
+
baselineMode: z.enum(['re-test', 're-do']).optional(),
|
|
97
|
+
})
|
|
98
|
+
.optional(),
|
|
63
99
|
})
|
|
64
100
|
.optional()
|
|
65
101
|
.describe('Per-canonical-target self-evolution toggles'),
|
|
@@ -246,6 +282,38 @@ export function readProjectConfig(projectRoot) {
|
|
|
246
282
|
else if (rawSE.editBudget !== undefined) {
|
|
247
283
|
console.warn(`Invalid 'selfEvolution.editBudget' in config (must be a number), ignoring`);
|
|
248
284
|
}
|
|
285
|
+
// Loop v2 — 奖励智能体 REWARD AGENT knobs. Resilient: each sub-field is
|
|
286
|
+
// validated independently; a bad value is dropped with a warning (the
|
|
287
|
+
// judge/aggregator default applies). Omitted ⇒ undefined (single-sample,
|
|
288
|
+
// flag-only — byte-identical to configs that never set `reward`).
|
|
289
|
+
const rewardSchema = ProjectConfigSchema.shape.selfEvolution
|
|
290
|
+
.unwrap()
|
|
291
|
+
.shape.reward.unwrap();
|
|
292
|
+
const rewardResult = rewardSchema.safeParse(rawSE.reward);
|
|
293
|
+
if (rewardResult.success) {
|
|
294
|
+
if (Object.keys(rewardResult.data).length > 0) {
|
|
295
|
+
selfEvolution.reward = rewardResult.data;
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
else if (rawSE.reward !== undefined) {
|
|
299
|
+
console.warn(`Invalid 'selfEvolution.reward' in config (samples/noiseFloor numbers, ` +
|
|
300
|
+
`orderSwap/requireCorrectnessGate booleans, tamperCheck off|flag|block), ignoring`);
|
|
301
|
+
}
|
|
302
|
+
// Loop v2 — CRITIC AGENT knobs. Resilient: a bad value is dropped with a
|
|
303
|
+
// warning (the critic default 're-do' then applies). Omitted ⇒ undefined
|
|
304
|
+
// (byte-identical to configs that never set `critic`).
|
|
305
|
+
const criticSchema = ProjectConfigSchema.shape.selfEvolution
|
|
306
|
+
.unwrap()
|
|
307
|
+
.shape.critic.unwrap();
|
|
308
|
+
const criticResult = criticSchema.safeParse(rawSE.critic);
|
|
309
|
+
if (criticResult.success) {
|
|
310
|
+
if (Object.keys(criticResult.data).length > 0) {
|
|
311
|
+
selfEvolution.critic = criticResult.data;
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
else if (rawSE.critic !== undefined) {
|
|
315
|
+
console.warn(`Invalid 'selfEvolution.critic' in config (baselineMode must be 're-test' or 're-do'), ignoring`);
|
|
316
|
+
}
|
|
249
317
|
config.selfEvolution = selfEvolution;
|
|
250
318
|
}
|
|
251
319
|
else {
|
|
@@ -4,23 +4,30 @@
|
|
|
4
4
|
*
|
|
5
5
|
* The CRITIC AGENT is an AGENT with the SAME input/output as the 主智能体 MAIN
|
|
6
6
|
* AGENT (frozen actor; the user's host agent running the current 策略 policy
|
|
7
|
-
* vN+1). It
|
|
8
|
-
* ISOLATED worktree
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
7
|
+
* vN+1). It produces the baseline arm for LAST episode's 策略 policy vN on the
|
|
8
|
+
* SAME change in an ISOLATED worktree (by default RE-DOING the change under vN —
|
|
9
|
+
* see {@link CriticBaselineMode}), so the 奖励智能体 REWARD AGENT can later 算分
|
|
10
|
+
* calculate reward(主臂)&reward(基线臂) and advantage = reward(主臂) −
|
|
11
|
+
* reward(基线臂). Only its baseline trajectory survives — 产物即弃 (worktree
|
|
12
|
+
* artifacts discarded): the worktree is torn down in `finally`, and the single
|
|
13
|
+
* durable output is the `baseline-arm/` capture in the episode store.
|
|
13
14
|
*
|
|
14
15
|
* This module orchestrates ONE baseline arm:
|
|
15
|
-
* 1. create an isolated worktree OUTSIDE the repo (git worktree
|
|
16
|
-
*
|
|
16
|
+
* 1. create an isolated worktree OUTSIDE the repo (git worktree at detached
|
|
17
|
+
* HEAD — which excludes the change's still-uncommitted implementation — else
|
|
18
|
+
* a recursive file copy fallback),
|
|
17
19
|
* 2. make it runnable (node_modules junction/symlink + the untracked surfaces
|
|
18
20
|
* the rerun reads),
|
|
21
|
+
* 2b. ('re-do' mode, the default) reset the copied change dir to its INPUTS —
|
|
22
|
+
* remove the GENERATED artifacts (design.md, tasks.md) + reports — so the
|
|
23
|
+
* rerun re-authors them under the installed prior policy ({@link
|
|
24
|
+
* resetChangeArtifactsForRedo}),
|
|
19
25
|
* 3. INSTALL 策略 policy vN into the worktree from the byte-for-byte version
|
|
20
26
|
* snapshot, so the baseline arm reruns the PRIOR policy and not the live
|
|
21
27
|
* templates,
|
|
22
28
|
* 4. rerun headlessly via the host-aware {@link runHeadlessAgent} with
|
|
23
|
-
* cwd = worktree
|
|
29
|
+
* cwd = worktree ('re-do' regenerates design→tasks→impl→tests; 're-test'
|
|
30
|
+
* re-runs the existing change's tests), never editing canonical files,
|
|
24
31
|
* 5. persist the baseline arm (stdout always; the claude session transcript +
|
|
25
32
|
* action skeleton when discoverable; an `objective.json` shaped IDENTICALLY
|
|
26
33
|
* to the main arm's), and
|
|
@@ -32,6 +39,7 @@
|
|
|
32
39
|
* strips every arm/candidate word.
|
|
33
40
|
*/
|
|
34
41
|
import { spawn as nodeSpawn } from 'node:child_process';
|
|
42
|
+
import type { ObservedTestFailure } from '../trajectory/facts.js';
|
|
35
43
|
/** Error thrown when the worktree could not be created (git AND copy fallback failed). */
|
|
36
44
|
export declare class CriticWorktreeError extends Error {
|
|
37
45
|
constructor(message: string);
|
|
@@ -66,6 +74,18 @@ export interface ArmObjective {
|
|
|
66
74
|
verified: boolean;
|
|
67
75
|
observedStatus: 'success' | 'failure' | null;
|
|
68
76
|
measuredAt: string;
|
|
77
|
+
/**
|
|
78
|
+
* Whether a real test-runner invocation was OBSERVED in the trajectory (vs a
|
|
79
|
+
* self-reported pass rate). Lets the 奖励智能体 REWARD AGENT calibrate
|
|
80
|
+
* confidence on the correctness anchor (P2). Omitted on older captures.
|
|
81
|
+
*/
|
|
82
|
+
testRunObserved?: boolean;
|
|
83
|
+
/**
|
|
84
|
+
* Failing test ids (+ assertion lines) parsed from the OBSERVED runner output
|
|
85
|
+
* — the per-arm failure CONTENT the judge contrasts across arms (P1). Omitted
|
|
86
|
+
* when nothing was recognized (keeps JSON baselines stable).
|
|
87
|
+
*/
|
|
88
|
+
observedFailures?: ObservedTestFailure[];
|
|
69
89
|
}
|
|
70
90
|
export interface ShouldRunCriticAgentOptions {
|
|
71
91
|
repoRoot: string;
|
|
@@ -110,7 +130,22 @@ export declare function shouldRunCriticAgent(opts: ShouldRunCriticAgentOptions):
|
|
|
110
130
|
* canonical files, and to print the runner summary line verbatim as its final
|
|
111
131
|
* line.
|
|
112
132
|
*/
|
|
113
|
-
|
|
133
|
+
/**
|
|
134
|
+
* How the CRITIC AGENT builds the baseline arm:
|
|
135
|
+
* - 're-do' (default): RE-DO the change from its inputs under the prior policy
|
|
136
|
+
* vN — reset the GENERATED artifacts (design.md, tasks.md), re-author design
|
|
137
|
+
* under the installed vN template, then re-implement → gen-test → run-test.
|
|
138
|
+
* advantage = reward(主臂) − reward(基线臂) then reflects the POLICY change,
|
|
139
|
+
* because the design template is actually exercised (it shapes the freshly
|
|
140
|
+
* authored design). This is faithful when the change's implementation is
|
|
141
|
+
* still uncommitted at episode time (the workflow default — so the isolated
|
|
142
|
+
* git worktree, checked out at detached HEAD, holds the PRE-change code).
|
|
143
|
+
* - 're-test': re-run the EXISTING change's tests under vN's template. Cheaper,
|
|
144
|
+
* but an already-authored change never exercises the design template, so the
|
|
145
|
+
* baseline mostly re-measures the main arm's own work.
|
|
146
|
+
*/
|
|
147
|
+
export type CriticBaselineMode = 're-test' | 're-do';
|
|
148
|
+
export declare function assembleCriticPrompt(changeName: string, mode?: CriticBaselineMode): string;
|
|
114
149
|
export interface RunCriticAgentOptions {
|
|
115
150
|
repoRoot: string;
|
|
116
151
|
targetId: string;
|
|
@@ -118,6 +153,11 @@ export interface RunCriticAgentOptions {
|
|
|
118
153
|
episodeId: string;
|
|
119
154
|
/** LAST episode's policy version vN, from {@link shouldRunCriticAgent}. */
|
|
120
155
|
baselineVersion: number;
|
|
156
|
+
/**
|
|
157
|
+
* How the baseline arm is built (see {@link CriticBaselineMode}). Default
|
|
158
|
+
* 're-do' (regenerate the change under vN so the policy is exercised).
|
|
159
|
+
*/
|
|
160
|
+
baselineMode?: CriticBaselineMode;
|
|
121
161
|
/** Injectable spawn seam for tests; defaults to node's spawn. */
|
|
122
162
|
spawn?: typeof nodeSpawn;
|
|
123
163
|
/** Hard timeout per agent run (ms). Default 600000 (10 min). */
|
|
@@ -138,6 +178,8 @@ export interface RunCriticAgentResult {
|
|
|
138
178
|
worktreePath: string;
|
|
139
179
|
/** How the worktree was created. */
|
|
140
180
|
worktreeMode: 'git-worktree' | 'copy-fallback';
|
|
181
|
+
/** Which baseline construction ran (see {@link CriticBaselineMode}). */
|
|
182
|
+
baselineMode: CriticBaselineMode;
|
|
141
183
|
}
|
|
142
184
|
/**
|
|
143
185
|
* Run the CRITIC AGENT(基线智能体 baseline agent)'s full baseline arm and
|
|
@@ -4,23 +4,30 @@
|
|
|
4
4
|
*
|
|
5
5
|
* The CRITIC AGENT is an AGENT with the SAME input/output as the 主智能体 MAIN
|
|
6
6
|
* AGENT (frozen actor; the user's host agent running the current 策略 policy
|
|
7
|
-
* vN+1). It
|
|
8
|
-
* ISOLATED worktree
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
7
|
+
* vN+1). It produces the baseline arm for LAST episode's 策略 policy vN on the
|
|
8
|
+
* SAME change in an ISOLATED worktree (by default RE-DOING the change under vN —
|
|
9
|
+
* see {@link CriticBaselineMode}), so the 奖励智能体 REWARD AGENT can later 算分
|
|
10
|
+
* calculate reward(主臂)&reward(基线臂) and advantage = reward(主臂) −
|
|
11
|
+
* reward(基线臂). Only its baseline trajectory survives — 产物即弃 (worktree
|
|
12
|
+
* artifacts discarded): the worktree is torn down in `finally`, and the single
|
|
13
|
+
* durable output is the `baseline-arm/` capture in the episode store.
|
|
13
14
|
*
|
|
14
15
|
* This module orchestrates ONE baseline arm:
|
|
15
|
-
* 1. create an isolated worktree OUTSIDE the repo (git worktree
|
|
16
|
-
*
|
|
16
|
+
* 1. create an isolated worktree OUTSIDE the repo (git worktree at detached
|
|
17
|
+
* HEAD — which excludes the change's still-uncommitted implementation — else
|
|
18
|
+
* a recursive file copy fallback),
|
|
17
19
|
* 2. make it runnable (node_modules junction/symlink + the untracked surfaces
|
|
18
20
|
* the rerun reads),
|
|
21
|
+
* 2b. ('re-do' mode, the default) reset the copied change dir to its INPUTS —
|
|
22
|
+
* remove the GENERATED artifacts (design.md, tasks.md) + reports — so the
|
|
23
|
+
* rerun re-authors them under the installed prior policy ({@link
|
|
24
|
+
* resetChangeArtifactsForRedo}),
|
|
19
25
|
* 3. INSTALL 策略 policy vN into the worktree from the byte-for-byte version
|
|
20
26
|
* snapshot, so the baseline arm reruns the PRIOR policy and not the live
|
|
21
27
|
* templates,
|
|
22
28
|
* 4. rerun headlessly via the host-aware {@link runHeadlessAgent} with
|
|
23
|
-
* cwd = worktree
|
|
29
|
+
* cwd = worktree ('re-do' regenerates design→tasks→impl→tests; 're-test'
|
|
30
|
+
* re-runs the existing change's tests), never editing canonical files,
|
|
24
31
|
* 5. persist the baseline arm (stdout always; the claude session transcript +
|
|
25
32
|
* action skeleton when discoverable; an `objective.json` shaped IDENTICALLY
|
|
26
33
|
* to the main arm's), and
|
|
@@ -105,29 +112,79 @@ export async function shouldRunCriticAgent(opts) {
|
|
|
105
112
|
baselineVersion,
|
|
106
113
|
};
|
|
107
114
|
}
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
115
|
+
export function assembleCriticPrompt(changeName, mode = 're-do') {
|
|
116
|
+
if (mode === 're-test') {
|
|
117
|
+
return [
|
|
118
|
+
`You are RE-RUNNING an existing SynergySpec change end-to-end to measure its`,
|
|
119
|
+
`test outcome under the artifact templates already installed in your working`,
|
|
120
|
+
`directory. This is a measurement run only — do NOT modify any canonical`,
|
|
121
|
+
`workflow prompt, artifact template, or schema, and do NOT edit the frozen`,
|
|
122
|
+
`gen-test/run-test oracle.`,
|
|
123
|
+
``,
|
|
124
|
+
`Change name: ${changeName}`,
|
|
125
|
+
``,
|
|
126
|
+
`Run the change's tests (apply → gen-test → run-test) and output the test`,
|
|
127
|
+
`runner's SUMMARY LINE verbatim as the final line of your response, e.g.`,
|
|
128
|
+
`"Tests 12 passed | 1 failed (13)" or "5 passed, 0 failed in 0.4s".`,
|
|
129
|
+
].join('\n');
|
|
130
|
+
}
|
|
131
|
+
// 're-do' — regenerate the change end-to-end so the installed prior-policy
|
|
132
|
+
// design template is actually exercised.
|
|
117
133
|
return [
|
|
118
|
-
`You are RE-
|
|
119
|
-
`
|
|
120
|
-
`
|
|
121
|
-
`workflow prompt, artifact template, or schema, and do NOT edit the frozen`,
|
|
122
|
-
`gen-test/run-test oracle.`,
|
|
134
|
+
`You are RE-DOING an existing SynergySpec change from scratch under the`,
|
|
135
|
+
`artifact templates currently installed in your working directory, to measure`,
|
|
136
|
+
`the test outcome those templates produce. This is a measurement run.`,
|
|
123
137
|
``,
|
|
124
138
|
`Change name: ${changeName}`,
|
|
125
139
|
``,
|
|
126
|
-
`
|
|
127
|
-
`
|
|
128
|
-
`
|
|
140
|
+
`The change's INPUT artifacts (proposal.md, usecases.md, specs/) are present.`,
|
|
141
|
+
`Its design.md and tasks.md have been intentionally REMOVED so you regenerate`,
|
|
142
|
+
`them under the installed templates. Re-create the change end-to-end:`,
|
|
143
|
+
``,
|
|
144
|
+
`1. Regenerate the design — run`,
|
|
145
|
+
` synergyspec-selfevolving instructions design --change "${changeName}" --json`,
|
|
146
|
+
` read the returned template + dependency files (proposal.md, usecases.md),`,
|
|
147
|
+
` and author design.md using that template as the structure.`,
|
|
148
|
+
`2. Regenerate the tasks the same way`,
|
|
149
|
+
` (synergyspec-selfevolving instructions tasks --change "${changeName}" --json),`,
|
|
150
|
+
` then apply them — implement the code each task requires.`,
|
|
151
|
+
`3. Generate the change's tests (gen-test), then run the test runner (run-test).`,
|
|
152
|
+
``,
|
|
153
|
+
`Do NOT modify any canonical workflow prompt, artifact TEMPLATE, or schema, and`,
|
|
154
|
+
`do NOT edit the frozen gen-test/run-test oracle. Write ONLY the change's own`,
|
|
155
|
+
`artifacts (design.md, tasks.md in the change dir) and the implementation`,
|
|
156
|
+
`source the tasks require.`,
|
|
157
|
+
``,
|
|
158
|
+
`Output the test runner's SUMMARY LINE verbatim as the final line of your`,
|
|
159
|
+
`response, e.g. "Tests 12 passed | 1 failed (13)" or "5 passed, 0 failed in 0.4s".`,
|
|
129
160
|
].join('\n');
|
|
130
161
|
}
|
|
162
|
+
/**
|
|
163
|
+
* Generated artifacts a 're-do' baseline removes from the copied change dir
|
|
164
|
+
* before the rerun, so the agent re-authors them under the installed prior
|
|
165
|
+
* policy. design.md + tasks.md are the policy-shaped chain; the report files are
|
|
166
|
+
* post-implementation residue that would otherwise read the change as already
|
|
167
|
+
* applied (status keys doneness off file existence). The INPUT artifacts
|
|
168
|
+
* (proposal.md, usecases.md, specs/) — which define "the same task" — are KEPT.
|
|
169
|
+
*/
|
|
170
|
+
const REDO_REGENERATED_ARTIFACTS = [
|
|
171
|
+
'design.md',
|
|
172
|
+
'tasks.md',
|
|
173
|
+
'test-report.md',
|
|
174
|
+
'test-plan.md',
|
|
175
|
+
'spec-tests.md',
|
|
176
|
+
'spec-blast-radius.md',
|
|
177
|
+
'verification-report.md',
|
|
178
|
+
];
|
|
179
|
+
/**
|
|
180
|
+
* Reset a copied change dir to its inputs for a 're-do' baseline (see
|
|
181
|
+
* {@link REDO_REGENERATED_ARTIFACTS}). Best-effort: a missing artifact is fine.
|
|
182
|
+
*/
|
|
183
|
+
async function resetChangeArtifactsForRedo(changeDir) {
|
|
184
|
+
for (const rel of REDO_REGENERATED_ARTIFACTS) {
|
|
185
|
+
await fs.rm(path.join(changeDir, rel), { force: true }).catch(() => { });
|
|
186
|
+
}
|
|
187
|
+
}
|
|
131
188
|
const NODE_MODULES = 'node_modules';
|
|
132
189
|
const CONFIG_DIR = '.synergyspec-selfevolving';
|
|
133
190
|
const SCHEMAS_REL = path.join('synergyspec-selfevolving', 'schemas');
|
|
@@ -143,6 +200,7 @@ export async function runCriticAgent(opts) {
|
|
|
143
200
|
const spawnImpl = opts.spawn ?? nodeSpawn;
|
|
144
201
|
const timeoutMs = opts.timeoutMs ?? 600000;
|
|
145
202
|
const homeDir = opts.homeDir ?? os.homedir();
|
|
203
|
+
const baselineMode = opts.baselineMode ?? 're-do';
|
|
146
204
|
if (!Number.isInteger(opts.baselineVersion) || opts.baselineVersion < 0) {
|
|
147
205
|
throw new Error(`runCriticAgent requires a non-negative integer baselineVersion, got ${JSON.stringify(opts.baselineVersion)}`);
|
|
148
206
|
}
|
|
@@ -155,13 +213,31 @@ export async function runCriticAgent(opts) {
|
|
|
155
213
|
try {
|
|
156
214
|
// 1) Isolated worktree OUTSIDE the repo (git worktree --detach, else copy).
|
|
157
215
|
worktreeMode = await createIsolatedWorktree(repoRoot, worktreePath, spawnImpl);
|
|
216
|
+
// 're-do' fidelity needs the detached-HEAD tree (pre-change code). The copy
|
|
217
|
+
// fallback (non-git repo) brings the LIVE tree — including the change's
|
|
218
|
+
// uncommitted implementation — so it cannot reach the pre-change state and
|
|
219
|
+
// degrades to a re-measure. Surface that so a degraded baseline is not silent.
|
|
220
|
+
if (baselineMode === 're-do' && worktreeMode === 'copy-fallback') {
|
|
221
|
+
console.warn(`[critic] re-do baseline degraded for "${opts.changeName}": no git worktree ` +
|
|
222
|
+
`(copy fallback) — the change's implementation could not be isolated, so the ` +
|
|
223
|
+
`baseline re-measures rather than re-does. Use a git repo, or set ` +
|
|
224
|
+
`selfEvolution.critic.baselineMode: re-test to silence this.`);
|
|
225
|
+
}
|
|
158
226
|
// 2) Make it runnable: node_modules junction/symlink + untracked surfaces.
|
|
159
227
|
await makeWorktreeRunnable(repoRoot, worktreePath, opts.changeName);
|
|
228
|
+
// 2b) 're-do': reset the copied change dir to its inputs so the rerun
|
|
229
|
+
// RE-AUTHORS design+tasks under the prior policy (and re-implements on
|
|
230
|
+
// the pre-change code the detached-HEAD worktree already holds). The
|
|
231
|
+
// fidelity over 're-test' is that the design TEMPLATE is actually
|
|
232
|
+
// exercised, so advantage reflects the policy change, not re-run noise.
|
|
233
|
+
if (baselineMode === 're-do') {
|
|
234
|
+
await resetChangeArtifactsForRedo(path.join(worktreePath, 'synergyspec-selfevolving', 'changes', opts.changeName));
|
|
235
|
+
}
|
|
160
236
|
// 3) INSTALL 策略 policy vN (byte-for-byte snapshot files) — the fidelity
|
|
161
237
|
// fix the old GA replay never performed.
|
|
162
238
|
await installPolicyVersion(repoRoot, worktreePath, opts.targetId, opts.baselineVersion);
|
|
163
|
-
// 4) Rerun headlessly with cwd = worktree (
|
|
164
|
-
const prompt = assembleCriticPrompt(opts.changeName);
|
|
239
|
+
// 4) Rerun headlessly with cwd = worktree (re-do: regenerate; re-test: measure).
|
|
240
|
+
const prompt = assembleCriticPrompt(opts.changeName, baselineMode);
|
|
165
241
|
const run = await runHeadlessAgent(prompt, {
|
|
166
242
|
cwd: worktreePath,
|
|
167
243
|
spawn: spawnImpl,
|
|
@@ -212,6 +288,10 @@ export async function runCriticAgent(opts) {
|
|
|
212
288
|
verified,
|
|
213
289
|
observedStatus,
|
|
214
290
|
measuredAt,
|
|
291
|
+
...(facts ? { testRunObserved: facts.testRunObserved } : {}),
|
|
292
|
+
...(facts?.observedFailures && facts.observedFailures.length > 0
|
|
293
|
+
? { observedFailures: facts.observedFailures }
|
|
294
|
+
: {}),
|
|
215
295
|
};
|
|
216
296
|
// Transcript: the claude session `.jsonl` when discovered, else stdout.
|
|
217
297
|
let transcriptDiscovered = false;
|
|
@@ -256,6 +336,7 @@ export async function runCriticAgent(opts) {
|
|
|
256
336
|
transcriptDiscovered,
|
|
257
337
|
worktreePath,
|
|
258
338
|
worktreeMode,
|
|
339
|
+
baselineMode,
|
|
259
340
|
};
|
|
260
341
|
}
|
|
261
342
|
finally {
|
|
@@ -47,9 +47,10 @@
|
|
|
47
47
|
import { spawn as nodeSpawn } from 'node:child_process';
|
|
48
48
|
import type { LearnReport } from '../learn.js';
|
|
49
49
|
import type { TrajectorySource } from '../trajectory/source.js';
|
|
50
|
-
import { type PolicyResolveFiles } from './policy/policy-store.js';
|
|
50
|
+
import { type PolicyResolveFiles, type PolicyLedgerEntry } from './policy/policy-store.js';
|
|
51
51
|
import { type EpisodeStage } from './episode-store.js';
|
|
52
|
-
import { type ArmObjective } from './critic-agent.js';
|
|
52
|
+
import { type ArmObjective, type CriticBaselineMode } from './critic-agent.js';
|
|
53
|
+
import { type RewardConfig } from './reward-aggregator.js';
|
|
53
54
|
import { type RunEvolvingAgentResult } from './evolving-agent.js';
|
|
54
55
|
/** The 主智能体 MAIN AGENT (policy vN+1) capture the orchestrator records. */
|
|
55
56
|
export interface MainArmCapture {
|
|
@@ -107,6 +108,30 @@ export interface CaptureMainArmOptions {
|
|
|
107
108
|
export declare function captureMainArm(opts: CaptureMainArmOptions): Promise<MainArmCapture>;
|
|
108
109
|
/** The decision the orchestrator made on the main arm's edits. */
|
|
109
110
|
export type EpisodeDecision = 'rolled-back' | 'kept' | 'abstained';
|
|
111
|
+
/**
|
|
112
|
+
* Count the consecutive trailing rolled-back episodes in the 版本账本 ledger.
|
|
113
|
+
*
|
|
114
|
+
* A bad streak's ledger tail reads `…, evolve, rollback, evolve, rollback` — the
|
|
115
|
+
* 演进智能体 EVOLVING AGENT appends exactly one 'evolve' after each decision, so
|
|
116
|
+
* each counted rollback is reached by skipping the single 'evolve' that follows
|
|
117
|
+
* it. A 'kept' episode leaves a bare 'evolve' (no following rollback) which
|
|
118
|
+
* breaks the streak, as do 'init'/'refused'. Returns 0 when the head is not a
|
|
119
|
+
* rollback (the last episode kept). Pure.
|
|
120
|
+
*/
|
|
121
|
+
export declare function consecutiveRollbacks(ledger: readonly PolicyLedgerEntry[]): number;
|
|
122
|
+
/**
|
|
123
|
+
* 步长 step-size schedule for the 演进智能体 EVOLVING AGENT's edit budget L.
|
|
124
|
+
*
|
|
125
|
+
* Backtracking-line-search / trust-region move (and SkillOpt's decaying edit
|
|
126
|
+
* budget): after an edit LOST ground and was rolled back, the next edit should
|
|
127
|
+
* be SMALLER — a smaller blast radius is cheaper to undo and its cause is more
|
|
128
|
+
* legible, and it keeps a struggling lineage from drifting via repeated
|
|
129
|
+
* full-size swings. HALVE the base budget once per consecutive rolled-back
|
|
130
|
+
* episode, never below `minBudget` (itself clamped to `base`, so a caller-shrunk
|
|
131
|
+
* base is never RAISED). A healthy lineage (no trailing rollback) keeps `base`.
|
|
132
|
+
* Pure.
|
|
133
|
+
*/
|
|
134
|
+
export declare function scheduledEditBudget(ledger: readonly PolicyLedgerEntry[], base: number, minBudget?: number): number;
|
|
110
135
|
export interface RunEpisodeOptions {
|
|
111
136
|
repoRoot: string;
|
|
112
137
|
targetId: string;
|
|
@@ -123,6 +148,18 @@ export interface RunEpisodeOptions {
|
|
|
123
148
|
advantageRollbackThreshold?: number;
|
|
124
149
|
/** Edit budget L for the 演进智能体 EVOLVING AGENT. Default 40. */
|
|
125
150
|
editBudget?: number;
|
|
151
|
+
/**
|
|
152
|
+
* 奖励智能体 REWARD AGENT judge-quality knobs (from `selfEvolution.reward`).
|
|
153
|
+
* Omitted ⇒ single sample, flag-only tamper (historical, zero extra spawns).
|
|
154
|
+
*/
|
|
155
|
+
reward?: RewardConfig;
|
|
156
|
+
/**
|
|
157
|
+
* CRITIC AGENT(基线智能体)baseline construction (from `selfEvolution.critic`).
|
|
158
|
+
* Omitted ⇒ the critic's default 're-do' (regenerate the change under vN).
|
|
159
|
+
*/
|
|
160
|
+
critic?: {
|
|
161
|
+
baselineMode?: CriticBaselineMode;
|
|
162
|
+
};
|
|
126
163
|
/** Injectable spawn seam — threaded to ALL THREE agents. Defaults to node's spawn. */
|
|
127
164
|
spawn?: typeof nodeSpawn;
|
|
128
165
|
/** Injectable clock for the lock + episode id; defaults to `new Date()`. */
|