synergyspec-selfevolving 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/dist/commands/learn.js +7 -0
  2. package/dist/commands/self-evolution-episode.js +8 -0
  3. package/dist/core/fitness/test-failures.js +10 -2
  4. package/dist/core/project-config.d.ts +17 -0
  5. package/dist/core/project-config.js +68 -0
  6. package/dist/core/self-evolution/critic-agent.d.ts +52 -10
  7. package/dist/core/self-evolution/critic-agent.js +109 -28
  8. package/dist/core/self-evolution/episode-orchestrator.d.ts +39 -2
  9. package/dist/core/self-evolution/episode-orchestrator.js +157 -10
  10. package/dist/core/self-evolution/evolving-agent.d.ts +63 -17
  11. package/dist/core/self-evolution/evolving-agent.js +106 -20
  12. package/dist/core/self-evolution/host-harness.d.ts +14 -14
  13. package/dist/core/self-evolution/host-harness.js +48 -22
  14. package/dist/core/self-evolution/index.d.ts +2 -0
  15. package/dist/core/self-evolution/index.js +2 -0
  16. package/dist/core/self-evolution/policy/prediction-reconcile.d.ts +54 -0
  17. package/dist/core/self-evolution/policy/prediction-reconcile.js +191 -0
  18. package/dist/core/self-evolution/policy/reject-buffer.d.ts +9 -2
  19. package/dist/core/self-evolution/policy/reject-buffer.js +4 -2
  20. package/dist/core/self-evolution/reward-agent.d.ts +159 -14
  21. package/dist/core/self-evolution/reward-agent.js +445 -69
  22. package/dist/core/self-evolution/reward-aggregator.d.ts +59 -0
  23. package/dist/core/self-evolution/reward-aggregator.js +262 -0
  24. package/dist/core/self-evolution/tamper-check.d.ts +24 -0
  25. package/dist/core/self-evolution/tamper-check.js +236 -0
  26. package/dist/core/templates/workflows/gen-tests.js +1 -1
  27. package/dist/core/templates/workflows/learn.js +7 -6
  28. package/dist/core/trajectory/scrub.d.ts +27 -0
  29. package/dist/core/trajectory/scrub.js +79 -0
  30. package/dist/core/trajectory/skeleton.d.ts +27 -1
  31. package/dist/core/trajectory/skeleton.js +152 -8
  32. package/package.json +1 -1
  33. package/dist/core/self-evolution/ga-selection.d.ts +0 -94
  34. package/dist/core/self-evolution/ga-selection.js +0 -153
  35. package/dist/core/self-evolution/proposer-agent.d.ts +0 -182
  36. package/dist/core/self-evolution/proposer-agent.js +0 -326
  37. package/dist/core/self-evolution/replay-runner.d.ts +0 -100
  38. package/dist/core/self-evolution/replay-runner.js +0 -170
  39. package/dist/core/self-evolution/replay.d.ts +0 -45
  40. package/dist/core/self-evolution/replay.js +0 -56
  41. package/dist/core/self-evolution/template-variants.d.ts +0 -62
  42. package/dist/core/self-evolution/template-variants.js +0 -171
  43. package/dist/core/self-evolution/trajectory.d.ts +0 -65
  44. package/dist/core/self-evolution/trajectory.js +0 -185
@@ -179,12 +179,19 @@ export function registerLearnCommand(program, deps = {}) {
179
179
  changeName: report.changeName,
180
180
  report,
181
181
  });
182
+ // Thread the loop-v2 reward judge-quality config (samples / noiseFloor /
183
+ // orderSwap / tamperCheck). Omitted ⇒ the orchestrator's single-sample,
184
+ // flag-only default (no extra spawns).
185
+ const episodeConfig = readProjectConfig(projectRoot);
182
186
  episodeOutcome = await runEpisodeImpl({
183
187
  repoRoot: projectRoot,
184
188
  targetId: concreteEvolveTarget.targetId,
185
189
  changeName: report.changeName,
186
190
  changeDirPath: report.changeDir,
187
191
  mainArm,
192
+ ...(episodeConfig?.selfEvolution?.reward
193
+ ? { reward: episodeConfig.selfEvolution.reward }
194
+ : {}),
188
195
  });
189
196
  }
190
197
  if (options.json) {
@@ -9,6 +9,7 @@ lookupCanonicalTarget, listCanonicalTargets, DESIGN_ARTIFACT_TARGET_ID, } from '
9
9
  import { generateLearnReport } from '../core/learn.js';
10
10
  import { validateExplicitTrajectoryHandle } from '../core/learn/trajectory-discovery.js';
11
11
  import { validateChangeExists } from './workflow/shared.js';
12
+ import { readProjectConfig } from '../core/project-config.js';
12
13
  /**
13
14
  * The 主智能体 MAIN AGENT arm is graded from a learn report exactly the way the
14
15
  * `learn` command grades it (the orchestrator REUSES that grading; it never
@@ -148,6 +149,7 @@ export async function runEpisodeCommand(args, opts) {
148
149
  // orchestrator's behavior when it is unaware of the flag.
149
150
  let outcome;
150
151
  try {
152
+ const episodeConfig = readProjectConfig(opts.repoRoot);
151
153
  const episodeOptions = {
152
154
  repoRoot: opts.repoRoot,
153
155
  targetId,
@@ -155,6 +157,12 @@ export async function runEpisodeCommand(args, opts) {
155
157
  changeDirPath,
156
158
  mainArm,
157
159
  ...(args.noBaseline ? { skipBaseline: true } : {}),
160
+ ...(episodeConfig?.selfEvolution?.reward
161
+ ? { reward: episodeConfig.selfEvolution.reward }
162
+ : {}),
163
+ ...(episodeConfig?.selfEvolution?.critic
164
+ ? { critic: episodeConfig.selfEvolution.critic }
165
+ : {}),
158
166
  };
159
167
  outcome = await runEpisode(episodeOptions);
160
168
  }
@@ -57,6 +57,14 @@ function findAssertion(lines, from) {
57
57
  function cleanToken(value) {
58
58
  return value.replace(/^[`'"]+|[`'"]+$/g, '');
59
59
  }
60
+ /**
61
+ * POSIX-normalize a path so a Windows pytest path (`tests\test_x.py`) matches the
62
+ * already-POSIX-normalized file-edit paths in the action skeleton — the reward
63
+ * agent's renamed/edited-test caveat compares the two by exact string.
64
+ */
65
+ function toPosix(p) {
66
+ return p.replace(/\\/g, '/');
67
+ }
60
68
  /**
61
69
  * Extract failing test ids + assertion lines from observed runner output.
62
70
  * Returns `[]` when nothing is recognized. Deduplicates by testId, preserves
@@ -86,7 +94,7 @@ export function parseTestFailures(output) {
86
94
  const inline = pytest[2]?.trim();
87
95
  push({
88
96
  testId,
89
- file: testId.split('::')[0],
97
+ file: toPosix(testId.split('::')[0]),
90
98
  ...(inline
91
99
  ? { assertion: capAssertion(inline) }
92
100
  : (() => {
@@ -98,7 +106,7 @@ export function parseTestFailures(output) {
98
106
  }
99
107
  const vitest = VITEST_FAIL_RE.exec(line);
100
108
  if (vitest) {
101
- const file = cleanToken(vitest[1]);
109
+ const file = toPosix(cleanToken(vitest[1]));
102
110
  const rest = vitest[2]?.trim();
103
111
  const testId = rest ? `${file} > ${rest}` : file;
104
112
  const assertion = findAssertion(lines, i);
@@ -27,6 +27,23 @@ export declare const ProjectConfigSchema: z.ZodObject<{
27
27
  focus: z.ZodOptional<z.ZodBoolean>;
28
28
  advantageRollbackThreshold: z.ZodOptional<z.ZodNumber>;
29
29
  editBudget: z.ZodOptional<z.ZodNumber>;
30
+ reward: z.ZodOptional<z.ZodObject<{
31
+ samples: z.ZodOptional<z.ZodNumber>;
32
+ noiseFloor: z.ZodOptional<z.ZodNumber>;
33
+ orderSwap: z.ZodOptional<z.ZodBoolean>;
34
+ requireCorrectnessGate: z.ZodOptional<z.ZodBoolean>;
35
+ tamperCheck: z.ZodOptional<z.ZodEnum<{
36
+ off: "off";
37
+ flag: "flag";
38
+ block: "block";
39
+ }>>;
40
+ }, z.core.$strip>>;
41
+ critic: z.ZodOptional<z.ZodObject<{
42
+ baselineMode: z.ZodOptional<z.ZodEnum<{
43
+ "re-test": "re-test";
44
+ "re-do": "re-do";
45
+ }>>;
46
+ }, z.core.$strip>>;
30
47
  }, z.core.$strip>>;
31
48
  health: z.ZodOptional<z.ZodObject<{
32
49
  source: z.ZodDefault<z.ZodEnum<{
@@ -60,6 +60,42 @@ export const ProjectConfigSchema = z.object({
60
60
  // 演进智能体 EVOLVING AGENT's ONE bounded edit may total. Default 40.
61
61
  // Optional/omitted ⇒ the agent's DEFAULT_EVOLVING_AGENT_EDIT_BUDGET applies.
62
62
  editBudget: z.number().optional(),
63
+ // Loop v2 — 奖励智能体 REWARD AGENT judge-quality knobs. ALL optional; omitted
64
+ // ⇒ the historical single-sample, flag-only behaviour (no extra LLM spawns).
65
+ reward: z
66
+ .object({
67
+ // ② How many judged duels per episode. Default 1 (single sample, no
68
+ // extra spawns). >1 enables the A/A noise floor + SPRT + order-swap.
69
+ samples: z.number().optional(),
70
+ // ② Minimum |advantage| to trust; within the floor ⇒ insufficient-signal.
71
+ // Omitted ⇒ measured from an A/A pair when samples>1, else unused.
72
+ noiseFloor: z.number().optional(),
73
+ // ③ Swap arm presentation order across samples to cancel position bias.
74
+ orderSwap: z.boolean().optional(),
75
+ // ① Enforce the correctness hard-gate inside the judge (default on).
76
+ requireCorrectnessGate: z.boolean().optional(),
77
+ // ④ Test-tamper handling: 'off' (no check), 'flag' (annotate only,
78
+ // default), or 'block' (force insufficient-signal + reject-buffer).
79
+ tamperCheck: z.enum(['off', 'flag', 'block']).optional(),
80
+ })
81
+ .optional(),
82
+ // Loop v2 — CRITIC AGENT(基线智能体 baseline agent)baseline construction.
83
+ // 're-do' (default): the baseline arm RE-DOES the change under the prior
84
+ // policy vN — it resets the change's GENERATED artifacts (design.md,
85
+ // tasks.md), re-authors design under the installed vN template, then
86
+ // re-implements → gen-test → run-test. So advantage = reward(主臂) −
87
+ // reward(基线臂) reflects the POLICY change, not re-run noise. Faithful
88
+ // when the change's implementation is still uncommitted at episode time
89
+ // (the workflow default) and an isolated git worktree can be created;
90
+ // on a non-git copy fallback it degrades to a re-measure (documented).
91
+ // 're-test': the prior behaviour — re-run the EXISTING change's tests
92
+ // under vN's template (cheaper, but an already-authored change does not
93
+ // exercise the design template). Omitted ⇒ 're-do'.
94
+ critic: z
95
+ .object({
96
+ baselineMode: z.enum(['re-test', 're-do']).optional(),
97
+ })
98
+ .optional(),
63
99
  })
64
100
  .optional()
65
101
  .describe('Per-canonical-target self-evolution toggles'),
@@ -246,6 +282,38 @@ export function readProjectConfig(projectRoot) {
246
282
  else if (rawSE.editBudget !== undefined) {
247
283
  console.warn(`Invalid 'selfEvolution.editBudget' in config (must be a number), ignoring`);
248
284
  }
285
+ // Loop v2 — 奖励智能体 REWARD AGENT knobs. Resilient: each sub-field is
286
+ // validated independently; a bad value is dropped with a warning (the
287
+ // judge/aggregator default applies). Omitted ⇒ undefined (single-sample,
288
+ // flag-only — byte-identical to configs that never set `reward`).
289
+ const rewardSchema = ProjectConfigSchema.shape.selfEvolution
290
+ .unwrap()
291
+ .shape.reward.unwrap();
292
+ const rewardResult = rewardSchema.safeParse(rawSE.reward);
293
+ if (rewardResult.success) {
294
+ if (Object.keys(rewardResult.data).length > 0) {
295
+ selfEvolution.reward = rewardResult.data;
296
+ }
297
+ }
298
+ else if (rawSE.reward !== undefined) {
299
+ console.warn(`Invalid 'selfEvolution.reward' in config (samples/noiseFloor numbers, ` +
300
+ `orderSwap/requireCorrectnessGate booleans, tamperCheck off|flag|block), ignoring`);
301
+ }
302
+ // Loop v2 — CRITIC AGENT knobs. Resilient: a bad value is dropped with a
303
+ // warning (the critic default 're-do' then applies). Omitted ⇒ undefined
304
+ // (byte-identical to configs that never set `critic`).
305
+ const criticSchema = ProjectConfigSchema.shape.selfEvolution
306
+ .unwrap()
307
+ .shape.critic.unwrap();
308
+ const criticResult = criticSchema.safeParse(rawSE.critic);
309
+ if (criticResult.success) {
310
+ if (Object.keys(criticResult.data).length > 0) {
311
+ selfEvolution.critic = criticResult.data;
312
+ }
313
+ }
314
+ else if (rawSE.critic !== undefined) {
315
+ console.warn(`Invalid 'selfEvolution.critic' in config (baselineMode must be 're-test' or 're-do'), ignoring`);
316
+ }
249
317
  config.selfEvolution = selfEvolution;
250
318
  }
251
319
  else {
@@ -4,23 +4,30 @@
4
4
  *
5
5
  * The CRITIC AGENT is an AGENT with the SAME input/output as the 主智能体 MAIN
6
6
  * AGENT (frozen actor; the user's host agent running the current 策略 policy
7
- * vN+1). It reruns LAST episode's 策略 policy vN on the SAME change in an
8
- * ISOLATED worktree, so the 奖励智能体 REWARD AGENT can later 算分 calculate
9
- * reward(主臂)&reward(基线臂) and advantage reward(主臂) reward(基线臂).
10
- * Only its baseline trajectory survives — 产物即弃 (worktree artifacts
11
- * discarded): the worktree is torn down in `finally`, and the single durable
12
- * output is the `baseline-arm/` capture in the episode store.
7
+ * vN+1). It produces the baseline arm for LAST episode's 策略 policy vN on the
8
+ * SAME change in an ISOLATED worktree (by default RE-DOING the change under vN
9
+ * see {@link CriticBaselineMode}), so the 奖励智能体 REWARD AGENT can later 算分
10
+ * calculate reward(主臂)&reward(基线臂) and advantage reward(主臂)
11
+ * reward(基线臂). Only its baseline trajectory survives 产物即弃 (worktree
12
+ * artifacts discarded): the worktree is torn down in `finally`, and the single
13
+ * durable output is the `baseline-arm/` capture in the episode store.
13
14
  *
14
15
  * This module orchestrates ONE baseline arm:
15
- * 1. create an isolated worktree OUTSIDE the repo (git worktree, else a
16
- * recursive file copy fallback),
16
+ * 1. create an isolated worktree OUTSIDE the repo (git worktree at detached
17
+ * HEAD which excludes the change's still-uncommitted implementation — else
18
+ * a recursive file copy fallback),
17
19
  * 2. make it runnable (node_modules junction/symlink + the untracked surfaces
18
20
  * the rerun reads),
21
+ * 2b. ('re-do' mode, the default) reset the copied change dir to its INPUTS —
22
+ * remove the GENERATED artifacts (design.md, tasks.md) + reports — so the
23
+ * rerun re-authors them under the installed prior policy ({@link
24
+ * resetChangeArtifactsForRedo}),
19
25
  * 3. INSTALL 策略 policy vN into the worktree from the byte-for-byte version
20
26
  * snapshot, so the baseline arm reruns the PRIOR policy and not the live
21
27
  * templates,
22
28
  * 4. rerun headlessly via the host-aware {@link runHeadlessAgent} with
23
- * cwd = worktree, measurement only, never editing canonical files,
29
+ * cwd = worktree ('re-do' regenerates design→tasks→impl→tests; 're-test'
30
+ * re-runs the existing change's tests), never editing canonical files,
24
31
  * 5. persist the baseline arm (stdout always; the claude session transcript +
25
32
  * action skeleton when discoverable; an `objective.json` shaped IDENTICALLY
26
33
  * to the main arm's), and
@@ -32,6 +39,7 @@
32
39
  * strips every arm/candidate word.
33
40
  */
34
41
  import { spawn as nodeSpawn } from 'node:child_process';
42
+ import type { ObservedTestFailure } from '../trajectory/facts.js';
35
43
  /** Error thrown when the worktree could not be created (git AND copy fallback failed). */
36
44
  export declare class CriticWorktreeError extends Error {
37
45
  constructor(message: string);
@@ -66,6 +74,18 @@ export interface ArmObjective {
66
74
  verified: boolean;
67
75
  observedStatus: 'success' | 'failure' | null;
68
76
  measuredAt: string;
77
+ /**
78
+ * Whether a real test-runner invocation was OBSERVED in the trajectory (vs a
79
+ * self-reported pass rate). Lets the 奖励智能体 REWARD AGENT calibrate
80
+ * confidence on the correctness anchor (P2). Omitted on older captures.
81
+ */
82
+ testRunObserved?: boolean;
83
+ /**
84
+ * Failing test ids (+ assertion lines) parsed from the OBSERVED runner output
85
+ * — the per-arm failure CONTENT the judge contrasts across arms (P1). Omitted
86
+ * when nothing was recognized (keeps JSON baselines stable).
87
+ */
88
+ observedFailures?: ObservedTestFailure[];
69
89
  }
70
90
  export interface ShouldRunCriticAgentOptions {
71
91
  repoRoot: string;
@@ -110,7 +130,22 @@ export declare function shouldRunCriticAgent(opts: ShouldRunCriticAgentOptions):
110
130
  * canonical files, and to print the runner summary line verbatim as its final
111
131
  * line.
112
132
  */
113
- export declare function assembleCriticPrompt(changeName: string): string;
133
+ /**
134
+ * How the CRITIC AGENT builds the baseline arm:
135
+ * - 're-do' (default): RE-DO the change from its inputs under the prior policy
136
+ * vN — reset the GENERATED artifacts (design.md, tasks.md), re-author design
137
+ * under the installed vN template, then re-implement → gen-test → run-test.
138
+ * advantage = reward(主臂) − reward(基线臂) then reflects the POLICY change,
139
+ * because the design template is actually exercised (it shapes the freshly
140
+ * authored design). This is faithful when the change's implementation is
141
+ * still uncommitted at episode time (the workflow default — so the isolated
142
+ * git worktree, checked out at detached HEAD, holds the PRE-change code).
143
+ * - 're-test': re-run the EXISTING change's tests under vN's template. Cheaper,
144
+ * but an already-authored change never exercises the design template, so the
145
+ * baseline mostly re-measures the main arm's own work.
146
+ */
147
+ export type CriticBaselineMode = 're-test' | 're-do';
148
+ export declare function assembleCriticPrompt(changeName: string, mode?: CriticBaselineMode): string;
114
149
  export interface RunCriticAgentOptions {
115
150
  repoRoot: string;
116
151
  targetId: string;
@@ -118,6 +153,11 @@ export interface RunCriticAgentOptions {
118
153
  episodeId: string;
119
154
  /** LAST episode's policy version vN, from {@link shouldRunCriticAgent}. */
120
155
  baselineVersion: number;
156
+ /**
157
+ * How the baseline arm is built (see {@link CriticBaselineMode}). Default
158
+ * 're-do' (regenerate the change under vN so the policy is exercised).
159
+ */
160
+ baselineMode?: CriticBaselineMode;
121
161
  /** Injectable spawn seam for tests; defaults to node's spawn. */
122
162
  spawn?: typeof nodeSpawn;
123
163
  /** Hard timeout per agent run (ms). Default 600000 (10 min). */
@@ -138,6 +178,8 @@ export interface RunCriticAgentResult {
138
178
  worktreePath: string;
139
179
  /** How the worktree was created. */
140
180
  worktreeMode: 'git-worktree' | 'copy-fallback';
181
+ /** Which baseline construction ran (see {@link CriticBaselineMode}). */
182
+ baselineMode: CriticBaselineMode;
141
183
  }
142
184
  /**
143
185
  * Run the CRITIC AGENT(基线智能体 baseline agent)'s full baseline arm and
@@ -4,23 +4,30 @@
4
4
  *
5
5
  * The CRITIC AGENT is an AGENT with the SAME input/output as the 主智能体 MAIN
6
6
  * AGENT (frozen actor; the user's host agent running the current 策略 policy
7
- * vN+1). It reruns LAST episode's 策略 policy vN on the SAME change in an
8
- * ISOLATED worktree, so the 奖励智能体 REWARD AGENT can later 算分 calculate
9
- * reward(主臂)&reward(基线臂) and advantage reward(主臂) reward(基线臂).
10
- * Only its baseline trajectory survives — 产物即弃 (worktree artifacts
11
- * discarded): the worktree is torn down in `finally`, and the single durable
12
- * output is the `baseline-arm/` capture in the episode store.
7
+ * vN+1). It produces the baseline arm for LAST episode's 策略 policy vN on the
8
+ * SAME change in an ISOLATED worktree (by default RE-DOING the change under vN
9
+ * see {@link CriticBaselineMode}), so the 奖励智能体 REWARD AGENT can later 算分
10
+ * calculate reward(主臂)&reward(基线臂) and advantage reward(主臂)
11
+ * reward(基线臂). Only its baseline trajectory survives 产物即弃 (worktree
12
+ * artifacts discarded): the worktree is torn down in `finally`, and the single
13
+ * durable output is the `baseline-arm/` capture in the episode store.
13
14
  *
14
15
  * This module orchestrates ONE baseline arm:
15
- * 1. create an isolated worktree OUTSIDE the repo (git worktree, else a
16
- * recursive file copy fallback),
16
+ * 1. create an isolated worktree OUTSIDE the repo (git worktree at detached
17
+ * HEAD which excludes the change's still-uncommitted implementation — else
18
+ * a recursive file copy fallback),
17
19
  * 2. make it runnable (node_modules junction/symlink + the untracked surfaces
18
20
  * the rerun reads),
21
+ * 2b. ('re-do' mode, the default) reset the copied change dir to its INPUTS —
22
+ * remove the GENERATED artifacts (design.md, tasks.md) + reports — so the
23
+ * rerun re-authors them under the installed prior policy ({@link
24
+ * resetChangeArtifactsForRedo}),
19
25
  * 3. INSTALL 策略 policy vN into the worktree from the byte-for-byte version
20
26
  * snapshot, so the baseline arm reruns the PRIOR policy and not the live
21
27
  * templates,
22
28
  * 4. rerun headlessly via the host-aware {@link runHeadlessAgent} with
23
- * cwd = worktree, measurement only, never editing canonical files,
29
+ * cwd = worktree ('re-do' regenerates design→tasks→impl→tests; 're-test'
30
+ * re-runs the existing change's tests), never editing canonical files,
24
31
  * 5. persist the baseline arm (stdout always; the claude session transcript +
25
32
  * action skeleton when discoverable; an `objective.json` shaped IDENTICALLY
26
33
  * to the main arm's), and
@@ -105,29 +112,79 @@ export async function shouldRunCriticAgent(opts) {
105
112
  baselineVersion,
106
113
  };
107
114
  }
108
- /**
109
- * Assemble the CRITIC AGENT(基线智能体 baseline agent)rerun prompt. STRIPPED
110
- * of every arm/candidate word: the agent is simply told to re-run change
111
- * <changeName> end-to-end (apply gen-test run-test) under the templates
112
- * already installed in its working directory, measurement only, never editing
113
- * canonical files, and to print the runner summary line verbatim as its final
114
- * line.
115
- */
116
- export function assembleCriticPrompt(changeName) {
115
+ export function assembleCriticPrompt(changeName, mode = 're-do') {
116
+ if (mode === 're-test') {
117
+ return [
118
+ `You are RE-RUNNING an existing SynergySpec change end-to-end to measure its`,
119
+ `test outcome under the artifact templates already installed in your working`,
120
+ `directory. This is a measurement run only do NOT modify any canonical`,
121
+ `workflow prompt, artifact template, or schema, and do NOT edit the frozen`,
122
+ `gen-test/run-test oracle.`,
123
+ ``,
124
+ `Change name: ${changeName}`,
125
+ ``,
126
+ `Run the change's tests (apply → gen-test → run-test) and output the test`,
127
+ `runner's SUMMARY LINE verbatim as the final line of your response, e.g.`,
128
+ `"Tests 12 passed | 1 failed (13)" or "5 passed, 0 failed in 0.4s".`,
129
+ ].join('\n');
130
+ }
131
+ // 're-do' — regenerate the change end-to-end so the installed prior-policy
132
+ // design template is actually exercised.
117
133
  return [
118
- `You are RE-RUNNING an existing SynergySpec change end-to-end to measure its`,
119
- `test outcome under the artifact templates already installed in your working`,
120
- `directory. This is a measurement run only — do NOT modify any canonical`,
121
- `workflow prompt, artifact template, or schema, and do NOT edit the frozen`,
122
- `gen-test/run-test oracle.`,
134
+ `You are RE-DOING an existing SynergySpec change from scratch under the`,
135
+ `artifact templates currently installed in your working directory, to measure`,
136
+ `the test outcome those templates produce. This is a measurement run.`,
123
137
  ``,
124
138
  `Change name: ${changeName}`,
125
139
  ``,
126
- `Run the change's tests (apply gen-test → run-test) and output the test`,
127
- `runner's SUMMARY LINE verbatim as the final line of your response, e.g.`,
128
- `"Tests 12 passed | 1 failed (13)" or "5 passed, 0 failed in 0.4s".`,
140
+ `The change's INPUT artifacts (proposal.md, usecases.md, specs/) are present.`,
141
+ `Its design.md and tasks.md have been intentionally REMOVED so you regenerate`,
142
+ `them under the installed templates. Re-create the change end-to-end:`,
143
+ ``,
144
+ `1. Regenerate the design — run`,
145
+ ` synergyspec-selfevolving instructions design --change "${changeName}" --json`,
146
+ ` read the returned template + dependency files (proposal.md, usecases.md),`,
147
+ ` and author design.md using that template as the structure.`,
148
+ `2. Regenerate the tasks the same way`,
149
+ ` (synergyspec-selfevolving instructions tasks --change "${changeName}" --json),`,
150
+ ` then apply them — implement the code each task requires.`,
151
+ `3. Generate the change's tests (gen-test), then run the test runner (run-test).`,
152
+ ``,
153
+ `Do NOT modify any canonical workflow prompt, artifact TEMPLATE, or schema, and`,
154
+ `do NOT edit the frozen gen-test/run-test oracle. Write ONLY the change's own`,
155
+ `artifacts (design.md, tasks.md in the change dir) and the implementation`,
156
+ `source the tasks require.`,
157
+ ``,
158
+ `Output the test runner's SUMMARY LINE verbatim as the final line of your`,
159
+ `response, e.g. "Tests 12 passed | 1 failed (13)" or "5 passed, 0 failed in 0.4s".`,
129
160
  ].join('\n');
130
161
  }
162
+ /**
163
+ * Generated artifacts a 're-do' baseline removes from the copied change dir
164
+ * before the rerun, so the agent re-authors them under the installed prior
165
+ * policy. design.md + tasks.md are the policy-shaped chain; the report files are
166
+ * post-implementation residue that would otherwise read the change as already
167
+ * applied (status keys doneness off file existence). The INPUT artifacts
168
+ * (proposal.md, usecases.md, specs/) — which define "the same task" — are KEPT.
169
+ */
170
+ const REDO_REGENERATED_ARTIFACTS = [
171
+ 'design.md',
172
+ 'tasks.md',
173
+ 'test-report.md',
174
+ 'test-plan.md',
175
+ 'spec-tests.md',
176
+ 'spec-blast-radius.md',
177
+ 'verification-report.md',
178
+ ];
179
+ /**
180
+ * Reset a copied change dir to its inputs for a 're-do' baseline (see
181
+ * {@link REDO_REGENERATED_ARTIFACTS}). Best-effort: a missing artifact is fine.
182
+ */
183
+ async function resetChangeArtifactsForRedo(changeDir) {
184
+ for (const rel of REDO_REGENERATED_ARTIFACTS) {
185
+ await fs.rm(path.join(changeDir, rel), { force: true }).catch(() => { });
186
+ }
187
+ }
131
188
  const NODE_MODULES = 'node_modules';
132
189
  const CONFIG_DIR = '.synergyspec-selfevolving';
133
190
  const SCHEMAS_REL = path.join('synergyspec-selfevolving', 'schemas');
@@ -143,6 +200,7 @@ export async function runCriticAgent(opts) {
143
200
  const spawnImpl = opts.spawn ?? nodeSpawn;
144
201
  const timeoutMs = opts.timeoutMs ?? 600000;
145
202
  const homeDir = opts.homeDir ?? os.homedir();
203
+ const baselineMode = opts.baselineMode ?? 're-do';
146
204
  if (!Number.isInteger(opts.baselineVersion) || opts.baselineVersion < 0) {
147
205
  throw new Error(`runCriticAgent requires a non-negative integer baselineVersion, got ${JSON.stringify(opts.baselineVersion)}`);
148
206
  }
@@ -155,13 +213,31 @@ export async function runCriticAgent(opts) {
155
213
  try {
156
214
  // 1) Isolated worktree OUTSIDE the repo (git worktree --detach, else copy).
157
215
  worktreeMode = await createIsolatedWorktree(repoRoot, worktreePath, spawnImpl);
216
+ // 're-do' fidelity needs the detached-HEAD tree (pre-change code). The copy
217
+ // fallback (non-git repo) brings the LIVE tree — including the change's
218
+ // uncommitted implementation — so it cannot reach the pre-change state and
219
+ // degrades to a re-measure. Surface that so a degraded baseline is not silent.
220
+ if (baselineMode === 're-do' && worktreeMode === 'copy-fallback') {
221
+ console.warn(`[critic] re-do baseline degraded for "${opts.changeName}": no git worktree ` +
222
+ `(copy fallback) — the change's implementation could not be isolated, so the ` +
223
+ `baseline re-measures rather than re-does. Use a git repo, or set ` +
224
+ `selfEvolution.critic.baselineMode: re-test to silence this.`);
225
+ }
158
226
  // 2) Make it runnable: node_modules junction/symlink + untracked surfaces.
159
227
  await makeWorktreeRunnable(repoRoot, worktreePath, opts.changeName);
228
+ // 2b) 're-do': reset the copied change dir to its inputs so the rerun
229
+ // RE-AUTHORS design+tasks under the prior policy (and re-implements on
230
+ // the pre-change code the detached-HEAD worktree already holds). The
231
+ // fidelity over 're-test' is that the design TEMPLATE is actually
232
+ // exercised, so advantage reflects the policy change, not re-run noise.
233
+ if (baselineMode === 're-do') {
234
+ await resetChangeArtifactsForRedo(path.join(worktreePath, 'synergyspec-selfevolving', 'changes', opts.changeName));
235
+ }
160
236
  // 3) INSTALL 策略 policy vN (byte-for-byte snapshot files) — the fidelity
161
237
  // fix the old GA replay never performed.
162
238
  await installPolicyVersion(repoRoot, worktreePath, opts.targetId, opts.baselineVersion);
163
- // 4) Rerun headlessly with cwd = worktree (measurement only).
164
- const prompt = assembleCriticPrompt(opts.changeName);
239
+ // 4) Rerun headlessly with cwd = worktree (re-do: regenerate; re-test: measure).
240
+ const prompt = assembleCriticPrompt(opts.changeName, baselineMode);
165
241
  const run = await runHeadlessAgent(prompt, {
166
242
  cwd: worktreePath,
167
243
  spawn: spawnImpl,
@@ -212,6 +288,10 @@ export async function runCriticAgent(opts) {
212
288
  verified,
213
289
  observedStatus,
214
290
  measuredAt,
291
+ ...(facts ? { testRunObserved: facts.testRunObserved } : {}),
292
+ ...(facts?.observedFailures && facts.observedFailures.length > 0
293
+ ? { observedFailures: facts.observedFailures }
294
+ : {}),
215
295
  };
216
296
  // Transcript: the claude session `.jsonl` when discovered, else stdout.
217
297
  let transcriptDiscovered = false;
@@ -256,6 +336,7 @@ export async function runCriticAgent(opts) {
256
336
  transcriptDiscovered,
257
337
  worktreePath,
258
338
  worktreeMode,
339
+ baselineMode,
259
340
  };
260
341
  }
261
342
  finally {
@@ -47,9 +47,10 @@
47
47
  import { spawn as nodeSpawn } from 'node:child_process';
48
48
  import type { LearnReport } from '../learn.js';
49
49
  import type { TrajectorySource } from '../trajectory/source.js';
50
- import { type PolicyResolveFiles } from './policy/policy-store.js';
50
+ import { type PolicyResolveFiles, type PolicyLedgerEntry } from './policy/policy-store.js';
51
51
  import { type EpisodeStage } from './episode-store.js';
52
- import { type ArmObjective } from './critic-agent.js';
52
+ import { type ArmObjective, type CriticBaselineMode } from './critic-agent.js';
53
+ import { type RewardConfig } from './reward-aggregator.js';
53
54
  import { type RunEvolvingAgentResult } from './evolving-agent.js';
54
55
  /** The 主智能体 MAIN AGENT (policy vN+1) capture the orchestrator records. */
55
56
  export interface MainArmCapture {
@@ -107,6 +108,30 @@ export interface CaptureMainArmOptions {
107
108
  export declare function captureMainArm(opts: CaptureMainArmOptions): Promise<MainArmCapture>;
108
109
  /** The decision the orchestrator made on the main arm's edits. */
109
110
  export type EpisodeDecision = 'rolled-back' | 'kept' | 'abstained';
111
+ /**
112
+ * Count the consecutive trailing rolled-back episodes in the 版本账本 ledger.
113
+ *
114
+ * A bad streak's ledger tail reads `…, evolve, rollback, evolve, rollback` — the
115
+ * 演进智能体 EVOLVING AGENT appends exactly one 'evolve' after each decision, so
116
+ * each counted rollback is reached by skipping the single 'evolve' that follows
117
+ * it. A 'kept' episode leaves a bare 'evolve' (no following rollback) which
118
+ * breaks the streak, as do 'init'/'refused'. Returns 0 when the head is not a
119
+ * rollback (the last episode kept). Pure.
120
+ */
121
+ export declare function consecutiveRollbacks(ledger: readonly PolicyLedgerEntry[]): number;
122
+ /**
123
+ * 步长 step-size schedule for the 演进智能体 EVOLVING AGENT's edit budget L.
124
+ *
125
+ * Backtracking-line-search / trust-region move (and SkillOpt's decaying edit
126
+ * budget): after an edit LOST ground and was rolled back, the next edit should
127
+ * be SMALLER — a smaller blast radius is cheaper to undo and its cause is more
128
+ * legible, and it keeps a struggling lineage from drifting via repeated
129
+ * full-size swings. HALVE the base budget once per consecutive rolled-back
130
+ * episode, never below `minBudget` (itself clamped to `base`, so a caller-shrunk
131
+ * base is never RAISED). A healthy lineage (no trailing rollback) keeps `base`.
132
+ * Pure.
133
+ */
134
+ export declare function scheduledEditBudget(ledger: readonly PolicyLedgerEntry[], base: number, minBudget?: number): number;
110
135
  export interface RunEpisodeOptions {
111
136
  repoRoot: string;
112
137
  targetId: string;
@@ -123,6 +148,18 @@ export interface RunEpisodeOptions {
123
148
  advantageRollbackThreshold?: number;
124
149
  /** Edit budget L for the 演进智能体 EVOLVING AGENT. Default 40. */
125
150
  editBudget?: number;
151
+ /**
152
+ * 奖励智能体 REWARD AGENT judge-quality knobs (from `selfEvolution.reward`).
153
+ * Omitted ⇒ single sample, flag-only tamper (historical, zero extra spawns).
154
+ */
155
+ reward?: RewardConfig;
156
+ /**
157
+ * CRITIC AGENT(基线智能体)baseline construction (from `selfEvolution.critic`).
158
+ * Omitted ⇒ the critic's default 're-do' (regenerate the change under vN).
159
+ */
160
+ critic?: {
161
+ baselineMode?: CriticBaselineMode;
162
+ };
126
163
  /** Injectable spawn seam — threaded to ALL THREE agents. Defaults to node's spawn. */
127
164
  spawn?: typeof nodeSpawn;
128
165
  /** Injectable clock for the lock + episode id; defaults to `new Date()`. */