npm - synergyspec-selfevolving - Versions diffs - 2.0.0 → 2.1.0 - Mend

synergyspec-selfevolving 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

package/dist/commands/learn.js +7 -0
package/dist/commands/self-evolution-episode.js +8 -0
package/dist/core/fitness/test-failures.js +10 -2
package/dist/core/project-config.d.ts +17 -0
package/dist/core/project-config.js +68 -0
package/dist/core/self-evolution/critic-agent.d.ts +52 -10
package/dist/core/self-evolution/critic-agent.js +109 -28
package/dist/core/self-evolution/episode-orchestrator.d.ts +39 -2
package/dist/core/self-evolution/episode-orchestrator.js +157 -10
package/dist/core/self-evolution/evolving-agent.d.ts +63 -17
package/dist/core/self-evolution/evolving-agent.js +106 -20
package/dist/core/self-evolution/host-harness.d.ts +14 -14
package/dist/core/self-evolution/host-harness.js +48 -22
package/dist/core/self-evolution/index.d.ts +2 -0
package/dist/core/self-evolution/index.js +2 -0
package/dist/core/self-evolution/policy/prediction-reconcile.d.ts +54 -0
package/dist/core/self-evolution/policy/prediction-reconcile.js +191 -0
package/dist/core/self-evolution/policy/reject-buffer.d.ts +9 -2
package/dist/core/self-evolution/policy/reject-buffer.js +4 -2
package/dist/core/self-evolution/reward-agent.d.ts +159 -14
package/dist/core/self-evolution/reward-agent.js +445 -69
package/dist/core/self-evolution/reward-aggregator.d.ts +59 -0
package/dist/core/self-evolution/reward-aggregator.js +262 -0
package/dist/core/self-evolution/tamper-check.d.ts +24 -0
package/dist/core/self-evolution/tamper-check.js +236 -0
package/dist/core/templates/workflows/gen-tests.js +1 -1
package/dist/core/templates/workflows/learn.js +7 -6
package/dist/core/trajectory/scrub.d.ts +27 -0
package/dist/core/trajectory/scrub.js +79 -0
package/dist/core/trajectory/skeleton.d.ts +27 -1
package/dist/core/trajectory/skeleton.js +152 -8
package/package.json +1 -1
package/dist/core/self-evolution/ga-selection.d.ts +0 -94
package/dist/core/self-evolution/ga-selection.js +0 -153
package/dist/core/self-evolution/proposer-agent.d.ts +0 -182
package/dist/core/self-evolution/proposer-agent.js +0 -326
package/dist/core/self-evolution/replay-runner.d.ts +0 -100
package/dist/core/self-evolution/replay-runner.js +0 -170
package/dist/core/self-evolution/replay.d.ts +0 -45
package/dist/core/self-evolution/replay.js +0 -56
package/dist/core/self-evolution/template-variants.d.ts +0 -62
package/dist/core/self-evolution/template-variants.js +0 -171
package/dist/core/self-evolution/trajectory.d.ts +0 -65
package/dist/core/self-evolution/trajectory.js +0 -185

package/dist/commands/learn.js CHANGED Viewed

@@ -179,12 +179,19 @@ export function registerLearnCommand(program, deps = {}) {
                     changeName: report.changeName,
                     report,
                 });
+                // Thread the loop-v2 reward judge-quality config (samples / noiseFloor /
+                // orderSwap / tamperCheck). Omitted ⇒ the orchestrator's single-sample,
+                // flag-only default (no extra spawns).
+                const episodeConfig = readProjectConfig(projectRoot);
                 episodeOutcome = await runEpisodeImpl({
                     repoRoot: projectRoot,
                     targetId: concreteEvolveTarget.targetId,
                     changeName: report.changeName,
                     changeDirPath: report.changeDir,
                     mainArm,
+                    ...(episodeConfig?.selfEvolution?.reward
+                        ? { reward: episodeConfig.selfEvolution.reward }
+                        : {}),
                 });
             }
             if (options.json) {

package/dist/commands/self-evolution-episode.js CHANGED Viewed

@@ -9,6 +9,7 @@ lookupCanonicalTarget, listCanonicalTargets, DESIGN_ARTIFACT_TARGET_ID, } from '
 import { generateLearnReport } from '../core/learn.js';
 import { validateExplicitTrajectoryHandle } from '../core/learn/trajectory-discovery.js';
 import { validateChangeExists } from './workflow/shared.js';
+import { readProjectConfig } from '../core/project-config.js';
 /**
  * The 主智能体 MAIN AGENT arm is graded from a learn report exactly the way the
  * `learn` command grades it (the orchestrator REUSES that grading; it never
@@ -148,6 +149,7 @@ export async function runEpisodeCommand(args, opts) {
     // orchestrator's behavior when it is unaware of the flag.
     let outcome;
     try {
+        const episodeConfig = readProjectConfig(opts.repoRoot);
         const episodeOptions = {
             repoRoot: opts.repoRoot,
             targetId,
@@ -155,6 +157,12 @@ export async function runEpisodeCommand(args, opts) {
             changeDirPath,
             mainArm,
             ...(args.noBaseline ? { skipBaseline: true } : {}),
+            ...(episodeConfig?.selfEvolution?.reward
+                ? { reward: episodeConfig.selfEvolution.reward }
+                : {}),
+            ...(episodeConfig?.selfEvolution?.critic
+                ? { critic: episodeConfig.selfEvolution.critic }
+                : {}),
         };
         outcome = await runEpisode(episodeOptions);
     }

package/dist/core/fitness/test-failures.js CHANGED Viewed

@@ -57,6 +57,14 @@ function findAssertion(lines, from) {
 function cleanToken(value) {
     return value.replace(/^[`'"]+|[`'"]+$/g, '');
 }
+/**
+ * POSIX-normalize a path so a Windows pytest path (`tests\test_x.py`) matches the
+ * already-POSIX-normalized file-edit paths in the action skeleton — the reward
+ * agent's renamed/edited-test caveat compares the two by exact string.
+ */
+function toPosix(p) {
+    return p.replace(/\\/g, '/');
+}
 /**
  * Extract failing test ids + assertion lines from observed runner output.
  * Returns `[]` when nothing is recognized. Deduplicates by testId, preserves
@@ -86,7 +94,7 @@ export function parseTestFailures(output) {
             const inline = pytest[2]?.trim();
             push({
                 testId,
-                file: testId.split('::')[0],
+                file: toPosix(testId.split('::')[0]),
                 ...(inline
                     ? { assertion: capAssertion(inline) }
                     : (() => {
@@ -98,7 +106,7 @@ export function parseTestFailures(output) {
         }
         const vitest = VITEST_FAIL_RE.exec(line);
         if (vitest) {
-            const file = cleanToken(vitest[1]);
+            const file = toPosix(cleanToken(vitest[1]));
             const rest = vitest[2]?.trim();
             const testId = rest ? `${file} > ${rest}` : file;
             const assertion = findAssertion(lines, i);

package/dist/core/project-config.d.ts CHANGED Viewed

@@ -27,6 +27,23 @@ export declare const ProjectConfigSchema: z.ZodObject<{
         focus: z.ZodOptional<z.ZodBoolean>;
         advantageRollbackThreshold: z.ZodOptional<z.ZodNumber>;
         editBudget: z.ZodOptional<z.ZodNumber>;
+        reward: z.ZodOptional<z.ZodObject<{
+            samples: z.ZodOptional<z.ZodNumber>;
+            noiseFloor: z.ZodOptional<z.ZodNumber>;
+            orderSwap: z.ZodOptional<z.ZodBoolean>;
+            requireCorrectnessGate: z.ZodOptional<z.ZodBoolean>;
+            tamperCheck: z.ZodOptional<z.ZodEnum<{
+                off: "off";
+                flag: "flag";
+                block: "block";
+            }>>;
+        }, z.core.$strip>>;
+        critic: z.ZodOptional<z.ZodObject<{
+            baselineMode: z.ZodOptional<z.ZodEnum<{
+                "re-test": "re-test";
+                "re-do": "re-do";
+            }>>;
+        }, z.core.$strip>>;
     }, z.core.$strip>>;
     health: z.ZodOptional<z.ZodObject<{
         source: z.ZodDefault<z.ZodEnum<{

package/dist/core/project-config.js CHANGED Viewed

@@ -60,6 +60,42 @@ export const ProjectConfigSchema = z.object({
         // 演进智能体 EVOLVING AGENT's ONE bounded edit may total. Default 40.
         // Optional/omitted ⇒ the agent's DEFAULT_EVOLVING_AGENT_EDIT_BUDGET applies.
         editBudget: z.number().optional(),
+        // Loop v2 — 奖励智能体 REWARD AGENT judge-quality knobs. ALL optional; omitted
+        // ⇒ the historical single-sample, flag-only behaviour (no extra LLM spawns).
+        reward: z
+            .object({
+            // ② How many judged duels per episode. Default 1 (single sample, no
+            //   extra spawns). >1 enables the A/A noise floor + SPRT + order-swap.
+            samples: z.number().optional(),
+            // ② Minimum |advantage| to trust; within the floor ⇒ insufficient-signal.
+            //   Omitted ⇒ measured from an A/A pair when samples>1, else unused.
+            noiseFloor: z.number().optional(),
+            // ③ Swap arm presentation order across samples to cancel position bias.
+            orderSwap: z.boolean().optional(),
+            // ① Enforce the correctness hard-gate inside the judge (default on).
+            requireCorrectnessGate: z.boolean().optional(),
+            // ④ Test-tamper handling: 'off' (no check), 'flag' (annotate only,
+            //   default), or 'block' (force insufficient-signal + reject-buffer).
+            tamperCheck: z.enum(['off', 'flag', 'block']).optional(),
+        })
+            .optional(),
+        // Loop v2 — CRITIC AGENT（基线智能体 baseline agent）baseline construction.
+        //   're-do' (default): the baseline arm RE-DOES the change under the prior
+        //     policy vN — it resets the change's GENERATED artifacts (design.md,
+        //     tasks.md), re-authors design under the installed vN template, then
+        //     re-implements → gen-test → run-test. So advantage ＝ reward(主臂) −
+        //     reward(基线臂) reflects the POLICY change, not re-run noise. Faithful
+        //     when the change's implementation is still uncommitted at episode time
+        //     (the workflow default) and an isolated git worktree can be created;
+        //     on a non-git copy fallback it degrades to a re-measure (documented).
+        //   're-test': the prior behaviour — re-run the EXISTING change's tests
+        //     under vN's template (cheaper, but an already-authored change does not
+        //     exercise the design template). Omitted ⇒ 're-do'.
+        critic: z
+            .object({
+            baselineMode: z.enum(['re-test', 're-do']).optional(),
+        })
+            .optional(),
     })
         .optional()
         .describe('Per-canonical-target self-evolution toggles'),
@@ -246,6 +282,38 @@ export function readProjectConfig(projectRoot) {
                 else if (rawSE.editBudget !== undefined) {
                     console.warn(`Invalid 'selfEvolution.editBudget' in config (must be a number), ignoring`);
                 }
+                // Loop v2 — 奖励智能体 REWARD AGENT knobs. Resilient: each sub-field is
+                // validated independently; a bad value is dropped with a warning (the
+                // judge/aggregator default applies). Omitted ⇒ undefined (single-sample,
+                // flag-only — byte-identical to configs that never set `reward`).
+                const rewardSchema = ProjectConfigSchema.shape.selfEvolution
+                    .unwrap()
+                    .shape.reward.unwrap();
+                const rewardResult = rewardSchema.safeParse(rawSE.reward);
+                if (rewardResult.success) {
+                    if (Object.keys(rewardResult.data).length > 0) {
+                        selfEvolution.reward = rewardResult.data;
+                    }
+                }
+                else if (rawSE.reward !== undefined) {
+                    console.warn(`Invalid 'selfEvolution.reward' in config (samples/noiseFloor numbers, ` +
+                        `orderSwap/requireCorrectnessGate booleans, tamperCheck off|flag|block), ignoring`);
+                }
+                // Loop v2 — CRITIC AGENT knobs. Resilient: a bad value is dropped with a
+                // warning (the critic default 're-do' then applies). Omitted ⇒ undefined
+                // (byte-identical to configs that never set `critic`).
+                const criticSchema = ProjectConfigSchema.shape.selfEvolution
+                    .unwrap()
+                    .shape.critic.unwrap();
+                const criticResult = criticSchema.safeParse(rawSE.critic);
+                if (criticResult.success) {
+                    if (Object.keys(criticResult.data).length > 0) {
+                        selfEvolution.critic = criticResult.data;
+                    }
+                }
+                else if (rawSE.critic !== undefined) {
+                    console.warn(`Invalid 'selfEvolution.critic' in config (baselineMode must be 're-test' or 're-do'), ignoring`);
+                }
                 config.selfEvolution = selfEvolution;
             }
             else {

package/dist/core/self-evolution/critic-agent.d.ts CHANGED Viewed

@@ -4,23 +4,30 @@
  *
  * The CRITIC AGENT is an AGENT with the SAME input/output as the 主智能体 MAIN
  * AGENT (frozen actor; the user's host agent running the current 策略 policy
- * vN+1). It reruns LAST episode's 策略 policy vN on the SAME change in an
- * ISOLATED worktree, so the 奖励智能体 REWARD AGENT can later 算分 calculate
- * reward(主臂)＆reward(基线臂) and advantage ＝ reward(主臂) − reward(基线臂).
- * Only its baseline trajectory survives — 产物即弃 (worktree artifacts
- * discarded): the worktree is torn down in `finally`, and the single durable
- * output is the `baseline-arm/` capture in the episode store.
+ * vN+1). It produces the baseline arm for LAST episode's 策略 policy vN on the
+ * SAME change in an ISOLATED worktree (by default RE-DOING the change under vN —
+ * see {@link CriticBaselineMode}), so the 奖励智能体 REWARD AGENT can later 算分
+ * calculate reward(主臂)＆reward(基线臂) and advantage ＝ reward(主臂) −
+ * reward(基线臂). Only its baseline trajectory survives — 产物即弃 (worktree
+ * artifacts discarded): the worktree is torn down in `finally`, and the single
+ * durable output is the `baseline-arm/` capture in the episode store.
  *
  * This module orchestrates ONE baseline arm:
- *   1. create an isolated worktree OUTSIDE the repo (git worktree, else a
- *      recursive file copy fallback),
+ *   1. create an isolated worktree OUTSIDE the repo (git worktree at detached
+ *      HEAD — which excludes the change's still-uncommitted implementation — else
+ *      a recursive file copy fallback),
  *   2. make it runnable (node_modules junction/symlink + the untracked surfaces
  *      the rerun reads),
+ *   2b. ('re-do' mode, the default) reset the copied change dir to its INPUTS —
+ *      remove the GENERATED artifacts (design.md, tasks.md) + reports — so the
+ *      rerun re-authors them under the installed prior policy ({@link
+ *      resetChangeArtifactsForRedo}),
  *   3. INSTALL 策略 policy vN into the worktree from the byte-for-byte version
  *      snapshot, so the baseline arm reruns the PRIOR policy and not the live
  *      templates,
  *   4. rerun headlessly via the host-aware {@link runHeadlessAgent} with
- *      cwd = worktree, measurement only, never editing canonical files,
+ *      cwd = worktree ('re-do' regenerates design→tasks→impl→tests; 're-test'
+ *      re-runs the existing change's tests), never editing canonical files,
  *   5. persist the baseline arm (stdout always; the claude session transcript +
  *      action skeleton when discoverable; an `objective.json` shaped IDENTICALLY
  *      to the main arm's), and
@@ -32,6 +39,7 @@
  * strips every arm/candidate word.
  */
 import { spawn as nodeSpawn } from 'node:child_process';
+import type { ObservedTestFailure } from '../trajectory/facts.js';
 /** Error thrown when the worktree could not be created (git AND copy fallback failed). */
 export declare class CriticWorktreeError extends Error {
     constructor(message: string);
@@ -66,6 +74,18 @@ export interface ArmObjective {
     verified: boolean;
     observedStatus: 'success' | 'failure' | null;
     measuredAt: string;
+    /**
+     * Whether a real test-runner invocation was OBSERVED in the trajectory (vs a
+     * self-reported pass rate). Lets the 奖励智能体 REWARD AGENT calibrate
+     * confidence on the correctness anchor (P2). Omitted on older captures.
+     */
+    testRunObserved?: boolean;
+    /**
+     * Failing test ids (+ assertion lines) parsed from the OBSERVED runner output
+     * — the per-arm failure CONTENT the judge contrasts across arms (P1). Omitted
+     * when nothing was recognized (keeps JSON baselines stable).
+     */
+    observedFailures?: ObservedTestFailure[];
 }
 export interface ShouldRunCriticAgentOptions {
     repoRoot: string;
@@ -110,7 +130,22 @@ export declare function shouldRunCriticAgent(opts: ShouldRunCriticAgentOptions):
  * canonical files, and to print the runner summary line verbatim as its final
  * line.
  */
-export declare function assembleCriticPrompt(changeName: string): string;
+/**
+ * How the CRITIC AGENT builds the baseline arm:
+ *   - 're-do' (default): RE-DO the change from its inputs under the prior policy
+ *     vN — reset the GENERATED artifacts (design.md, tasks.md), re-author design
+ *     under the installed vN template, then re-implement → gen-test → run-test.
+ *     advantage ＝ reward(主臂) − reward(基线臂) then reflects the POLICY change,
+ *     because the design template is actually exercised (it shapes the freshly
+ *     authored design). This is faithful when the change's implementation is
+ *     still uncommitted at episode time (the workflow default — so the isolated
+ *     git worktree, checked out at detached HEAD, holds the PRE-change code).
+ *   - 're-test': re-run the EXISTING change's tests under vN's template. Cheaper,
+ *     but an already-authored change never exercises the design template, so the
+ *     baseline mostly re-measures the main arm's own work.
+ */
+export type CriticBaselineMode = 're-test' | 're-do';
+export declare function assembleCriticPrompt(changeName: string, mode?: CriticBaselineMode): string;
 export interface RunCriticAgentOptions {
     repoRoot: string;
     targetId: string;
@@ -118,6 +153,11 @@ export interface RunCriticAgentOptions {
     episodeId: string;
     /** LAST episode's policy version vN, from {@link shouldRunCriticAgent}. */
     baselineVersion: number;
+    /**
+     * How the baseline arm is built (see {@link CriticBaselineMode}). Default
+     * 're-do' (regenerate the change under vN so the policy is exercised).
+     */
+    baselineMode?: CriticBaselineMode;
     /** Injectable spawn seam for tests; defaults to node's spawn. */
     spawn?: typeof nodeSpawn;
     /** Hard timeout per agent run (ms). Default 600000 (10 min). */
@@ -138,6 +178,8 @@ export interface RunCriticAgentResult {
     worktreePath: string;
     /** How the worktree was created. */
     worktreeMode: 'git-worktree' | 'copy-fallback';
+    /** Which baseline construction ran (see {@link CriticBaselineMode}). */
+    baselineMode: CriticBaselineMode;
 }
 /**
  * Run the CRITIC AGENT（基线智能体 baseline agent）'s full baseline arm and

package/dist/core/self-evolution/critic-agent.js CHANGED Viewed

@@ -4,23 +4,30 @@
  *
  * The CRITIC AGENT is an AGENT with the SAME input/output as the 主智能体 MAIN
  * AGENT (frozen actor; the user's host agent running the current 策略 policy
- * vN+1). It reruns LAST episode's 策略 policy vN on the SAME change in an
- * ISOLATED worktree, so the 奖励智能体 REWARD AGENT can later 算分 calculate
- * reward(主臂)＆reward(基线臂) and advantage ＝ reward(主臂) − reward(基线臂).
- * Only its baseline trajectory survives — 产物即弃 (worktree artifacts
- * discarded): the worktree is torn down in `finally`, and the single durable
- * output is the `baseline-arm/` capture in the episode store.
+ * vN+1). It produces the baseline arm for LAST episode's 策略 policy vN on the
+ * SAME change in an ISOLATED worktree (by default RE-DOING the change under vN —
+ * see {@link CriticBaselineMode}), so the 奖励智能体 REWARD AGENT can later 算分
+ * calculate reward(主臂)＆reward(基线臂) and advantage ＝ reward(主臂) −
+ * reward(基线臂). Only its baseline trajectory survives — 产物即弃 (worktree
+ * artifacts discarded): the worktree is torn down in `finally`, and the single
+ * durable output is the `baseline-arm/` capture in the episode store.
  *
  * This module orchestrates ONE baseline arm:
- *   1. create an isolated worktree OUTSIDE the repo (git worktree, else a
- *      recursive file copy fallback),
+ *   1. create an isolated worktree OUTSIDE the repo (git worktree at detached
+ *      HEAD — which excludes the change's still-uncommitted implementation — else
+ *      a recursive file copy fallback),
  *   2. make it runnable (node_modules junction/symlink + the untracked surfaces
  *      the rerun reads),
+ *   2b. ('re-do' mode, the default) reset the copied change dir to its INPUTS —
+ *      remove the GENERATED artifacts (design.md, tasks.md) + reports — so the
+ *      rerun re-authors them under the installed prior policy ({@link
+ *      resetChangeArtifactsForRedo}),
  *   3. INSTALL 策略 policy vN into the worktree from the byte-for-byte version
  *      snapshot, so the baseline arm reruns the PRIOR policy and not the live
  *      templates,
  *   4. rerun headlessly via the host-aware {@link runHeadlessAgent} with
- *      cwd = worktree, measurement only, never editing canonical files,
+ *      cwd = worktree ('re-do' regenerates design→tasks→impl→tests; 're-test'
+ *      re-runs the existing change's tests), never editing canonical files,
  *   5. persist the baseline arm (stdout always; the claude session transcript +
  *      action skeleton when discoverable; an `objective.json` shaped IDENTICALLY
  *      to the main arm's), and
@@ -105,29 +112,79 @@ export async function shouldRunCriticAgent(opts) {
         baselineVersion,
     };
 }
-/**
- * Assemble the CRITIC AGENT（基线智能体 baseline agent）rerun prompt. STRIPPED
- * of every arm/candidate word: the agent is simply told to re-run change
- * <changeName> end-to-end (apply → gen-test → run-test) under the templates
- * already installed in its working directory, measurement only, never editing
- * canonical files, and to print the runner summary line verbatim as its final
- * line.
- */
-export function assembleCriticPrompt(changeName) {
+export function assembleCriticPrompt(changeName, mode = 're-do') {
+    if (mode === 're-test') {
+        return [
+            `You are RE-RUNNING an existing SynergySpec change end-to-end to measure its`,
+            `test outcome under the artifact templates already installed in your working`,
+            `directory. This is a measurement run only — do NOT modify any canonical`,
+            `workflow prompt, artifact template, or schema, and do NOT edit the frozen`,
+            `gen-test/run-test oracle.`,
+            ``,
+            `Change name: ${changeName}`,
+            ``,
+            `Run the change's tests (apply → gen-test → run-test) and output the test`,
+            `runner's SUMMARY LINE verbatim as the final line of your response, e.g.`,
+            `"Tests  12 passed | 1 failed (13)" or "5 passed, 0 failed in 0.4s".`,
+        ].join('\n');
+    }
+    // 're-do' — regenerate the change end-to-end so the installed prior-policy
+    // design template is actually exercised.
     return [
-        `You are RE-RUNNING an existing SynergySpec change end-to-end to measure its`,
-        `test outcome under the artifact templates already installed in your working`,
-        `directory. This is a measurement run only — do NOT modify any canonical`,
-        `workflow prompt, artifact template, or schema, and do NOT edit the frozen`,
-        `gen-test/run-test oracle.`,
+        `You are RE-DOING an existing SynergySpec change from scratch under the`,
+        `artifact templates currently installed in your working directory, to measure`,
+        `the test outcome those templates produce. This is a measurement run.`,
         ``,
         `Change name: ${changeName}`,
         ``,
-        `Run the change's tests (apply → gen-test → run-test) and output the test`,
-        `runner's SUMMARY LINE verbatim as the final line of your response, e.g.`,
-        `"Tests  12 passed | 1 failed (13)" or "5 passed, 0 failed in 0.4s".`,
+        `The change's INPUT artifacts (proposal.md, usecases.md, specs/) are present.`,
+        `Its design.md and tasks.md have been intentionally REMOVED so you regenerate`,
+        `them under the installed templates. Re-create the change end-to-end:`,
+        ``,
+        `1. Regenerate the design — run`,
+        `     synergyspec-selfevolving instructions design --change "${changeName}" --json`,
+        `   read the returned template + dependency files (proposal.md, usecases.md),`,
+        `   and author design.md using that template as the structure.`,
+        `2. Regenerate the tasks the same way`,
+        `   (synergyspec-selfevolving instructions tasks --change "${changeName}" --json),`,
+        `   then apply them — implement the code each task requires.`,
+        `3. Generate the change's tests (gen-test), then run the test runner (run-test).`,
+        ``,
+        `Do NOT modify any canonical workflow prompt, artifact TEMPLATE, or schema, and`,
+        `do NOT edit the frozen gen-test/run-test oracle. Write ONLY the change's own`,
+        `artifacts (design.md, tasks.md in the change dir) and the implementation`,
+        `source the tasks require.`,
+        ``,
+        `Output the test runner's SUMMARY LINE verbatim as the final line of your`,
+        `response, e.g. "Tests  12 passed | 1 failed (13)" or "5 passed, 0 failed in 0.4s".`,
     ].join('\n');
 }
+/**
+ * Generated artifacts a 're-do' baseline removes from the copied change dir
+ * before the rerun, so the agent re-authors them under the installed prior
+ * policy. design.md + tasks.md are the policy-shaped chain; the report files are
+ * post-implementation residue that would otherwise read the change as already
+ * applied (status keys doneness off file existence). The INPUT artifacts
+ * (proposal.md, usecases.md, specs/) — which define "the same task" — are KEPT.
+ */
+const REDO_REGENERATED_ARTIFACTS = [
+    'design.md',
+    'tasks.md',
+    'test-report.md',
+    'test-plan.md',
+    'spec-tests.md',
+    'spec-blast-radius.md',
+    'verification-report.md',
+];
+/**
+ * Reset a copied change dir to its inputs for a 're-do' baseline (see
+ * {@link REDO_REGENERATED_ARTIFACTS}). Best-effort: a missing artifact is fine.
+ */
+async function resetChangeArtifactsForRedo(changeDir) {
+    for (const rel of REDO_REGENERATED_ARTIFACTS) {
+        await fs.rm(path.join(changeDir, rel), { force: true }).catch(() => { });
+    }
+}
 const NODE_MODULES = 'node_modules';
 const CONFIG_DIR = '.synergyspec-selfevolving';
 const SCHEMAS_REL = path.join('synergyspec-selfevolving', 'schemas');
@@ -143,6 +200,7 @@ export async function runCriticAgent(opts) {
     const spawnImpl = opts.spawn ?? nodeSpawn;
     const timeoutMs = opts.timeoutMs ?? 600000;
     const homeDir = opts.homeDir ?? os.homedir();
+    const baselineMode = opts.baselineMode ?? 're-do';
     if (!Number.isInteger(opts.baselineVersion) || opts.baselineVersion < 0) {
         throw new Error(`runCriticAgent requires a non-negative integer baselineVersion, got ${JSON.stringify(opts.baselineVersion)}`);
     }
@@ -155,13 +213,31 @@ export async function runCriticAgent(opts) {
     try {
         // 1) Isolated worktree OUTSIDE the repo (git worktree --detach, else copy).
         worktreeMode = await createIsolatedWorktree(repoRoot, worktreePath, spawnImpl);
+        // 're-do' fidelity needs the detached-HEAD tree (pre-change code). The copy
+        // fallback (non-git repo) brings the LIVE tree — including the change's
+        // uncommitted implementation — so it cannot reach the pre-change state and
+        // degrades to a re-measure. Surface that so a degraded baseline is not silent.
+        if (baselineMode === 're-do' && worktreeMode === 'copy-fallback') {
+            console.warn(`[critic] re-do baseline degraded for "${opts.changeName}": no git worktree ` +
+                `(copy fallback) — the change's implementation could not be isolated, so the ` +
+                `baseline re-measures rather than re-does. Use a git repo, or set ` +
+                `selfEvolution.critic.baselineMode: re-test to silence this.`);
+        }
         // 2) Make it runnable: node_modules junction/symlink + untracked surfaces.
         await makeWorktreeRunnable(repoRoot, worktreePath, opts.changeName);
+        // 2b) 're-do': reset the copied change dir to its inputs so the rerun
+        //     RE-AUTHORS design+tasks under the prior policy (and re-implements on
+        //     the pre-change code the detached-HEAD worktree already holds). The
+        //     fidelity over 're-test' is that the design TEMPLATE is actually
+        //     exercised, so advantage reflects the policy change, not re-run noise.
+        if (baselineMode === 're-do') {
+            await resetChangeArtifactsForRedo(path.join(worktreePath, 'synergyspec-selfevolving', 'changes', opts.changeName));
+        }
         // 3) INSTALL 策略 policy vN (byte-for-byte snapshot files) — the fidelity
         //    fix the old GA replay never performed.
         await installPolicyVersion(repoRoot, worktreePath, opts.targetId, opts.baselineVersion);
-        // 4) Rerun headlessly with cwd = worktree (measurement only).
-        const prompt = assembleCriticPrompt(opts.changeName);
+        // 4) Rerun headlessly with cwd = worktree (re-do: regenerate; re-test: measure).
+        const prompt = assembleCriticPrompt(opts.changeName, baselineMode);
         const run = await runHeadlessAgent(prompt, {
             cwd: worktreePath,
             spawn: spawnImpl,
@@ -212,6 +288,10 @@ export async function runCriticAgent(opts) {
             verified,
             observedStatus,
             measuredAt,
+            ...(facts ? { testRunObserved: facts.testRunObserved } : {}),
+            ...(facts?.observedFailures && facts.observedFailures.length > 0
+                ? { observedFailures: facts.observedFailures }
+                : {}),
         };
         // Transcript: the claude session `.jsonl` when discovered, else stdout.
         let transcriptDiscovered = false;
@@ -256,6 +336,7 @@ export async function runCriticAgent(opts) {
             transcriptDiscovered,
             worktreePath,
             worktreeMode,
+            baselineMode,
         };
     }
     finally {

package/dist/core/self-evolution/episode-orchestrator.d.ts CHANGED Viewed

@@ -47,9 +47,10 @@
 import { spawn as nodeSpawn } from 'node:child_process';
 import type { LearnReport } from '../learn.js';
 import type { TrajectorySource } from '../trajectory/source.js';
-import { type PolicyResolveFiles } from './policy/policy-store.js';
+import { type PolicyResolveFiles, type PolicyLedgerEntry } from './policy/policy-store.js';
 import { type EpisodeStage } from './episode-store.js';
-import { type ArmObjective } from './critic-agent.js';
+import { type ArmObjective, type CriticBaselineMode } from './critic-agent.js';
+import { type RewardConfig } from './reward-aggregator.js';
 import { type RunEvolvingAgentResult } from './evolving-agent.js';
 /** The 主智能体 MAIN AGENT (policy vN+1) capture the orchestrator records. */
 export interface MainArmCapture {
@@ -107,6 +108,30 @@ export interface CaptureMainArmOptions {
 export declare function captureMainArm(opts: CaptureMainArmOptions): Promise<MainArmCapture>;
 /** The decision the orchestrator made on the main arm's edits. */
 export type EpisodeDecision = 'rolled-back' | 'kept' | 'abstained';
+/**
+ * Count the consecutive trailing rolled-back episodes in the 版本账本 ledger.
+ *
+ * A bad streak's ledger tail reads `…, evolve, rollback, evolve, rollback` — the
+ * 演进智能体 EVOLVING AGENT appends exactly one 'evolve' after each decision, so
+ * each counted rollback is reached by skipping the single 'evolve' that follows
+ * it. A 'kept' episode leaves a bare 'evolve' (no following rollback) which
+ * breaks the streak, as do 'init'/'refused'. Returns 0 when the head is not a
+ * rollback (the last episode kept). Pure.
+ */
+export declare function consecutiveRollbacks(ledger: readonly PolicyLedgerEntry[]): number;
+/**
+ * 步长 step-size schedule for the 演进智能体 EVOLVING AGENT's edit budget L.
+ *
+ * Backtracking-line-search / trust-region move (and SkillOpt's decaying edit
+ * budget): after an edit LOST ground and was rolled back, the next edit should
+ * be SMALLER — a smaller blast radius is cheaper to undo and its cause is more
+ * legible, and it keeps a struggling lineage from drifting via repeated
+ * full-size swings. HALVE the base budget once per consecutive rolled-back
+ * episode, never below `minBudget` (itself clamped to `base`, so a caller-shrunk
+ * base is never RAISED). A healthy lineage (no trailing rollback) keeps `base`.
+ * Pure.
+ */
+export declare function scheduledEditBudget(ledger: readonly PolicyLedgerEntry[], base: number, minBudget?: number): number;
 export interface RunEpisodeOptions {
     repoRoot: string;
     targetId: string;
@@ -123,6 +148,18 @@ export interface RunEpisodeOptions {
     advantageRollbackThreshold?: number;
     /** Edit budget L for the 演进智能体 EVOLVING AGENT. Default 40. */
     editBudget?: number;
+    /**
+     * 奖励智能体 REWARD AGENT judge-quality knobs (from `selfEvolution.reward`).
+     * Omitted ⇒ single sample, flag-only tamper (historical, zero extra spawns).
+     */
+    reward?: RewardConfig;
+    /**
+     * CRITIC AGENT（基线智能体）baseline construction (from `selfEvolution.critic`).
+     * Omitted ⇒ the critic's default 're-do' (regenerate the change under vN).
+     */
+    critic?: {
+        baselineMode?: CriticBaselineMode;
+    };
     /** Injectable spawn seam — threaded to ALL THREE agents. Defaults to node's spawn. */
     spawn?: typeof nodeSpawn;
     /** Injectable clock for the lock + episode id; defaults to `new Date()`. */