npm - synergyspec-selfevolving - Versions diffs - 1.4.0 → 2.1.0 - Mend

synergyspec-selfevolving 1.4.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

package/README.md +31 -18
package/dist/commands/learn.d.ts +12 -1
package/dist/commands/learn.js +158 -11
package/dist/commands/self-evolution-episode.d.ts +177 -0
package/dist/commands/self-evolution-episode.js +431 -0
package/dist/commands/self-evolution.d.ts +12 -190
package/dist/commands/self-evolution.js +114 -866
package/dist/core/archive.d.ts +0 -1
package/dist/core/archive.js +0 -58
package/dist/core/artifact-graph/instruction-loader.d.ts +2 -4
package/dist/core/artifact-graph/instruction-loader.js +3 -31
package/dist/core/fitness/loss.d.ts +5 -5
package/dist/core/fitness/loss.js +4 -4
package/dist/core/fitness/test-failures.js +10 -2
package/dist/core/project-config.d.ts +19 -0
package/dist/core/project-config.js +96 -0
package/dist/core/self-evolution/candidate-fitness.d.ts +23 -1
package/dist/core/self-evolution/candidate-fitness.js +31 -5
package/dist/core/self-evolution/candidates.d.ts +0 -9
package/dist/core/self-evolution/critic-agent.d.ts +192 -0
package/dist/core/self-evolution/critic-agent.js +568 -0
package/dist/core/self-evolution/edits-contract.d.ts +53 -0
package/dist/core/self-evolution/edits-contract.js +89 -0
package/dist/core/self-evolution/episode-orchestrator.d.ts +234 -0
package/dist/core/self-evolution/episode-orchestrator.js +681 -0
package/dist/core/self-evolution/episode-store.d.ts +266 -0
package/dist/core/self-evolution/episode-store.js +573 -0
package/dist/core/self-evolution/evolution-switches.d.ts +1 -1
package/dist/core/self-evolution/evolution-switches.js +5 -10
package/dist/core/self-evolution/evolving-agent.d.ts +208 -0
package/dist/core/self-evolution/evolving-agent.js +535 -0
package/dist/core/self-evolution/host-harness.d.ts +14 -15
package/dist/core/self-evolution/host-harness.js +48 -23
package/dist/core/self-evolution/index.d.ts +11 -6
package/dist/core/self-evolution/index.js +20 -6
package/dist/core/self-evolution/line-diff.d.ts +60 -0
package/dist/core/self-evolution/line-diff.js +130 -0
package/dist/core/self-evolution/policy/fs-safe.d.ts +19 -0
package/dist/core/self-evolution/policy/fs-safe.js +89 -0
package/dist/core/self-evolution/policy/index.d.ts +13 -0
package/dist/core/self-evolution/policy/index.js +13 -0
package/dist/core/self-evolution/policy/policy-store.d.ts +217 -0
package/dist/core/self-evolution/policy/policy-store.js +774 -0
package/dist/core/self-evolution/policy/prediction-reconcile.d.ts +54 -0
package/dist/core/self-evolution/policy/prediction-reconcile.js +191 -0
package/dist/core/self-evolution/policy/reject-buffer.d.ts +55 -0
package/dist/core/self-evolution/policy/reject-buffer.js +170 -0
package/dist/core/self-evolution/promote.d.ts +1 -1
package/dist/core/self-evolution/promote.js +6 -33
package/dist/core/self-evolution/promotion.js +1 -2
package/dist/core/self-evolution/reward-agent.d.ts +379 -0
package/dist/core/self-evolution/reward-agent.js +940 -0
package/dist/core/self-evolution/reward-aggregator.d.ts +59 -0
package/dist/core/self-evolution/reward-aggregator.js +262 -0
package/dist/core/self-evolution/scope-gate.d.ts +66 -0
package/dist/core/self-evolution/scope-gate.js +107 -0
package/dist/core/self-evolution/success-channel.js +2 -2
package/dist/core/self-evolution/tamper-check.d.ts +24 -0
package/dist/core/self-evolution/tamper-check.js +236 -0
package/dist/core/self-evolution/tool-evolution.js +2 -13
package/dist/core/self-evolution/verdict.d.ts +8 -5
package/dist/core/self-evolution/verdict.js +4 -7
package/dist/core/templates/workflows/gen-tests.js +1 -1
package/dist/core/templates/workflows/learn.d.ts +3 -2
package/dist/core/templates/workflows/learn.js +21 -18
package/dist/core/templates/workflows/self-evolving.d.ts +6 -4
package/dist/core/templates/workflows/self-evolving.js +62 -172
package/dist/core/trajectory/scrub.d.ts +27 -0
package/dist/core/trajectory/scrub.js +79 -0
package/dist/core/trajectory/skeleton.d.ts +27 -1
package/dist/core/trajectory/skeleton.js +152 -8
package/dist/dashboard/data.d.ts +25 -51
package/dist/dashboard/data.js +68 -180
package/dist/dashboard/react-client.js +458 -503
package/dist/dashboard/react-styles.js +3 -3
package/dist/dashboard/server.js +23 -17
package/dist/ui/ascii-patterns.d.ts +7 -15
package/dist/ui/ascii-patterns.js +123 -54
package/dist/ui/welcome-screen.d.ts +0 -14
package/dist/ui/welcome-screen.js +16 -35
package/package.json +1 -1
package/dist/core/self-evolution/ga-selection.d.ts +0 -94
package/dist/core/self-evolution/ga-selection.js +0 -153
package/dist/core/self-evolution/proposer-agent.d.ts +0 -182
package/dist/core/self-evolution/proposer-agent.js +0 -326
package/dist/core/self-evolution/replay-runner.d.ts +0 -100
package/dist/core/self-evolution/replay-runner.js +0 -170
package/dist/core/self-evolution/replay.d.ts +0 -45
package/dist/core/self-evolution/replay.js +0 -56
package/dist/core/self-evolution/template-variants.d.ts +0 -62
package/dist/core/self-evolution/template-variants.js +0 -171
package/dist/core/self-evolution/trajectory.d.ts +0 -65
package/dist/core/self-evolution/trajectory.js +0 -185

package/dist/core/self-evolution/evolving-agent.js ADDED Viewed

@@ -0,0 +1,535 @@
+import { promises as fs } from 'node:fs';
+import * as path from 'node:path';
+import { runHeadlessAgent } from './host-harness.js';
+import { evaluateToolEvolutionCandidate, } from './tool-evolution.js';
+import { validateCandidateEdits, CanonicalProposerNoOp, CanonicalProposerOutputInvalid, CanonicalProposerInvocationError, renderUnifiedDiff, } from './edits-contract.js';
+import { requireCanonicalTarget } from './canonical-targets.js';
+import { resolveTargetLocalFiles } from './local-targets.js';
+import { isEvidenceComplete } from './promote.js';
+import { renderDoNotPruneBlock, readProtections, listExemplarFiles, } from './success-channel.js';
+import { readRejectBuffer } from './policy/reject-buffer.js';
+import { advancePolicyVersion, recordEvolutionRefused, } from './policy/policy-store.js';
+import { advanceEpisodeStage, episodeDir, readEpisode, } from './episode-store.js';
+import { countChangedLines, } from './line-diff.js';
+import { checkScopeWithinDiagnosis, } from './scope-gate.js';
+/** Default edit budget L: at most this many changed lines (added + removed). */
+export const DEFAULT_EVOLVING_AGENT_EDIT_BUDGET = 40;
+/**
+ * Floor for the orchestrator's failure-driven 步长 step-size schedule: after a
+ * rolled-back edit the next episode's budget is halved toward this floor (never
+ * below it), so a struggling lineage takes smaller, more legible steps instead
+ * of another full-size swing — the backtracking-line-search / trust-region move
+ * (shrink the step after a step that lost ground; SkillOpt's decaying edit
+ * budget). The flat default above stays the ceiling for a healthy lineage. The
+ * schedule itself lives in the orchestrator (`scheduledEditBudget`); the
+ * 演进智能体 EVOLVING AGENT just receives the resolved budget as `editBudget`.
+ */
+export const MIN_EVOLVING_AGENT_EDIT_BUDGET = 8;
+/** Most recent 否决缓冲 reject-buffer entries surfaced in the prompt. */
+const REJECT_BUFFER_PROMPT_LIMIT = 5;
+const WEAKNESS_CLASSES = new Set([
+    'forgetting',
+    'boundary',
+    'rare',
+    'logic',
+    'verbosity',
+    'other',
+]);
+const GAP_SEVERITIES = new Set(['high', 'medium', 'low']);
+function normalizeDiagnosis(raw) {
+    const o = (raw && typeof raw === 'object' ? raw : {});
+    const gaps = [];
+    if (Array.isArray(o.gaps)) {
+        for (const g of o.gaps) {
+            const file = g?.file;
+            const section = g?.section;
+            if (typeof file === 'string' && typeof section === 'string') {
+                const gap = { file, section };
+                // Carry the OPTIONAL weaknessClass/severity through when present and
+                // valid; tolerate absence (old diagnoses have neither).
+                const wc = g?.weaknessClass;
+                if (typeof wc === 'string' && WEAKNESS_CLASSES.has(wc)) {
+                    gap.weaknessClass = wc;
+                }
+                const sev = g?.severity;
+                if (typeof sev === 'string' && GAP_SEVERITIES.has(sev)) {
+                    gap.severity = sev;
+                }
+                gaps.push(gap);
+            }
+        }
+    }
+    const errors = Array.isArray(o.errors)
+        ? o.errors.filter((e) => typeof e === 'string')
+        : [];
+    const advantageRaw = o.advantage;
+    const advantage = typeof advantageRaw === 'number' && Number.isFinite(advantageRaw) ? advantageRaw : null;
+    return {
+        abstained: o.abstained === true,
+        gaps,
+        errors,
+        textualGradient: typeof o.textualGradient === 'string' ? o.textualGradient : '',
+        advantage,
+    };
+}
+const PRELUDE_HEAD = [
+    'You are the 演进智能体 EVOLVING AGENT (optimizer.step) for a self-evolving',
+    'SynergySpec. The 策略 POLICY is the design template below (the 主智能体 MAIN',
+    "AGENT's 「权重」). The 奖励智能体 REWARD AGENT has already scored the last",
+    'episode and produced the DIAGNOSIS below; you NEVER score — you make ONE',
+    'bounded edit that acts on its 文本梯度 textual gradient.',
+];
+function preludeLines(editBudget) {
+    return [
+        ...PRELUDE_HEAD,
+        '',
+        'Make EXACTLY ONE bounded edit totalling no more than ' +
+            `${editBudget} changed lines (added + removed), plus a checkable prediction.`,
+        'Stay STRICTLY inside the sections the DIAGNOSIS names — do not rewrite an',
+        'unrelated heading or key just because the file is editable.',
+        '',
+        'If the diagnosis is too weak to name a concrete edit, REFUSE — emit the',
+        'refusal shape instead of inventing a change:',
+        '',
+        '```json:patch',
+        '{"edits": [], "refusal": "<one sentence: why no concrete edit is named>"}',
+        '```',
+        '',
+        'Otherwise emit EXACTLY ONE fenced block tagged `json:patch` and nothing else:',
+        '',
+        '```json:patch',
+        '{"rationale": "<why this edit and the expected behavioral delta>",',
+        ' "prediction": {"metric": "loss" | "passRate" | "healthPenalty",',
+        '                "direction": "down" | "up",',
+        '                "checkBy": "<one sentence: how a later episode settles this>"},',
+        ' "edits": [{"relPath": "<one of the allowed files>", "content": "<FULL new file contents>"}]}',
+        '```',
+        '',
+        'Rules:',
+        '- Only edit files listed under "CANONICAL TARGET" below. Never invent paths.',
+        "- Each edit's `content` is the COMPLETE new file, not a patch fragment.",
+        '- You NEVER score and you NEVER touch the gate/oracle files.',
+    ].join('\n');
+}
+/**
+ * Severity rank used to ORDER gaps in the prompt. HIGH gaps render first so the
+ * bounded edit is aimed at the most-severe failure mode. See
+ * {@link orderGapsForPrompt} for the exact deterministic rule.
+ */
+const SEVERITY_RANK = { high: 0, medium: 1, low: 2 };
+/**
+ * Order gaps for the prompt so HIGH severity comes first. Deterministic rule:
+ *   - HIGH (rank 0) before everything;
+ *   - gaps WITHOUT a severity (rank 1.5) sit AFTER the high ones, interleaved
+ *     between explicit medium (1) and low (2) by their original order;
+ *   - the sort is STABLE — gaps of equal effective rank keep their original
+ *     relative order (`Array.prototype.sort` is stable in modern V8/Node).
+ * Pure: returns a new array, never mutates the input.
+ */
+function orderGapsForPrompt(gaps) {
+    const rank = (g) => g.severity !== undefined ? SEVERITY_RANK[g.severity] : 1.5;
+    return [...gaps].sort((a, b) => rank(a) - rank(b));
+}
+/**
+ * Render the optional `[severity · weaknessClass]` annotation for a gap, with a
+ * trailing space, so the bounded edit is AIMED. Emits nothing (empty string)
+ * when BOTH fields are absent — no empty brackets, so an un-annotated gap
+ * renders exactly as before.
+ */
+function renderGapTag(g) {
+    const inner = [];
+    if (g.severity !== undefined)
+        inner.push(g.severity);
+    if (g.weaknessClass !== undefined)
+        inner.push(g.weaknessClass);
+    return inner.length > 0 ? `[${inner.join(' · ')}] ` : '';
+}
+/**
+ * Assemble the EVOLVING AGENT prompt. Order is stable and sections are
+ * omitted-when-empty so prompts on runs with no reject-buffer / no protections
+ * stay byte-identical. The editable files are fenced as `<<FILE: relPath>>`.
+ */
+export function assembleEvolvingAgentPrompt(input) {
+    const parts = [preludeLines(input.editBudget), ''];
+    const t = input.target;
+    parts.push(`# CANONICAL TARGET: ${t.id} kind=${t.kind} files=${t.files.join(', ')}`, '');
+    for (const f of input.currentFiles) {
+        parts.push(`<<FILE: ${f.relPath}>>`, f.content, '<<END FILE>>', '');
+    }
+    const d = input.diagnosis;
+    parts.push('# DIAGNOSIS (from the 奖励智能体 REWARD AGENT — it scored, you edit)');
+    if (typeof d.advantage === 'number') {
+        parts.push(`advantage ＝ reward(主臂) − reward(基线臂): ${d.advantage.toFixed(3)} (negative ⇒ the last edit lost ground)`);
+    }
+    const errors = (d.errors ?? []).filter((e) => e && e.trim().length > 0);
+    if (errors.length > 0) {
+        parts.push('errors:');
+        for (const e of errors)
+            parts.push(`- ${e}`);
+    }
+    const gaps = orderGapsForPrompt(d.gaps ?? []);
+    if (gaps.length > 0) {
+        parts.push('gaps (your edit must stay inside these sections):');
+        for (const g of gaps) {
+            const where = g.section === '*' ? `${g.file} (whole file)` : `${g.file} §"${g.section}"`;
+            const tag = renderGapTag(g);
+            parts.push(`- ${tag}${where}`);
+        }
+    }
+    const gradient = (d.textualGradient ?? '').trim();
+    if (gradient.length > 0) {
+        parts.push('', '文本梯度 textual gradient:', gradient);
+    }
+    const rejects = input.rejectBuffer ?? [];
+    if (rejects.length > 0) {
+        parts.push('', '# 否决缓冲 REJECT-BUFFER (directions already vetoed; do not repeat them)');
+        for (const r of rejects) {
+            const adv = typeof r.advantage === 'number' ? ` advantage=${r.advantage.toFixed(3)}` : '';
+            parts.push(`- [${r.reason}${adv}] ${r.textualGradientTried.trim() || r.editSummary.rationaleExcerpt}`);
+        }
+    }
+    const calibration = (input.calibrationNote ?? '').trim();
+    if (calibration.length > 0) {
+        parts.push('', '# 预测校准 PREDICTION CALIBRATION (your earlier predictions, settled by measurement)');
+        parts.push('A repeatedly-refuted direction is a weak bet — weight your confidence (and', 'this edit) accordingly. This NEVER blocks you; it is advisory context only.', calibration);
+    }
+    const doNotPrune = (input.doNotPrune ?? '').trim();
+    if (doNotPrune.length > 0) {
+        parts.push('', '# DO-NOT-PRUNE (成功保护 — load-bearing in passing runs)');
+        parts.push('The sections below are implicated in verified-PASSING runs. They must not be', 'deleted or hollowed out — make your bounded edit elsewhere.', doNotPrune);
+    }
+    return parts.join('\n');
+}
+const PREDICTION_METRICS = new Set(['loss', 'passRate', 'healthPenalty']);
+/**
+ * Parse the model's single `json:patch` block. Accepts EITHER the refusal shape
+ * (`{edits: [], refusal: string}`) OR a concrete edit (`{rationale, prediction,
+ * edits[]}`). Throws {@link CanonicalProposerOutputInvalid} on a malformed
+ * block, the wrong block count, a missing/invalid prediction, or
+ * {@link CanonicalProposerNoOp} on empty edits WITHOUT a refusal reason.
+ *
+ * Edits are NOT yet scope-validated here (the caller runs the static gate over
+ * them); this only enforces the SHAPE of the contract.
+ */
+export function parseEvolvingAgentResponse(text) {
+    const fenceRe = /```json:patch\s*([\s\S]*?)```/g;
+    const matches = [];
+    let m;
+    while ((m = fenceRe.exec(text)) !== null)
+        matches.push(m[1]);
+    if (matches.length === 0) {
+        throw new CanonicalProposerOutputInvalid('no `json:patch` fenced block found in response');
+    }
+    if (matches.length > 1) {
+        throw new CanonicalProposerOutputInvalid(`expected exactly 1 \`json:patch\` block, found ${matches.length}`);
+    }
+    let parsed;
+    try {
+        parsed = JSON.parse(matches[0].trim());
+    }
+    catch (err) {
+        throw new CanonicalProposerOutputInvalid(`failed to parse JSON inside patch block: ${err instanceof Error ? err.message : String(err)}`);
+    }
+    if (!parsed || typeof parsed !== 'object') {
+        throw new CanonicalProposerOutputInvalid('patch block must be a JSON object');
+    }
+    const o = parsed;
+    const rawEdits = o.edits;
+    if (!Array.isArray(rawEdits)) {
+        throw new CanonicalProposerOutputInvalid('patch block must contain an `edits` array');
+    }
+    // Refusal shape: empty edits + a refusal string.
+    const refusal = o.refusal;
+    if (rawEdits.length === 0) {
+        if (typeof refusal === 'string' && refusal.trim().length > 0) {
+            return { kind: 'refusal', reason: refusal.trim() };
+        }
+        // Empty edits with no refusal reason is a malformed no-op, not a refusal.
+        throw new CanonicalProposerNoOp();
+    }
+    // Concrete-edit shape: validate prediction + edit shapes.
+    const prediction = parsePrediction(o.prediction);
+    const edits = [];
+    for (const e of rawEdits) {
+        const relPath = e?.relPath;
+        const content = e?.content;
+        if (typeof relPath !== 'string' || typeof content !== 'string') {
+            throw new CanonicalProposerOutputInvalid('edit must have string relPath and string content');
+        }
+        edits.push({ relPath: relPath.replace(/\\/g, '/'), content });
+    }
+    const rationale = typeof o.rationale === 'string' ? o.rationale.trim() : '';
+    return { kind: 'edit', rationale, prediction, edits };
+}
+function parsePrediction(raw) {
+    if (!raw || typeof raw !== 'object') {
+        throw new CanonicalProposerOutputInvalid('a concrete edit requires a `prediction` object {metric, direction, checkBy}');
+    }
+    const p = raw;
+    if (typeof p.metric !== 'string' || !PREDICTION_METRICS.has(p.metric)) {
+        throw new CanonicalProposerOutputInvalid("prediction.metric must be 'loss' | 'passRate' | 'healthPenalty'");
+    }
+    if (p.direction !== 'down' && p.direction !== 'up') {
+        throw new CanonicalProposerOutputInvalid("prediction.direction must be 'down' | 'up'");
+    }
+    if (typeof p.checkBy !== 'string' || p.checkBy.trim().length === 0) {
+        throw new CanonicalProposerOutputInvalid('prediction.checkBy must be a non-empty string');
+    }
+    return {
+        metric: p.metric,
+        direction: p.direction,
+        checkBy: p.checkBy.trim(),
+    };
+}
+/** Read + structurally normalize the episode's diagnosis.json. */
+async function readDiagnosis(repoRoot, episodeId) {
+    const file = path.join(episodeDir(repoRoot, episodeId), 'diagnosis.json');
+    let raw;
+    try {
+        raw = await fs.readFile(file, 'utf8');
+    }
+    catch (err) {
+        if (err.code === 'ENOENT') {
+            // No diagnosis ⇒ nothing to act on ⇒ treated as an abstain (refuse-to-spawn).
+            return { abstained: true, gaps: [], errors: [], textualGradient: '', advantage: null };
+        }
+        throw err;
+    }
+    let parsed;
+    try {
+        parsed = JSON.parse(raw);
+    }
+    catch {
+        // A corrupt diagnosis is not a nameable gap ⇒ abstain (refuse-to-spawn).
+        return { abstained: true, gaps: [], errors: [], textualGradient: '', advantage: null };
+    }
+    return normalizeDiagnosis(parsed);
+}
+/** Read the MAIN arm's objective.json as an {@link EvidenceReport}. */
+async function readMainArmObjective(repoRoot, episodeId) {
+    const file = path.join(episodeDir(repoRoot, episodeId), 'main-arm', 'objective.json');
+    let raw;
+    try {
+        raw = await fs.readFile(file, 'utf8');
+    }
+    catch (err) {
+        if (err.code === 'ENOENT')
+            return null;
+        throw err;
+    }
+    try {
+        return JSON.parse(raw);
+    }
+    catch {
+        return null;
+    }
+}
+/** Build the static-gate ToolEvolutionCandidate from the parsed edit. */
+function buildToolEvolutionReport(edit, currentFiles) {
+    const oldByPath = new Map(currentFiles.map((f) => [f.relPath.replace(/\\/g, '/'), f.content]));
+    const diff = edit.edits
+        .map((e) => renderUnifiedDiff(e.relPath, oldByPath.get(e.relPath) ?? '', e.content))
+        .join('\n');
+    return evaluateToolEvolutionCandidate({
+        changedFiles: edit.edits.map((e) => e.relPath),
+        diff,
+        // The rationale carries the user-facing "why"; the static guard scans it
+        // for a rationale signal. Default a generic line so a terse rationale does
+        // not spuriously fail the guard.
+        summary: edit.rationale || 'EVOLVING AGENT bounded edit acting on the textual gradient',
+        // The prediction IS the verification evidence (a falsifiable, checkable bet).
+        evidence: `prediction: ${edit.prediction.metric} ${edit.prediction.direction} — ${edit.prediction.checkBy} (verification: checked by a later episode's measurement)`,
+        requireDiff: true,
+    });
+}
+/** Compose the human-readable feedback for a repair re-prompt. */
+function gateFeedback(reason) {
+    return reason;
+}
+/**
+ * Run the 演进智能体 EVOLVING AGENT against an already-scored episode.
+ *
+ * Flow:
+ *   0. Code-side refuse-to-spawn: diagnosis.abstained or no gaps ⇒ not-spawned.
+ *   1. Assemble + spawn (fresh context) and parse with repair ×N; over-budget
+ *      and 范围⊆诊断 (gate-3) violations are repairable too (re-prompt with the
+ *      violation appended).
+ *   2. Model refusal ⇒ {kind:'refused'} + a 'refused' ledger entry.
+ *   3. GATES ×3 (static → observed-GREEN → 范围⊆诊断), each ANDed with the
+ *      ≤ L budget and a valid prediction.
+ *   4. All green ⇒ advancePolicyVersion writes the next version; episode stage
+ *      advances to 'evolved' (or 'evolution-refused' on refusal).
+ */
+export async function runEvolvingAgent(opts) {
+    const repoRoot = path.resolve(opts.repoRoot);
+    const { episodeId, targetId } = opts;
+    const editBudget = opts.editBudget ?? DEFAULT_EVOLVING_AGENT_EDIT_BUDGET;
+    const maxRepairAttempts = Math.max(0, opts.maxRepairAttempts ?? 2);
+    // Fail closed: the episode must exist (and tells us nothing else we need yet).
+    const episode = await readEpisode(repoRoot, episodeId);
+    void episode;
+    // ── 0. Code-side refuse-to-spawn ───────────────────────────────────────────
+    const diagnosis = await readDiagnosis(repoRoot, episodeId);
+    if (diagnosis.abstained) {
+        return {
+            kind: 'not-spawned',
+            reason: '奖励智能体 REWARD AGENT 弃权 abstained — no nameable gap to act on',
+        };
+    }
+    if (diagnosis.gaps.length === 0) {
+        return {
+            kind: 'not-spawned',
+            reason: 'diagnosis names no gaps — nothing for the 演进智能体 EVOLVING AGENT to scope an edit to',
+        };
+    }
+    // Resolve the target's editable local files (the lineage surface).
+    const target = requireCanonicalTarget(targetId);
+    const resolved = await resolveTargetLocalFiles(targetId, repoRoot);
+    const currentFiles = resolved.files.map((f) => ({
+        relPath: f.relPath,
+        content: f.content,
+    }));
+    if (currentFiles.length === 0) {
+        return {
+            kind: 'not-spawned',
+            reason: `target ${targetId} resolves to no editable local files in this repo`,
+        };
+    }
+    const allowedFiles = currentFiles.map((f) => f.relPath);
+    const rejectBuffer = await readRejectBuffer(repoRoot, targetId, REJECT_BUFFER_PROMPT_LIMIT);
+    // 成功保护 DO-NOT-PRUNE: read protections + exemplars FRESH from disk (mirroring
+    // the reject-buffer read just above), so the green-run-mined load-bearing
+    // sections are actually surfaced to the bounded edit. `learn` mines these
+    // BEFORE each episode, so this episode's protections are already on disk. The
+    // opts.* fields stay a hermetic TEST seam; an absent file reads as [].
+    const protections = opts.protections ?? (await readProtections(repoRoot, targetId));
+    const exemplarPaths = opts.exemplarPaths ?? (await listExemplarFiles(repoRoot, targetId));
+    const doNotPrune = renderDoNotPruneBlock(protections, exemplarPaths);
+    const basePrompt = assembleEvolvingAgentPrompt({
+        target,
+        currentFiles,
+        diagnosis,
+        editBudget,
+        rejectBuffer,
+        doNotPrune,
+        ...(opts.calibrationNote ? { calibrationNote: opts.calibrationNote } : {}),
+    });
+    // ── 1. Spawn + parse with bounded repair (parse / budget / gate-3) ──────────
+    let feedback = null;
+    let parsed = null;
+    let scopeResult = null;
+    for (let attempt = 0;; attempt++) {
+        const prompt = feedback === null
+            ? basePrompt
+            : `${basePrompt}\n\n# PREVIOUS ATTEMPT WAS REJECTED\n${feedback}\n` +
+                'Re-emit EXACTLY ONE ```json:patch fenced block — either a refusal ' +
+                '({"edits": [], "refusal": string}) or a single bounded edit ' +
+                '({"rationale", "prediction", "edits"}), staying inside the diagnosed ' +
+                'sections and within the changed-line budget.';
+        const run = await runHeadlessAgent(prompt, {
+            cwd: repoRoot,
+            spawn: opts.spawn,
+            binaryOverride: opts.binary,
+        });
+        if (run.exitCode !== 0 || run.stdout.length === 0) {
+            // Agent crash is NOT repaired (mirrors the proposer's invocation contract).
+            throw new CanonicalProposerInvocationError(run.stderr);
+        }
+        try {
+            const candidate = parseEvolvingAgentResponse(run.stdout);
+            if (candidate.kind === 'refusal') {
+                parsed = candidate;
+                break;
+            }
+            // Static-shape edit: validate scope-to-target + frozen freeze here so a
+            // bad path is a REPAIRABLE failure (same class as the proposer).
+            validateCandidateEdits(candidate.edits, allowedFiles);
+            // ≤ L budget (repairable).
+            const changed = countChangedLines(candidate.edits, currentFiles);
+            if (changed > editBudget) {
+                throw new CanonicalProposerOutputInvalid(`edit changes ${changed} lines, over the ${editBudget}-line budget (L) — make a smaller, more targeted edit`);
+            }
+            // 范围⊆诊断 (gate-3, repairable).
+            const scope = checkScopeWithinDiagnosis({
+                edits: candidate.edits,
+                currentFiles,
+                gaps: diagnosis.gaps,
+            });
+            if (!scope.pass) {
+                const where = scope.violations
+                    .map((v) => `${v.file} §"${v.section}"`)
+                    .join(', ');
+                throw new CanonicalProposerOutputInvalid(`edit touches sections outside the diagnosis (范围⊆诊断 violated): ${where} — only edit the diagnosed sections`);
+            }
+            // static guard (tool-evolution) — RUN INSIDE the repair loop so a
+            // content-driven failure (missing rationale / validation evidence / diff)
+            // becomes a REPAIRABLE re-prompt, bounded by maxRepairAttempts, rather than
+            // a throw past the loop. observed-GREEN stays OUTSIDE (the edit cannot
+            // influence the pre-edit main-arm objective — retrying it is a category error).
+            const toolReport = buildToolEvolutionReport(candidate, currentFiles);
+            if (!toolReport.passed) {
+                const findings = toolReport.findings.map((f) => ({
+                    severity: f.severity,
+                    code: f.code,
+                    message: f.message,
+                }));
+                const errs = findings
+                    .filter((f) => f.severity === 'error')
+                    .map((f) => `${f.code}: ${f.message}`);
+                throw new CanonicalProposerOutputInvalid(`static gate failed (score ${toolReport.score.toFixed(2)}): ${errs.join('; ') || 'score below threshold'}`);
+            }
+            parsed = candidate;
+            scopeResult = scope;
+            break;
+        }
+        catch (err) {
+            if (err instanceof CanonicalProposerOutputInvalid && attempt < maxRepairAttempts) {
+                feedback = gateFeedback(err.message);
+                continue;
+            }
+            throw err;
+        }
+    }
+    // ── 2. Model refusal ⇒ 'refused' ledger entry + episode 'evolution-refused' ─
+    if (parsed.kind === 'refusal') {
+        const ledgerEntry = await recordEvolutionRefused({
+            repoRoot,
+            targetId,
+            episodeId,
+            reason: parsed.reason,
+        });
+        await advanceEpisodeStage({ repoRoot, episodeId, stage: 'evolution-refused' });
+        return { kind: 'refused', reason: parsed.reason, ledgerEntry };
+    }
+    const edit = parsed; // narrowed to EvolvingAgentEdit
+    // scopeResult was set alongside the accepted parse; reasserted defensively.
+    if (!scopeResult || !scopeResult.pass) {
+        // Unreachable on the accept path; fail closed rather than evolve out of scope.
+        throw new CanonicalProposerOutputInvalid('范围⊆诊断 scope gate did not pass');
+    }
+    // ── 3. POST-LOOP GATE: observed-GREEN ───────────────────────────────────────
+    // static / 范围⊆诊断 / budget / valid-prediction were all enforced inside the
+    // bounded repair loop above (a content-driven failure was repairable there).
+    // observed-GREEN runs ONCE here and is NOT repairable: it reads the PRE-edit
+    // MAIN arm's objective.json, which the edit cannot influence — re-prompting it
+    // would be a category error.
+    const objective = await readMainArmObjective(repoRoot, episodeId);
+    if (!objective) {
+        throw new CanonicalProposerOutputInvalid('observed-GREEN gate: main-arm/objective.json is missing or unreadable — cannot confirm a verified green run');
+    }
+    const evidence = isEvidenceComplete(objective);
+    if (!evidence.ok) {
+        throw new CanonicalProposerOutputInvalid(`observed-GREEN gate failed: ${evidence.reason}`);
+    }
+    // ── 4. Write back the next policy version. NO candidate dir / sidecar / verdict. ─
+    const ledgerEntry = await advancePolicyVersion({
+        repoRoot,
+        targetId,
+        episodeId,
+        edits: edit.edits,
+        prediction: edit.prediction,
+    });
+    await advanceEpisodeStage({ repoRoot, episodeId, stage: 'evolved' });
+    return { kind: 'evolved', ledgerEntry };
+}
+//# sourceMappingURL=evolving-agent.js.map

package/dist/core/self-evolution/host-harness.d.ts CHANGED Viewed

@@ -1,13 +1,18 @@
 /**
  * Host-aware headless agent runner.
  *
- * The self-evolution fallback (canonical proposer + replay runner) shells out to
- * the HOST coding agent's CLI to do real work. Historically both sites hardcoded
- * `claude -p <prompt>`, which is Claude Code-specific: the OpenAI Codex CLI and
- * opencode CLI take different subcommands and pass the prompt differently (Codex
- * reads it from stdin; opencode takes it as a positional arg). Simply swapping
- * the binary to `codex`/`opencode` would break because the ARGS and the
- * prompt-passing mechanism differ per harness.
+ * The self-evolution loop (reward agent, evolving agent, critic agent) shells out
+ * to the HOST coding agent's CLI to do real work. Each harness takes a different
+ * subcommand, so simply swapping the binary to `codex`/`opencode` would break.
+ *
+ * The prompt is ALWAYS streamed over the child's stdin, never placed in argv.
+ * Loop-v2 prompts embed both arms' transcripts, the five change artifacts and the
+ * objective.json sidecars — routinely 100KB+. Passing a payload that large as a
+ * command-line argument overflows the OS argv limit and the spawn dies with
+ * `ENAMETOOLONG` (Windows `CreateProcess` caps the command line at ~32KB) before
+ * the agent ever runs. stdin has no such limit and all three CLIs read a piped
+ * prompt: `codex exec … -`, `claude -p` (bare), and `opencode run` (bare) each
+ * consume stdin as the prompt.
  *
  * This module centralizes:
  *   - resolving which harness the host is running ({@link resolveHostHarness}),
@@ -15,12 +20,6 @@
  *     ({@link buildHeadlessCommand}), and
  *   - a single spawn attempt that collects stdout/stderr and never rejects
  *     ({@link runHeadlessAgent}).
- *
- * Back-compat is load-bearing: when the harness resolves to 'claude' (the
- * default - no CODEX_ or OPENCODE_ env present, as in the unit tests), the built
- * command is byte-identical to the previous behavior - `binary` + `['-p', prompt]`
- * with no stdin - so the existing proposer-agent / replay-runner tests pass
- * unchanged.
  */
 import { spawn as nodeSpawn } from 'node:child_process';
 export type AgentHarness = 'claude' | 'codex' | 'opencode';
@@ -55,8 +54,8 @@ export interface HeadlessCommand {
  * token anywhere (so the caller streams the prompt to stdin instead).
  *
  * Otherwise the command is derived from the harness (default
- * {@link resolveHostHarness}). The 'claude' branch is byte-identical to the
- * previous hardcoded behavior.
+ * {@link resolveHostHarness}). Every harness streams the prompt over stdin
+ * (`useStdin: true`) so argv stays tiny regardless of prompt size.
  */
 export declare function buildHeadlessCommand(prompt: string, opts: {
     cwd: string;