synergyspec-selfevolving 1.4.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/README.md +31 -18
  2. package/dist/commands/learn.d.ts +12 -1
  3. package/dist/commands/learn.js +158 -11
  4. package/dist/commands/self-evolution-episode.d.ts +177 -0
  5. package/dist/commands/self-evolution-episode.js +431 -0
  6. package/dist/commands/self-evolution.d.ts +12 -190
  7. package/dist/commands/self-evolution.js +114 -866
  8. package/dist/core/archive.d.ts +0 -1
  9. package/dist/core/archive.js +0 -58
  10. package/dist/core/artifact-graph/instruction-loader.d.ts +2 -4
  11. package/dist/core/artifact-graph/instruction-loader.js +3 -31
  12. package/dist/core/fitness/loss.d.ts +5 -5
  13. package/dist/core/fitness/loss.js +4 -4
  14. package/dist/core/fitness/test-failures.js +10 -2
  15. package/dist/core/project-config.d.ts +19 -0
  16. package/dist/core/project-config.js +96 -0
  17. package/dist/core/self-evolution/candidate-fitness.d.ts +23 -1
  18. package/dist/core/self-evolution/candidate-fitness.js +31 -5
  19. package/dist/core/self-evolution/candidates.d.ts +0 -9
  20. package/dist/core/self-evolution/critic-agent.d.ts +192 -0
  21. package/dist/core/self-evolution/critic-agent.js +568 -0
  22. package/dist/core/self-evolution/edits-contract.d.ts +53 -0
  23. package/dist/core/self-evolution/edits-contract.js +89 -0
  24. package/dist/core/self-evolution/episode-orchestrator.d.ts +234 -0
  25. package/dist/core/self-evolution/episode-orchestrator.js +681 -0
  26. package/dist/core/self-evolution/episode-store.d.ts +266 -0
  27. package/dist/core/self-evolution/episode-store.js +573 -0
  28. package/dist/core/self-evolution/evolution-switches.d.ts +1 -1
  29. package/dist/core/self-evolution/evolution-switches.js +5 -10
  30. package/dist/core/self-evolution/evolving-agent.d.ts +208 -0
  31. package/dist/core/self-evolution/evolving-agent.js +535 -0
  32. package/dist/core/self-evolution/host-harness.d.ts +14 -15
  33. package/dist/core/self-evolution/host-harness.js +48 -23
  34. package/dist/core/self-evolution/index.d.ts +11 -6
  35. package/dist/core/self-evolution/index.js +20 -6
  36. package/dist/core/self-evolution/line-diff.d.ts +60 -0
  37. package/dist/core/self-evolution/line-diff.js +130 -0
  38. package/dist/core/self-evolution/policy/fs-safe.d.ts +19 -0
  39. package/dist/core/self-evolution/policy/fs-safe.js +89 -0
  40. package/dist/core/self-evolution/policy/index.d.ts +13 -0
  41. package/dist/core/self-evolution/policy/index.js +13 -0
  42. package/dist/core/self-evolution/policy/policy-store.d.ts +217 -0
  43. package/dist/core/self-evolution/policy/policy-store.js +774 -0
  44. package/dist/core/self-evolution/policy/prediction-reconcile.d.ts +54 -0
  45. package/dist/core/self-evolution/policy/prediction-reconcile.js +191 -0
  46. package/dist/core/self-evolution/policy/reject-buffer.d.ts +55 -0
  47. package/dist/core/self-evolution/policy/reject-buffer.js +170 -0
  48. package/dist/core/self-evolution/promote.d.ts +1 -1
  49. package/dist/core/self-evolution/promote.js +6 -33
  50. package/dist/core/self-evolution/promotion.js +1 -2
  51. package/dist/core/self-evolution/reward-agent.d.ts +379 -0
  52. package/dist/core/self-evolution/reward-agent.js +940 -0
  53. package/dist/core/self-evolution/reward-aggregator.d.ts +59 -0
  54. package/dist/core/self-evolution/reward-aggregator.js +262 -0
  55. package/dist/core/self-evolution/scope-gate.d.ts +66 -0
  56. package/dist/core/self-evolution/scope-gate.js +107 -0
  57. package/dist/core/self-evolution/success-channel.js +2 -2
  58. package/dist/core/self-evolution/tamper-check.d.ts +24 -0
  59. package/dist/core/self-evolution/tamper-check.js +236 -0
  60. package/dist/core/self-evolution/tool-evolution.js +2 -13
  61. package/dist/core/self-evolution/verdict.d.ts +8 -5
  62. package/dist/core/self-evolution/verdict.js +4 -7
  63. package/dist/core/templates/workflows/gen-tests.js +1 -1
  64. package/dist/core/templates/workflows/learn.d.ts +3 -2
  65. package/dist/core/templates/workflows/learn.js +21 -18
  66. package/dist/core/templates/workflows/self-evolving.d.ts +6 -4
  67. package/dist/core/templates/workflows/self-evolving.js +62 -172
  68. package/dist/core/trajectory/scrub.d.ts +27 -0
  69. package/dist/core/trajectory/scrub.js +79 -0
  70. package/dist/core/trajectory/skeleton.d.ts +27 -1
  71. package/dist/core/trajectory/skeleton.js +152 -8
  72. package/dist/dashboard/data.d.ts +25 -51
  73. package/dist/dashboard/data.js +68 -180
  74. package/dist/dashboard/react-client.js +458 -503
  75. package/dist/dashboard/react-styles.js +3 -3
  76. package/dist/dashboard/server.js +23 -17
  77. package/dist/ui/ascii-patterns.d.ts +7 -15
  78. package/dist/ui/ascii-patterns.js +123 -54
  79. package/dist/ui/welcome-screen.d.ts +0 -14
  80. package/dist/ui/welcome-screen.js +16 -35
  81. package/package.json +1 -1
  82. package/dist/core/self-evolution/ga-selection.d.ts +0 -94
  83. package/dist/core/self-evolution/ga-selection.js +0 -153
  84. package/dist/core/self-evolution/proposer-agent.d.ts +0 -182
  85. package/dist/core/self-evolution/proposer-agent.js +0 -326
  86. package/dist/core/self-evolution/replay-runner.d.ts +0 -100
  87. package/dist/core/self-evolution/replay-runner.js +0 -170
  88. package/dist/core/self-evolution/replay.d.ts +0 -45
  89. package/dist/core/self-evolution/replay.js +0 -56
  90. package/dist/core/self-evolution/template-variants.d.ts +0 -62
  91. package/dist/core/self-evolution/template-variants.js +0 -171
  92. package/dist/core/self-evolution/trajectory.d.ts +0 -65
  93. package/dist/core/self-evolution/trajectory.js +0 -185
@@ -0,0 +1,535 @@
1
+ import { promises as fs } from 'node:fs';
2
+ import * as path from 'node:path';
3
+ import { runHeadlessAgent } from './host-harness.js';
4
+ import { evaluateToolEvolutionCandidate, } from './tool-evolution.js';
5
+ import { validateCandidateEdits, CanonicalProposerNoOp, CanonicalProposerOutputInvalid, CanonicalProposerInvocationError, renderUnifiedDiff, } from './edits-contract.js';
6
+ import { requireCanonicalTarget } from './canonical-targets.js';
7
+ import { resolveTargetLocalFiles } from './local-targets.js';
8
+ import { isEvidenceComplete } from './promote.js';
9
+ import { renderDoNotPruneBlock, readProtections, listExemplarFiles, } from './success-channel.js';
10
+ import { readRejectBuffer } from './policy/reject-buffer.js';
11
+ import { advancePolicyVersion, recordEvolutionRefused, } from './policy/policy-store.js';
12
+ import { advanceEpisodeStage, episodeDir, readEpisode, } from './episode-store.js';
13
+ import { countChangedLines, } from './line-diff.js';
14
+ import { checkScopeWithinDiagnosis, } from './scope-gate.js';
15
+ /** Default edit budget L: at most this many changed lines (added + removed). */
16
+ export const DEFAULT_EVOLVING_AGENT_EDIT_BUDGET = 40;
17
+ /**
18
+ * Floor for the orchestrator's failure-driven 步长 step-size schedule: after a
19
+ * rolled-back edit the next episode's budget is halved toward this floor (never
20
+ * below it), so a struggling lineage takes smaller, more legible steps instead
21
+ * of another full-size swing — the backtracking-line-search / trust-region move
22
+ * (shrink the step after a step that lost ground; SkillOpt's decaying edit
23
+ * budget). The flat default above stays the ceiling for a healthy lineage. The
24
+ * schedule itself lives in the orchestrator (`scheduledEditBudget`); the
25
+ * 演进智能体 EVOLVING AGENT just receives the resolved budget as `editBudget`.
26
+ */
27
+ export const MIN_EVOLVING_AGENT_EDIT_BUDGET = 8;
28
+ /** Most recent 否决缓冲 reject-buffer entries surfaced in the prompt. */
29
+ const REJECT_BUFFER_PROMPT_LIMIT = 5;
30
+ const WEAKNESS_CLASSES = new Set([
31
+ 'forgetting',
32
+ 'boundary',
33
+ 'rare',
34
+ 'logic',
35
+ 'verbosity',
36
+ 'other',
37
+ ]);
38
+ const GAP_SEVERITIES = new Set(['high', 'medium', 'low']);
39
+ function normalizeDiagnosis(raw) {
40
+ const o = (raw && typeof raw === 'object' ? raw : {});
41
+ const gaps = [];
42
+ if (Array.isArray(o.gaps)) {
43
+ for (const g of o.gaps) {
44
+ const file = g?.file;
45
+ const section = g?.section;
46
+ if (typeof file === 'string' && typeof section === 'string') {
47
+ const gap = { file, section };
48
+ // Carry the OPTIONAL weaknessClass/severity through when present and
49
+ // valid; tolerate absence (old diagnoses have neither).
50
+ const wc = g?.weaknessClass;
51
+ if (typeof wc === 'string' && WEAKNESS_CLASSES.has(wc)) {
52
+ gap.weaknessClass = wc;
53
+ }
54
+ const sev = g?.severity;
55
+ if (typeof sev === 'string' && GAP_SEVERITIES.has(sev)) {
56
+ gap.severity = sev;
57
+ }
58
+ gaps.push(gap);
59
+ }
60
+ }
61
+ }
62
+ const errors = Array.isArray(o.errors)
63
+ ? o.errors.filter((e) => typeof e === 'string')
64
+ : [];
65
+ const advantageRaw = o.advantage;
66
+ const advantage = typeof advantageRaw === 'number' && Number.isFinite(advantageRaw) ? advantageRaw : null;
67
+ return {
68
+ abstained: o.abstained === true,
69
+ gaps,
70
+ errors,
71
+ textualGradient: typeof o.textualGradient === 'string' ? o.textualGradient : '',
72
+ advantage,
73
+ };
74
+ }
75
+ const PRELUDE_HEAD = [
76
+ 'You are the 演进智能体 EVOLVING AGENT (optimizer.step) for a self-evolving',
77
+ 'SynergySpec. The 策略 POLICY is the design template below (the 主智能体 MAIN',
78
+ "AGENT's 「权重」). The 奖励智能体 REWARD AGENT has already scored the last",
79
+ 'episode and produced the DIAGNOSIS below; you NEVER score — you make ONE',
80
+ 'bounded edit that acts on its 文本梯度 textual gradient.',
81
+ ];
82
+ function preludeLines(editBudget) {
83
+ return [
84
+ ...PRELUDE_HEAD,
85
+ '',
86
+ 'Make EXACTLY ONE bounded edit totalling no more than ' +
87
+ `${editBudget} changed lines (added + removed), plus a checkable prediction.`,
88
+ 'Stay STRICTLY inside the sections the DIAGNOSIS names — do not rewrite an',
89
+ 'unrelated heading or key just because the file is editable.',
90
+ '',
91
+ 'If the diagnosis is too weak to name a concrete edit, REFUSE — emit the',
92
+ 'refusal shape instead of inventing a change:',
93
+ '',
94
+ '```json:patch',
95
+ '{"edits": [], "refusal": "<one sentence: why no concrete edit is named>"}',
96
+ '```',
97
+ '',
98
+ 'Otherwise emit EXACTLY ONE fenced block tagged `json:patch` and nothing else:',
99
+ '',
100
+ '```json:patch',
101
+ '{"rationale": "<why this edit and the expected behavioral delta>",',
102
+ ' "prediction": {"metric": "loss" | "passRate" | "healthPenalty",',
103
+ ' "direction": "down" | "up",',
104
+ ' "checkBy": "<one sentence: how a later episode settles this>"},',
105
+ ' "edits": [{"relPath": "<one of the allowed files>", "content": "<FULL new file contents>"}]}',
106
+ '```',
107
+ '',
108
+ 'Rules:',
109
+ '- Only edit files listed under "CANONICAL TARGET" below. Never invent paths.',
110
+ "- Each edit's `content` is the COMPLETE new file, not a patch fragment.",
111
+ '- You NEVER score and you NEVER touch the gate/oracle files.',
112
+ ].join('\n');
113
+ }
114
+ /**
115
+ * Severity rank used to ORDER gaps in the prompt. HIGH gaps render first so the
116
+ * bounded edit is aimed at the most-severe failure mode. See
117
+ * {@link orderGapsForPrompt} for the exact deterministic rule.
118
+ */
119
+ const SEVERITY_RANK = { high: 0, medium: 1, low: 2 };
120
+ /**
121
+ * Order gaps for the prompt so HIGH severity comes first. Deterministic rule:
122
+ * - HIGH (rank 0) before everything;
123
+ * - gaps WITHOUT a severity (rank 1.5) sit AFTER the high ones, interleaved
124
+ * between explicit medium (1) and low (2) by their original order;
125
+ * - the sort is STABLE — gaps of equal effective rank keep their original
126
+ * relative order (`Array.prototype.sort` is stable in modern V8/Node).
127
+ * Pure: returns a new array, never mutates the input.
128
+ */
129
+ function orderGapsForPrompt(gaps) {
130
+ const rank = (g) => g.severity !== undefined ? SEVERITY_RANK[g.severity] : 1.5;
131
+ return [...gaps].sort((a, b) => rank(a) - rank(b));
132
+ }
133
+ /**
134
+ * Render the optional `[severity · weaknessClass]` annotation for a gap, with a
135
+ * trailing space, so the bounded edit is AIMED. Emits nothing (empty string)
136
+ * when BOTH fields are absent — no empty brackets, so an un-annotated gap
137
+ * renders exactly as before.
138
+ */
139
+ function renderGapTag(g) {
140
+ const inner = [];
141
+ if (g.severity !== undefined)
142
+ inner.push(g.severity);
143
+ if (g.weaknessClass !== undefined)
144
+ inner.push(g.weaknessClass);
145
+ return inner.length > 0 ? `[${inner.join(' · ')}] ` : '';
146
+ }
147
+ /**
148
+ * Assemble the EVOLVING AGENT prompt. Order is stable and sections are
149
+ * omitted-when-empty so prompts on runs with no reject-buffer / no protections
150
+ * stay byte-identical. The editable files are fenced as `<<FILE: relPath>>`.
151
+ */
152
+ export function assembleEvolvingAgentPrompt(input) {
153
+ const parts = [preludeLines(input.editBudget), ''];
154
+ const t = input.target;
155
+ parts.push(`# CANONICAL TARGET: ${t.id} kind=${t.kind} files=${t.files.join(', ')}`, '');
156
+ for (const f of input.currentFiles) {
157
+ parts.push(`<<FILE: ${f.relPath}>>`, f.content, '<<END FILE>>', '');
158
+ }
159
+ const d = input.diagnosis;
160
+ parts.push('# DIAGNOSIS (from the 奖励智能体 REWARD AGENT — it scored, you edit)');
161
+ if (typeof d.advantage === 'number') {
162
+ parts.push(`advantage = reward(主臂) − reward(基线臂): ${d.advantage.toFixed(3)} (negative ⇒ the last edit lost ground)`);
163
+ }
164
+ const errors = (d.errors ?? []).filter((e) => e && e.trim().length > 0);
165
+ if (errors.length > 0) {
166
+ parts.push('errors:');
167
+ for (const e of errors)
168
+ parts.push(`- ${e}`);
169
+ }
170
+ const gaps = orderGapsForPrompt(d.gaps ?? []);
171
+ if (gaps.length > 0) {
172
+ parts.push('gaps (your edit must stay inside these sections):');
173
+ for (const g of gaps) {
174
+ const where = g.section === '*' ? `${g.file} (whole file)` : `${g.file} §"${g.section}"`;
175
+ const tag = renderGapTag(g);
176
+ parts.push(`- ${tag}${where}`);
177
+ }
178
+ }
179
+ const gradient = (d.textualGradient ?? '').trim();
180
+ if (gradient.length > 0) {
181
+ parts.push('', '文本梯度 textual gradient:', gradient);
182
+ }
183
+ const rejects = input.rejectBuffer ?? [];
184
+ if (rejects.length > 0) {
185
+ parts.push('', '# 否决缓冲 REJECT-BUFFER (directions already vetoed; do not repeat them)');
186
+ for (const r of rejects) {
187
+ const adv = typeof r.advantage === 'number' ? ` advantage=${r.advantage.toFixed(3)}` : '';
188
+ parts.push(`- [${r.reason}${adv}] ${r.textualGradientTried.trim() || r.editSummary.rationaleExcerpt}`);
189
+ }
190
+ }
191
+ const calibration = (input.calibrationNote ?? '').trim();
192
+ if (calibration.length > 0) {
193
+ parts.push('', '# 预测校准 PREDICTION CALIBRATION (your earlier predictions, settled by measurement)');
194
+ parts.push('A repeatedly-refuted direction is a weak bet — weight your confidence (and', 'this edit) accordingly. This NEVER blocks you; it is advisory context only.', calibration);
195
+ }
196
+ const doNotPrune = (input.doNotPrune ?? '').trim();
197
+ if (doNotPrune.length > 0) {
198
+ parts.push('', '# DO-NOT-PRUNE (成功保护 — load-bearing in passing runs)');
199
+ parts.push('The sections below are implicated in verified-PASSING runs. They must not be', 'deleted or hollowed out — make your bounded edit elsewhere.', doNotPrune);
200
+ }
201
+ return parts.join('\n');
202
+ }
203
+ const PREDICTION_METRICS = new Set(['loss', 'passRate', 'healthPenalty']);
204
+ /**
205
+ * Parse the model's single `json:patch` block. Accepts EITHER the refusal shape
206
+ * (`{edits: [], refusal: string}`) OR a concrete edit (`{rationale, prediction,
207
+ * edits[]}`). Throws {@link CanonicalProposerOutputInvalid} on a malformed
208
+ * block, the wrong block count, a missing/invalid prediction, or
209
+ * {@link CanonicalProposerNoOp} on empty edits WITHOUT a refusal reason.
210
+ *
211
+ * Edits are NOT yet scope-validated here (the caller runs the static gate over
212
+ * them); this only enforces the SHAPE of the contract.
213
+ */
214
+ export function parseEvolvingAgentResponse(text) {
215
+ const fenceRe = /```json:patch\s*([\s\S]*?)```/g;
216
+ const matches = [];
217
+ let m;
218
+ while ((m = fenceRe.exec(text)) !== null)
219
+ matches.push(m[1]);
220
+ if (matches.length === 0) {
221
+ throw new CanonicalProposerOutputInvalid('no `json:patch` fenced block found in response');
222
+ }
223
+ if (matches.length > 1) {
224
+ throw new CanonicalProposerOutputInvalid(`expected exactly 1 \`json:patch\` block, found ${matches.length}`);
225
+ }
226
+ let parsed;
227
+ try {
228
+ parsed = JSON.parse(matches[0].trim());
229
+ }
230
+ catch (err) {
231
+ throw new CanonicalProposerOutputInvalid(`failed to parse JSON inside patch block: ${err instanceof Error ? err.message : String(err)}`);
232
+ }
233
+ if (!parsed || typeof parsed !== 'object') {
234
+ throw new CanonicalProposerOutputInvalid('patch block must be a JSON object');
235
+ }
236
+ const o = parsed;
237
+ const rawEdits = o.edits;
238
+ if (!Array.isArray(rawEdits)) {
239
+ throw new CanonicalProposerOutputInvalid('patch block must contain an `edits` array');
240
+ }
241
+ // Refusal shape: empty edits + a refusal string.
242
+ const refusal = o.refusal;
243
+ if (rawEdits.length === 0) {
244
+ if (typeof refusal === 'string' && refusal.trim().length > 0) {
245
+ return { kind: 'refusal', reason: refusal.trim() };
246
+ }
247
+ // Empty edits with no refusal reason is a malformed no-op, not a refusal.
248
+ throw new CanonicalProposerNoOp();
249
+ }
250
+ // Concrete-edit shape: validate prediction + edit shapes.
251
+ const prediction = parsePrediction(o.prediction);
252
+ const edits = [];
253
+ for (const e of rawEdits) {
254
+ const relPath = e?.relPath;
255
+ const content = e?.content;
256
+ if (typeof relPath !== 'string' || typeof content !== 'string') {
257
+ throw new CanonicalProposerOutputInvalid('edit must have string relPath and string content');
258
+ }
259
+ edits.push({ relPath: relPath.replace(/\\/g, '/'), content });
260
+ }
261
+ const rationale = typeof o.rationale === 'string' ? o.rationale.trim() : '';
262
+ return { kind: 'edit', rationale, prediction, edits };
263
+ }
264
+ function parsePrediction(raw) {
265
+ if (!raw || typeof raw !== 'object') {
266
+ throw new CanonicalProposerOutputInvalid('a concrete edit requires a `prediction` object {metric, direction, checkBy}');
267
+ }
268
+ const p = raw;
269
+ if (typeof p.metric !== 'string' || !PREDICTION_METRICS.has(p.metric)) {
270
+ throw new CanonicalProposerOutputInvalid("prediction.metric must be 'loss' | 'passRate' | 'healthPenalty'");
271
+ }
272
+ if (p.direction !== 'down' && p.direction !== 'up') {
273
+ throw new CanonicalProposerOutputInvalid("prediction.direction must be 'down' | 'up'");
274
+ }
275
+ if (typeof p.checkBy !== 'string' || p.checkBy.trim().length === 0) {
276
+ throw new CanonicalProposerOutputInvalid('prediction.checkBy must be a non-empty string');
277
+ }
278
+ return {
279
+ metric: p.metric,
280
+ direction: p.direction,
281
+ checkBy: p.checkBy.trim(),
282
+ };
283
+ }
284
+ /** Read + structurally normalize the episode's diagnosis.json. */
285
+ async function readDiagnosis(repoRoot, episodeId) {
286
+ const file = path.join(episodeDir(repoRoot, episodeId), 'diagnosis.json');
287
+ let raw;
288
+ try {
289
+ raw = await fs.readFile(file, 'utf8');
290
+ }
291
+ catch (err) {
292
+ if (err.code === 'ENOENT') {
293
+ // No diagnosis ⇒ nothing to act on ⇒ treated as an abstain (refuse-to-spawn).
294
+ return { abstained: true, gaps: [], errors: [], textualGradient: '', advantage: null };
295
+ }
296
+ throw err;
297
+ }
298
+ let parsed;
299
+ try {
300
+ parsed = JSON.parse(raw);
301
+ }
302
+ catch {
303
+ // A corrupt diagnosis is not a nameable gap ⇒ abstain (refuse-to-spawn).
304
+ return { abstained: true, gaps: [], errors: [], textualGradient: '', advantage: null };
305
+ }
306
+ return normalizeDiagnosis(parsed);
307
+ }
308
+ /** Read the MAIN arm's objective.json as an {@link EvidenceReport}. */
309
+ async function readMainArmObjective(repoRoot, episodeId) {
310
+ const file = path.join(episodeDir(repoRoot, episodeId), 'main-arm', 'objective.json');
311
+ let raw;
312
+ try {
313
+ raw = await fs.readFile(file, 'utf8');
314
+ }
315
+ catch (err) {
316
+ if (err.code === 'ENOENT')
317
+ return null;
318
+ throw err;
319
+ }
320
+ try {
321
+ return JSON.parse(raw);
322
+ }
323
+ catch {
324
+ return null;
325
+ }
326
+ }
327
+ /** Build the static-gate ToolEvolutionCandidate from the parsed edit. */
328
+ function buildToolEvolutionReport(edit, currentFiles) {
329
+ const oldByPath = new Map(currentFiles.map((f) => [f.relPath.replace(/\\/g, '/'), f.content]));
330
+ const diff = edit.edits
331
+ .map((e) => renderUnifiedDiff(e.relPath, oldByPath.get(e.relPath) ?? '', e.content))
332
+ .join('\n');
333
+ return evaluateToolEvolutionCandidate({
334
+ changedFiles: edit.edits.map((e) => e.relPath),
335
+ diff,
336
+ // The rationale carries the user-facing "why"; the static guard scans it
337
+ // for a rationale signal. Default a generic line so a terse rationale does
338
+ // not spuriously fail the guard.
339
+ summary: edit.rationale || 'EVOLVING AGENT bounded edit acting on the textual gradient',
340
+ // The prediction IS the verification evidence (a falsifiable, checkable bet).
341
+ evidence: `prediction: ${edit.prediction.metric} ${edit.prediction.direction} — ${edit.prediction.checkBy} (verification: checked by a later episode's measurement)`,
342
+ requireDiff: true,
343
+ });
344
+ }
345
+ /** Compose the human-readable feedback for a repair re-prompt. */
346
+ function gateFeedback(reason) {
347
+ return reason;
348
+ }
349
+ /**
350
+ * Run the 演进智能体 EVOLVING AGENT against an already-scored episode.
351
+ *
352
+ * Flow:
353
+ * 0. Code-side refuse-to-spawn: diagnosis.abstained or no gaps ⇒ not-spawned.
354
+ * 1. Assemble + spawn (fresh context) and parse with repair ×N; over-budget
355
+ * and 范围⊆诊断 (gate-3) violations are repairable too (re-prompt with the
356
+ * violation appended).
357
+ * 2. Model refusal ⇒ {kind:'refused'} + a 'refused' ledger entry.
358
+ * 3. GATES ×3 (static → observed-GREEN → 范围⊆诊断), each ANDed with the
359
+ * ≤ L budget and a valid prediction.
360
+ * 4. All green ⇒ advancePolicyVersion writes the next version; episode stage
361
+ * advances to 'evolved' (or 'evolution-refused' on refusal).
362
+ */
363
+ export async function runEvolvingAgent(opts) {
364
+ const repoRoot = path.resolve(opts.repoRoot);
365
+ const { episodeId, targetId } = opts;
366
+ const editBudget = opts.editBudget ?? DEFAULT_EVOLVING_AGENT_EDIT_BUDGET;
367
+ const maxRepairAttempts = Math.max(0, opts.maxRepairAttempts ?? 2);
368
+ // Fail closed: the episode must exist (and tells us nothing else we need yet).
369
+ const episode = await readEpisode(repoRoot, episodeId);
370
+ void episode;
371
+ // ── 0. Code-side refuse-to-spawn ───────────────────────────────────────────
372
+ const diagnosis = await readDiagnosis(repoRoot, episodeId);
373
+ if (diagnosis.abstained) {
374
+ return {
375
+ kind: 'not-spawned',
376
+ reason: '奖励智能体 REWARD AGENT 弃权 abstained — no nameable gap to act on',
377
+ };
378
+ }
379
+ if (diagnosis.gaps.length === 0) {
380
+ return {
381
+ kind: 'not-spawned',
382
+ reason: 'diagnosis names no gaps — nothing for the 演进智能体 EVOLVING AGENT to scope an edit to',
383
+ };
384
+ }
385
+ // Resolve the target's editable local files (the lineage surface).
386
+ const target = requireCanonicalTarget(targetId);
387
+ const resolved = await resolveTargetLocalFiles(targetId, repoRoot);
388
+ const currentFiles = resolved.files.map((f) => ({
389
+ relPath: f.relPath,
390
+ content: f.content,
391
+ }));
392
+ if (currentFiles.length === 0) {
393
+ return {
394
+ kind: 'not-spawned',
395
+ reason: `target ${targetId} resolves to no editable local files in this repo`,
396
+ };
397
+ }
398
+ const allowedFiles = currentFiles.map((f) => f.relPath);
399
+ const rejectBuffer = await readRejectBuffer(repoRoot, targetId, REJECT_BUFFER_PROMPT_LIMIT);
400
+ // 成功保护 DO-NOT-PRUNE: read protections + exemplars FRESH from disk (mirroring
401
+ // the reject-buffer read just above), so the green-run-mined load-bearing
402
+ // sections are actually surfaced to the bounded edit. `learn` mines these
403
+ // BEFORE each episode, so this episode's protections are already on disk. The
404
+ // opts.* fields stay a hermetic TEST seam; an absent file reads as [].
405
+ const protections = opts.protections ?? (await readProtections(repoRoot, targetId));
406
+ const exemplarPaths = opts.exemplarPaths ?? (await listExemplarFiles(repoRoot, targetId));
407
+ const doNotPrune = renderDoNotPruneBlock(protections, exemplarPaths);
408
+ const basePrompt = assembleEvolvingAgentPrompt({
409
+ target,
410
+ currentFiles,
411
+ diagnosis,
412
+ editBudget,
413
+ rejectBuffer,
414
+ doNotPrune,
415
+ ...(opts.calibrationNote ? { calibrationNote: opts.calibrationNote } : {}),
416
+ });
417
+ // ── 1. Spawn + parse with bounded repair (parse / budget / gate-3) ──────────
418
+ let feedback = null;
419
+ let parsed = null;
420
+ let scopeResult = null;
421
+ for (let attempt = 0;; attempt++) {
422
+ const prompt = feedback === null
423
+ ? basePrompt
424
+ : `${basePrompt}\n\n# PREVIOUS ATTEMPT WAS REJECTED\n${feedback}\n` +
425
+ 'Re-emit EXACTLY ONE ```json:patch fenced block — either a refusal ' +
426
+ '({"edits": [], "refusal": string}) or a single bounded edit ' +
427
+ '({"rationale", "prediction", "edits"}), staying inside the diagnosed ' +
428
+ 'sections and within the changed-line budget.';
429
+ const run = await runHeadlessAgent(prompt, {
430
+ cwd: repoRoot,
431
+ spawn: opts.spawn,
432
+ binaryOverride: opts.binary,
433
+ });
434
+ if (run.exitCode !== 0 || run.stdout.length === 0) {
435
+ // Agent crash is NOT repaired (mirrors the proposer's invocation contract).
436
+ throw new CanonicalProposerInvocationError(run.stderr);
437
+ }
438
+ try {
439
+ const candidate = parseEvolvingAgentResponse(run.stdout);
440
+ if (candidate.kind === 'refusal') {
441
+ parsed = candidate;
442
+ break;
443
+ }
444
+ // Static-shape edit: validate scope-to-target + frozen freeze here so a
445
+ // bad path is a REPAIRABLE failure (same class as the proposer).
446
+ validateCandidateEdits(candidate.edits, allowedFiles);
447
+ // ≤ L budget (repairable).
448
+ const changed = countChangedLines(candidate.edits, currentFiles);
449
+ if (changed > editBudget) {
450
+ throw new CanonicalProposerOutputInvalid(`edit changes ${changed} lines, over the ${editBudget}-line budget (L) — make a smaller, more targeted edit`);
451
+ }
452
+ // 范围⊆诊断 (gate-3, repairable).
453
+ const scope = checkScopeWithinDiagnosis({
454
+ edits: candidate.edits,
455
+ currentFiles,
456
+ gaps: diagnosis.gaps,
457
+ });
458
+ if (!scope.pass) {
459
+ const where = scope.violations
460
+ .map((v) => `${v.file} §"${v.section}"`)
461
+ .join(', ');
462
+ throw new CanonicalProposerOutputInvalid(`edit touches sections outside the diagnosis (范围⊆诊断 violated): ${where} — only edit the diagnosed sections`);
463
+ }
464
+ // static guard (tool-evolution) — RUN INSIDE the repair loop so a
465
+ // content-driven failure (missing rationale / validation evidence / diff)
466
+ // becomes a REPAIRABLE re-prompt, bounded by maxRepairAttempts, rather than
467
+ // a throw past the loop. observed-GREEN stays OUTSIDE (the edit cannot
468
+ // influence the pre-edit main-arm objective — retrying it is a category error).
469
+ const toolReport = buildToolEvolutionReport(candidate, currentFiles);
470
+ if (!toolReport.passed) {
471
+ const findings = toolReport.findings.map((f) => ({
472
+ severity: f.severity,
473
+ code: f.code,
474
+ message: f.message,
475
+ }));
476
+ const errs = findings
477
+ .filter((f) => f.severity === 'error')
478
+ .map((f) => `${f.code}: ${f.message}`);
479
+ throw new CanonicalProposerOutputInvalid(`static gate failed (score ${toolReport.score.toFixed(2)}): ${errs.join('; ') || 'score below threshold'}`);
480
+ }
481
+ parsed = candidate;
482
+ scopeResult = scope;
483
+ break;
484
+ }
485
+ catch (err) {
486
+ if (err instanceof CanonicalProposerOutputInvalid && attempt < maxRepairAttempts) {
487
+ feedback = gateFeedback(err.message);
488
+ continue;
489
+ }
490
+ throw err;
491
+ }
492
+ }
493
+ // ── 2. Model refusal ⇒ 'refused' ledger entry + episode 'evolution-refused' ─
494
+ if (parsed.kind === 'refusal') {
495
+ const ledgerEntry = await recordEvolutionRefused({
496
+ repoRoot,
497
+ targetId,
498
+ episodeId,
499
+ reason: parsed.reason,
500
+ });
501
+ await advanceEpisodeStage({ repoRoot, episodeId, stage: 'evolution-refused' });
502
+ return { kind: 'refused', reason: parsed.reason, ledgerEntry };
503
+ }
504
+ const edit = parsed; // narrowed to EvolvingAgentEdit
505
+ // scopeResult was set alongside the accepted parse; reasserted defensively.
506
+ if (!scopeResult || !scopeResult.pass) {
507
+ // Unreachable on the accept path; fail closed rather than evolve out of scope.
508
+ throw new CanonicalProposerOutputInvalid('范围⊆诊断 scope gate did not pass');
509
+ }
510
+ // ── 3. POST-LOOP GATE: observed-GREEN ───────────────────────────────────────
511
+ // static / 范围⊆诊断 / budget / valid-prediction were all enforced inside the
512
+ // bounded repair loop above (a content-driven failure was repairable there).
513
+ // observed-GREEN runs ONCE here and is NOT repairable: it reads the PRE-edit
514
+ // MAIN arm's objective.json, which the edit cannot influence — re-prompting it
515
+ // would be a category error.
516
+ const objective = await readMainArmObjective(repoRoot, episodeId);
517
+ if (!objective) {
518
+ throw new CanonicalProposerOutputInvalid('observed-GREEN gate: main-arm/objective.json is missing or unreadable — cannot confirm a verified green run');
519
+ }
520
+ const evidence = isEvidenceComplete(objective);
521
+ if (!evidence.ok) {
522
+ throw new CanonicalProposerOutputInvalid(`observed-GREEN gate failed: ${evidence.reason}`);
523
+ }
524
+ // ── 4. Write back the next policy version. NO candidate dir / sidecar / verdict. ─
525
+ const ledgerEntry = await advancePolicyVersion({
526
+ repoRoot,
527
+ targetId,
528
+ episodeId,
529
+ edits: edit.edits,
530
+ prediction: edit.prediction,
531
+ });
532
+ await advanceEpisodeStage({ repoRoot, episodeId, stage: 'evolved' });
533
+ return { kind: 'evolved', ledgerEntry };
534
+ }
535
+ //# sourceMappingURL=evolving-agent.js.map
@@ -1,13 +1,18 @@
1
1
  /**
2
2
  * Host-aware headless agent runner.
3
3
  *
4
- * The self-evolution fallback (canonical proposer + replay runner) shells out to
5
- * the HOST coding agent's CLI to do real work. Historically both sites hardcoded
6
- * `claude -p <prompt>`, which is Claude Code-specific: the OpenAI Codex CLI and
7
- * opencode CLI take different subcommands and pass the prompt differently (Codex
8
- * reads it from stdin; opencode takes it as a positional arg). Simply swapping
9
- * the binary to `codex`/`opencode` would break because the ARGS and the
10
- * prompt-passing mechanism differ per harness.
4
+ * The self-evolution loop (reward agent, evolving agent, critic agent) shells out
5
+ * to the HOST coding agent's CLI to do real work. Each harness takes a different
6
+ * subcommand, so simply swapping the binary to `codex`/`opencode` would break.
7
+ *
8
+ * The prompt is ALWAYS streamed over the child's stdin, never placed in argv.
9
+ * Loop-v2 prompts embed both arms' transcripts, the five change artifacts and the
10
+ * objective.json sidecars routinely 100KB+. Passing a payload that large as a
11
+ * command-line argument overflows the OS argv limit and the spawn dies with
12
+ * `ENAMETOOLONG` (Windows `CreateProcess` caps the command line at ~32KB) before
13
+ * the agent ever runs. stdin has no such limit and all three CLIs read a piped
14
+ * prompt: `codex exec … -`, `claude -p` (bare), and `opencode run` (bare) each
15
+ * consume stdin as the prompt.
11
16
  *
12
17
  * This module centralizes:
13
18
  * - resolving which harness the host is running ({@link resolveHostHarness}),
@@ -15,12 +20,6 @@
15
20
  * ({@link buildHeadlessCommand}), and
16
21
  * - a single spawn attempt that collects stdout/stderr and never rejects
17
22
  * ({@link runHeadlessAgent}).
18
- *
19
- * Back-compat is load-bearing: when the harness resolves to 'claude' (the
20
- * default - no CODEX_ or OPENCODE_ env present, as in the unit tests), the built
21
- * command is byte-identical to the previous behavior - `binary` + `['-p', prompt]`
22
- * with no stdin - so the existing proposer-agent / replay-runner tests pass
23
- * unchanged.
24
23
  */
25
24
  import { spawn as nodeSpawn } from 'node:child_process';
26
25
  export type AgentHarness = 'claude' | 'codex' | 'opencode';
@@ -55,8 +54,8 @@ export interface HeadlessCommand {
55
54
  * token anywhere (so the caller streams the prompt to stdin instead).
56
55
  *
57
56
  * Otherwise the command is derived from the harness (default
58
- * {@link resolveHostHarness}). The 'claude' branch is byte-identical to the
59
- * previous hardcoded behavior.
57
+ * {@link resolveHostHarness}). Every harness streams the prompt over stdin
58
+ * (`useStdin: true`) so argv stays tiny regardless of prompt size.
60
59
  */
61
60
  export declare function buildHeadlessCommand(prompt: string, opts: {
62
61
  cwd: string;