synergyspec-selfevolving 1.4.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/README.md +31 -18
  2. package/dist/commands/learn.d.ts +12 -1
  3. package/dist/commands/learn.js +158 -11
  4. package/dist/commands/self-evolution-episode.d.ts +177 -0
  5. package/dist/commands/self-evolution-episode.js +431 -0
  6. package/dist/commands/self-evolution.d.ts +12 -190
  7. package/dist/commands/self-evolution.js +114 -866
  8. package/dist/core/archive.d.ts +0 -1
  9. package/dist/core/archive.js +0 -58
  10. package/dist/core/artifact-graph/instruction-loader.d.ts +2 -4
  11. package/dist/core/artifact-graph/instruction-loader.js +3 -31
  12. package/dist/core/fitness/loss.d.ts +5 -5
  13. package/dist/core/fitness/loss.js +4 -4
  14. package/dist/core/fitness/test-failures.js +10 -2
  15. package/dist/core/project-config.d.ts +19 -0
  16. package/dist/core/project-config.js +96 -0
  17. package/dist/core/self-evolution/candidate-fitness.d.ts +23 -1
  18. package/dist/core/self-evolution/candidate-fitness.js +31 -5
  19. package/dist/core/self-evolution/candidates.d.ts +0 -9
  20. package/dist/core/self-evolution/critic-agent.d.ts +192 -0
  21. package/dist/core/self-evolution/critic-agent.js +568 -0
  22. package/dist/core/self-evolution/edits-contract.d.ts +53 -0
  23. package/dist/core/self-evolution/edits-contract.js +89 -0
  24. package/dist/core/self-evolution/episode-orchestrator.d.ts +234 -0
  25. package/dist/core/self-evolution/episode-orchestrator.js +681 -0
  26. package/dist/core/self-evolution/episode-store.d.ts +266 -0
  27. package/dist/core/self-evolution/episode-store.js +573 -0
  28. package/dist/core/self-evolution/evolution-switches.d.ts +1 -1
  29. package/dist/core/self-evolution/evolution-switches.js +5 -10
  30. package/dist/core/self-evolution/evolving-agent.d.ts +208 -0
  31. package/dist/core/self-evolution/evolving-agent.js +535 -0
  32. package/dist/core/self-evolution/host-harness.d.ts +14 -15
  33. package/dist/core/self-evolution/host-harness.js +48 -23
  34. package/dist/core/self-evolution/index.d.ts +11 -6
  35. package/dist/core/self-evolution/index.js +20 -6
  36. package/dist/core/self-evolution/line-diff.d.ts +60 -0
  37. package/dist/core/self-evolution/line-diff.js +130 -0
  38. package/dist/core/self-evolution/policy/fs-safe.d.ts +19 -0
  39. package/dist/core/self-evolution/policy/fs-safe.js +89 -0
  40. package/dist/core/self-evolution/policy/index.d.ts +13 -0
  41. package/dist/core/self-evolution/policy/index.js +13 -0
  42. package/dist/core/self-evolution/policy/policy-store.d.ts +217 -0
  43. package/dist/core/self-evolution/policy/policy-store.js +774 -0
  44. package/dist/core/self-evolution/policy/prediction-reconcile.d.ts +54 -0
  45. package/dist/core/self-evolution/policy/prediction-reconcile.js +191 -0
  46. package/dist/core/self-evolution/policy/reject-buffer.d.ts +55 -0
  47. package/dist/core/self-evolution/policy/reject-buffer.js +170 -0
  48. package/dist/core/self-evolution/promote.d.ts +1 -1
  49. package/dist/core/self-evolution/promote.js +6 -33
  50. package/dist/core/self-evolution/promotion.js +1 -2
  51. package/dist/core/self-evolution/reward-agent.d.ts +379 -0
  52. package/dist/core/self-evolution/reward-agent.js +940 -0
  53. package/dist/core/self-evolution/reward-aggregator.d.ts +59 -0
  54. package/dist/core/self-evolution/reward-aggregator.js +262 -0
  55. package/dist/core/self-evolution/scope-gate.d.ts +66 -0
  56. package/dist/core/self-evolution/scope-gate.js +107 -0
  57. package/dist/core/self-evolution/success-channel.js +2 -2
  58. package/dist/core/self-evolution/tamper-check.d.ts +24 -0
  59. package/dist/core/self-evolution/tamper-check.js +236 -0
  60. package/dist/core/self-evolution/tool-evolution.js +2 -13
  61. package/dist/core/self-evolution/verdict.d.ts +8 -5
  62. package/dist/core/self-evolution/verdict.js +4 -7
  63. package/dist/core/templates/workflows/gen-tests.js +1 -1
  64. package/dist/core/templates/workflows/learn.d.ts +3 -2
  65. package/dist/core/templates/workflows/learn.js +21 -18
  66. package/dist/core/templates/workflows/self-evolving.d.ts +6 -4
  67. package/dist/core/templates/workflows/self-evolving.js +62 -172
  68. package/dist/core/trajectory/scrub.d.ts +27 -0
  69. package/dist/core/trajectory/scrub.js +79 -0
  70. package/dist/core/trajectory/skeleton.d.ts +27 -1
  71. package/dist/core/trajectory/skeleton.js +152 -8
  72. package/dist/dashboard/data.d.ts +25 -51
  73. package/dist/dashboard/data.js +68 -180
  74. package/dist/dashboard/react-client.js +458 -503
  75. package/dist/dashboard/react-styles.js +3 -3
  76. package/dist/dashboard/server.js +23 -17
  77. package/dist/ui/ascii-patterns.d.ts +7 -15
  78. package/dist/ui/ascii-patterns.js +123 -54
  79. package/dist/ui/welcome-screen.d.ts +0 -14
  80. package/dist/ui/welcome-screen.js +16 -35
  81. package/package.json +1 -1
  82. package/dist/core/self-evolution/ga-selection.d.ts +0 -94
  83. package/dist/core/self-evolution/ga-selection.js +0 -153
  84. package/dist/core/self-evolution/proposer-agent.d.ts +0 -182
  85. package/dist/core/self-evolution/proposer-agent.js +0 -326
  86. package/dist/core/self-evolution/replay-runner.d.ts +0 -100
  87. package/dist/core/self-evolution/replay-runner.js +0 -170
  88. package/dist/core/self-evolution/replay.d.ts +0 -45
  89. package/dist/core/self-evolution/replay.js +0 -56
  90. package/dist/core/self-evolution/template-variants.d.ts +0 -62
  91. package/dist/core/self-evolution/template-variants.js +0 -171
  92. package/dist/core/self-evolution/trajectory.d.ts +0 -65
  93. package/dist/core/self-evolution/trajectory.js +0 -185
@@ -0,0 +1,59 @@
1
+ /**
2
+ * 奖励智能体 REWARD AGENT — statistical layer (② A/A noise floor + SPRT).
3
+ *
4
+ * Wraps {@link scoreOnce} to run k judged duels for ONE episode, cancel the
5
+ * judge's position bias by swapping arm order across samples (③), size the
6
+ * judge's jitter with an A/A pair (the NOISE FLOOR), stop early with a sequential
7
+ * test (SPRT), and route a within-noise advantage to the ⑤ `insufficient-signal`
8
+ * verdict. It then writes the ONE aggregated `diagnosis.json` (means + `stats`)
9
+ * and advances the episode to `scored`.
10
+ *
11
+ * Defaults are zero-cost: `samples` defaults to 1, which runs exactly ONE duel
12
+ * and produces a diagnosis byte-compatible with {@link runRewardAgent} (no extra
13
+ * spawns, no `stats`). The orchestrator calls THIS as its single reward entry so
14
+ * the ④ tamper hint + block mode are honored uniformly; everything beyond a
15
+ * single sample is opt-in via `selfEvolution.reward`.
16
+ */
17
+ import { spawn as nodeSpawn } from 'node:child_process';
18
+ import { type RewardDiagnosis } from './reward-agent.js';
19
+ import { type EpisodeRecord } from './episode-store.js';
20
+ /** The loop-v2 reward judge-quality config (mirrors `selfEvolution.reward`). */
21
+ export interface RewardConfig {
22
+ /** Judged duels per episode. Default 1 (single sample, no extra spawns). */
23
+ samples?: number;
24
+ /** Minimum |advantage| to trust; within ⇒ insufficient-signal. Measured via A/A when omitted. */
25
+ noiseFloor?: number;
26
+ /** Swap arm order across samples (③). Defaults to true when samples > 1. */
27
+ orderSwap?: boolean;
28
+ /** Reserved: the gate-not-blend rule is always enforced inside the judge. */
29
+ requireCorrectnessGate?: boolean;
30
+ /** Test-tamper handling (the detection is the orchestrator's; this only gates routing). */
31
+ tamperCheck?: 'off' | 'flag' | 'block';
32
+ }
33
+ export interface RunRewardAgentEnsembleOptions {
34
+ repoRoot: string;
35
+ episodeId: string;
36
+ spawn?: typeof nodeSpawn;
37
+ binary?: string;
38
+ maxRepairAttempts?: number;
39
+ /** Judge-quality knobs; omitted ⇒ single sample, flag-only (historical behavior). */
40
+ reward?: RewardConfig;
41
+ /** ④ tamper hint computed upstream; injected into the judge prompt + integrity. */
42
+ integrityHint?: {
43
+ suspected: boolean;
44
+ flags: string[];
45
+ } | null;
46
+ }
47
+ export interface RunRewardAgentEnsembleResult {
48
+ diagnosis: RewardDiagnosis;
49
+ diagnosisPath: string;
50
+ episode: EpisodeRecord;
51
+ }
52
+ /**
53
+ * Run the reward judge as a k-sample ensemble (② noise floor + SPRT). At
54
+ * `samples <= 1` this is a single duel whose diagnosis matches
55
+ * {@link runRewardAgent} (plus any ④ tamper override). The ONLY write path is
56
+ * the episode dir (`diagnosis.json` + stage → `scored`).
57
+ */
58
+ export declare function runRewardAgentEnsemble(opts: RunRewardAgentEnsembleOptions): Promise<RunRewardAgentEnsembleResult>;
59
+ //# sourceMappingURL=reward-aggregator.d.ts.map
@@ -0,0 +1,262 @@
1
+ import { loadRewardScoringContext, scoreOnce, deriveSingleSampleVerdict, buildAnchors, } from './reward-agent.js';
2
+ import { writeDiagnosis, advanceEpisodeStage, } from './episode-store.js';
3
+ // ── SPRT parameters (Wald) ───────────────────────────────────────────────────
4
+ // Treat each duel as a Bernoulli "main wins" (advantage > 0). H1: p1=0.8 (main is
5
+ // genuinely better), H0: p0=0.5 (no real difference). alpha=beta=0.1.
6
+ const SPRT_P1 = 0.8;
7
+ const SPRT_P0 = 0.5;
8
+ const SPRT_ALPHA = 0.1;
9
+ const SPRT_BETA = 0.1;
10
+ const SPRT_UPPER = Math.log((1 - SPRT_BETA) / SPRT_ALPHA); // accept H1 (main-better)
11
+ const SPRT_LOWER = Math.log(SPRT_BETA / (1 - SPRT_ALPHA)); // accept H0 (no difference)
12
+ function clamp01(v) {
13
+ if (Number.isNaN(v))
14
+ return 0;
15
+ return v < 0 ? 0 : v > 1 ? 1 : v;
16
+ }
17
+ function mean(xs) {
18
+ return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length;
19
+ }
20
+ function stdev(xs) {
21
+ if (xs.length < 2)
22
+ return 0;
23
+ const m = mean(xs);
24
+ const variance = xs.reduce((a, b) => a + (b - m) ** 2, 0) / (xs.length - 1);
25
+ return Math.sqrt(variance);
26
+ }
27
+ /** Per-duel SPRT log-likelihood-ratio increment for a Bernoulli "main wins" outcome. */
28
+ function sprtIncrement(mainWon) {
29
+ return mainWon
30
+ ? Math.log(SPRT_P1 / SPRT_P0)
31
+ : Math.log((1 - SPRT_P1) / (1 - SPRT_P0));
32
+ }
33
+ /**
34
+ * Run the reward judge as a k-sample ensemble (② noise floor + SPRT). At
35
+ * `samples <= 1` this is a single duel whose diagnosis matches
36
+ * {@link runRewardAgent} (plus any ④ tamper override). The ONLY write path is
37
+ * the episode dir (`diagnosis.json` + stage → `scored`).
38
+ */
39
+ export async function runRewardAgentEnsemble(opts) {
40
+ const { repoRoot, episodeId } = opts;
41
+ const samples = Math.max(1, Math.floor(opts.reward?.samples ?? 1));
42
+ const orderSwap = opts.reward?.orderSwap ?? samples > 1;
43
+ const tamperBlock = (opts.reward?.tamperCheck ?? 'flag') === 'block' && (opts.integrityHint?.suspected ?? false);
44
+ const integrityHint = opts.integrityHint ?? null;
45
+ const ctx = await loadRewardScoringContext(repoRoot, episodeId);
46
+ const basePromptInput = { ...ctx.promptInput, integrityHint };
47
+ const armOrderFor = (i) => orderSwap && i % 2 === 1 ? 'baseline-first' : 'main-first';
48
+ const runDuel = (i) => scoreOnce({
49
+ promptInput: { ...basePromptInput, armOrder: armOrderFor(i) },
50
+ baselineSkipped: ctx.baselineSkipped,
51
+ repoRoot,
52
+ spawn: opts.spawn,
53
+ binary: opts.binary,
54
+ maxRepairAttempts: opts.maxRepairAttempts,
55
+ });
56
+ // ── run the duels, with SPRT early-stop when the baseline ran ────────────────
57
+ const duels = [];
58
+ let llr = 0;
59
+ let sequentialDecision = samples <= 1 ? 'single' : 'continue';
60
+ for (let i = 0; i < samples; i++) {
61
+ const duel = await runDuel(i);
62
+ duels.push(duel);
63
+ if (samples > 1 && !ctx.baselineSkipped && duel.parsed.advantage !== null) {
64
+ llr += sprtIncrement(duel.parsed.advantage > 0);
65
+ if (llr >= SPRT_UPPER) {
66
+ sequentialDecision = 'accept';
67
+ break;
68
+ }
69
+ if (llr <= SPRT_LOWER) {
70
+ sequentialDecision = 'reject';
71
+ break;
72
+ }
73
+ }
74
+ }
75
+ const diagnosis = samples <= 1
76
+ ? buildSingleSampleDiagnosis(ctx, duels[0], tamperBlock)
77
+ : await buildAggregatedDiagnosis(opts, ctx, duels, sequentialDecision, tamperBlock);
78
+ const diagnosisPath = await writeDiagnosis({ repoRoot, episodeId, diagnosis });
79
+ const episode = await advanceEpisodeStage({
80
+ repoRoot,
81
+ episodeId,
82
+ stage: 'scored',
83
+ patch: { advantage: diagnosis.advantage },
84
+ });
85
+ return { diagnosis, diagnosisPath, episode };
86
+ }
87
+ /** Single-sample diagnosis — identical to {@link runRewardAgent}, plus the ④ block override. */
88
+ function buildSingleSampleDiagnosis(ctx, duel, tamperBlock) {
89
+ const { parsed, integrity } = duel;
90
+ let verdict = deriveSingleSampleVerdict(parsed);
91
+ let advantage = parsed.advantage;
92
+ let finalIntegrity = integrity;
93
+ if (tamperBlock) {
94
+ verdict = 'insufficient-signal';
95
+ advantage = null;
96
+ finalIntegrity = withTamperBlockFlag(integrity);
97
+ }
98
+ return assembleDiagnosis(ctx.episode, {
99
+ rewardMain: parsed.rewardMain,
100
+ rewardBaseline: parsed.rewardBaseline,
101
+ advantage,
102
+ anchors: ctx.promptInput.anchors,
103
+ errors: parsed.errors,
104
+ gaps: parsed.gaps,
105
+ textualGradient: parsed.textualGradient,
106
+ abstained: parsed.abstained,
107
+ abstainReason: parsed.abstainReason,
108
+ verdict,
109
+ confidence: parsed.confidence ?? undefined,
110
+ integrity: finalIntegrity,
111
+ });
112
+ }
113
+ /** Aggregate k duels into one diagnosis (means + ② stats + verdict from noise floor). */
114
+ async function buildAggregatedDiagnosis(opts, ctx, duels, sequentialDecision, tamperBlock) {
115
+ const rewardMains = duels.map((d) => d.parsed.rewardMain);
116
+ const rewardMain = mean(rewardMains);
117
+ const advantages = duels
118
+ .map((d) => d.parsed.advantage)
119
+ .filter((a) => a !== null);
120
+ // Representative duel: when the baseline ran, the one whose advantage is
121
+ // closest to the mean; otherwise the one whose rewardMain is closest to the
122
+ // mean. Its gaps/errors/gradient carry the diagnostic content.
123
+ const advantageMean = advantages.length > 0 ? mean(advantages) : null;
124
+ const target = advantageMean ?? rewardMain;
125
+ const repValue = (d) => advantageMean !== null ? (d.parsed.advantage ?? Number.POSITIVE_INFINITY) : d.parsed.rewardMain;
126
+ const rep = duels.reduce((best, d) => Math.abs(repValue(d) - target) < Math.abs(repValue(best) - target) ? d : best);
127
+ const rewardBaseline = ctx.baselineSkipped
128
+ ? null
129
+ : mean(duels.map((d) => d.parsed.rewardBaseline ?? 0));
130
+ // ② Noise floor: the configured value, else measured from ONE A/A duel
131
+ // (main-vs-main) whose advantage should be ~0; its magnitude is the jitter.
132
+ let noiseFloor = opts.reward?.noiseFloor ?? null;
133
+ if (noiseFloor === null && !ctx.baselineSkipped) {
134
+ noiseFloor = await measureNoiseFloor(opts, ctx);
135
+ }
136
+ const advantageStdev = advantages.length >= 2 ? stdev(advantages) : null;
137
+ // ⑤ Verdict from the aggregated advantage vs the noise floor.
138
+ let verdict;
139
+ let advantage = advantageMean;
140
+ if (advantageMean === null) {
141
+ verdict = undefined; // baseline skipped — only the main arm was scored
142
+ }
143
+ else if (noiseFloor !== null && Math.abs(advantageMean) <= noiseFloor) {
144
+ verdict = 'insufficient-signal';
145
+ advantage = null;
146
+ }
147
+ else if (advantageMean > 1e-9) {
148
+ verdict = 'main-better';
149
+ }
150
+ else if (advantageMean < -1e-9) {
151
+ verdict = 'baseline-better';
152
+ }
153
+ else {
154
+ verdict = 'tie';
155
+ }
156
+ // Confidence: 0 when insufficient-signal; else the signal's share of (signal + jitter).
157
+ const confidence = advantageMean === null
158
+ ? undefined
159
+ : verdict === 'insufficient-signal'
160
+ ? 0
161
+ : clamp01(Math.abs(advantageMean) / (Math.abs(advantageMean) + (advantageStdev ?? 0) + 1e-9));
162
+ // Merge integrity across duels: tamper from the hint; divergence from the rep;
163
+ // flags = de-duplicated union (each duel already folded in any tamper flags).
164
+ const flags = Array.from(new Set(duels.flatMap((d) => d.integrity.flags)));
165
+ let integrity = {
166
+ testTamperSuspected: rep.integrity.testTamperSuspected,
167
+ judgeVerifierDivergence: rep.integrity.judgeVerifierDivergence,
168
+ flags,
169
+ };
170
+ if (tamperBlock) {
171
+ verdict = 'insufficient-signal';
172
+ advantage = null;
173
+ integrity = withTamperBlockFlag(integrity);
174
+ }
175
+ return assembleDiagnosis(ctx.episode, {
176
+ rewardMain,
177
+ rewardBaseline,
178
+ advantage,
179
+ anchors: ctx.promptInput.anchors,
180
+ errors: rep.parsed.errors,
181
+ gaps: rep.parsed.gaps,
182
+ textualGradient: rep.parsed.textualGradient,
183
+ abstained: rep.parsed.abstained,
184
+ abstainReason: rep.parsed.abstainReason,
185
+ verdict,
186
+ confidence,
187
+ integrity,
188
+ stats: {
189
+ samples: duels.length,
190
+ advantageMean,
191
+ advantageStdev,
192
+ noiseFloor,
193
+ sequentialDecision,
194
+ },
195
+ });
196
+ }
197
+ /** Run one A/A duel (main vs main) and return |advantage| as the judge's jitter. */
198
+ async function measureNoiseFloor(opts, ctx) {
199
+ const main = ctx.promptInput.mainArm;
200
+ const aaAnchors = buildAnchors(main.objective, main.objective);
201
+ const aaPromptInput = {
202
+ ...ctx.promptInput,
203
+ integrityHint: null,
204
+ baselineArm: main,
205
+ policyVersions: {
206
+ main: ctx.promptInput.policyVersions.main,
207
+ baseline: ctx.promptInput.policyVersions.main,
208
+ },
209
+ anchors: aaAnchors,
210
+ armOrder: 'main-first',
211
+ };
212
+ try {
213
+ const aa = await scoreOnce({
214
+ promptInput: aaPromptInput,
215
+ baselineSkipped: false,
216
+ repoRoot: opts.repoRoot,
217
+ spawn: opts.spawn,
218
+ binary: opts.binary,
219
+ maxRepairAttempts: opts.maxRepairAttempts,
220
+ });
221
+ return aa.parsed.advantage === null ? null : Math.abs(aa.parsed.advantage);
222
+ }
223
+ catch {
224
+ // A/A is a calibration aid, not load-bearing: a failed A/A leaves the floor
225
+ // unmeasured (null ⇒ no insufficient-signal gating from the floor).
226
+ return null;
227
+ }
228
+ }
229
+ function withTamperBlockFlag(integrity) {
230
+ const flag = 'tamper: blocked (tamperCheck=block) — forced insufficient-signal';
231
+ return {
232
+ ...integrity,
233
+ flags: integrity.flags.includes(flag) ? integrity.flags : [...integrity.flags, flag],
234
+ };
235
+ }
236
+ /** Build the {@link RewardDiagnosis} record (schema 2) from an episode + computed parts. */
237
+ function assembleDiagnosis(episode, parts) {
238
+ return {
239
+ schemaVersion: 2,
240
+ episodeId: episode.episodeId,
241
+ changeName: episode.changeName,
242
+ targetId: episode.targetId,
243
+ policyVersions: {
244
+ main: episode.policyVersionMain,
245
+ baseline: episode.policyVersionBaseline,
246
+ },
247
+ rewardMain: parts.rewardMain,
248
+ rewardBaseline: parts.rewardBaseline,
249
+ advantage: parts.advantage,
250
+ anchors: parts.anchors,
251
+ errors: parts.errors,
252
+ gaps: parts.gaps,
253
+ textualGradient: parts.textualGradient,
254
+ abstained: parts.abstained,
255
+ ...(parts.abstainReason !== undefined ? { abstainReason: parts.abstainReason } : {}),
256
+ ...(parts.verdict !== undefined ? { verdict: parts.verdict } : {}),
257
+ ...(parts.confidence !== undefined ? { confidence: parts.confidence } : {}),
258
+ integrity: parts.integrity,
259
+ ...(parts.stats !== undefined ? { stats: parts.stats } : {}),
260
+ };
261
+ }
262
+ //# sourceMappingURL=reward-aggregator.js.map
@@ -0,0 +1,66 @@
1
+ /**
2
+ * 范围⊆诊断 scope⊆diagnosis gate for the 演进智能体 EVOLVING AGENT — loop v2
3
+ * (self-evolution as in-context RL).
4
+ *
5
+ * The 奖励智能体 REWARD AGENT names a set of GAPS, each anchored to a (file,
6
+ * section) the 文本梯度 textual gradient points at. The EVOLVING AGENT's ONE
7
+ * bounded edit must stay INSIDE those named sections — it may not wander off
8
+ * and rewrite an unrelated heading just because the file is editable. This gate
9
+ * is the check: from the line diff, compute each changed range's ENCLOSING
10
+ * section, and PASS iff every (file, section) the edit touches is covered by
11
+ * some diagnosis gap.
12
+ *
13
+ * Section addressing:
14
+ * - `.md` files: the nearest PRECEDING markdown heading of any `#`-level
15
+ * (`# …`, `## …`, …). A change before the first heading has section `''`
16
+ * (the file preamble).
17
+ * - YAML / other files: the nearest preceding TOP-LEVEL key (`key:` at column
18
+ * 0). A change before the first top-level key has section `''`.
19
+ *
20
+ * Coverage:
21
+ * - a gap `{file: '*'}` covers ANY file;
22
+ * - a gap `{section: '*'}` covers the WHOLE file;
23
+ * - otherwise the gap's `file` AND `section` must match exactly.
24
+ *
25
+ * Pure + dependency-free (golden-testable). Re-uses {@link lineDiff} so its
26
+ * notion of "changed lines" is identical to the ≤ L budget check.
27
+ */
28
+ import { type DiffEdit } from './line-diff.js';
29
+ /** A (file, section) the 奖励智能体 REWARD AGENT's diagnosis points the edit at. */
30
+ export interface DiagnosisGap {
31
+ /** Repo-relative POSIX path, or `'*'` to cover any file. */
32
+ file: string;
33
+ /** Section anchor (markdown heading text / YAML top-level key), or `'*'`. */
34
+ section: string;
35
+ }
36
+ /** One changed (file, section) the edit touched but no gap covered. */
37
+ export interface ScopeViolation {
38
+ file: string;
39
+ section: string;
40
+ /** 1-based inclusive line spans (in the proposed file) that fell out of scope. */
41
+ lines: {
42
+ startLine: number;
43
+ endLine: number;
44
+ }[];
45
+ }
46
+ export interface ScopeGateResult {
47
+ pass: boolean;
48
+ violations: ScopeViolation[];
49
+ }
50
+ export interface CheckScopeWithinDiagnosisInput {
51
+ /** Proposed full-file replacements the EVOLVING AGENT wants to write. */
52
+ edits: readonly DiffEdit[];
53
+ /** Current on-disk contents of the same files (for the line diff). */
54
+ currentFiles: readonly DiffEdit[];
55
+ /** The diagnosis gaps the edit's scope must be a subset of. */
56
+ gaps: readonly DiagnosisGap[];
57
+ }
58
+ /**
59
+ * Run the 范围⊆诊断 scope⊆diagnosis gate. For every edit, diff it against its
60
+ * current content, resolve each changed range's enclosing section, and flag any
61
+ * (file, section) not covered by a gap. Returns `{pass, violations}`; a pure
62
+ * deletion (no inserted range) is in-scope by construction (it removes named or
63
+ * unnamed lines but introduces no new out-of-scope section) and is not flagged.
64
+ */
65
+ export declare function checkScopeWithinDiagnosis(input: CheckScopeWithinDiagnosisInput): ScopeGateResult;
66
+ //# sourceMappingURL=scope-gate.d.ts.map
@@ -0,0 +1,107 @@
1
+ /**
2
+ * 范围⊆诊断 scope⊆diagnosis gate for the 演进智能体 EVOLVING AGENT — loop v2
3
+ * (self-evolution as in-context RL).
4
+ *
5
+ * The 奖励智能体 REWARD AGENT names a set of GAPS, each anchored to a (file,
6
+ * section) the 文本梯度 textual gradient points at. The EVOLVING AGENT's ONE
7
+ * bounded edit must stay INSIDE those named sections — it may not wander off
8
+ * and rewrite an unrelated heading just because the file is editable. This gate
9
+ * is the check: from the line diff, compute each changed range's ENCLOSING
10
+ * section, and PASS iff every (file, section) the edit touches is covered by
11
+ * some diagnosis gap.
12
+ *
13
+ * Section addressing:
14
+ * - `.md` files: the nearest PRECEDING markdown heading of any `#`-level
15
+ * (`# …`, `## …`, …). A change before the first heading has section `''`
16
+ * (the file preamble).
17
+ * - YAML / other files: the nearest preceding TOP-LEVEL key (`key:` at column
18
+ * 0). A change before the first top-level key has section `''`.
19
+ *
20
+ * Coverage:
21
+ * - a gap `{file: '*'}` covers ANY file;
22
+ * - a gap `{section: '*'}` covers the WHOLE file;
23
+ * - otherwise the gap's `file` AND `section` must match exactly.
24
+ *
25
+ * Pure + dependency-free (golden-testable). Re-uses {@link lineDiff} so its
26
+ * notion of "changed lines" is identical to the ≤ L budget check.
27
+ */
28
+ import { lineDiff } from './line-diff.js';
29
+ function toPosix(p) {
30
+ return p.replace(/\\/g, '/');
31
+ }
32
+ /**
33
+ * The enclosing section of 1-based `line` in `proposedContent`. For `.md`
34
+ * files it is the nearest preceding heading's text; otherwise the nearest
35
+ * preceding top-level (`key:` at column 0) key. `''` when nothing precedes it.
36
+ */
37
+ function enclosingSection(relPath, proposedLines, line) {
38
+ const isMarkdown = /\.md$/i.test(relPath);
39
+ let section = '';
40
+ // Scan from the file top down to (and including) the changed line; the last
41
+ // matching anchor at-or-before it is the enclosing section.
42
+ const upto = Math.min(line, proposedLines.length);
43
+ for (let idx = 0; idx < upto; idx++) {
44
+ const text = proposedLines[idx];
45
+ if (isMarkdown) {
46
+ const h = /^#{1,6}\s+(.+?)\s*$/.exec(text);
47
+ if (h)
48
+ section = h[1];
49
+ }
50
+ else {
51
+ // Top-level key: an identifier-ish key at column 0 followed by ':'.
52
+ const k = /^([^\s:#][^:]*):(?:\s|$)/.exec(text);
53
+ if (k)
54
+ section = k[1].trim();
55
+ }
56
+ }
57
+ return section;
58
+ }
59
+ /** True iff some gap covers `(file, section)` per the wildcard rules. */
60
+ function isCovered(file, section, gaps) {
61
+ for (const gap of gaps) {
62
+ const fileOk = gap.file === '*' || toPosix(gap.file) === file;
63
+ if (!fileOk)
64
+ continue;
65
+ if (gap.section === '*' || gap.section === section)
66
+ return true;
67
+ }
68
+ return false;
69
+ }
70
+ /**
71
+ * Run the 范围⊆诊断 scope⊆diagnosis gate. For every edit, diff it against its
72
+ * current content, resolve each changed range's enclosing section, and flag any
73
+ * (file, section) not covered by a gap. Returns `{pass, violations}`; a pure
74
+ * deletion (no inserted range) is in-scope by construction (it removes named or
75
+ * unnamed lines but introduces no new out-of-scope section) and is not flagged.
76
+ */
77
+ export function checkScopeWithinDiagnosis(input) {
78
+ const currentByPath = new Map(input.currentFiles.map((f) => [toPosix(f.relPath), f.content]));
79
+ // Accumulate violations keyed by file+section so the lines coalesce per spot.
80
+ const violationByKey = new Map();
81
+ for (const edit of input.edits) {
82
+ const rel = toPosix(edit.relPath);
83
+ const current = currentByPath.get(rel) ?? '';
84
+ const proposedLines = edit.content.length === 0 ? [] : edit.content.replace(/\n$/, '').split('\n');
85
+ const d = lineDiff(current, edit.content);
86
+ for (const range of d.changedRanges) {
87
+ const section = enclosingSection(rel, proposedLines, range.startLine);
88
+ if (isCovered(rel, section, input.gaps))
89
+ continue;
90
+ const key = `${rel}${section}`;
91
+ const existing = violationByKey.get(key);
92
+ if (existing) {
93
+ existing.lines.push({ startLine: range.startLine, endLine: range.endLine });
94
+ }
95
+ else {
96
+ violationByKey.set(key, {
97
+ file: rel,
98
+ section,
99
+ lines: [{ startLine: range.startLine, endLine: range.endLine }],
100
+ });
101
+ }
102
+ }
103
+ }
104
+ const violations = [...violationByKey.values()];
105
+ return { pass: violations.length === 0, violations };
106
+ }
107
+ //# sourceMappingURL=scope-gate.js.map
@@ -229,7 +229,7 @@ export async function mineSuccessSignals(opts) {
229
229
  const walkedPaths = [];
230
230
  const minedKeys = new Map();
231
231
  const protect = (targetId, section) => {
232
- const key = `${targetId}${section}`;
232
+ const key = `${targetId} ${section}`;
233
233
  if (!minedKeys.has(key))
234
234
  minedKeys.set(key, { targetId, section });
235
235
  };
@@ -282,7 +282,7 @@ export async function mineSuccessSignals(opts) {
282
282
  }
283
283
  }
284
284
  // Best-effort persist (health-baseline.ts pattern): a failed side-write must
285
- // never fail the learn/auto-evolve run that triggered the mining.
285
+ // never fail the learn/episode run that triggered the mining.
286
286
  let protectionsWritten = 0;
287
287
  try {
288
288
  const file = protectionsPath(projectRoot);
@@ -0,0 +1,24 @@
1
+ export interface DetectTestTamperInput {
2
+ /** Absolute path to the change dir (holds the 5 artifacts + test-report.md + any test files). */
3
+ changeDirPath: string;
4
+ }
5
+ export interface TamperResult {
6
+ suspected: boolean;
7
+ /** Human-readable reasons, e.g. "spec-tests.md declares 36 scenarios but test-report shows 4 collected". Empty when clean. */
8
+ flags: string[];
9
+ }
10
+ /**
11
+ * Count the scenarios/tests a `spec-tests.md` declares. The canonical gen-tests
12
+ * format is a "Requirement Traceability Matrix" markdown table — one row per
13
+ * mapped (requirement, test) pair. We count those data rows (skipping the
14
+ * header + separator and any row whose test cell is "—"/blank, which marks a
15
+ * step with no applicable test). Returns null when no countable table is found.
16
+ */
17
+ export declare function countDeclaredScenarios(specTestsText: string): number | null;
18
+ /**
19
+ * Detect whether a change's tests were tampered to fake a pass. Reads only
20
+ * files under `input.changeDirPath`; tolerates every missing file. `suspected`
21
+ * is true iff at least one strong heuristic fired.
22
+ */
23
+ export declare function detectTestTamper(input: DetectTestTamperInput): Promise<TamperResult>;
24
+ //# sourceMappingURL=tamper-check.d.ts.map