synergyspec-selfevolving 1.4.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -18
- package/dist/commands/learn.d.ts +12 -1
- package/dist/commands/learn.js +158 -11
- package/dist/commands/self-evolution-episode.d.ts +177 -0
- package/dist/commands/self-evolution-episode.js +431 -0
- package/dist/commands/self-evolution.d.ts +12 -190
- package/dist/commands/self-evolution.js +114 -866
- package/dist/core/archive.d.ts +0 -1
- package/dist/core/archive.js +0 -58
- package/dist/core/artifact-graph/instruction-loader.d.ts +2 -4
- package/dist/core/artifact-graph/instruction-loader.js +3 -31
- package/dist/core/fitness/loss.d.ts +5 -5
- package/dist/core/fitness/loss.js +4 -4
- package/dist/core/fitness/test-failures.js +10 -2
- package/dist/core/project-config.d.ts +19 -0
- package/dist/core/project-config.js +96 -0
- package/dist/core/self-evolution/candidate-fitness.d.ts +23 -1
- package/dist/core/self-evolution/candidate-fitness.js +31 -5
- package/dist/core/self-evolution/candidates.d.ts +0 -9
- package/dist/core/self-evolution/critic-agent.d.ts +192 -0
- package/dist/core/self-evolution/critic-agent.js +568 -0
- package/dist/core/self-evolution/edits-contract.d.ts +53 -0
- package/dist/core/self-evolution/edits-contract.js +89 -0
- package/dist/core/self-evolution/episode-orchestrator.d.ts +234 -0
- package/dist/core/self-evolution/episode-orchestrator.js +681 -0
- package/dist/core/self-evolution/episode-store.d.ts +266 -0
- package/dist/core/self-evolution/episode-store.js +573 -0
- package/dist/core/self-evolution/evolution-switches.d.ts +1 -1
- package/dist/core/self-evolution/evolution-switches.js +5 -10
- package/dist/core/self-evolution/evolving-agent.d.ts +208 -0
- package/dist/core/self-evolution/evolving-agent.js +535 -0
- package/dist/core/self-evolution/host-harness.d.ts +14 -15
- package/dist/core/self-evolution/host-harness.js +48 -23
- package/dist/core/self-evolution/index.d.ts +11 -6
- package/dist/core/self-evolution/index.js +20 -6
- package/dist/core/self-evolution/line-diff.d.ts +60 -0
- package/dist/core/self-evolution/line-diff.js +130 -0
- package/dist/core/self-evolution/policy/fs-safe.d.ts +19 -0
- package/dist/core/self-evolution/policy/fs-safe.js +89 -0
- package/dist/core/self-evolution/policy/index.d.ts +13 -0
- package/dist/core/self-evolution/policy/index.js +13 -0
- package/dist/core/self-evolution/policy/policy-store.d.ts +217 -0
- package/dist/core/self-evolution/policy/policy-store.js +774 -0
- package/dist/core/self-evolution/policy/prediction-reconcile.d.ts +54 -0
- package/dist/core/self-evolution/policy/prediction-reconcile.js +191 -0
- package/dist/core/self-evolution/policy/reject-buffer.d.ts +55 -0
- package/dist/core/self-evolution/policy/reject-buffer.js +170 -0
- package/dist/core/self-evolution/promote.d.ts +1 -1
- package/dist/core/self-evolution/promote.js +6 -33
- package/dist/core/self-evolution/promotion.js +1 -2
- package/dist/core/self-evolution/reward-agent.d.ts +379 -0
- package/dist/core/self-evolution/reward-agent.js +940 -0
- package/dist/core/self-evolution/reward-aggregator.d.ts +59 -0
- package/dist/core/self-evolution/reward-aggregator.js +262 -0
- package/dist/core/self-evolution/scope-gate.d.ts +66 -0
- package/dist/core/self-evolution/scope-gate.js +107 -0
- package/dist/core/self-evolution/success-channel.js +2 -2
- package/dist/core/self-evolution/tamper-check.d.ts +24 -0
- package/dist/core/self-evolution/tamper-check.js +236 -0
- package/dist/core/self-evolution/tool-evolution.js +2 -13
- package/dist/core/self-evolution/verdict.d.ts +8 -5
- package/dist/core/self-evolution/verdict.js +4 -7
- package/dist/core/templates/workflows/gen-tests.js +1 -1
- package/dist/core/templates/workflows/learn.d.ts +3 -2
- package/dist/core/templates/workflows/learn.js +21 -18
- package/dist/core/templates/workflows/self-evolving.d.ts +6 -4
- package/dist/core/templates/workflows/self-evolving.js +62 -172
- package/dist/core/trajectory/scrub.d.ts +27 -0
- package/dist/core/trajectory/scrub.js +79 -0
- package/dist/core/trajectory/skeleton.d.ts +27 -1
- package/dist/core/trajectory/skeleton.js +152 -8
- package/dist/dashboard/data.d.ts +25 -51
- package/dist/dashboard/data.js +68 -180
- package/dist/dashboard/react-client.js +458 -503
- package/dist/dashboard/react-styles.js +3 -3
- package/dist/dashboard/server.js +23 -17
- package/dist/ui/ascii-patterns.d.ts +7 -15
- package/dist/ui/ascii-patterns.js +123 -54
- package/dist/ui/welcome-screen.d.ts +0 -14
- package/dist/ui/welcome-screen.js +16 -35
- package/package.json +1 -1
- package/dist/core/self-evolution/ga-selection.d.ts +0 -94
- package/dist/core/self-evolution/ga-selection.js +0 -153
- package/dist/core/self-evolution/proposer-agent.d.ts +0 -182
- package/dist/core/self-evolution/proposer-agent.js +0 -326
- package/dist/core/self-evolution/replay-runner.d.ts +0 -100
- package/dist/core/self-evolution/replay-runner.js +0 -170
- package/dist/core/self-evolution/replay.d.ts +0 -45
- package/dist/core/self-evolution/replay.js +0 -56
- package/dist/core/self-evolution/template-variants.d.ts +0 -62
- package/dist/core/self-evolution/template-variants.js +0 -171
- package/dist/core/self-evolution/trajectory.d.ts +0 -65
- package/dist/core/self-evolution/trajectory.js +0 -185
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 奖励智能体 REWARD AGENT — statistical layer (② A/A noise floor + SPRT).
|
|
3
|
+
*
|
|
4
|
+
* Wraps {@link scoreOnce} to run k judged duels for ONE episode, cancel the
|
|
5
|
+
* judge's position bias by swapping arm order across samples (③), size the
|
|
6
|
+
* judge's jitter with an A/A pair (the NOISE FLOOR), stop early with a sequential
|
|
7
|
+
* test (SPRT), and route a within-noise advantage to the ⑤ `insufficient-signal`
|
|
8
|
+
* verdict. It then writes the ONE aggregated `diagnosis.json` (means + `stats`)
|
|
9
|
+
* and advances the episode to `scored`.
|
|
10
|
+
*
|
|
11
|
+
* Defaults are zero-cost: `samples` defaults to 1, which runs exactly ONE duel
|
|
12
|
+
* and produces a diagnosis byte-compatible with {@link runRewardAgent} (no extra
|
|
13
|
+
* spawns, no `stats`). The orchestrator calls THIS as its single reward entry so
|
|
14
|
+
* the ④ tamper hint + block mode are honored uniformly; everything beyond a
|
|
15
|
+
* single sample is opt-in via `selfEvolution.reward`.
|
|
16
|
+
*/
|
|
17
|
+
import { spawn as nodeSpawn } from 'node:child_process';
|
|
18
|
+
import { type RewardDiagnosis } from './reward-agent.js';
|
|
19
|
+
import { type EpisodeRecord } from './episode-store.js';
|
|
20
|
+
/** The loop-v2 reward judge-quality config (mirrors `selfEvolution.reward`). */
|
|
21
|
+
export interface RewardConfig {
|
|
22
|
+
/** Judged duels per episode. Default 1 (single sample, no extra spawns). */
|
|
23
|
+
samples?: number;
|
|
24
|
+
/** Minimum |advantage| to trust; within ⇒ insufficient-signal. Measured via A/A when omitted. */
|
|
25
|
+
noiseFloor?: number;
|
|
26
|
+
/** Swap arm order across samples (③). Defaults to true when samples > 1. */
|
|
27
|
+
orderSwap?: boolean;
|
|
28
|
+
/** Reserved: the gate-not-blend rule is always enforced inside the judge. */
|
|
29
|
+
requireCorrectnessGate?: boolean;
|
|
30
|
+
/** Test-tamper handling (the detection is the orchestrator's; this only gates routing). */
|
|
31
|
+
tamperCheck?: 'off' | 'flag' | 'block';
|
|
32
|
+
}
|
|
33
|
+
export interface RunRewardAgentEnsembleOptions {
|
|
34
|
+
repoRoot: string;
|
|
35
|
+
episodeId: string;
|
|
36
|
+
spawn?: typeof nodeSpawn;
|
|
37
|
+
binary?: string;
|
|
38
|
+
maxRepairAttempts?: number;
|
|
39
|
+
/** Judge-quality knobs; omitted ⇒ single sample, flag-only (historical behavior). */
|
|
40
|
+
reward?: RewardConfig;
|
|
41
|
+
/** ④ tamper hint computed upstream; injected into the judge prompt + integrity. */
|
|
42
|
+
integrityHint?: {
|
|
43
|
+
suspected: boolean;
|
|
44
|
+
flags: string[];
|
|
45
|
+
} | null;
|
|
46
|
+
}
|
|
47
|
+
export interface RunRewardAgentEnsembleResult {
|
|
48
|
+
diagnosis: RewardDiagnosis;
|
|
49
|
+
diagnosisPath: string;
|
|
50
|
+
episode: EpisodeRecord;
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Run the reward judge as a k-sample ensemble (② noise floor + SPRT). At
|
|
54
|
+
* `samples <= 1` this is a single duel whose diagnosis matches
|
|
55
|
+
* {@link runRewardAgent} (plus any ④ tamper override). The ONLY write path is
|
|
56
|
+
* the episode dir (`diagnosis.json` + stage → `scored`).
|
|
57
|
+
*/
|
|
58
|
+
export declare function runRewardAgentEnsemble(opts: RunRewardAgentEnsembleOptions): Promise<RunRewardAgentEnsembleResult>;
|
|
59
|
+
//# sourceMappingURL=reward-aggregator.d.ts.map
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
import { loadRewardScoringContext, scoreOnce, deriveSingleSampleVerdict, buildAnchors, } from './reward-agent.js';
|
|
2
|
+
import { writeDiagnosis, advanceEpisodeStage, } from './episode-store.js';
|
|
3
|
+
// ── SPRT parameters (Wald) ───────────────────────────────────────────────────
|
|
4
|
+
// Treat each duel as a Bernoulli "main wins" (advantage > 0). H1: p1=0.8 (main is
|
|
5
|
+
// genuinely better), H0: p0=0.5 (no real difference). alpha=beta=0.1.
|
|
6
|
+
const SPRT_P1 = 0.8;
|
|
7
|
+
const SPRT_P0 = 0.5;
|
|
8
|
+
const SPRT_ALPHA = 0.1;
|
|
9
|
+
const SPRT_BETA = 0.1;
|
|
10
|
+
const SPRT_UPPER = Math.log((1 - SPRT_BETA) / SPRT_ALPHA); // accept H1 (main-better)
|
|
11
|
+
const SPRT_LOWER = Math.log(SPRT_BETA / (1 - SPRT_ALPHA)); // accept H0 (no difference)
|
|
12
|
+
function clamp01(v) {
|
|
13
|
+
if (Number.isNaN(v))
|
|
14
|
+
return 0;
|
|
15
|
+
return v < 0 ? 0 : v > 1 ? 1 : v;
|
|
16
|
+
}
|
|
17
|
+
function mean(xs) {
|
|
18
|
+
return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
19
|
+
}
|
|
20
|
+
function stdev(xs) {
|
|
21
|
+
if (xs.length < 2)
|
|
22
|
+
return 0;
|
|
23
|
+
const m = mean(xs);
|
|
24
|
+
const variance = xs.reduce((a, b) => a + (b - m) ** 2, 0) / (xs.length - 1);
|
|
25
|
+
return Math.sqrt(variance);
|
|
26
|
+
}
|
|
27
|
+
/** Per-duel SPRT log-likelihood-ratio increment for a Bernoulli "main wins" outcome. */
|
|
28
|
+
function sprtIncrement(mainWon) {
|
|
29
|
+
return mainWon
|
|
30
|
+
? Math.log(SPRT_P1 / SPRT_P0)
|
|
31
|
+
: Math.log((1 - SPRT_P1) / (1 - SPRT_P0));
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Run the reward judge as a k-sample ensemble (② noise floor + SPRT). At
|
|
35
|
+
* `samples <= 1` this is a single duel whose diagnosis matches
|
|
36
|
+
* {@link runRewardAgent} (plus any ④ tamper override). The ONLY write path is
|
|
37
|
+
* the episode dir (`diagnosis.json` + stage → `scored`).
|
|
38
|
+
*/
|
|
39
|
+
export async function runRewardAgentEnsemble(opts) {
|
|
40
|
+
const { repoRoot, episodeId } = opts;
|
|
41
|
+
const samples = Math.max(1, Math.floor(opts.reward?.samples ?? 1));
|
|
42
|
+
const orderSwap = opts.reward?.orderSwap ?? samples > 1;
|
|
43
|
+
const tamperBlock = (opts.reward?.tamperCheck ?? 'flag') === 'block' && (opts.integrityHint?.suspected ?? false);
|
|
44
|
+
const integrityHint = opts.integrityHint ?? null;
|
|
45
|
+
const ctx = await loadRewardScoringContext(repoRoot, episodeId);
|
|
46
|
+
const basePromptInput = { ...ctx.promptInput, integrityHint };
|
|
47
|
+
const armOrderFor = (i) => orderSwap && i % 2 === 1 ? 'baseline-first' : 'main-first';
|
|
48
|
+
const runDuel = (i) => scoreOnce({
|
|
49
|
+
promptInput: { ...basePromptInput, armOrder: armOrderFor(i) },
|
|
50
|
+
baselineSkipped: ctx.baselineSkipped,
|
|
51
|
+
repoRoot,
|
|
52
|
+
spawn: opts.spawn,
|
|
53
|
+
binary: opts.binary,
|
|
54
|
+
maxRepairAttempts: opts.maxRepairAttempts,
|
|
55
|
+
});
|
|
56
|
+
// ── run the duels, with SPRT early-stop when the baseline ran ────────────────
|
|
57
|
+
const duels = [];
|
|
58
|
+
let llr = 0;
|
|
59
|
+
let sequentialDecision = samples <= 1 ? 'single' : 'continue';
|
|
60
|
+
for (let i = 0; i < samples; i++) {
|
|
61
|
+
const duel = await runDuel(i);
|
|
62
|
+
duels.push(duel);
|
|
63
|
+
if (samples > 1 && !ctx.baselineSkipped && duel.parsed.advantage !== null) {
|
|
64
|
+
llr += sprtIncrement(duel.parsed.advantage > 0);
|
|
65
|
+
if (llr >= SPRT_UPPER) {
|
|
66
|
+
sequentialDecision = 'accept';
|
|
67
|
+
break;
|
|
68
|
+
}
|
|
69
|
+
if (llr <= SPRT_LOWER) {
|
|
70
|
+
sequentialDecision = 'reject';
|
|
71
|
+
break;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
const diagnosis = samples <= 1
|
|
76
|
+
? buildSingleSampleDiagnosis(ctx, duels[0], tamperBlock)
|
|
77
|
+
: await buildAggregatedDiagnosis(opts, ctx, duels, sequentialDecision, tamperBlock);
|
|
78
|
+
const diagnosisPath = await writeDiagnosis({ repoRoot, episodeId, diagnosis });
|
|
79
|
+
const episode = await advanceEpisodeStage({
|
|
80
|
+
repoRoot,
|
|
81
|
+
episodeId,
|
|
82
|
+
stage: 'scored',
|
|
83
|
+
patch: { advantage: diagnosis.advantage },
|
|
84
|
+
});
|
|
85
|
+
return { diagnosis, diagnosisPath, episode };
|
|
86
|
+
}
|
|
87
|
+
/** Single-sample diagnosis — identical to {@link runRewardAgent}, plus the ④ block override. */
|
|
88
|
+
function buildSingleSampleDiagnosis(ctx, duel, tamperBlock) {
|
|
89
|
+
const { parsed, integrity } = duel;
|
|
90
|
+
let verdict = deriveSingleSampleVerdict(parsed);
|
|
91
|
+
let advantage = parsed.advantage;
|
|
92
|
+
let finalIntegrity = integrity;
|
|
93
|
+
if (tamperBlock) {
|
|
94
|
+
verdict = 'insufficient-signal';
|
|
95
|
+
advantage = null;
|
|
96
|
+
finalIntegrity = withTamperBlockFlag(integrity);
|
|
97
|
+
}
|
|
98
|
+
return assembleDiagnosis(ctx.episode, {
|
|
99
|
+
rewardMain: parsed.rewardMain,
|
|
100
|
+
rewardBaseline: parsed.rewardBaseline,
|
|
101
|
+
advantage,
|
|
102
|
+
anchors: ctx.promptInput.anchors,
|
|
103
|
+
errors: parsed.errors,
|
|
104
|
+
gaps: parsed.gaps,
|
|
105
|
+
textualGradient: parsed.textualGradient,
|
|
106
|
+
abstained: parsed.abstained,
|
|
107
|
+
abstainReason: parsed.abstainReason,
|
|
108
|
+
verdict,
|
|
109
|
+
confidence: parsed.confidence ?? undefined,
|
|
110
|
+
integrity: finalIntegrity,
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
/** Aggregate k duels into one diagnosis (means + ② stats + verdict from noise floor). */
|
|
114
|
+
async function buildAggregatedDiagnosis(opts, ctx, duels, sequentialDecision, tamperBlock) {
|
|
115
|
+
const rewardMains = duels.map((d) => d.parsed.rewardMain);
|
|
116
|
+
const rewardMain = mean(rewardMains);
|
|
117
|
+
const advantages = duels
|
|
118
|
+
.map((d) => d.parsed.advantage)
|
|
119
|
+
.filter((a) => a !== null);
|
|
120
|
+
// Representative duel: when the baseline ran, the one whose advantage is
|
|
121
|
+
// closest to the mean; otherwise the one whose rewardMain is closest to the
|
|
122
|
+
// mean. Its gaps/errors/gradient carry the diagnostic content.
|
|
123
|
+
const advantageMean = advantages.length > 0 ? mean(advantages) : null;
|
|
124
|
+
const target = advantageMean ?? rewardMain;
|
|
125
|
+
const repValue = (d) => advantageMean !== null ? (d.parsed.advantage ?? Number.POSITIVE_INFINITY) : d.parsed.rewardMain;
|
|
126
|
+
const rep = duels.reduce((best, d) => Math.abs(repValue(d) - target) < Math.abs(repValue(best) - target) ? d : best);
|
|
127
|
+
const rewardBaseline = ctx.baselineSkipped
|
|
128
|
+
? null
|
|
129
|
+
: mean(duels.map((d) => d.parsed.rewardBaseline ?? 0));
|
|
130
|
+
// ② Noise floor: the configured value, else measured from ONE A/A duel
|
|
131
|
+
// (main-vs-main) whose advantage should be ~0; its magnitude is the jitter.
|
|
132
|
+
let noiseFloor = opts.reward?.noiseFloor ?? null;
|
|
133
|
+
if (noiseFloor === null && !ctx.baselineSkipped) {
|
|
134
|
+
noiseFloor = await measureNoiseFloor(opts, ctx);
|
|
135
|
+
}
|
|
136
|
+
const advantageStdev = advantages.length >= 2 ? stdev(advantages) : null;
|
|
137
|
+
// ⑤ Verdict from the aggregated advantage vs the noise floor.
|
|
138
|
+
let verdict;
|
|
139
|
+
let advantage = advantageMean;
|
|
140
|
+
if (advantageMean === null) {
|
|
141
|
+
verdict = undefined; // baseline skipped — only the main arm was scored
|
|
142
|
+
}
|
|
143
|
+
else if (noiseFloor !== null && Math.abs(advantageMean) <= noiseFloor) {
|
|
144
|
+
verdict = 'insufficient-signal';
|
|
145
|
+
advantage = null;
|
|
146
|
+
}
|
|
147
|
+
else if (advantageMean > 1e-9) {
|
|
148
|
+
verdict = 'main-better';
|
|
149
|
+
}
|
|
150
|
+
else if (advantageMean < -1e-9) {
|
|
151
|
+
verdict = 'baseline-better';
|
|
152
|
+
}
|
|
153
|
+
else {
|
|
154
|
+
verdict = 'tie';
|
|
155
|
+
}
|
|
156
|
+
// Confidence: 0 when insufficient-signal; else the signal's share of (signal + jitter).
|
|
157
|
+
const confidence = advantageMean === null
|
|
158
|
+
? undefined
|
|
159
|
+
: verdict === 'insufficient-signal'
|
|
160
|
+
? 0
|
|
161
|
+
: clamp01(Math.abs(advantageMean) / (Math.abs(advantageMean) + (advantageStdev ?? 0) + 1e-9));
|
|
162
|
+
// Merge integrity across duels: tamper from the hint; divergence from the rep;
|
|
163
|
+
// flags = de-duplicated union (each duel already folded in any tamper flags).
|
|
164
|
+
const flags = Array.from(new Set(duels.flatMap((d) => d.integrity.flags)));
|
|
165
|
+
let integrity = {
|
|
166
|
+
testTamperSuspected: rep.integrity.testTamperSuspected,
|
|
167
|
+
judgeVerifierDivergence: rep.integrity.judgeVerifierDivergence,
|
|
168
|
+
flags,
|
|
169
|
+
};
|
|
170
|
+
if (tamperBlock) {
|
|
171
|
+
verdict = 'insufficient-signal';
|
|
172
|
+
advantage = null;
|
|
173
|
+
integrity = withTamperBlockFlag(integrity);
|
|
174
|
+
}
|
|
175
|
+
return assembleDiagnosis(ctx.episode, {
|
|
176
|
+
rewardMain,
|
|
177
|
+
rewardBaseline,
|
|
178
|
+
advantage,
|
|
179
|
+
anchors: ctx.promptInput.anchors,
|
|
180
|
+
errors: rep.parsed.errors,
|
|
181
|
+
gaps: rep.parsed.gaps,
|
|
182
|
+
textualGradient: rep.parsed.textualGradient,
|
|
183
|
+
abstained: rep.parsed.abstained,
|
|
184
|
+
abstainReason: rep.parsed.abstainReason,
|
|
185
|
+
verdict,
|
|
186
|
+
confidence,
|
|
187
|
+
integrity,
|
|
188
|
+
stats: {
|
|
189
|
+
samples: duels.length,
|
|
190
|
+
advantageMean,
|
|
191
|
+
advantageStdev,
|
|
192
|
+
noiseFloor,
|
|
193
|
+
sequentialDecision,
|
|
194
|
+
},
|
|
195
|
+
});
|
|
196
|
+
}
|
|
197
|
+
/** Run one A/A duel (main vs main) and return |advantage| as the judge's jitter. */
|
|
198
|
+
async function measureNoiseFloor(opts, ctx) {
|
|
199
|
+
const main = ctx.promptInput.mainArm;
|
|
200
|
+
const aaAnchors = buildAnchors(main.objective, main.objective);
|
|
201
|
+
const aaPromptInput = {
|
|
202
|
+
...ctx.promptInput,
|
|
203
|
+
integrityHint: null,
|
|
204
|
+
baselineArm: main,
|
|
205
|
+
policyVersions: {
|
|
206
|
+
main: ctx.promptInput.policyVersions.main,
|
|
207
|
+
baseline: ctx.promptInput.policyVersions.main,
|
|
208
|
+
},
|
|
209
|
+
anchors: aaAnchors,
|
|
210
|
+
armOrder: 'main-first',
|
|
211
|
+
};
|
|
212
|
+
try {
|
|
213
|
+
const aa = await scoreOnce({
|
|
214
|
+
promptInput: aaPromptInput,
|
|
215
|
+
baselineSkipped: false,
|
|
216
|
+
repoRoot: opts.repoRoot,
|
|
217
|
+
spawn: opts.spawn,
|
|
218
|
+
binary: opts.binary,
|
|
219
|
+
maxRepairAttempts: opts.maxRepairAttempts,
|
|
220
|
+
});
|
|
221
|
+
return aa.parsed.advantage === null ? null : Math.abs(aa.parsed.advantage);
|
|
222
|
+
}
|
|
223
|
+
catch {
|
|
224
|
+
// A/A is a calibration aid, not load-bearing: a failed A/A leaves the floor
|
|
225
|
+
// unmeasured (null ⇒ no insufficient-signal gating from the floor).
|
|
226
|
+
return null;
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
function withTamperBlockFlag(integrity) {
|
|
230
|
+
const flag = 'tamper: blocked (tamperCheck=block) — forced insufficient-signal';
|
|
231
|
+
return {
|
|
232
|
+
...integrity,
|
|
233
|
+
flags: integrity.flags.includes(flag) ? integrity.flags : [...integrity.flags, flag],
|
|
234
|
+
};
|
|
235
|
+
}
|
|
236
|
+
/** Build the {@link RewardDiagnosis} record (schema 2) from an episode + computed parts. */
|
|
237
|
+
function assembleDiagnosis(episode, parts) {
|
|
238
|
+
return {
|
|
239
|
+
schemaVersion: 2,
|
|
240
|
+
episodeId: episode.episodeId,
|
|
241
|
+
changeName: episode.changeName,
|
|
242
|
+
targetId: episode.targetId,
|
|
243
|
+
policyVersions: {
|
|
244
|
+
main: episode.policyVersionMain,
|
|
245
|
+
baseline: episode.policyVersionBaseline,
|
|
246
|
+
},
|
|
247
|
+
rewardMain: parts.rewardMain,
|
|
248
|
+
rewardBaseline: parts.rewardBaseline,
|
|
249
|
+
advantage: parts.advantage,
|
|
250
|
+
anchors: parts.anchors,
|
|
251
|
+
errors: parts.errors,
|
|
252
|
+
gaps: parts.gaps,
|
|
253
|
+
textualGradient: parts.textualGradient,
|
|
254
|
+
abstained: parts.abstained,
|
|
255
|
+
...(parts.abstainReason !== undefined ? { abstainReason: parts.abstainReason } : {}),
|
|
256
|
+
...(parts.verdict !== undefined ? { verdict: parts.verdict } : {}),
|
|
257
|
+
...(parts.confidence !== undefined ? { confidence: parts.confidence } : {}),
|
|
258
|
+
integrity: parts.integrity,
|
|
259
|
+
...(parts.stats !== undefined ? { stats: parts.stats } : {}),
|
|
260
|
+
};
|
|
261
|
+
}
|
|
262
|
+
//# sourceMappingURL=reward-aggregator.js.map
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 范围⊆诊断 scope⊆diagnosis gate for the 演进智能体 EVOLVING AGENT — loop v2
|
|
3
|
+
* (self-evolution as in-context RL).
|
|
4
|
+
*
|
|
5
|
+
* The 奖励智能体 REWARD AGENT names a set of GAPS, each anchored to a (file,
|
|
6
|
+
* section) the 文本梯度 textual gradient points at. The EVOLVING AGENT's ONE
|
|
7
|
+
* bounded edit must stay INSIDE those named sections — it may not wander off
|
|
8
|
+
* and rewrite an unrelated heading just because the file is editable. This gate
|
|
9
|
+
* is the check: from the line diff, compute each changed range's ENCLOSING
|
|
10
|
+
* section, and PASS iff every (file, section) the edit touches is covered by
|
|
11
|
+
* some diagnosis gap.
|
|
12
|
+
*
|
|
13
|
+
* Section addressing:
|
|
14
|
+
* - `.md` files: the nearest PRECEDING markdown heading of any `#`-level
|
|
15
|
+
* (`# …`, `## …`, …). A change before the first heading has section `''`
|
|
16
|
+
* (the file preamble).
|
|
17
|
+
* - YAML / other files: the nearest preceding TOP-LEVEL key (`key:` at column
|
|
18
|
+
* 0). A change before the first top-level key has section `''`.
|
|
19
|
+
*
|
|
20
|
+
* Coverage:
|
|
21
|
+
* - a gap `{file: '*'}` covers ANY file;
|
|
22
|
+
* - a gap `{section: '*'}` covers the WHOLE file;
|
|
23
|
+
* - otherwise the gap's `file` AND `section` must match exactly.
|
|
24
|
+
*
|
|
25
|
+
* Pure + dependency-free (golden-testable). Re-uses {@link lineDiff} so its
|
|
26
|
+
* notion of "changed lines" is identical to the ≤ L budget check.
|
|
27
|
+
*/
|
|
28
|
+
import { type DiffEdit } from './line-diff.js';
|
|
29
|
+
/** A (file, section) the 奖励智能体 REWARD AGENT's diagnosis points the edit at. */
|
|
30
|
+
export interface DiagnosisGap {
|
|
31
|
+
/** Repo-relative POSIX path, or `'*'` to cover any file. */
|
|
32
|
+
file: string;
|
|
33
|
+
/** Section anchor (markdown heading text / YAML top-level key), or `'*'`. */
|
|
34
|
+
section: string;
|
|
35
|
+
}
|
|
36
|
+
/** One changed (file, section) the edit touched but no gap covered. */
|
|
37
|
+
export interface ScopeViolation {
|
|
38
|
+
file: string;
|
|
39
|
+
section: string;
|
|
40
|
+
/** 1-based inclusive line spans (in the proposed file) that fell out of scope. */
|
|
41
|
+
lines: {
|
|
42
|
+
startLine: number;
|
|
43
|
+
endLine: number;
|
|
44
|
+
}[];
|
|
45
|
+
}
|
|
46
|
+
export interface ScopeGateResult {
|
|
47
|
+
pass: boolean;
|
|
48
|
+
violations: ScopeViolation[];
|
|
49
|
+
}
|
|
50
|
+
export interface CheckScopeWithinDiagnosisInput {
|
|
51
|
+
/** Proposed full-file replacements the EVOLVING AGENT wants to write. */
|
|
52
|
+
edits: readonly DiffEdit[];
|
|
53
|
+
/** Current on-disk contents of the same files (for the line diff). */
|
|
54
|
+
currentFiles: readonly DiffEdit[];
|
|
55
|
+
/** The diagnosis gaps the edit's scope must be a subset of. */
|
|
56
|
+
gaps: readonly DiagnosisGap[];
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Run the 范围⊆诊断 scope⊆diagnosis gate. For every edit, diff it against its
|
|
60
|
+
* current content, resolve each changed range's enclosing section, and flag any
|
|
61
|
+
* (file, section) not covered by a gap. Returns `{pass, violations}`; a pure
|
|
62
|
+
* deletion (no inserted range) is in-scope by construction (it removes named or
|
|
63
|
+
* unnamed lines but introduces no new out-of-scope section) and is not flagged.
|
|
64
|
+
*/
|
|
65
|
+
export declare function checkScopeWithinDiagnosis(input: CheckScopeWithinDiagnosisInput): ScopeGateResult;
|
|
66
|
+
//# sourceMappingURL=scope-gate.d.ts.map
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 范围⊆诊断 scope⊆diagnosis gate for the 演进智能体 EVOLVING AGENT — loop v2
|
|
3
|
+
* (self-evolution as in-context RL).
|
|
4
|
+
*
|
|
5
|
+
* The 奖励智能体 REWARD AGENT names a set of GAPS, each anchored to a (file,
|
|
6
|
+
* section) the 文本梯度 textual gradient points at. The EVOLVING AGENT's ONE
|
|
7
|
+
* bounded edit must stay INSIDE those named sections — it may not wander off
|
|
8
|
+
* and rewrite an unrelated heading just because the file is editable. This gate
|
|
9
|
+
* is the check: from the line diff, compute each changed range's ENCLOSING
|
|
10
|
+
* section, and PASS iff every (file, section) the edit touches is covered by
|
|
11
|
+
* some diagnosis gap.
|
|
12
|
+
*
|
|
13
|
+
* Section addressing:
|
|
14
|
+
* - `.md` files: the nearest PRECEDING markdown heading of any `#`-level
|
|
15
|
+
* (`# …`, `## …`, …). A change before the first heading has section `''`
|
|
16
|
+
* (the file preamble).
|
|
17
|
+
* - YAML / other files: the nearest preceding TOP-LEVEL key (`key:` at column
|
|
18
|
+
* 0). A change before the first top-level key has section `''`.
|
|
19
|
+
*
|
|
20
|
+
* Coverage:
|
|
21
|
+
* - a gap `{file: '*'}` covers ANY file;
|
|
22
|
+
* - a gap `{section: '*'}` covers the WHOLE file;
|
|
23
|
+
* - otherwise the gap's `file` AND `section` must match exactly.
|
|
24
|
+
*
|
|
25
|
+
* Pure + dependency-free (golden-testable). Re-uses {@link lineDiff} so its
|
|
26
|
+
* notion of "changed lines" is identical to the ≤ L budget check.
|
|
27
|
+
*/
|
|
28
|
+
import { lineDiff } from './line-diff.js';
|
|
29
|
+
function toPosix(p) {
|
|
30
|
+
return p.replace(/\\/g, '/');
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* The enclosing section of 1-based `line` in `proposedContent`. For `.md`
|
|
34
|
+
* files it is the nearest preceding heading's text; otherwise the nearest
|
|
35
|
+
* preceding top-level (`key:` at column 0) key. `''` when nothing precedes it.
|
|
36
|
+
*/
|
|
37
|
+
function enclosingSection(relPath, proposedLines, line) {
|
|
38
|
+
const isMarkdown = /\.md$/i.test(relPath);
|
|
39
|
+
let section = '';
|
|
40
|
+
// Scan from the file top down to (and including) the changed line; the last
|
|
41
|
+
// matching anchor at-or-before it is the enclosing section.
|
|
42
|
+
const upto = Math.min(line, proposedLines.length);
|
|
43
|
+
for (let idx = 0; idx < upto; idx++) {
|
|
44
|
+
const text = proposedLines[idx];
|
|
45
|
+
if (isMarkdown) {
|
|
46
|
+
const h = /^#{1,6}\s+(.+?)\s*$/.exec(text);
|
|
47
|
+
if (h)
|
|
48
|
+
section = h[1];
|
|
49
|
+
}
|
|
50
|
+
else {
|
|
51
|
+
// Top-level key: an identifier-ish key at column 0 followed by ':'.
|
|
52
|
+
const k = /^([^\s:#][^:]*):(?:\s|$)/.exec(text);
|
|
53
|
+
if (k)
|
|
54
|
+
section = k[1].trim();
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
return section;
|
|
58
|
+
}
|
|
59
|
+
/** True iff some gap covers `(file, section)` per the wildcard rules. */
|
|
60
|
+
function isCovered(file, section, gaps) {
|
|
61
|
+
for (const gap of gaps) {
|
|
62
|
+
const fileOk = gap.file === '*' || toPosix(gap.file) === file;
|
|
63
|
+
if (!fileOk)
|
|
64
|
+
continue;
|
|
65
|
+
if (gap.section === '*' || gap.section === section)
|
|
66
|
+
return true;
|
|
67
|
+
}
|
|
68
|
+
return false;
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Run the 范围⊆诊断 scope⊆diagnosis gate. For every edit, diff it against its
|
|
72
|
+
* current content, resolve each changed range's enclosing section, and flag any
|
|
73
|
+
* (file, section) not covered by a gap. Returns `{pass, violations}`; a pure
|
|
74
|
+
* deletion (no inserted range) is in-scope by construction (it removes named or
|
|
75
|
+
* unnamed lines but introduces no new out-of-scope section) and is not flagged.
|
|
76
|
+
*/
|
|
77
|
+
export function checkScopeWithinDiagnosis(input) {
|
|
78
|
+
const currentByPath = new Map(input.currentFiles.map((f) => [toPosix(f.relPath), f.content]));
|
|
79
|
+
// Accumulate violations keyed by file+section so the lines coalesce per spot.
|
|
80
|
+
const violationByKey = new Map();
|
|
81
|
+
for (const edit of input.edits) {
|
|
82
|
+
const rel = toPosix(edit.relPath);
|
|
83
|
+
const current = currentByPath.get(rel) ?? '';
|
|
84
|
+
const proposedLines = edit.content.length === 0 ? [] : edit.content.replace(/\n$/, '').split('\n');
|
|
85
|
+
const d = lineDiff(current, edit.content);
|
|
86
|
+
for (const range of d.changedRanges) {
|
|
87
|
+
const section = enclosingSection(rel, proposedLines, range.startLine);
|
|
88
|
+
if (isCovered(rel, section, input.gaps))
|
|
89
|
+
continue;
|
|
90
|
+
const key = `${rel}${section}`;
|
|
91
|
+
const existing = violationByKey.get(key);
|
|
92
|
+
if (existing) {
|
|
93
|
+
existing.lines.push({ startLine: range.startLine, endLine: range.endLine });
|
|
94
|
+
}
|
|
95
|
+
else {
|
|
96
|
+
violationByKey.set(key, {
|
|
97
|
+
file: rel,
|
|
98
|
+
section,
|
|
99
|
+
lines: [{ startLine: range.startLine, endLine: range.endLine }],
|
|
100
|
+
});
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
const violations = [...violationByKey.values()];
|
|
105
|
+
return { pass: violations.length === 0, violations };
|
|
106
|
+
}
|
|
107
|
+
//# sourceMappingURL=scope-gate.js.map
|
|
@@ -229,7 +229,7 @@ export async function mineSuccessSignals(opts) {
|
|
|
229
229
|
const walkedPaths = [];
|
|
230
230
|
const minedKeys = new Map();
|
|
231
231
|
const protect = (targetId, section) => {
|
|
232
|
-
const key = `${targetId}
|
|
232
|
+
const key = `${targetId} ${section}`;
|
|
233
233
|
if (!minedKeys.has(key))
|
|
234
234
|
minedKeys.set(key, { targetId, section });
|
|
235
235
|
};
|
|
@@ -282,7 +282,7 @@ export async function mineSuccessSignals(opts) {
|
|
|
282
282
|
}
|
|
283
283
|
}
|
|
284
284
|
// Best-effort persist (health-baseline.ts pattern): a failed side-write must
|
|
285
|
-
// never fail the learn/
|
|
285
|
+
// never fail the learn/episode run that triggered the mining.
|
|
286
286
|
let protectionsWritten = 0;
|
|
287
287
|
try {
|
|
288
288
|
const file = protectionsPath(projectRoot);
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
export interface DetectTestTamperInput {
|
|
2
|
+
/** Absolute path to the change dir (holds the 5 artifacts + test-report.md + any test files). */
|
|
3
|
+
changeDirPath: string;
|
|
4
|
+
}
|
|
5
|
+
export interface TamperResult {
|
|
6
|
+
suspected: boolean;
|
|
7
|
+
/** Human-readable reasons, e.g. "spec-tests.md declares 36 scenarios but test-report shows 4 collected". Empty when clean. */
|
|
8
|
+
flags: string[];
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Count the scenarios/tests a `spec-tests.md` declares. The canonical gen-tests
|
|
12
|
+
* format is a "Requirement Traceability Matrix" markdown table — one row per
|
|
13
|
+
* mapped (requirement, test) pair. We count those data rows (skipping the
|
|
14
|
+
* header + separator and any row whose test cell is "—"/blank, which marks a
|
|
15
|
+
* step with no applicable test). Returns null when no countable table is found.
|
|
16
|
+
*/
|
|
17
|
+
export declare function countDeclaredScenarios(specTestsText: string): number | null;
|
|
18
|
+
/**
|
|
19
|
+
* Detect whether a change's tests were tampered to fake a pass. Reads only
|
|
20
|
+
* files under `input.changeDirPath`; tolerates every missing file. `suspected`
|
|
21
|
+
* is true iff at least one strong heuristic fired.
|
|
22
|
+
*/
|
|
23
|
+
export declare function detectTestTamper(input: DetectTestTamperInput): Promise<TamperResult>;
|
|
24
|
+
//# sourceMappingURL=tamper-check.d.ts.map
|