synergyspec-selfevolving 1.4.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/README.md +31 -18
  2. package/dist/commands/learn.d.ts +12 -1
  3. package/dist/commands/learn.js +158 -11
  4. package/dist/commands/self-evolution-episode.d.ts +177 -0
  5. package/dist/commands/self-evolution-episode.js +431 -0
  6. package/dist/commands/self-evolution.d.ts +12 -190
  7. package/dist/commands/self-evolution.js +114 -866
  8. package/dist/core/archive.d.ts +0 -1
  9. package/dist/core/archive.js +0 -58
  10. package/dist/core/artifact-graph/instruction-loader.d.ts +2 -4
  11. package/dist/core/artifact-graph/instruction-loader.js +3 -31
  12. package/dist/core/fitness/loss.d.ts +5 -5
  13. package/dist/core/fitness/loss.js +4 -4
  14. package/dist/core/fitness/test-failures.js +10 -2
  15. package/dist/core/project-config.d.ts +19 -0
  16. package/dist/core/project-config.js +96 -0
  17. package/dist/core/self-evolution/candidate-fitness.d.ts +23 -1
  18. package/dist/core/self-evolution/candidate-fitness.js +31 -5
  19. package/dist/core/self-evolution/candidates.d.ts +0 -9
  20. package/dist/core/self-evolution/critic-agent.d.ts +192 -0
  21. package/dist/core/self-evolution/critic-agent.js +568 -0
  22. package/dist/core/self-evolution/edits-contract.d.ts +53 -0
  23. package/dist/core/self-evolution/edits-contract.js +89 -0
  24. package/dist/core/self-evolution/episode-orchestrator.d.ts +234 -0
  25. package/dist/core/self-evolution/episode-orchestrator.js +681 -0
  26. package/dist/core/self-evolution/episode-store.d.ts +266 -0
  27. package/dist/core/self-evolution/episode-store.js +573 -0
  28. package/dist/core/self-evolution/evolution-switches.d.ts +1 -1
  29. package/dist/core/self-evolution/evolution-switches.js +5 -10
  30. package/dist/core/self-evolution/evolving-agent.d.ts +208 -0
  31. package/dist/core/self-evolution/evolving-agent.js +535 -0
  32. package/dist/core/self-evolution/host-harness.d.ts +14 -15
  33. package/dist/core/self-evolution/host-harness.js +48 -23
  34. package/dist/core/self-evolution/index.d.ts +11 -6
  35. package/dist/core/self-evolution/index.js +20 -6
  36. package/dist/core/self-evolution/line-diff.d.ts +60 -0
  37. package/dist/core/self-evolution/line-diff.js +130 -0
  38. package/dist/core/self-evolution/policy/fs-safe.d.ts +19 -0
  39. package/dist/core/self-evolution/policy/fs-safe.js +89 -0
  40. package/dist/core/self-evolution/policy/index.d.ts +13 -0
  41. package/dist/core/self-evolution/policy/index.js +13 -0
  42. package/dist/core/self-evolution/policy/policy-store.d.ts +217 -0
  43. package/dist/core/self-evolution/policy/policy-store.js +774 -0
  44. package/dist/core/self-evolution/policy/prediction-reconcile.d.ts +54 -0
  45. package/dist/core/self-evolution/policy/prediction-reconcile.js +191 -0
  46. package/dist/core/self-evolution/policy/reject-buffer.d.ts +55 -0
  47. package/dist/core/self-evolution/policy/reject-buffer.js +170 -0
  48. package/dist/core/self-evolution/promote.d.ts +1 -1
  49. package/dist/core/self-evolution/promote.js +6 -33
  50. package/dist/core/self-evolution/promotion.js +1 -2
  51. package/dist/core/self-evolution/reward-agent.d.ts +379 -0
  52. package/dist/core/self-evolution/reward-agent.js +940 -0
  53. package/dist/core/self-evolution/reward-aggregator.d.ts +59 -0
  54. package/dist/core/self-evolution/reward-aggregator.js +262 -0
  55. package/dist/core/self-evolution/scope-gate.d.ts +66 -0
  56. package/dist/core/self-evolution/scope-gate.js +107 -0
  57. package/dist/core/self-evolution/success-channel.js +2 -2
  58. package/dist/core/self-evolution/tamper-check.d.ts +24 -0
  59. package/dist/core/self-evolution/tamper-check.js +236 -0
  60. package/dist/core/self-evolution/tool-evolution.js +2 -13
  61. package/dist/core/self-evolution/verdict.d.ts +8 -5
  62. package/dist/core/self-evolution/verdict.js +4 -7
  63. package/dist/core/templates/workflows/gen-tests.js +1 -1
  64. package/dist/core/templates/workflows/learn.d.ts +3 -2
  65. package/dist/core/templates/workflows/learn.js +21 -18
  66. package/dist/core/templates/workflows/self-evolving.d.ts +6 -4
  67. package/dist/core/templates/workflows/self-evolving.js +62 -172
  68. package/dist/core/trajectory/scrub.d.ts +27 -0
  69. package/dist/core/trajectory/scrub.js +79 -0
  70. package/dist/core/trajectory/skeleton.d.ts +27 -1
  71. package/dist/core/trajectory/skeleton.js +152 -8
  72. package/dist/dashboard/data.d.ts +25 -51
  73. package/dist/dashboard/data.js +68 -180
  74. package/dist/dashboard/react-client.js +458 -503
  75. package/dist/dashboard/react-styles.js +3 -3
  76. package/dist/dashboard/server.js +23 -17
  77. package/dist/ui/ascii-patterns.d.ts +7 -15
  78. package/dist/ui/ascii-patterns.js +123 -54
  79. package/dist/ui/welcome-screen.d.ts +0 -14
  80. package/dist/ui/welcome-screen.js +16 -35
  81. package/package.json +1 -1
  82. package/dist/core/self-evolution/ga-selection.d.ts +0 -94
  83. package/dist/core/self-evolution/ga-selection.js +0 -153
  84. package/dist/core/self-evolution/proposer-agent.d.ts +0 -182
  85. package/dist/core/self-evolution/proposer-agent.js +0 -326
  86. package/dist/core/self-evolution/replay-runner.d.ts +0 -100
  87. package/dist/core/self-evolution/replay-runner.js +0 -170
  88. package/dist/core/self-evolution/replay.d.ts +0 -45
  89. package/dist/core/self-evolution/replay.js +0 -56
  90. package/dist/core/self-evolution/template-variants.d.ts +0 -62
  91. package/dist/core/self-evolution/template-variants.js +0 -171
  92. package/dist/core/self-evolution/trajectory.d.ts +0 -65
  93. package/dist/core/self-evolution/trajectory.js +0 -185
@@ -0,0 +1,940 @@
1
+ import { promises as fs } from 'node:fs';
2
+ import * as path from 'node:path';
3
+ import { runHeadlessAgent } from './host-harness.js';
4
+ import { readEpisode, episodeDir, writeDiagnosis, advanceEpisodeStage, } from './episode-store.js';
5
+ import { scrub } from '../trajectory/scrub.js';
6
+ export class RewardAgentOutputInvalid extends Error {
7
+ constructor(message) {
8
+ super(`reward agent output invalid: ${message}`);
9
+ this.name = 'RewardAgentOutputInvalid';
10
+ }
11
+ }
12
+ export class RewardAgentInvocationError extends Error {
13
+ constructor(stderr) {
14
+ super(`reward agent invocation failed: ${stderr}`);
15
+ this.name = 'RewardAgentInvocationError';
16
+ }
17
+ }
18
+ // ── bounding ───────────────────────────────────────────────────────────────
19
+ // Mirror how learn.ts/skeleton bound things: cap each excerpt so a huge
20
+ // transcript or artifact cannot blow the judge's context. Bounds are generous
21
+ // but finite; truncation appends a visible marker.
22
+ const MAX_TRANSCRIPT_CHARS = 24_000;
23
+ const MAX_ARTIFACT_CHARS = 12_000;
24
+ const TRUNCATION_MARKER = '\n…[truncated]…';
25
+ // P5: transcript bound is HEAD+TAIL (not head-only) so the decisive FINAL test
26
+ // outcome at the end of a long run survives. head + tail = MAX_TRANSCRIPT_CHARS.
27
+ const TRANSCRIPT_HEAD_CHARS = 14_000;
28
+ const TRANSCRIPT_TAIL_CHARS = MAX_TRANSCRIPT_CHARS - TRANSCRIPT_HEAD_CHARS;
29
+ function boundExcerpt(text, max) {
30
+ if (text.length <= max)
31
+ return text;
32
+ return text.slice(0, max - TRUNCATION_MARKER.length) + TRUNCATION_MARKER;
33
+ }
34
+ /**
35
+ * P5: keep the FIRST {@link TRANSCRIPT_HEAD_CHARS} and the LAST
36
+ * {@link TRANSCRIPT_TAIL_CHARS} of a transcript with an explicit elision marker
37
+ * in between — the head-only bound silently dropped the final runner result,
38
+ * the single most decisive region for a final-verdict judge. Same total budget.
39
+ */
40
+ function boundTranscriptExcerpt(text) {
41
+ if (text.length <= MAX_TRANSCRIPT_CHARS)
42
+ return text;
43
+ const elided = text.length - TRANSCRIPT_HEAD_CHARS - TRANSCRIPT_TAIL_CHARS;
44
+ const marker = `\n…[${elided} chars elided]…\n`;
45
+ // Charge the marker to the tail so head + marker + tail ≤ MAX_TRANSCRIPT_CHARS
46
+ // exactly (the marker is no longer free overage on top of the budget).
47
+ const tailLen = Math.max(0, TRANSCRIPT_TAIL_CHARS - marker.length);
48
+ return text.slice(0, TRANSCRIPT_HEAD_CHARS) + marker + text.slice(text.length - tailLen);
49
+ }
50
+ const PRELUDE = [
51
+ 'You are the 奖励智能体 REWARD AGENT — an LLM AS JUDGE for one episode of a',
52
+ 'self-evolution loop run as in-context RL. Two arms ran the SAME change:',
53
+ ' - MAIN ARM — 主智能体 MAIN AGENT, the frozen actor on the current policy vN+1.',
54
+ ' - BASELINE ARM — CRITIC AGENT(基线智能体 baseline agent), an agent with the',
55
+ ' SAME input/output as the main agent that reran the LAST',
56
+ ' episode\'s policy vN on the SAME change.',
57
+ '',
58
+ 'Your job:',
59
+ '- CALCULATE 算分 reward(主臂) and reward(基线臂), each a number in [0,1],',
60
+ ' ANCHORED on the OBJECTIVE EVIDENCE below (tests · health · 轨迹度量 trajectory',
61
+ ' metrics). The anchors guide the score; the score itself is YOUR judgment.',
62
+ '- advantage = reward(主臂) − reward(基线臂).',
63
+ '- FIND errors and NAME gaps, each with a suggested direction — together the',
64
+ ' 文本梯度 textual gradient: a short instruction for how the policy (the design',
65
+ ' template) should change to close the gap.',
66
+ '- You NEVER edit any file. You only score and diagnose.',
67
+ '- 弃权 ABSTAIN when there is NO nameable gap: set "abstained": true, leave',
68
+ ' "gaps" empty and "textualGradient" null, and give a one-line "abstainReason".',
69
+ ' Otherwise "abstained" must be false and "textualGradient" must be non-empty.',
70
+ '',
71
+ 'SCORING RUBRIC — score each arm against these dimensions, in priority order:',
72
+ ' 1. CORRECTNESS (dominant) — does the code pass its tests? A LOWER pass rate',
73
+ ' CANNOT be outscored by a cleaner/shorter solution. Correctness is a GATE,',
74
+ ' not one term among many: if one arm has a strictly lower pass rate, its',
75
+ ' reward MUST be lower and the advantage must point AWAY from it.',
76
+ ' 2. CODE HEALTH — structural erosion (complexity/nesting/duplication).',
77
+ ' 3. VERBOSITY / COMPRESSION — prefer the arm that is NOT bloated or redundant;',
78
+ ' do NOT reward length. A longer transcript or artifact is not "more thorough".',
79
+ ' 4. TRAJECTORY ECONOMY — fewer wasted tool calls / dead ends, all else equal.',
80
+ '',
81
+ 'REFERENCE-GROUND your scores: the OBJECTIVE EVIDENCE block below carries the',
82
+ 'measured loss/passRate/health for each arm. Your reward ordering MUST track',
83
+ 'that evidence — if the anchors say one arm is clearly better on correctness,',
84
+ 'your reward for it must be higher. Do NOT let presentation order or length',
85
+ 'move the score.',
86
+ '',
87
+ 'DROP UNEXPLAINABLE FAILURES: only blame an arm for a failure you can tie to a',
88
+ 'verbatim quoted span in a real file. A failure you cannot explain (flaky,',
89
+ 'environmental, or unrelated to the change) must NOT lower that arm\'s reward.',
90
+ 'But an EXPLAINED failure is still a failure: a real, reproduced test failure',
91
+ 'must lower the arm\'s reward even when its error message makes it understandable.',
92
+ '',
93
+ 'WEAKNESS-CLASS each gap (optional but encouraged): "forgetting" (the baseline',
94
+ 'did this right and the main arm lost it), "boundary" (edge/limit case),',
95
+ '"rare" (low-frequency scenario), "logic" (outright wrong), "verbosity"',
96
+ '(bloat to prune), or "other"; plus a "severity" of "high"|"medium"|"low".',
97
+ '',
98
+ 'Output contract: emit EXACTLY ONE fenced block tagged `json:diagnosis` and',
99
+ 'nothing else, of the form:',
100
+ '',
101
+ '```json:diagnosis',
102
+ '{"rewardMain": <0..1>, "rewardBaseline": <0..1 | null>,',
103
+ ' "advantage": <rewardMain − rewardBaseline | null>,',
104
+ ' "verdict": "main-better"|"baseline-better"|"tie"|"insufficient-signal"|"no-gap",',
105
+ ' "confidence": <0..1 | null>,',
106
+ ' "errors": [{"arm": "main"|"baseline", "description": "<what is wrong>",',
107
+ ' "evidence": {"file": "<path>", "quote": "<verbatim span>"}}],',
108
+ ' "gaps": [{"file": "<target file>", "section": "<heading | *>",',
109
+ ' "description": "<the nameable gap>",',
110
+ ' "weaknessClass": "forgetting"|"boundary"|"rare"|"logic"|"verbosity"|"other",',
111
+ ' "severity": "high"|"medium"|"low"}],',
112
+ ' "textualGradient": "<one-paragraph direction | null>",',
113
+ ' "abstained": <true|false>, "abstainReason": "<only when abstained>"}',
114
+ '```',
115
+ '',
116
+ '"verdict", "confidence", and the per-gap "weaknessClass"/"severity" are',
117
+ 'OPTIONAL — omit any you are unsure of; never invent a value to fill a slot.',
118
+ ].join('\n');
119
+ /**
120
+ * Assemble the 奖励智能体 REWARD AGENT prompt. Pure (no I/O); exported for golden
121
+ * tests. Ordered blocks:
122
+ * 1. PRELUDE — the judge contract.
123
+ * 2. MAIN ARM (主智能体, policy vN+1) — skeleton + bounded transcript excerpt.
124
+ * 3. BASELINE ARM (CRITIC AGENT(基线智能体), policy vN) — same; OMITTED
125
+ * ENTIRELY when the baseline arm was skipped (a one-line note replaces it,
126
+ * demanding null rewardBaseline/advantage).
127
+ * 4. ARTIFACTS — the 5 artifacts + test-report.md, bounded excerpts.
128
+ * 5. OBJECTIVE EVIDENCE — both arms' anchors verbatim (the on-disk loss/health
129
+ * numbers that anchor the score).
130
+ */
131
+ export function assembleRewardAgentPrompt(input) {
132
+ const parts = [PRELUDE, ''];
133
+ parts.push(`# EPISODE: change=${input.changeName} target=${input.targetId} ` +
134
+ `policy(main=${fmtVersion(input.policyVersions.main)}, baseline=${fmtVersion(input.policyVersions.baseline)})`, '');
135
+ // ④ Tamper signal (when computed upstream) — stated BEFORE the arms so the
136
+ // judge reads it while scoring. Omitted entirely when absent or not suspected.
137
+ if (input.integrityHint && input.integrityHint.suspected) {
138
+ parts.push('# ⚠ INTEGRITY SIGNAL — possible test tampering on the MAIN ARM', 'A pre-scoring check flagged that the main arm may have weakened its own', 'tests (e.g. deleted assertions, hard-coded expecteds, skips):', ...input.integrityHint.flags.map((f) => ` - ${f}`), 'Do NOT reward passing tests that were weakened. If the green is bought by', 'gutting the test, the main arm\'s reward must reflect that, not the pass rate.', '');
139
+ }
140
+ // 2/3. The two arm blocks, in the requested presentation order (③ order-swap).
141
+ const mainBlock = () => {
142
+ const out = [
143
+ `# MAIN ARM (主智能体 MAIN AGENT, policy v${fmtVersion(input.policyVersions.main)})`,
144
+ ...renderArmBody(input.mainArm),
145
+ '',
146
+ ];
147
+ return out;
148
+ };
149
+ const baselineBlock = () => {
150
+ if (input.baselineArm === null) {
151
+ return [
152
+ '# BASELINE ARM — SKIPPED',
153
+ 'The CRITIC AGENT(基线智能体 baseline agent)did NOT run for this episode',
154
+ '(no prior policy version in the 版本账本 ledger, or skipped by policy).',
155
+ 'NO comparison is possible: set "rewardBaseline" to null and "advantage" to',
156
+ 'null. Score ONLY the main arm and diagnose against the artifacts below.',
157
+ '',
158
+ ];
159
+ }
160
+ return [
161
+ `# BASELINE ARM (CRITIC AGENT(基线智能体 baseline agent), policy v${fmtVersion(input.policyVersions.baseline)})`,
162
+ ...renderArmBody(input.baselineArm),
163
+ '',
164
+ ];
165
+ };
166
+ if (input.armOrder === 'baseline-first') {
167
+ parts.push(...baselineBlock(), ...mainBlock());
168
+ }
169
+ else {
170
+ parts.push(...mainBlock(), ...baselineBlock());
171
+ }
172
+ // P1: the failing-test CONTRAST — which tests each arm failed and the set MAIN
173
+ // fails that BASELINE passed (the literal regression signal), with a
174
+ // renamed/edited-test caveat. Order-independent (always after both arms).
175
+ parts.push(...renderFailingTestContrast(input));
176
+ // 4. ARTIFACTS ──────────────────────────────────────────────────────────
177
+ parts.push('# ARTIFACTS (the 5 artifacts + test-report.md, from the change dir)');
178
+ if (input.artifacts.length === 0) {
179
+ parts.push('(no artifacts were readable in the change dir)');
180
+ }
181
+ else {
182
+ for (const a of input.artifacts) {
183
+ parts.push(`<<FILE: ${a.file}>>`, boundExcerpt(a.content, MAX_ARTIFACT_CHARS), '<<END FILE>>', '');
184
+ }
185
+ }
186
+ // 5. OBJECTIVE EVIDENCE ───────────────────────────────────────────────────
187
+ parts.push('# OBJECTIVE EVIDENCE (anchors — tests · health · 轨迹度量; lower loss is better)');
188
+ parts.push('main arm objective.json:');
189
+ parts.push(jsonBlock(scrubObjectiveForPrompt(input.mainArm.objective)));
190
+ if (input.baselineArm === null) {
191
+ parts.push('baseline arm objective.json: (none — baseline arm was skipped; rewardBaseline=null, advantage=null)');
192
+ }
193
+ else {
194
+ parts.push('baseline arm objective.json:');
195
+ parts.push(jsonBlock(scrubObjectiveForPrompt(input.baselineArm.objective)));
196
+ }
197
+ parts.push('');
198
+ parts.push('anchors (mapped):');
199
+ parts.push(jsonBlock(input.anchors));
200
+ return parts.join('\n');
201
+ }
202
+ function fmtVersion(v) {
203
+ return v === null ? 'unknown' : String(v);
204
+ }
205
+ function jsonBlock(value) {
206
+ return `\`\`\`json\n${JSON.stringify(value, null, 2)}\n\`\`\``;
207
+ }
208
+ function renderArmBody(arm) {
209
+ const out = [];
210
+ // Score-vs-diagnosis firewall: everything in this arm body (verification line,
211
+ // skeleton, error tails, plan/effort signals, transcript) is DIAGNOSTIC CONTEXT
212
+ // for naming gaps — NOT a scoring input. Correctness is set by the passRate /
213
+ // loss / health anchors in OBJECTIVE EVIDENCE below; an arm with a strictly
214
+ // lower pass rate must score lower no matter how legible or "deliberate" its
215
+ // trajectory looks. A high event count is flailing, not thoroughness.
216
+ out.push('context for the DIAGNOSIS only (does NOT change the correctness gate — anchor scores on OBJECTIVE EVIDENCE):');
217
+ // P2: verification provenance + truncation magnitude — lets the judge calibrate
218
+ // confidence on the pass-rate anchor (observed vs self-reported) and discount
219
+ // trajectory-economy on a truncated long run instead of reading it as flailing.
220
+ out.push(renderVerificationLine(arm));
221
+ if (arm.skeleton !== null) {
222
+ out.push('skeleton.json (action sequence, error tails, plan/effort signals — diagnostic only):');
223
+ out.push(jsonBlock(arm.skeleton));
224
+ }
225
+ else {
226
+ out.push('skeleton.json: (none)');
227
+ }
228
+ if (arm.transcript !== null && arm.transcript.length > 0) {
229
+ // Scrub the BOUNDED excerpt (cheaper than scrubbing a 200K raw transcript and
230
+ // the budget is already enforced on the bounded text).
231
+ out.push('transcript excerpt (secrets redacted):');
232
+ out.push('```', scrub(boundTranscriptExcerpt(arm.transcript)), '```');
233
+ }
234
+ else {
235
+ out.push('transcript: (none captured)');
236
+ }
237
+ return out;
238
+ }
239
+ /** P2: one-line verification + truncation-magnitude summary for an arm. */
240
+ function renderVerificationLine(arm) {
241
+ const o = arm.objective;
242
+ const observed = o.testRunObserved === true ? 'yes' : o.testRunObserved === false ? 'no' : 'unknown';
243
+ const status = o.observedStatus ?? 'null';
244
+ const verified = o.verified === true ? 'yes' : o.verified === false ? 'no' : 'unknown';
245
+ const mag = skeletonMagnitude(arm.skeleton);
246
+ const skel = mag
247
+ ? mag.truncated
248
+ ? `${mag.shown} of ${mag.pre} events (truncated)`
249
+ : `${mag.shown} events`
250
+ : 'none';
251
+ const tx = arm.transcript === null || arm.transcript.length === 0
252
+ ? 'none'
253
+ : arm.transcript.length > MAX_TRANSCRIPT_CHARS
254
+ ? `${MAX_TRANSCRIPT_CHARS} of ${arm.transcript.length} chars (head+tail)`
255
+ : `${arm.transcript.length} chars`;
256
+ return `verification: tests OBSERVED=${observed} · observedStatus=${status} · passRate VERIFIED=${verified} · skeleton=${skel} · transcript=${tx}`;
257
+ }
258
+ /** Structural read of an arm's persisted ActionSkeleton (typed as opaque object here). */
259
+ function skeletonMagnitude(skeleton) {
260
+ if (!skeleton || typeof skeleton !== 'object')
261
+ return null;
262
+ const s = skeleton;
263
+ const shown = Array.isArray(s.events) ? s.events.length : 0;
264
+ const pre = typeof s.preTruncationEventCount === 'number' ? s.preTruncationEventCount : shown;
265
+ return { shown, pre, truncated: s.truncated === true };
266
+ }
267
+ /**
268
+ * Redact secrets out of an objective before its JSON is dumped into the prompt —
269
+ * the only free-text field is `observedFailures` (testId/assertion parsed from
270
+ * raw runner output, which can echo a credential). Numeric/enum fields are left
271
+ * as-is. Returns the same object unchanged when there is nothing to scrub.
272
+ */
273
+ function scrubObjectiveForPrompt(obj) {
274
+ if (!obj.observedFailures || obj.observedFailures.length === 0)
275
+ return obj;
276
+ return {
277
+ ...obj,
278
+ observedFailures: obj.observedFailures.map((f) => ({
279
+ testId: scrub(f.testId),
280
+ ...(f.file !== undefined ? { file: f.file } : {}),
281
+ ...(f.assertion !== undefined ? { assertion: scrub(f.assertion) } : {}),
282
+ })),
283
+ };
284
+ }
285
+ /** POSIX file paths the arm EDITED, read from its persisted skeleton's file-edit events. */
286
+ function editedFilesFromSkeleton(skeleton) {
287
+ const set = new Set();
288
+ if (!skeleton || typeof skeleton !== 'object')
289
+ return set;
290
+ const events = skeleton.events;
291
+ if (!Array.isArray(events))
292
+ return set;
293
+ for (const ev of events) {
294
+ if (ev && typeof ev === 'object') {
295
+ const e = ev;
296
+ if (e.kind === 'file-edit' && typeof e.file === 'string')
297
+ set.add(e.file);
298
+ }
299
+ }
300
+ return set;
301
+ }
302
+ /**
303
+ * P1: a CONTRASTED failing-test block — the literal regression signal a paired
304
+ * judge needs. Lists each arm's parsed failing tests and the set MAIN fails that
305
+ * BASELINE passed, with a renamed/edited-test CAVEAT (a main-only failure whose
306
+ * test file the main arm edited is a LEAD, not a confirmed regression). Rendered
307
+ * only when at least one arm parsed failures (keeps the prompt stable otherwise).
308
+ */
309
+ function renderFailingTestContrast(input) {
310
+ const main = input.mainArm.objective.observedFailures ?? [];
311
+ const baseline = input.baselineArm?.objective.observedFailures ?? [];
312
+ if (main.length === 0 && baseline.length === 0)
313
+ return [];
314
+ // testIds + assertion lines come from raw runner output — scrub them too (a
315
+ // parametrized id or an assertion echoing a credential could otherwise leak).
316
+ const ids = (fs) => fs.length > 0 ? fs.map((f) => scrub(f.testId)).join(', ') : '(none)';
317
+ const out = [
318
+ '# FAILING-TEST CONTRAST (parsed from OBSERVED runner output — may be incomplete)',
319
+ 'Diagnostic context only: this sharpens the gap you name; it must NOT soften the correctness penalty.',
320
+ 'An explained, reproduced failure is still a failure — a strictly lower pass rate must score lower.',
321
+ `MAIN failing tests (${main.length}): ${ids(main)}`,
322
+ ];
323
+ if (input.baselineArm === null) {
324
+ out.push('BASELINE failing tests: (baseline arm skipped — no contrast possible)');
325
+ out.push('');
326
+ return out;
327
+ }
328
+ out.push(`BASELINE failing tests (${baseline.length}): ${ids(baseline)}`);
329
+ const baselineIds = new Set(baseline.map((f) => f.testId));
330
+ const editedByMain = editedFilesFromSkeleton(input.mainArm.skeleton);
331
+ const newlyFailing = main.filter((f) => !baselineIds.has(f.testId));
332
+ if (newlyFailing.length === 0) {
333
+ out.push('newly-failing in MAIN: (none — nothing fails in main that the baseline passed)');
334
+ }
335
+ else {
336
+ out.push('newly-failing in MAIN (possible regression vs the baseline):');
337
+ for (const f of newlyFailing) {
338
+ const renamed = f.file !== undefined && editedByMain.has(f.file);
339
+ const caveat = renamed
340
+ ? ' (⚠ this test file was EDITED by the main arm — possibly renamed/changed, NOT a confirmed regression)'
341
+ : '';
342
+ out.push(` - ${scrub(f.testId)}${caveat}${f.assertion ? ` — ${scrub(f.assertion)}` : ''}`);
343
+ }
344
+ }
345
+ out.push('');
346
+ return out;
347
+ }
348
+ /**
349
+ * Parse the judge's `json:diagnosis` block with a strict one-block discipline:
350
+ * exactly one fenced block, well-formed JSON, then fail-closed shape + range
351
+ * validation.
352
+ *
353
+ * Throws {@link RewardAgentOutputInvalid} on any violation (the repair loop
354
+ * re-prompts with the concrete message appended).
355
+ */
356
+ export function parseRewardAgentResponse(text) {
357
+ const fenceRe = /```json:diagnosis\s*([\s\S]*?)```/g;
358
+ const matches = [];
359
+ let m;
360
+ while ((m = fenceRe.exec(text)) !== null)
361
+ matches.push(m[1]);
362
+ if (matches.length === 0) {
363
+ throw new RewardAgentOutputInvalid('no `json:diagnosis` fenced block found in response');
364
+ }
365
+ if (matches.length > 1) {
366
+ throw new RewardAgentOutputInvalid(`expected exactly 1 \`json:diagnosis\` block, found ${matches.length}`);
367
+ }
368
+ let parsed;
369
+ try {
370
+ parsed = JSON.parse(matches[0].trim());
371
+ }
372
+ catch (err) {
373
+ throw new RewardAgentOutputInvalid(`failed to parse JSON inside diagnosis block: ${err instanceof Error ? err.message : String(err)}`);
374
+ }
375
+ if (!parsed || typeof parsed !== 'object') {
376
+ throw new RewardAgentOutputInvalid('diagnosis block must be a JSON object');
377
+ }
378
+ const o = parsed;
379
+ const rewardMain = requireReward(o.rewardMain, 'rewardMain');
380
+ const rewardBaseline = requireRewardOrNull(o.rewardBaseline, 'rewardBaseline');
381
+ const advantage = requireFiniteOrNull(o.advantage, 'advantage');
382
+ const errors = parseErrors(o.errors);
383
+ const gaps = parseGaps(o.gaps);
384
+ const textualGradient = parseTextualGradient(o.textualGradient);
385
+ if (typeof o.abstained !== 'boolean') {
386
+ throw new RewardAgentOutputInvalid('"abstained" must be a boolean');
387
+ }
388
+ const abstained = o.abstained;
389
+ let abstainReason;
390
+ if (o.abstainReason !== undefined) {
391
+ if (typeof o.abstainReason !== 'string') {
392
+ throw new RewardAgentOutputInvalid('"abstainReason" must be a string when present');
393
+ }
394
+ abstainReason = o.abstainReason;
395
+ }
396
+ const verdict = parseVerdict(o.verdict);
397
+ const confidence = parseConfidence(o.confidence);
398
+ // Abstain semantics: abstained ⇒ no gaps and no gradient; not abstained ⇒
399
+ // a non-empty gradient is required (a score with no direction is useless).
400
+ if (abstained) {
401
+ if (gaps.length > 0) {
402
+ throw new RewardAgentOutputInvalid('abstained=true requires an EMPTY "gaps" array (no nameable gap)');
403
+ }
404
+ if (textualGradient !== null) {
405
+ throw new RewardAgentOutputInvalid('abstained=true requires "textualGradient": null (no direction when abstaining)');
406
+ }
407
+ }
408
+ else {
409
+ if (textualGradient === null || textualGradient.length === 0) {
410
+ throw new RewardAgentOutputInvalid('abstained=false requires a non-empty "textualGradient"');
411
+ }
412
+ }
413
+ return {
414
+ rewardMain,
415
+ rewardBaseline,
416
+ advantage,
417
+ errors,
418
+ gaps,
419
+ textualGradient,
420
+ abstained,
421
+ ...(abstainReason !== undefined ? { abstainReason } : {}),
422
+ ...(verdict !== undefined ? { verdict } : {}),
423
+ ...(confidence !== undefined ? { confidence } : {}),
424
+ };
425
+ }
426
+ const VERDICT_VALUES = [
427
+ 'main-better',
428
+ 'baseline-better',
429
+ 'tie',
430
+ 'insufficient-signal',
431
+ 'no-gap',
432
+ ];
433
+ function parseVerdict(raw) {
434
+ if (raw === undefined || raw === null)
435
+ return undefined;
436
+ if (typeof raw !== 'string' || !VERDICT_VALUES.includes(raw)) {
437
+ throw new RewardAgentOutputInvalid(`"verdict" must be one of ${VERDICT_VALUES.map((v) => `'${v}'`).join(', ')} when present`);
438
+ }
439
+ return raw;
440
+ }
441
+ function parseConfidence(raw) {
442
+ if (raw === undefined)
443
+ return undefined;
444
+ if (raw === null)
445
+ return null;
446
+ if (typeof raw !== 'number' || !Number.isFinite(raw) || raw < 0 || raw > 1) {
447
+ throw new RewardAgentOutputInvalid('"confidence" must be a number in [0,1] or null when present');
448
+ }
449
+ return raw;
450
+ }
451
+ function requireReward(v, field) {
452
+ if (typeof v !== 'number' || !Number.isFinite(v)) {
453
+ throw new RewardAgentOutputInvalid(`"${field}" must be a finite number in [0,1]`);
454
+ }
455
+ if (v < 0 || v > 1) {
456
+ throw new RewardAgentOutputInvalid(`"${field}" must be within [0,1], got ${v}`);
457
+ }
458
+ return v;
459
+ }
460
+ function requireRewardOrNull(v, field) {
461
+ if (v === null)
462
+ return null;
463
+ return requireReward(v, field);
464
+ }
465
+ function requireFiniteOrNull(v, field) {
466
+ if (v === null)
467
+ return null;
468
+ if (typeof v !== 'number' || !Number.isFinite(v)) {
469
+ throw new RewardAgentOutputInvalid(`"${field}" must be a finite number or null`);
470
+ }
471
+ return v;
472
+ }
473
+ function parseErrors(raw) {
474
+ if (raw === undefined)
475
+ return [];
476
+ if (!Array.isArray(raw)) {
477
+ throw new RewardAgentOutputInvalid('"errors" must be an array');
478
+ }
479
+ return raw.map((e, i) => {
480
+ if (!e || typeof e !== 'object') {
481
+ throw new RewardAgentOutputInvalid(`errors[${i}] must be an object`);
482
+ }
483
+ const arm = e.arm;
484
+ if (arm !== 'main' && arm !== 'baseline') {
485
+ throw new RewardAgentOutputInvalid(`errors[${i}].arm must be 'main' or 'baseline'`);
486
+ }
487
+ const description = e.description;
488
+ if (typeof description !== 'string' || description.length === 0) {
489
+ throw new RewardAgentOutputInvalid(`errors[${i}].description must be a non-empty string`);
490
+ }
491
+ const evidence = e.evidence;
492
+ if (!evidence || typeof evidence !== 'object') {
493
+ throw new RewardAgentOutputInvalid(`errors[${i}].evidence must be an object`);
494
+ }
495
+ const file = evidence.file;
496
+ const quote = evidence.quote;
497
+ if (typeof file !== 'string' || typeof quote !== 'string') {
498
+ throw new RewardAgentOutputInvalid(`errors[${i}].evidence must have string "file" and string "quote"`);
499
+ }
500
+ return { arm, description, evidence: { file, quote } };
501
+ });
502
+ }
503
+ function parseGaps(raw) {
504
+ if (raw === undefined)
505
+ return [];
506
+ if (!Array.isArray(raw)) {
507
+ throw new RewardAgentOutputInvalid('"gaps" must be an array');
508
+ }
509
+ return raw.map((g, i) => {
510
+ if (!g || typeof g !== 'object') {
511
+ throw new RewardAgentOutputInvalid(`gaps[${i}] must be an object`);
512
+ }
513
+ const file = g.file;
514
+ const section = g.section;
515
+ const description = g.description;
516
+ if (typeof file !== 'string' || file.length === 0) {
517
+ throw new RewardAgentOutputInvalid(`gaps[${i}].file must be a non-empty string`);
518
+ }
519
+ if (typeof section !== 'string' || section.length === 0) {
520
+ throw new RewardAgentOutputInvalid(`gaps[${i}].section must be a non-empty string (a heading, or '*')`);
521
+ }
522
+ if (typeof description !== 'string' || description.length === 0) {
523
+ throw new RewardAgentOutputInvalid(`gaps[${i}].description must be a non-empty string`);
524
+ }
525
+ const weaknessClass = parseWeaknessClass(g.weaknessClass, i);
526
+ const severity = parseSeverity(g.severity, i);
527
+ return {
528
+ file,
529
+ section,
530
+ description,
531
+ ...(weaknessClass !== undefined ? { weaknessClass } : {}),
532
+ ...(severity !== undefined ? { severity } : {}),
533
+ };
534
+ });
535
+ }
536
+ const WEAKNESS_VALUES = [
537
+ 'forgetting',
538
+ 'boundary',
539
+ 'rare',
540
+ 'logic',
541
+ 'verbosity',
542
+ 'other',
543
+ ];
544
+ const SEVERITY_VALUES = ['high', 'medium', 'low'];
545
+ function parseWeaknessClass(raw, i) {
546
+ if (raw === undefined || raw === null)
547
+ return undefined;
548
+ if (typeof raw !== 'string' || !WEAKNESS_VALUES.includes(raw)) {
549
+ throw new RewardAgentOutputInvalid(`gaps[${i}].weaknessClass must be one of ${WEAKNESS_VALUES.map((v) => `'${v}'`).join(', ')} when present`);
550
+ }
551
+ return raw;
552
+ }
553
+ function parseSeverity(raw, i) {
554
+ if (raw === undefined || raw === null)
555
+ return undefined;
556
+ if (typeof raw !== 'string' || !SEVERITY_VALUES.includes(raw)) {
557
+ throw new RewardAgentOutputInvalid(`gaps[${i}].severity must be one of 'high', 'medium', 'low' when present`);
558
+ }
559
+ return raw;
560
+ }
561
+ function parseTextualGradient(raw) {
562
+ if (raw === null || raw === undefined)
563
+ return null;
564
+ if (typeof raw !== 'string') {
565
+ throw new RewardAgentOutputInvalid('"textualGradient" must be a string or null');
566
+ }
567
+ const trimmed = raw.trim();
568
+ return trimmed.length === 0 ? null : trimmed;
569
+ }
570
+ // ── anchor mapping (defensive against both objective shapes) ─────────────────
571
+ function numberOrNull(v) {
572
+ return typeof v === 'number' && Number.isFinite(v) ? v : null;
573
+ }
574
+ /** Map an arm's objective to its (loss, passRate, healthPenalty, verbosity) anchors. */
575
+ export function mapArmAnchors(objective) {
576
+ if (!objective || typeof objective !== 'object') {
577
+ return { loss: null, passRate: null, healthPenalty: null, verbosity: null };
578
+ }
579
+ // loss: flat number (CRITIC AGENT) OR nested PerChangeLoss.loss (MAIN ARM).
580
+ let loss = null;
581
+ if (typeof objective.loss === 'number') {
582
+ loss = numberOrNull(objective.loss);
583
+ }
584
+ else if (objective.loss && typeof objective.loss === 'object') {
585
+ loss = numberOrNull(objective.loss.loss);
586
+ }
587
+ // passRate: flat (CRITIC AGENT) OR testMetrics.passRate (MAIN ARM FitnessSample).
588
+ let passRate = numberOrNull(objective.passRate);
589
+ if (passRate === null && objective.testMetrics && typeof objective.testMetrics === 'object') {
590
+ passRate = numberOrNull(objective.testMetrics.passRate);
591
+ }
592
+ // healthPenalty: flat (CRITIC AGENT) OR healthSignal / loss.healthPenalty (MAIN ARM).
593
+ let healthPenalty = numberOrNull(objective.healthPenalty);
594
+ if (healthPenalty === null) {
595
+ healthPenalty = numberOrNull(objective.healthSignal);
596
+ }
597
+ if (healthPenalty === null &&
598
+ objective.loss &&
599
+ typeof objective.loss === 'object') {
600
+ healthPenalty = numberOrNull(objective.loss.healthPenalty);
601
+ }
602
+ // verbosity: the separable verbosity sub-signal, when the capture provides it
603
+ // (flat `verbosity`, or nested under a `health`/`healthDetail` block). Null
604
+ // when absent — the rubric then judges verbosity from the artifacts alone.
605
+ let verbosity = numberOrNull(objective.verbosity);
606
+ if (verbosity === null && objective.health && typeof objective.health === 'object') {
607
+ verbosity = numberOrNull(objective.health.verbosity);
608
+ }
609
+ return { loss, passRate, healthPenalty, verbosity };
610
+ }
611
+ /** Build the {@link DiagnosisAnchors} block from both arms' objectives. */
612
+ export function buildAnchors(mainObjective, baselineObjective) {
613
+ const main = mapArmAnchors(mainObjective);
614
+ const baseline = baselineObjective === null
615
+ ? { loss: null, passRate: null, healthPenalty: null, verbosity: null }
616
+ : mapArmAnchors(baselineObjective);
617
+ return {
618
+ mainLoss: main.loss,
619
+ baselineLoss: baseline.loss,
620
+ mainPassRate: main.passRate,
621
+ baselinePassRate: baseline.passRate,
622
+ mainHealthPenalty: main.healthPenalty,
623
+ baselineHealthPenalty: baseline.healthPenalty,
624
+ ...(main.verbosity !== null ? { mainVerbosity: main.verbosity } : {}),
625
+ ...(baseline.verbosity !== null ? { baselineVerbosity: baseline.verbosity } : {}),
626
+ };
627
+ }
628
+ // ── episode / arm reading ────────────────────────────────────────────────
629
+ const SKELETON_JSON_FILE = 'skeleton.json';
630
+ const OBJECTIVE_JSON_FILE = 'objective.json';
631
+ /** The transcript filename convention, in priority order (main arm jsonl, baseline stdout). */
632
+ const TRANSCRIPT_FILE_CANDIDATES = ['transcript.jsonl', 'stdout.txt'];
633
+ /** The 5 artifacts + test-report.md, in the order they appear in the prompt. */
634
+ const CHANGE_DIR_ARTIFACTS = [
635
+ 'proposal.md',
636
+ 'usecases.md',
637
+ 'design.md',
638
+ 'tasks.md',
639
+ 'spec-tests.md',
640
+ 'test-report.md',
641
+ ];
642
+ async function readFileIfExists(filePath) {
643
+ try {
644
+ return await fs.readFile(filePath, 'utf8');
645
+ }
646
+ catch (err) {
647
+ if (err.code === 'ENOENT')
648
+ return null;
649
+ throw err;
650
+ }
651
+ }
652
+ async function readJsonIfExists(filePath) {
653
+ const raw = await readFileIfExists(filePath);
654
+ if (raw === null)
655
+ return null;
656
+ try {
657
+ return JSON.parse(raw);
658
+ }
659
+ catch (err) {
660
+ throw new RewardAgentOutputInvalid(`unreadable ${path.basename(filePath)}: ${err instanceof Error ? err.message : String(err)}`);
661
+ }
662
+ }
663
+ /** Read one arm's skeleton + transcript + objective from `<episodeDir>/<arm>/`. */
664
+ async function readArmCapture(repoRoot, episodeId, arm) {
665
+ const armDir = path.join(episodeDir(repoRoot, episodeId), arm);
666
+ const skeletonRaw = await readJsonIfExists(path.join(armDir, SKELETON_JSON_FILE));
667
+ const skeleton = skeletonRaw && typeof skeletonRaw === 'object' ? skeletonRaw : null;
668
+ let transcript = null;
669
+ for (const fileName of TRANSCRIPT_FILE_CANDIDATES) {
670
+ transcript = await readFileIfExists(path.join(armDir, fileName));
671
+ if (transcript !== null)
672
+ break;
673
+ }
674
+ const objectiveRaw = await readJsonIfExists(path.join(armDir, OBJECTIVE_JSON_FILE));
675
+ if (objectiveRaw === null) {
676
+ throw new RewardAgentOutputInvalid(`missing ${OBJECTIVE_JSON_FILE} for ${arm} of episode ${episodeId} — cannot score`);
677
+ }
678
+ if (typeof objectiveRaw !== 'object') {
679
+ throw new RewardAgentOutputInvalid(`${arm} ${OBJECTIVE_JSON_FILE} for episode ${episodeId} must be a JSON object`);
680
+ }
681
+ return { skeleton, transcript, objective: objectiveRaw };
682
+ }
683
+ /** Read the 5 artifacts + test-report.md (those that exist) from the change dir. */
684
+ async function readChangeArtifacts(changeDirPath) {
685
+ const out = [];
686
+ for (const file of CHANGE_DIR_ARTIFACTS) {
687
+ const content = await readFileIfExists(path.join(changeDirPath, file));
688
+ if (content !== null)
689
+ out.push({ file, content });
690
+ }
691
+ return out;
692
+ }
693
+ async function invokeRewardAgent(opts) {
694
+ // Single attempt, delegated to the host-aware headless runner (sibling of the
695
+ // 演进智能体 EVOLVING AGENT — never parent-child; each fresh-context spawn). One
696
+ // retry on invocation failure, then throw — mirrors the proposer agent's
697
+ // invoke contract.
698
+ const attempt = async () => await runHeadlessAgent(opts.prompt, {
699
+ cwd: opts.repoRoot,
700
+ spawn: opts.spawn,
701
+ binaryOverride: opts.binary,
702
+ });
703
+ const first = await attempt();
704
+ if (first.exitCode === 0 && first.stdout.length > 0)
705
+ return first.stdout;
706
+ const second = await attempt();
707
+ if (second.exitCode === 0 && second.stdout.length > 0)
708
+ return second.stdout;
709
+ throw new RewardAgentInvocationError(second.stderr || first.stderr);
710
+ }
711
+ /**
712
+ * Score ONE judged duel: spawn the judge (fresh context), parse with the bounded
713
+ * repair loop (re-prompting with the concrete error appended), recompute and
714
+ * validate the advantage (incl. ① gate-not-blend), and compute ④ integrity
715
+ * signals. Does NOT write `diagnosis.json` — the caller (single-sample
716
+ * {@link runRewardAgent}, or the statistical aggregator) owns the write. This is
717
+ * the unit the aggregator calls k times for the A/A noise floor + SPRT.
718
+ */
719
+ export async function scoreOnce(opts) {
720
+ const maxRepairAttempts = Math.max(0, opts.maxRepairAttempts ?? 2);
721
+ const anchors = opts.promptInput.anchors;
722
+ const basePrompt = assembleRewardAgentPrompt(opts.promptInput);
723
+ let feedback = null;
724
+ for (let attempt = 0;; attempt++) {
725
+ const prompt = feedback === null
726
+ ? basePrompt
727
+ : `${basePrompt}\n\n# PREVIOUS ATTEMPT FAILED VALIDATION\n${feedback}\n` +
728
+ 'Re-emit EXACTLY ONE ```json:diagnosis fenced block matching the contract above. ' +
729
+ 'When the baseline arm was skipped, "rewardBaseline" and "advantage" must both be null.';
730
+ const stdout = await invokeRewardAgent({
731
+ prompt,
732
+ repoRoot: opts.repoRoot,
733
+ spawn: opts.spawn,
734
+ binary: opts.binary,
735
+ });
736
+ try {
737
+ const parsed = parseRewardAgentResponse(stdout);
738
+ validateAgainstEpisode(parsed, opts.baselineSkipped, anchors);
739
+ const integrity = computeIntegrity(parsed, anchors, opts.promptInput.integrityHint ?? null);
740
+ return { parsed, integrity };
741
+ }
742
+ catch (err) {
743
+ if (err instanceof RewardAgentOutputInvalid && attempt < maxRepairAttempts) {
744
+ feedback = err.message;
745
+ continue;
746
+ }
747
+ throw err;
748
+ }
749
+ }
750
+ }
751
+ /**
752
+ * Load everything the judge needs to score one episode WITHOUT spawning: read
753
+ * the episode + both arms (baseline omitted when skipped), map anchors, and read
754
+ * the change artifacts. Shared by {@link runRewardAgent} (single sample) and the
755
+ * statistical aggregator (which calls {@link scoreOnce} k times over the same
756
+ * context with swapped `armOrder`).
757
+ */
758
+ export async function loadRewardScoringContext(repoRoot, episodeId) {
759
+ const episode = await readEpisode(repoRoot, episodeId);
760
+ // The baseline arm is SKIPPED exactly when the episode recorded it as such.
761
+ const baselineSkipped = episode.stage === 'baseline-skipped' ||
762
+ episode.stageHistory.some((h) => h.stage === 'baseline-skipped');
763
+ const mainArm = await readArmCapture(repoRoot, episodeId, 'main-arm');
764
+ const baselineArm = baselineSkipped
765
+ ? null
766
+ : await readArmCapture(repoRoot, episodeId, 'baseline-arm');
767
+ const anchors = buildAnchors(mainArm.objective, baselineArm?.objective ?? null);
768
+ const artifacts = await readChangeArtifacts(episode.changeDirPath);
769
+ const promptInput = {
770
+ changeName: episode.changeName,
771
+ targetId: episode.targetId,
772
+ policyVersions: {
773
+ main: episode.policyVersionMain,
774
+ baseline: episode.policyVersionBaseline,
775
+ },
776
+ mainArm,
777
+ baselineArm,
778
+ artifacts,
779
+ anchors,
780
+ };
781
+ return { episode, baselineSkipped, promptInput };
782
+ }
783
+ /**
784
+ * Run the 奖励智能体 REWARD AGENT end-to-end for one episode (single sample):
785
+ * 1. read the episode + both arms (baseline omitted when skipped),
786
+ * 2. map anchors and assemble the prompt,
787
+ * 3. {@link scoreOnce} — spawn the judge, parse with the bounded repair loop,
788
+ * recompute/validate the advantage (incl. ① gate-not-blend), compute ④
789
+ * integrity,
790
+ * 4. derive the ⑤ single-sample verdict and write `diagnosis.json` (schema 2)
791
+ * via {@link writeDiagnosis}, then advance the episode stage to `scored`.
792
+ *
793
+ * Behaviour is byte-compatible with the historical single-call path: no extra
794
+ * spawns, `armOrder` defaults to `main-first`, and the new schema-2 fields are
795
+ * OPTIONAL (a reader that ignores them sees the same diagnosis). The statistical
796
+ * layer (Batch 3) wraps {@link scoreOnce} instead of calling this directly.
797
+ *
798
+ * The ONLY write path is the episode dir. Invocation errors (agent crash) are
799
+ * NOT repaired — they propagate as {@link RewardAgentInvocationError}.
800
+ */
801
+ export async function runRewardAgent(opts) {
802
+ const { repoRoot, episodeId } = opts;
803
+ const { episode, baselineSkipped, promptInput } = await loadRewardScoringContext(repoRoot, episodeId);
804
+ const { parsed, integrity } = await scoreOnce({
805
+ promptInput,
806
+ baselineSkipped,
807
+ repoRoot,
808
+ spawn: opts.spawn,
809
+ binary: opts.binary,
810
+ maxRepairAttempts: opts.maxRepairAttempts,
811
+ });
812
+ const verdict = deriveSingleSampleVerdict(parsed);
813
+ const diagnosis = {
814
+ schemaVersion: 2,
815
+ episodeId: episode.episodeId,
816
+ changeName: episode.changeName,
817
+ targetId: episode.targetId,
818
+ policyVersions: {
819
+ main: episode.policyVersionMain,
820
+ baseline: episode.policyVersionBaseline,
821
+ },
822
+ rewardMain: parsed.rewardMain,
823
+ rewardBaseline: parsed.rewardBaseline,
824
+ advantage: parsed.advantage,
825
+ anchors: promptInput.anchors,
826
+ errors: parsed.errors,
827
+ gaps: parsed.gaps,
828
+ textualGradient: parsed.textualGradient,
829
+ abstained: parsed.abstained,
830
+ ...(parsed.abstainReason !== undefined ? { abstainReason: parsed.abstainReason } : {}),
831
+ ...(verdict !== undefined ? { verdict } : {}),
832
+ ...(parsed.confidence !== undefined ? { confidence: parsed.confidence } : {}),
833
+ integrity,
834
+ };
835
+ const diagnosisPath = await writeDiagnosis({ repoRoot, episodeId, diagnosis });
836
+ const advanced = await advanceEpisodeStage({
837
+ repoRoot,
838
+ episodeId,
839
+ stage: 'scored',
840
+ patch: { advantage: diagnosis.advantage },
841
+ });
842
+ return { diagnosis, diagnosisPath, episode: advanced };
843
+ }
844
+ /**
845
+ * Derive the ⑤ single-sample verdict. A judge-emitted `verdict` wins; otherwise
846
+ * it is read off the advantage sign (no-gap when abstained; undefined when the
847
+ * baseline was skipped and only the main arm was scored). The statistical layer
848
+ * overrides this with `insufficient-signal` when the advantage is within the
849
+ * A/A noise floor.
850
+ */
851
+ export function deriveSingleSampleVerdict(parsed) {
852
+ if (parsed.verdict !== undefined)
853
+ return parsed.verdict;
854
+ if (parsed.abstained)
855
+ return 'no-gap';
856
+ if (parsed.advantage === null)
857
+ return undefined;
858
+ if (parsed.advantage > 1e-9)
859
+ return 'main-better';
860
+ if (parsed.advantage < -1e-9)
861
+ return 'baseline-better';
862
+ return 'tie';
863
+ }
864
+ /**
865
+ * Beyond this much sign-flipped disagreement between the judge's advantage and
866
+ * the loss-implied advantage, ④ raises a divergence flag (the judge loved an arm
867
+ * the verifier dislikes). The number itself is always recorded when computable.
868
+ */
869
+ const DIVERGENCE_FLAG_TOLERANCE = 0.2;
870
+ /** ④ Compute integrity signals from a parsed duel, its anchors, and the tamper hint. */
871
+ export function computeIntegrity(parsed, anchors, integrityHint) {
872
+ const flags = [];
873
+ let judgeVerifierDivergence = null;
874
+ if (parsed.advantage !== null && anchors.mainLoss !== null && anchors.baselineLoss !== null) {
875
+ // lower loss is better, so the verifier prefers the main arm by (baseline − main).
876
+ const lossImpliedAdvantage = anchors.baselineLoss - anchors.mainLoss;
877
+ judgeVerifierDivergence = parsed.advantage - lossImpliedAdvantage;
878
+ const judgeSign = Math.sign(parsed.advantage);
879
+ const verifierSign = Math.sign(lossImpliedAdvantage);
880
+ if (judgeSign !== 0 &&
881
+ verifierSign !== 0 &&
882
+ judgeSign !== verifierSign &&
883
+ Math.abs(judgeVerifierDivergence) > DIVERGENCE_FLAG_TOLERANCE) {
884
+ flags.push(`judge⇄verifier divergence: judge advantage ${parsed.advantage.toFixed(3)} disagrees in ` +
885
+ `sign with loss-implied ${lossImpliedAdvantage.toFixed(3)}`);
886
+ }
887
+ }
888
+ const testTamperSuspected = integrityHint?.suspected ?? false;
889
+ if (testTamperSuspected && integrityHint) {
890
+ for (const f of integrityHint.flags)
891
+ flags.push(`tamper: ${f}`);
892
+ }
893
+ return { testTamperSuspected, judgeVerifierDivergence, flags };
894
+ }
895
+ /**
896
+ * Cross-check the parsed diagnosis against the episode's baseline state and
897
+ * RECOMPUTE the advantage. advantage = reward(主臂) − reward(基线臂); a claimed
898
+ * advantage that disagrees with the recomputation by more than 1e-9 is a
899
+ * repairable {@link RewardAgentOutputInvalid}. When the baseline was skipped,
900
+ * rewardBaseline and advantage must both be null (no comparison possible).
901
+ *
902
+ * ① gate-not-blend: when both pass rates are known and the MAIN arm's is
903
+ * strictly lower, the advantage must be ≤ 0 — a correctness regression cannot be
904
+ * outscored by cleaner/shorter code. Repairable.
905
+ */
906
+ function validateAgainstEpisode(parsed, baselineSkipped, anchors) {
907
+ if (baselineSkipped) {
908
+ if (parsed.rewardBaseline !== null) {
909
+ throw new RewardAgentOutputInvalid('baseline arm was skipped — "rewardBaseline" must be null (no comparison possible)');
910
+ }
911
+ if (parsed.advantage !== null) {
912
+ throw new RewardAgentOutputInvalid('baseline arm was skipped — "advantage" must be null (no comparison possible)');
913
+ }
914
+ return;
915
+ }
916
+ // Baseline ran: a numeric rewardBaseline is required to compute the advantage.
917
+ if (parsed.rewardBaseline === null) {
918
+ throw new RewardAgentOutputInvalid('baseline arm RAN — "rewardBaseline" must be a number in [0,1], not null');
919
+ }
920
+ if (parsed.advantage === null) {
921
+ throw new RewardAgentOutputInvalid('baseline arm RAN — "advantage" must be a number (rewardMain − rewardBaseline), not null');
922
+ }
923
+ const recomputed = parsed.rewardMain - parsed.rewardBaseline;
924
+ if (Math.abs(recomputed - parsed.advantage) > 1e-9) {
925
+ throw new RewardAgentOutputInvalid(`advantage mismatch: claimed ${parsed.advantage}, but rewardMain − rewardBaseline = ` +
926
+ `${parsed.rewardMain} − ${parsed.rewardBaseline} = ${recomputed} ` +
927
+ '(advantage = reward(主臂) − reward(基线臂))');
928
+ }
929
+ // ① gate-not-blend: a strictly lower MAIN pass rate cannot net a positive advantage.
930
+ const { mainPassRate, baselinePassRate } = anchors;
931
+ if (mainPassRate !== null &&
932
+ baselinePassRate !== null &&
933
+ mainPassRate < baselinePassRate &&
934
+ parsed.advantage > 1e-9) {
935
+ throw new RewardAgentOutputInvalid(`gate-not-blend: main pass rate ${mainPassRate} < baseline ${baselinePassRate}, so ` +
936
+ `advantage must be <= 0 (a correctness regression cannot be outscored by cleaner code), ` +
937
+ `got ${parsed.advantage}`);
938
+ }
939
+ }
940
+ //# sourceMappingURL=reward-agent.js.map