@metaharness/weight-eft 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/export.js ADDED
@@ -0,0 +1,249 @@
1
+ // SPDX-License-Identifier: MIT
2
+ //
3
+ // export.ts — Archive → training-data exporter.
4
+ //
5
+ // Reads Darwin's trajectory archive (a DarwinTrajectory[]; reconstructable from
6
+ // Firestore darwin_runs + local prediction/trajectory artifacts) and emits two
7
+ // standard training sets:
8
+ //
9
+ // SFT = ALL gold-resolved trajectories (cheap-OWN *and* frontier-escalation).
10
+ // Frontier successes are included for OFF-POLICY-SAFE DISTILLATION —
11
+ // the cheap model learns to imitate a frontier success on issues it
12
+ // could not solve itself. SFT (max-likelihood) is off-policy-stable.
13
+ //
14
+ // DPO = ON-POLICY cheap-vs-cheap pairs ONLY. chosen = a resolved sample,
15
+ // rejected = an empty/failed sample by the SAME cheap model on the
16
+ // SAME instance (BoN-derived). We do NOT emit frontier-chosen-vs-cheap-
17
+ // rejected as DPO: an off-policy preference pair is unstable (the
18
+ // reference policy never produced the chosen completion). That signal
19
+ // goes to SFT instead.
20
+ //
21
+ // THE CONTAMINATION GUARD (ADR-198, the headline correctness property): strict
22
+ // train/eval instance-ID disjointness. Any trajectory whose instance_id is in
23
+ // the caller's evalHoldout is excluded; an overlap throws. Training on eval
24
+ // instances is fake lift.
25
+ //
26
+ // Two further guards: a LONG-CONTEXT filter (drop/truncate over-budget
27
+ // trajectories, never silently) and TOOL-CALL FIDELITY (tool_calls survive into
28
+ // SFT messages, never stringified).
29
+ import { detectRewardHack } from './reward-hack.js';
30
+ const DEFAULT_MAX_TOKENS = 28000;
31
+ /**
32
+ * Rough token estimate for a message list. We deliberately do NOT pull in a
33
+ * tokenizer dependency (the package is dependency-free, like darwin-mode). The
34
+ * ~4-chars-per-token heuristic is conservative and stable; tool_calls
35
+ * arguments count too (they cost context). This is a budget gate, not a billing
36
+ * figure — over-estimating is the safe direction.
37
+ */
38
+ export function estimateTokens(messages) {
39
+ let chars = 0;
40
+ for (const m of messages) {
41
+ chars += (m.content ?? '').length;
42
+ chars += (m.name ?? '').length;
43
+ if (m.tool_calls) {
44
+ for (const tc of m.tool_calls) {
45
+ chars += tc.function.name.length + tc.function.arguments.length + tc.id.length;
46
+ }
47
+ }
48
+ chars += 8; // per-message role/structural overhead
49
+ }
50
+ return Math.ceil(chars / 4);
51
+ }
52
+ /**
53
+ * Truncate an over-length trajectory by dropping the OLDEST middle tool
54
+ * round-trips while preserving the load-bearing turns: the leading system+user
55
+ * (the issue) and the trailing assistant (the final patch). Returns a copy.
56
+ */
57
+ function truncateTrajectory(messages, maxTokens) {
58
+ if (messages.length <= 3)
59
+ return messages;
60
+ // Keep a head (system + first user/issue) and a tail (final assistant).
61
+ let headEnd = 1;
62
+ while (headEnd < messages.length && messages[headEnd].role !== 'user')
63
+ headEnd++;
64
+ headEnd = Math.min(headEnd + 1, messages.length); // include the issue user turn
65
+ const head = messages.slice(0, headEnd);
66
+ const tail = [messages[messages.length - 1]];
67
+ const middle = messages.slice(headEnd, messages.length - 1);
68
+ // Greedily re-add middle turns from the END (most recent context) until budget.
69
+ const kept = [];
70
+ for (let i = middle.length - 1; i >= 0; i--) {
71
+ const candidate = [...head, ...kept.slice(), middle[i], ...tail];
72
+ if (estimateTokens(candidate) > maxTokens && kept.length > 0)
73
+ break;
74
+ kept.unshift(middle[i]);
75
+ }
76
+ return [...head, ...kept, ...tail];
77
+ }
78
+ /**
79
+ * Split a full trajectory into a DPO (prompt, completion) boundary. ReAct
80
+ * diverges at the first ACTION, so prompt = the shared leading system + user
81
+ * (issue) messages, and the completion is everything from the first assistant
82
+ * turn onward.
83
+ */
84
+ function splitPromptCompletion(messages) {
85
+ let firstAssistant = messages.findIndex((m) => m.role === 'assistant');
86
+ if (firstAssistant < 0)
87
+ firstAssistant = messages.length; // no assistant turn → empty completion
88
+ return {
89
+ prompt: messages.slice(0, firstAssistant),
90
+ completion: messages.slice(firstAssistant),
91
+ };
92
+ }
93
+ /**
94
+ * THE CONTAMINATION GUARD. Throws if any trajectory's instance_id appears in
95
+ * the eval holdout. Call it the first thing the exporter does — fail loud, not
96
+ * silently filter, when train/eval disjointness is violated.
97
+ */
98
+ export function assertTrainEvalDisjoint(trajectories, evalHoldout) {
99
+ const holdout = new Set(evalHoldout);
100
+ const overlap = new Set();
101
+ for (const t of trajectories) {
102
+ if (holdout.has(t.instance_id))
103
+ overlap.add(t.instance_id);
104
+ }
105
+ if (overlap.size > 0) {
106
+ const sample = [...overlap].slice(0, 10).join(', ');
107
+ throw new Error(`weight-eft contamination guard: ${overlap.size} training instance_id(s) overlap the eval holdout ` +
108
+ `(${sample}${overlap.size > 10 ? ', …' : ''}). Training on eval instances is fake lift — refusing to export. ` +
109
+ `Exclude these instance_ids from the training archive or remove them from evalHoldout.`);
110
+ }
111
+ }
112
+ /**
113
+ * Build the SFT and DPO sets from a Darwin trajectory archive.
114
+ *
115
+ * @param trajectories the input archive (already-excluded-of-holdout OR raw —
116
+ * the exporter excludes holdout members itself, but ASSERTS disjointness on
117
+ * what remains so a programming error can't slip eval data through).
118
+ */
119
+ export function exportTrainingData(trajectories, options) {
120
+ const maxTokens = options.maxTokens ?? DEFAULT_MAX_TOKENS;
121
+ const holdout = new Set(options.evalHoldout);
122
+ const notes = [];
123
+ const report = {
124
+ totalTrajectories: trajectories.length,
125
+ excludedByHoldout: 0,
126
+ droppedOverLength: 0,
127
+ truncatedOverLength: 0,
128
+ droppedRewardHacked: 0,
129
+ sftRows: 0,
130
+ dpoRows: 0,
131
+ sftInstanceIds: [],
132
+ dpoInstanceIds: [],
133
+ notes,
134
+ };
135
+ // 1) CONTAMINATION GUARD — exclude holdout members, then assert disjointness
136
+ // on what's left so an exclusion bug can never leak eval data downstream.
137
+ const inDomain = [];
138
+ for (const t of trajectories) {
139
+ if (holdout.has(t.instance_id)) {
140
+ report.excludedByHoldout++;
141
+ continue;
142
+ }
143
+ inDomain.push(t);
144
+ }
145
+ assertTrainEvalDisjoint(inDomain, holdout);
146
+ // 2) LONG-CONTEXT FILTER — drop or truncate over-budget trajectories, never
147
+ // silently. Applied to a working copy so the original archive is untouched.
148
+ const sized = [];
149
+ for (const t of inDomain) {
150
+ const tokens = estimateTokens(t.messages);
151
+ if (tokens <= maxTokens) {
152
+ sized.push(t);
153
+ continue;
154
+ }
155
+ if (options.truncateOverLength) {
156
+ const truncated = truncateTrajectory(t.messages, maxTokens);
157
+ report.truncatedOverLength++;
158
+ notes.push(`truncated ${t.instance_id} (${t.model}) ${tokens}→~${estimateTokens(truncated)} tok (budget ${maxTokens})`);
159
+ sized.push({ ...t, messages: truncated });
160
+ }
161
+ else {
162
+ report.droppedOverLength++;
163
+ notes.push(`dropped ${t.instance_id} (${t.model}) ${tokens} tok > budget ${maxTokens}`);
164
+ }
165
+ }
166
+ // 2b) REWARD-HACKING FILTER (Ornith-1.0 borrow). Drop any trajectory whose
167
+ // deterministic monitor flags a gold/test read, verification tamper, or
168
+ // sandbox escape — an archived "success" that reward-hacked would teach
169
+ // the model to reward-hack. The training-data analog of the conformance
170
+ // firewall. Default ON. Applied BEFORE SFT/DPO so neither set sees it.
171
+ const dropHack = options.dropRewardHacked ?? true;
172
+ const clean = [];
173
+ for (const t of sized) {
174
+ if (!dropHack) {
175
+ clean.push(t);
176
+ continue;
177
+ }
178
+ const findings = detectRewardHack(t);
179
+ if (findings.length === 0) {
180
+ clean.push(t);
181
+ continue;
182
+ }
183
+ report.droppedRewardHacked++;
184
+ const kinds = [...new Set(findings.map((f) => f.kind))].join(',');
185
+ const detail = findings[0].detail;
186
+ notes.push(`reward-hack drop ${t.instance_id} (${t.model}) [${kinds}] — e.g. "${detail}" via ${findings[0].tool}`);
187
+ }
188
+ // 3) SFT — ALL gold-resolved trajectories (cheap-own AND frontier-escalation).
189
+ const sft = [];
190
+ const sftIds = new Set();
191
+ for (const t of clean) {
192
+ if (!t.resolved)
193
+ continue;
194
+ if (t.messages.length === 0)
195
+ continue; // a resolved attempt must have a trajectory
196
+ // Tool-call fidelity: we copy messages through verbatim — tool_calls are
197
+ // structured objects on the assistant turns, never stringified.
198
+ sft.push({ messages: t.messages });
199
+ sftIds.add(t.instance_id);
200
+ }
201
+ // 4) DPO — ON-POLICY cheap-vs-cheap pairs ONLY. Group cheap-tier trajectories
202
+ // by (model, instance); pair a resolved (chosen) with an empty/failed
203
+ // (rejected) sample from the SAME model on the SAME instance.
204
+ const dpo = [];
205
+ const dpoIds = new Set();
206
+ const cheapGroups = new Map();
207
+ for (const t of clean) {
208
+ if (t.tier !== 'cheap')
209
+ continue; // ON-POLICY only — frontier never enters DPO
210
+ const key = `${t.model}${t.instance_id}`;
211
+ const arr = cheapGroups.get(key);
212
+ if (arr)
213
+ arr.push(t);
214
+ else
215
+ cheapGroups.set(key, [t]);
216
+ }
217
+ for (const group of cheapGroups.values()) {
218
+ const chosen = group.filter((t) => t.resolved && t.messages.length > 0);
219
+ const rejected = group.filter((t) => !t.resolved);
220
+ if (chosen.length === 0 || rejected.length === 0)
221
+ continue;
222
+ // Deterministic pairing: lowest-sample resolved vs lowest-sample failed.
223
+ const bySample = (a, b) => (a.sample ?? 0) - (b.sample ?? 0);
224
+ const c = [...chosen].sort(bySample)[0];
225
+ const r = [...rejected].sort(bySample)[0];
226
+ const { prompt, completion: chosenCompletion } = splitPromptCompletion(c.messages);
227
+ const { completion: rejectedCompletion } = splitPromptCompletion(r.messages.length > 0 ? r.messages : prompt);
228
+ dpo.push({
229
+ prompt,
230
+ chosen: chosenCompletion,
231
+ rejected: rejectedCompletion,
232
+ });
233
+ dpoIds.add(c.instance_id);
234
+ }
235
+ report.sftRows = sft.length;
236
+ report.dpoRows = dpo.length;
237
+ report.sftInstanceIds = [...sftIds].sort();
238
+ report.dpoInstanceIds = [...dpoIds].sort();
239
+ return { sft, dpo, report };
240
+ }
241
+ /** Serialize SFT rows to JSONL (one row per line). */
242
+ export function sftToJsonl(rows) {
243
+ return rows.map((r) => JSON.stringify(r)).join('\n') + (rows.length ? '\n' : '');
244
+ }
245
+ /** Serialize DPO rows to JSONL (one row per line). */
246
+ export function dpoToJsonl(rows) {
247
+ return rows.map((r) => JSON.stringify(r)).join('\n') + (rows.length ? '\n' : '');
248
+ }
249
+ //# sourceMappingURL=export.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"export.js","sourceRoot":"","sources":["../src/export.ts"],"names":[],"mappings":"AAAA,+BAA+B;AAC/B,EAAE;AACF,gDAAgD;AAChD,EAAE;AACF,gFAAgF;AAChF,+EAA+E;AAC/E,0BAA0B;AAC1B,EAAE;AACF,iFAAiF;AACjF,8EAA8E;AAC9E,6EAA6E;AAC7E,8EAA8E;AAC9E,EAAE;AACF,4EAA4E;AAC5E,4EAA4E;AAC5E,iFAAiF;AACjF,2EAA2E;AAC3E,+EAA+E;AAC/E,gCAAgC;AAChC,EAAE;AACF,+EAA+E;AAC/E,8EAA8E;AAC9E,4EAA4E;AAC5E,0BAA0B;AAC1B,EAAE;AACF,uEAAuE;AACvE,gFAAgF;AAChF,oCAAoC;AAWpC,OAAO,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAC;AAEpD,MAAM,kBAAkB,GAAG,KAAK,CAAC;AAEjC;;;;;;GAMG;AACH,MAAM,UAAU,cAAc,CAAC,QAAuB;IACpD,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,KAAK,MAAM,CAAC,IAAI,QAAQ,EAAE,CAAC;QACzB,KAAK,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC;QAClC,KAAK,IAAI,CAAC,CAAC,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC;QAC/B,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;YACjB,KAAK,MAAM,EAAE,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;gBAC9B,KAAK,IAAI,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,MAAM,GAAG,EAAE,CAAC,QAAQ,CAAC,SAAS,CAAC,MAAM,GAAG,EAAE,CAAC,EAAE,CAAC,MAAM,CAAC;YACjF,CAAC;QACH,CAAC;QACD,KAAK,IAAI,CAAC,CAAC,CAAC,uCAAuC;IACrD,CAAC;IACD,OAAO,IAAI,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC;AAC9B,CAAC;AAED;;;;GAIG;AACH,SAAS,kBAAkB,CAAC,QAAuB,EAAE,SAAiB;IACpE,IAAI,QAAQ,CAAC,MAAM,IAAI,CAAC;QAAE,OAAO,QAAQ,CAAC;IAC1C,wEAAwE;IACxE,IAAI,OAAO,GAAG,CAAC,CAAC;IAChB,OAAO,OAAO,GAAG,QAAQ,CAAC,MAAM,IAAI,QAAQ,CAAC,OAAO,CAAC,CAAC,IAAI,KAAK,MAAM;QAAE,OAAO,EAAE,CAAC;IACjF,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,GAAG,CAAC,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,8BAA8B;IAChF,MAAM,IAAI,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;IACxC,MAAM,IAAI,GAAG,CAAC,QAAQ,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC;IAC7C,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,OAAO,EAAE,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAC5D,gFAAgF;IAChF,MAAM,IAAI,GAAkB,EAAE,CAAC;IAC/B,KAAK,IAAI,CAAC,GAAG,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5C,MAAM,SAAS,GAAG,CAAC,GAAG,IAAI,EAAE,GAAG,IAAI,CAAC,KAAK,EAAE,EAAE,MAAM,CAAC,CAAC,CAAC,EAAE,GAAG,IAAI,CAAC,CAAC;QACjE,IAAI,cAAc,CAAC,SAAS,CAAC,GAAG,SAAS,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC;YAAE,MAAM;QACpE,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IAC1B,CAAC;IACD,OAAO,CAAC,GAAG,IAAI,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,CAAC,CAAC;AACrC,CAAC;AAED;;;;;GAKG;AACH,SAAS,qBAAqB,CAAC,QAAuB;IACpD,IAAI,cAAc,GAAG,QAAQ,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,WAAW,CAAC,CAAC;IACvE,IAAI,cAAc,GAAG,CAAC;QAAE,cAAc,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,uCAAuC;IACjG,OAAO;QACL,MAAM,EAAE,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,cAAc,CAAC;QACzC,UAAU,EAAE,QAAQ,CAAC,KAAK,CAAC,cAAc,CAAC;KAC3C,CAAC;AACJ,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,uBAAuB,CACrC,YAAgC,EAChC,WAA6B;IAE7B,MAAM,OAAO,GAAG,IAAI,GAAG,CAAC,WAAW,CAAC,CAAC;IACrC,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;IAClC,KAAK,MAAM,CAAC,IAAI,YAAY,EAAE,CAAC;QAC7B,IAAI,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,WAAW,CAAC;YAAE,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;IAC7D,CAAC;IACD,IAAI,OAAO,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;QACrB,MAAM,MAAM,GAAG,CAAC,GAAG,OAAO,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACpD,MAAM,IAAI,KAAK,CACb,mCAAmC,OAAO,CAAC,IAAI,oDAAoD;YACjG,IAAI,MAAM,GAAG,OAAO,CAAC,IAAI,GAAG,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,mEAAmE;YAC9G,uFAAuF,CAC1F,CAAC;IACJ,CAAC;AACH,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,kBAAkB,CAChC,YAAgC,EAChC,OAAsB;IAEtB,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,kBAAkB,CAAC;IAC1D,MAAM,OAAO,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC;IAC7C,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,MAAM,MAAM,GAAiB;QAC3B,iBAAiB,EAAE,YAAY,CAAC,MAAM;QACtC,iBAAiB,EAAE,CAAC;QACpB,iBAAiB,EAAE,CAAC;QACpB,mBAAmB,EAAE,CAAC;QACtB,mBAAmB,EAAE,CAAC;QACtB,OAAO,EAAE,CAAC;QACV,OAAO,EAAE,CAAC;QACV,cAAc,EAAE,EAAE;QAClB,cAAc,EAAE,EAAE;QAClB,KAAK;KACN,CAAC;IAEF,6EAA6E;IAC7E,6EAA6E;IAC7E,MAAM,QAAQ,GAAuB,EAAE,CAAC;IACxC,KAAK,MAAM,CAAC,IAAI,YAAY,EAAE,CAAC;QAC7B,IAAI,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,WAAW,CAAC,EAAE,CAAC;YAC/B,MAAM,CAAC,iBAAiB,EAAE,CAAC;YAC3B,SAAS;QACX,CAAC;QACD,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACnB,CAAC;IACD,uBAAuB,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAE3C,4EAA4E;IAC5E,+EAA+E;IAC/E,MAAM,KAAK,GAAuB,EAAE,CAAC;IACrC,KAAK,MAAM,CAAC,IAAI,QAAQ,EAAE,CAAC;QACzB,MAAM,MAAM,GAAG,cAAc,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;QAC1C,IAAI,MAAM,IAAI,SAAS,EAAE,CAAC;YACxB,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACd,SAAS;QACX,CAAC;QACD,IAAI,OAAO,CAAC,kBAAkB,EAAE,CAAC;YAC/B,MAAM,SAAS,GAAG,kBAAkB,CAAC,CAAC,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;YAC5D,MAAM,CAAC,mBAAmB,EAAE,CAAC;YAC7B,KAAK,CAAC,IAAI,CACR,aAAa,CAAC,CAAC,WAAW,KAAK,CAAC,CAAC,KAAK,KAAK,MAAM,KAAK,cAAc,CAAC,SAAS,CAAC,gBAAgB,SAAS,GAAG,CAC5G,CAAC;YACF,KAAK,CAAC,IAAI,CAAC,EAAE,GAAG,CAAC,EAAE,QAAQ,EAAE,SAAS,EAAE,CAAC,CAAC;QAC5C,CAAC;aAAM,CAAC;YACN,MAAM,CAAC,iBAAiB,EAAE,CAAC;YAC3B,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,WAAW,KAAK,CAAC,CAAC,KAAK,KAAK,MAAM,iBAAiB,SAAS,EAAE,CAAC,CAAC;QAC1F,CAAC;IACH,CAAC;IAED,2EAA2E;IAC3E,4EAA4E;IAC5E,4EAA4E;IAC5E,4EAA4E;IAC5E,2EAA2E;IAC3E,MAAM,QAAQ,GAAG,OAAO,CAAC,gBAAgB,IAAI,IAAI,CAAC;IAClD,MAAM,KAAK,GAAuB,EAAE,CAAC;IACrC,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,IAAI,CAAC,QAAQ,EAAE,CAAC;YACd,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACd,SAAS;QACX,CAAC;QACD,MAAM,QAAQ,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAC;QACrC,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACd,SAAS;QACX,CAAC;QACD,MAAM,CAAC,mBAAmB,EAAE,CAAC;QAC7B,MAAM,KAAK,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAClE,MAAM,MAAM,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;QAClC,KAAK,CAAC,IAAI,CACR,oBAAoB,CAAC,CAAC,WAAW,KAAK,CAAC,CAAC,KAAK,MAAM,KAAK,aAAa,MAAM,SAAS,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CACvG,CAAC;IACJ,CAAC;IAED,+EAA+E;IAC/E,MAAM,GAAG,GAAa,EAAE,CAAC;IACzB,MAAM,MAAM,GAAG,IAAI,GAAG,EAAU,CAAC;IACjC,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,IAAI,CAAC,CAAC,CAAC,QAAQ;YAAE,SAAS;QAC1B,IAAI,CAAC,CAAC,QAAQ,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS,CAAC,4CAA4C;QACnF,yEAAyE;QACzE,gEAAgE;QAChE,GAAG,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;QACnC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;IAC5B,CAAC;IAED,8EAA8E;IAC9E,yEAAyE;IACzE,iEAAiE;IACjE,MAAM,GAAG,GAAa,EAAE,CAAC;IACzB,MAAM,MAAM,GAAG,IAAI,GAAG,EAAU,CAAC;IACjC,MAAM,WAAW,GAAG,IAAI,GAAG,EAA8B,CAAC;IAC1D,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,IAAI,CAAC,CAAC,IAAI,KAAK,OAAO;YAAE,SAAS,CAAC,6CAA6C;QAC/E,MAAM,GAAG,GAAG,GAAG,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,WAAW,EAAE,CAAC;QAC1C,MAAM,GAAG,GAAG,WAAW,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QACjC,IAAI,GAAG;YAAE,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;;YAChB,WAAW,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IACjC,CAAC;IACD,KAAK,MAAM,KAAK,IAAI,WAAW,CAAC,MAAM,EAAE,EAAE,CAAC;QACzC,MAAM,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACxE,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;QAClD,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAC3D,yEAAyE;QACzE,MAAM,QAAQ,GAAG,CAAC,CAAmB,EAAE,CAAmB,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC;QACjG,MAAM,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QACxC,MAAM,CAAC,GAAG,CAAC,GAAG,QAAQ,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QAC1C,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,gBAAgB,EAAE,GAAG,qBAAqB,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;QACnF,MAAM,EAAE,UAAU,EAAE,kBAAkB,EAAE,GAAG,qBAAqB,CAC9D,CAAC,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,MAAM,CAC5C,CAAC;QACF,GAAG,CAAC,IAAI,CAAC;YACP,MAAM;YACN,MAAM,EAAE,gBAAgB;YACxB,QAAQ,EAAE,kBAAkB;SAC7B,CAAC,CAAC;QACH,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;IAC5B,CAAC;IAED,MAAM,CAAC,OAAO,GAAG,GAAG,CAAC,MAAM,CAAC;IAC5B,MAAM,CAAC,OAAO,GAAG,GAAG,CAAC,MAAM,CAAC;IAC5B,MAAM,CAAC,cAAc,GAAG,CAAC,GAAG,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;IAC3C,MAAM,CAAC,cAAc,GAAG,CAAC,GAAG,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;IAE3C,OAAO,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,EAAE,CAAC;AAC9B,CAAC;AAED,sDAAsD;AACtD,MAAM,UAAU,UAAU,CAAC,IAAc;IACvC,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;AACnF,CAAC;AAED,sDAAsD;AACtD,MAAM,UAAU,UAAU,CAAC,IAAc;IACvC,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;AACnF,CAAC"}
@@ -0,0 +1,38 @@
1
+ /**
2
+ * The value of the `weightAdapter` gene. `null` (or absent) === BASE: the cheap
3
+ * tier runs with no LoRA adapter — the pre-gene default, byte-identical key.
4
+ * A string is an adapter id (e.g. "glm5.2-sft", "glm5.2-sft-dpo").
5
+ */
6
+ export type WeightAdapterGene = string | null;
7
+ /** The canonical "no adapter" sentinel. Absent gene === BASE === this. */
8
+ export declare const BASE_ADAPTER: WeightAdapterGene;
9
+ /**
10
+ * The adapter variants Darwin evolution may choose among. BASE is always first
11
+ * (the control). The tuned variants reference adapters the runner produced:
12
+ * SFT-only and SFT+DPO are the two recipes; a ratio variant blends adapter
13
+ * strength. ALL are inert until an actual adapter is trained (GPU job) — the
14
+ * gene only NAMES an adapter; it does not create one.
15
+ */
16
+ export declare const WEIGHT_ADAPTERS: WeightAdapterGene[];
17
+ /**
18
+ * Normalize a raw gene value. Absent / '' / 'base' / 'none' all coerce to BASE
19
+ * so an unset gene is indistinguishable from an explicit base choice — that's
20
+ * what keeps pre-gene genomes byte-identical.
21
+ */
22
+ export declare function normalizeWeightAdapter(v: unknown): WeightAdapterGene;
23
+ /**
24
+ * Stable key fragment for a genome's adapter choice. Empty string for BASE so
25
+ * the gene contributes NOTHING to a genome key unless an adapter is selected —
26
+ * the backward-compatibility invariant (a base-adapter genome keys identically
27
+ * to a pre-gene genome).
28
+ */
29
+ export declare function weightAdapterSuffix(gene: WeightAdapterGene): string;
30
+ /**
31
+ * Map an adapter gene to the CLI flag the cheap-tier solver forwards. BASE
32
+ * emits NO flag (runs the stock model). A tuned adapter emits `--lora-adapter
33
+ * <id>` (the solver loads the adapter onto the cheap base before solving).
34
+ */
35
+ export declare function weightAdapterFlags(gene: WeightAdapterGene): string[];
36
+ /** True iff the gene selects a real adapter (not base). */
37
+ export declare function usesAdapter(gene: WeightAdapterGene): boolean;
38
+ //# sourceMappingURL=genome.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"genome.d.ts","sourceRoot":"","sources":["../src/genome.ts"],"names":[],"mappings":"AAuBA;;;;GAIG;AACH,MAAM,MAAM,iBAAiB,GAAG,MAAM,GAAG,IAAI,CAAC;AAE9C,0EAA0E;AAC1E,eAAO,MAAM,YAAY,EAAE,iBAAwB,CAAC;AAEpD;;;;;;GAMG;AACH,eAAO,MAAM,eAAe,EAAE,iBAAiB,EAI9C,CAAC;AAEF;;;;GAIG;AACH,wBAAgB,sBAAsB,CAAC,CAAC,EAAE,OAAO,GAAG,iBAAiB,CAMpE;AAED;;;;;GAKG;AACH,wBAAgB,mBAAmB,CAAC,IAAI,EAAE,iBAAiB,GAAG,MAAM,CAGnE;AAED;;;;GAIG;AACH,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,iBAAiB,GAAG,MAAM,EAAE,CAGpE;AAED,2DAA2D;AAC3D,wBAAgB,WAAW,CAAC,IAAI,EAAE,iBAAiB,GAAG,OAAO,CAE5D"}
package/dist/genome.js ADDED
@@ -0,0 +1,75 @@
1
+ // SPDX-License-Identifier: MIT
2
+ //
3
+ // genome.ts — the `weightAdapter` genome gene.
4
+ //
5
+ // This is the bridge that lets Darwin evolution SELECT among LoRA adapters
6
+ // (the evolutionary prune-the-overfitter safety net). The Darwin genome stays
7
+ // gradient-FREE policy evolution; this gene adds a single reference to a tuned
8
+ // weight adapter so an evolved config can run the cheap tier with-or-without a
9
+ // distilled adapter, and let selection decide.
10
+ //
11
+ // SAFETY-NET RATIONALE: a LoRA tune can overfit (memorize the SFT set, regress
12
+ // on held-out). Rather than trust the tune blindly, we make the adapter a GENE:
13
+ // base (no adapter) competes against SFT-only / SFT+DPO / ratio variants under
14
+ // the same conformant fitness, and evolution prunes an adapter that doesn't
15
+ // actually lift held-out resolve. The default is ALWAYS base/no-adapter, so a
16
+ // genome that never opts into an adapter is byte-identical to a pre-gene genome.
17
+ //
18
+ // The CONCRETE wiring into darwin-mode's evolve-config genome lives in
19
+ // packages/darwin-mode/bench/swebench/evolve-config.mjs (the WEIGHT_ADAPTERS /
20
+ // normalizeWeightAdapter / weightAdapterFlags helpers + the mutate/crossover/
21
+ // seed integration). This module is the typed SPEC + reference implementation
22
+ // of the gene's semantics, so the cross-package contract is testable from here.
23
+ /** The canonical "no adapter" sentinel. Absent gene === BASE === this. */
24
+ export const BASE_ADAPTER = null;
25
+ /**
26
+ * The adapter variants Darwin evolution may choose among. BASE is always first
27
+ * (the control). The tuned variants reference adapters the runner produced:
28
+ * SFT-only and SFT+DPO are the two recipes; a ratio variant blends adapter
29
+ * strength. ALL are inert until an actual adapter is trained (GPU job) — the
30
+ * gene only NAMES an adapter; it does not create one.
31
+ */
32
+ export const WEIGHT_ADAPTERS = [
33
+ BASE_ADAPTER, // control — no adapter (the safety-net baseline)
34
+ 'sft', // SFT-distill only
35
+ 'sft-dpo', // SFT then on-policy DPO
36
+ ];
37
+ /**
38
+ * Normalize a raw gene value. Absent / '' / 'base' / 'none' all coerce to BASE
39
+ * so an unset gene is indistinguishable from an explicit base choice — that's
40
+ * what keeps pre-gene genomes byte-identical.
41
+ */
42
+ export function normalizeWeightAdapter(v) {
43
+ if (v == null)
44
+ return BASE_ADAPTER;
45
+ if (typeof v !== 'string')
46
+ return BASE_ADAPTER;
47
+ const s = v.trim().toLowerCase();
48
+ if (s === '' || s === 'base' || s === 'none')
49
+ return BASE_ADAPTER;
50
+ return v.trim();
51
+ }
52
+ /**
53
+ * Stable key fragment for a genome's adapter choice. Empty string for BASE so
54
+ * the gene contributes NOTHING to a genome key unless an adapter is selected —
55
+ * the backward-compatibility invariant (a base-adapter genome keys identically
56
+ * to a pre-gene genome).
57
+ */
58
+ export function weightAdapterSuffix(gene) {
59
+ const norm = normalizeWeightAdapter(gene);
60
+ return norm == null ? '' : `+w:${norm}`;
61
+ }
62
+ /**
63
+ * Map an adapter gene to the CLI flag the cheap-tier solver forwards. BASE
64
+ * emits NO flag (runs the stock model). A tuned adapter emits `--lora-adapter
65
+ * <id>` (the solver loads the adapter onto the cheap base before solving).
66
+ */
67
+ export function weightAdapterFlags(gene) {
68
+ const norm = normalizeWeightAdapter(gene);
69
+ return norm == null ? [] : ['--lora-adapter', norm];
70
+ }
71
+ /** True iff the gene selects a real adapter (not base). */
72
+ export function usesAdapter(gene) {
73
+ return normalizeWeightAdapter(gene) != null;
74
+ }
75
+ //# sourceMappingURL=genome.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"genome.js","sourceRoot":"","sources":["../src/genome.ts"],"names":[],"mappings":"AAAA,+BAA+B;AAC/B,EAAE;AACF,+CAA+C;AAC/C,EAAE;AACF,2EAA2E;AAC3E,8EAA8E;AAC9E,+EAA+E;AAC/E,+EAA+E;AAC/E,+CAA+C;AAC/C,EAAE;AACF,+EAA+E;AAC/E,gFAAgF;AAChF,+EAA+E;AAC/E,4EAA4E;AAC5E,8EAA8E;AAC9E,iFAAiF;AACjF,EAAE;AACF,uEAAuE;AACvE,+EAA+E;AAC/E,8EAA8E;AAC9E,8EAA8E;AAC9E,gFAAgF;AAShF,0EAA0E;AAC1E,MAAM,CAAC,MAAM,YAAY,GAAsB,IAAI,CAAC;AAEpD;;;;;;GAMG;AACH,MAAM,CAAC,MAAM,eAAe,GAAwB;IAClD,YAAY,EAAE,iDAAiD;IAC/D,KAAK,EAAE,mBAAmB;IAC1B,SAAS,EAAE,yBAAyB;CACrC,CAAC;AAEF;;;;GAIG;AACH,MAAM,UAAU,sBAAsB,CAAC,CAAU;IAC/C,IAAI,CAAC,IAAI,IAAI;QAAE,OAAO,YAAY,CAAC;IACnC,IAAI,OAAO,CAAC,KAAK,QAAQ;QAAE,OAAO,YAAY,CAAC;IAC/C,MAAM,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IACjC,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,KAAK,MAAM,IAAI,CAAC,KAAK,MAAM;QAAE,OAAO,YAAY,CAAC;IAClE,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;AAClB,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,mBAAmB,CAAC,IAAuB;IACzD,MAAM,IAAI,GAAG,sBAAsB,CAAC,IAAI,CAAC,CAAC;IAC1C,OAAO,IAAI,IAAI,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,IAAI,EAAE,CAAC;AAC1C,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,kBAAkB,CAAC,IAAuB;IACxD,MAAM,IAAI,GAAG,sBAAsB,CAAC,IAAI,CAAC,CAAC;IAC1C,OAAO,IAAI,IAAI,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,gBAAgB,EAAE,IAAI,CAAC,CAAC;AACtD,CAAC;AAED,2DAA2D;AAC3D,MAAM,UAAU,WAAW,CAAC,IAAuB;IACjD,OAAO,sBAAsB,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC;AAC9C,CAAC"}
@@ -0,0 +1,11 @@
1
+ export type { ChatMessage, ToolCall, PolicyTier, DarwinTrajectory, SftRow, DpoRow, ExportOptions, ExportResult, ExportReport, } from './types.js';
2
+ export { exportTrainingData, assertTrainEvalDisjoint, estimateTokens, sftToJsonl, dpoToJsonl, } from './export.js';
3
+ export type { BaseModelSpec, TrainStage, LoraConfig, TrainConfig, TrainRunOptions, TrainRunResult, TrainingPlan, } from './train.js';
4
+ export { DEFAULT_LORA, defaultDetectGpu, assertTunableSize, sftConfig, dpoConfig, buildCommand, buildPlan, runTraining, twoStagePlan, adaptSftForRunner, adaptDpoForRunner, } from './train.js';
5
+ export type { WeightAdapterGene } from './genome.js';
6
+ export { BASE_ADAPTER, WEIGHT_ADAPTERS, normalizeWeightAdapter, weightAdapterSuffix, weightAdapterFlags, usesAdapter, } from './genome.js';
7
+ export type { CascadeOutcome, CascadeSummary, CostParetoDelta, } from './eval.js';
8
+ export { summarizeCascade, costParetoDelta } from './eval.js';
9
+ export type { RewardHackKind, RewardHackFinding } from './reward-hack.js';
10
+ export { detectRewardHack, isRewardHacked } from './reward-hack.js';
11
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAQA,YAAY,EACV,WAAW,EACX,QAAQ,EACR,UAAU,EACV,gBAAgB,EAChB,MAAM,EACN,MAAM,EACN,aAAa,EACb,YAAY,EACZ,YAAY,GACb,MAAM,YAAY,CAAC;AAEpB,OAAO,EACL,kBAAkB,EAClB,uBAAuB,EACvB,cAAc,EACd,UAAU,EACV,UAAU,GACX,MAAM,aAAa,CAAC;AAErB,YAAY,EACV,aAAa,EACb,UAAU,EACV,UAAU,EACV,WAAW,EACX,eAAe,EACf,cAAc,EACd,YAAY,GACb,MAAM,YAAY,CAAC;AAEpB,OAAO,EACL,YAAY,EACZ,gBAAgB,EAChB,iBAAiB,EACjB,SAAS,EACT,SAAS,EACT,YAAY,EACZ,SAAS,EACT,WAAW,EACX,YAAY,EACZ,iBAAiB,EACjB,iBAAiB,GAClB,MAAM,YAAY,CAAC;AAEpB,YAAY,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAC;AAErD,OAAO,EACL,YAAY,EACZ,eAAe,EACf,sBAAsB,EACtB,mBAAmB,EACnB,kBAAkB,EAClB,WAAW,GACZ,MAAM,aAAa,CAAC;AAErB,YAAY,EACV,cAAc,EACd,cAAc,EACd,eAAe,GAChB,MAAM,WAAW,CAAC;AAEnB,OAAO,EAAE,gBAAgB,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAE9D,YAAY,EAAE,cAAc,EAAE,iBAAiB,EAAE,MAAM,kBAAkB,CAAC;AAE1E,OAAO,EAAE,gBAAgB,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,13 @@
1
+ // SPDX-License-Identifier: MIT
2
+ //
3
+ // @metaharness/weight-eft — public API.
4
+ //
5
+ // Evolutionary fine-tuning: distill Darwin's archival success into the open
6
+ // cheap tier via LoRA so the cost-cascade escalates to a frontier model less
7
+ // often. Cost-Pareto axis, not the frontier ceiling. See ADR-198.
8
+ export { exportTrainingData, assertTrainEvalDisjoint, estimateTokens, sftToJsonl, dpoToJsonl, } from './export.js';
9
+ export { DEFAULT_LORA, defaultDetectGpu, assertTunableSize, sftConfig, dpoConfig, buildCommand, buildPlan, runTraining, twoStagePlan, adaptSftForRunner, adaptDpoForRunner, } from './train.js';
10
+ export { BASE_ADAPTER, WEIGHT_ADAPTERS, normalizeWeightAdapter, weightAdapterSuffix, weightAdapterFlags, usesAdapter, } from './genome.js';
11
+ export { summarizeCascade, costParetoDelta } from './eval.js';
12
+ export { detectRewardHack, isRewardHacked } from './reward-hack.js';
13
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,+BAA+B;AAC/B,EAAE;AACF,wCAAwC;AACxC,EAAE;AACF,4EAA4E;AAC5E,6EAA6E;AAC7E,kEAAkE;AAclE,OAAO,EACL,kBAAkB,EAClB,uBAAuB,EACvB,cAAc,EACd,UAAU,EACV,UAAU,GACX,MAAM,aAAa,CAAC;AAYrB,OAAO,EACL,YAAY,EACZ,gBAAgB,EAChB,iBAAiB,EACjB,SAAS,EACT,SAAS,EACT,YAAY,EACZ,SAAS,EACT,WAAW,EACX,YAAY,EACZ,iBAAiB,EACjB,iBAAiB,GAClB,MAAM,YAAY,CAAC;AAIpB,OAAO,EACL,YAAY,EACZ,eAAe,EACf,sBAAsB,EACtB,mBAAmB,EACnB,kBAAkB,EAClB,WAAW,GACZ,MAAM,aAAa,CAAC;AAQrB,OAAO,EAAE,gBAAgB,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAI9D,OAAO,EAAE,gBAAgB,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAC"}
@@ -0,0 +1,17 @@
1
+ import type { DarwinTrajectory } from './types.js';
2
+ export type RewardHackKind = 'gold-read' | 'verification-tamper' | 'sandbox-escape';
3
+ export interface RewardHackFinding {
4
+ kind: RewardHackKind;
5
+ /** The offending path/argument fragment. */
6
+ detail: string;
7
+ /** The tool that performed the offending action. */
8
+ tool: string;
9
+ }
10
+ /**
11
+ * Run the deterministic reward-hacking monitor over a single trajectory.
12
+ * Returns all findings (empty ⇒ clean). Pure function of the trajectory.
13
+ */
14
+ export declare function detectRewardHack(t: DarwinTrajectory): RewardHackFinding[];
15
+ /** True iff the trajectory shows ANY reward-hacking signal. */
16
+ export declare function isRewardHacked(t: DarwinTrajectory): boolean;
17
+ //# sourceMappingURL=reward-hack.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"reward-hack.d.ts","sourceRoot":"","sources":["../src/reward-hack.ts"],"names":[],"mappings":"AAoBA,OAAO,KAAK,EAAe,gBAAgB,EAAE,MAAM,YAAY,CAAC;AAwChE,MAAM,MAAM,cAAc,GAAG,WAAW,GAAG,qBAAqB,GAAG,gBAAgB,CAAC;AAEpF,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,cAAc,CAAC;IACrB,4CAA4C;IAC5C,MAAM,EAAE,MAAM,CAAC;IACf,oDAAoD;IACpD,IAAI,EAAE,MAAM,CAAC;CACd;AAcD;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,CAAC,EAAE,gBAAgB,GAAG,iBAAiB,EAAE,CA8BzE;AAED,+DAA+D;AAC/D,wBAAgB,cAAc,CAAC,CAAC,EAAE,gBAAgB,GAAG,OAAO,CAE3D"}
@@ -0,0 +1,105 @@
1
+ // SPDX-License-Identifier: MIT
2
+ //
3
+ // reward-hack.ts — deterministic reward-hacking monitor (Ornith-1.0 borrow).
4
+ //
5
+ // An archived "success" that secretly reward-hacked would teach the model to
6
+ // reward-hack. This is the TRAINING-DATA analog of the conformance firewall —
7
+ // SEPARATE from and IN ADDITION TO the train/eval-disjointness guard. We run a
8
+ // deterministic monitor over each gold-resolved trajectory and DROP any that:
9
+ // (a) read a withheld GOLD/TEST path (FAIL_TO_PASS / PASS_TO_PASS / gold_patch
10
+ // / the held-out test files),
11
+ // (b) modified the verification/test harness (the script that scores it),
12
+ // (c) touched a path OUTSIDE the sandboxed repo.
13
+ //
14
+ // Prior art: Ornith-1.0 (DeepReinforce) — self-scaffolding RL for agentic
15
+ // coding — uses a deterministic monitor that flags trajectories acting outside
16
+ // the sanctioned tool surface → zero reward + excluded. See ADR-198.
17
+ //
18
+ // The monitor reads ONLY the trajectory's own tool_calls + tool messages — it
19
+ // is a pure function of the trajectory (no filesystem, $0, reproducible).
20
+ /** Paths/patterns whose READ signals reward-hacking (the withheld oracle). */
21
+ const GOLD_READ_PATTERNS = [
22
+ /gold_patch/i,
23
+ /\bFAIL_TO_PASS\b/,
24
+ /\bPASS_TO_PASS\b/,
25
+ /test_patch/i,
26
+ /\bexpected[_-]?(output|patch|result)\b/i,
27
+ /\.golden\b/i,
28
+ /run_evaluation/i, // reading the swebench evaluator itself
29
+ ];
30
+ /** Paths whose WRITE/MODIFY signals tampering with the verification harness. */
31
+ const VERIFICATION_WRITE_PATTERNS = [
32
+ /conftest\.py/i,
33
+ /run_tests?\.(sh|py)/i,
34
+ /run_evaluation/i,
35
+ /swebench[\/\\].*harness/i,
36
+ /\bMakefile\b/, // re-routing the test target
37
+ /tox\.ini/i,
38
+ /pytest\.ini/i,
39
+ /setup\.cfg/i, // can redefine test discovery
40
+ ];
41
+ /** Path prefixes that are OUTSIDE the sandboxed repo (escape signals). */
42
+ const OUT_OF_SANDBOX_PATTERNS = [
43
+ /(^|["'\s])\/etc\//,
44
+ /(^|["'\s])\/root\//,
45
+ /(^|["'\s])\/home\//,
46
+ /(^|["'\s])\/tmp\/(?!repo|work)/, // /tmp is fine for the work tree; flag other /tmp paths
47
+ /\.\.\/\.\.\//, // repeated parent traversal out of the repo
48
+ /(^|["'\s])~\//,
49
+ ];
50
+ /** Tool names we treat as a READ of a path. */
51
+ const READ_TOOLS = new Set(['read_file', 'cat', 'open_file', 'view', 'grep', 'search']);
52
+ /** Tool names we treat as a WRITE/MODIFY of a path. */
53
+ const WRITE_TOOLS = new Set(['write_file', 'apply_patch', 'edit_file', 'create_file', 'str_replace']);
54
+ /** Extract the (tool, argumentsText) pairs from a trajectory's assistant turns. */
55
+ function toolActions(messages) {
56
+ const out = [];
57
+ for (const m of messages) {
58
+ if (m.role !== 'assistant' || !m.tool_calls)
59
+ continue;
60
+ for (const tc of m.tool_calls) {
61
+ out.push({ tool: tc.function.name, args: tc.function.arguments });
62
+ }
63
+ }
64
+ return out;
65
+ }
66
+ /**
67
+ * Run the deterministic reward-hacking monitor over a single trajectory.
68
+ * Returns all findings (empty ⇒ clean). Pure function of the trajectory.
69
+ */
70
+ export function detectRewardHack(t) {
71
+ const findings = [];
72
+ for (const { tool, args } of toolActions(t.messages)) {
73
+ const isRead = READ_TOOLS.has(tool);
74
+ const isWrite = WRITE_TOOLS.has(tool);
75
+ // (a) reading a withheld gold/test path — applies to ANY tool that takes a
76
+ // path argument (a read disguised as a grep still leaks the oracle).
77
+ if (isRead || isWrite || /file|path|patch/i.test(tool)) {
78
+ for (const p of GOLD_READ_PATTERNS) {
79
+ const m = args.match(p);
80
+ if (m)
81
+ findings.push({ kind: 'gold-read', detail: m[0], tool });
82
+ }
83
+ }
84
+ // (b) modifying the verification/test harness.
85
+ if (isWrite || /patch|edit|write|replace/i.test(tool)) {
86
+ for (const p of VERIFICATION_WRITE_PATTERNS) {
87
+ const m = args.match(p);
88
+ if (m)
89
+ findings.push({ kind: 'verification-tamper', detail: m[0], tool });
90
+ }
91
+ }
92
+ // (c) touching a path outside the sandboxed repo.
93
+ for (const p of OUT_OF_SANDBOX_PATTERNS) {
94
+ const m = args.match(p);
95
+ if (m)
96
+ findings.push({ kind: 'sandbox-escape', detail: m[0].trim(), tool });
97
+ }
98
+ }
99
+ return findings;
100
+ }
101
+ /** True iff the trajectory shows ANY reward-hacking signal. */
102
+ export function isRewardHacked(t) {
103
+ return detectRewardHack(t).length > 0;
104
+ }
105
+ //# sourceMappingURL=reward-hack.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"reward-hack.js","sourceRoot":"","sources":["../src/reward-hack.ts"],"names":[],"mappings":"AAAA,+BAA+B;AAC/B,EAAE;AACF,6EAA6E;AAC7E,EAAE;AACF,6EAA6E;AAC7E,8EAA8E;AAC9E,+EAA+E;AAC/E,8EAA8E;AAC9E,iFAAiF;AACjF,oCAAoC;AACpC,4EAA4E;AAC5E,mDAAmD;AACnD,EAAE;AACF,0EAA0E;AAC1E,+EAA+E;AAC/E,qEAAqE;AACrE,EAAE;AACF,8EAA8E;AAC9E,0EAA0E;AAI1E,8EAA8E;AAC9E,MAAM,kBAAkB,GAAa;IACnC,aAAa;IACb,kBAAkB;IAClB,kBAAkB;IAClB,aAAa;IACb,yCAAyC;IACzC,aAAa;IACb,iBAAiB,EAAE,wCAAwC;CAC5D,CAAC;AAEF,gFAAgF;AAChF,MAAM,2BAA2B,GAAa;IAC5C,eAAe;IACf,sBAAsB;IACtB,iBAAiB;IACjB,0BAA0B;IAC1B,cAAc,EAAE,6BAA6B;IAC7C,WAAW;IACX,cAAc;IACd,aAAa,EAAE,8BAA8B;CAC9C,CAAC;AAEF,0EAA0E;AAC1E,MAAM,uBAAuB,GAAa;IACxC,mBAAmB;IACnB,oBAAoB;IACpB,oBAAoB;IACpB,gCAAgC,EAAE,wDAAwD;IAC1F,cAAc,EAAE,4CAA4C;IAC5D,eAAe;CAChB,CAAC;AAEF,+CAA+C;AAC/C,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC,CAAC,WAAW,EAAE,KAAK,EAAE,WAAW,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC,CAAC;AACxF,uDAAuD;AACvD,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC,CAAC,YAAY,EAAE,aAAa,EAAE,WAAW,EAAE,aAAa,EAAE,aAAa,CAAC,CAAC,CAAC;AAYtG,mFAAmF;AACnF,SAAS,WAAW,CAAC,QAAuB;IAC1C,MAAM,GAAG,GAA0C,EAAE,CAAC;IACtD,KAAK,MAAM,CAAC,IAAI,QAAQ,EAAE,CAAC;QACzB,IAAI,CAAC,CAAC,IAAI,KAAK,WAAW,IAAI,CAAC,CAAC,CAAC,UAAU;YAAE,SAAS;QACtD,KAAK,MAAM,EAAE,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;YAC9B,GAAG,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,EAAE,CAAC,QAAQ,CAAC,IAAI,EAAE,IAAI,EAAE,EAAE,CAAC,QAAQ,CAAC,SAAS,EAAE,CAAC,CAAC;QACpE,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,CAAmB;IAClD,MAAM,QAAQ,GAAwB,EAAE,CAAC;IACzC,KAAK,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,EAAE,CAAC;QACrD,MAAM,MAAM,GAAG,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QACpC,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QAEtC,2EAA2E;QAC3E,yEAAyE;QACzE,IAAI,MAAM,IAAI,OAAO,IAAI,kBAAkB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YACvD,KAAK,MAAM,CAAC,IAAI,kBAAkB,EAAE,CAAC;gBACnC,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;gBACxB,IAAI,CAAC;oBAAE,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC;YAClE,CAAC;QACH,CAAC;QAED,+CAA+C;QAC/C,IAAI,OAAO,IAAI,2BAA2B,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YACtD,KAAK,MAAM,CAAC,IAAI,2BAA2B,EAAE,CAAC;gBAC5C,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;gBACxB,IAAI,CAAC;oBAAE,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,qBAAqB,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC;YAC5E,CAAC;QACH,CAAC;QAED,kDAAkD;QAClD,KAAK,MAAM,CAAC,IAAI,uBAAuB,EAAE,CAAC;YACxC,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;YACxB,IAAI,CAAC;gBAAE,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,gBAAgB,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;QAC9E,CAAC;IACH,CAAC;IACD,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,+DAA+D;AAC/D,MAAM,UAAU,cAAc,CAAC,CAAmB;IAChD,OAAO,gBAAgB,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;AACxC,CAAC"}