@tangle-network/agent-eval 0.23.1 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/CHANGELOG.md +145 -0
  2. package/README.md +212 -79
  3. package/dist/baseline-4R5deP0N.d.ts +108 -0
  4. package/dist/benchmarks/index.d.ts +3 -2
  5. package/dist/benchmarks/index.js +1 -1
  6. package/dist/builder-eval/index.d.ts +249 -0
  7. package/dist/builder-eval/index.js +391 -0
  8. package/dist/builder-eval/index.js.map +1 -0
  9. package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
  10. package/dist/chunk-2A5XJB43.js.map +1 -0
  11. package/dist/chunk-47X6LRCE.js +76 -0
  12. package/dist/chunk-47X6LRCE.js.map +1 -0
  13. package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
  14. package/dist/chunk-4F5DQN55.js.map +1 -0
  15. package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
  16. package/dist/chunk-4S4BM3QQ.js.map +1 -0
  17. package/dist/chunk-5BKGXME7.js +65 -0
  18. package/dist/chunk-5BKGXME7.js.map +1 -0
  19. package/dist/{chunk-6KQG5HAH.js → chunk-5LBB5B3Z.js} +376 -72
  20. package/dist/chunk-5LBB5B3Z.js.map +1 -0
  21. package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
  22. package/dist/chunk-6QDKWHLS.js.map +1 -0
  23. package/dist/{chunk-VQQSPGSM.js → chunk-EDUKQ5AM.js} +247 -189
  24. package/dist/chunk-EDUKQ5AM.js.map +1 -0
  25. package/dist/chunk-I4MBDTY5.js +272 -0
  26. package/dist/chunk-I4MBDTY5.js.map +1 -0
  27. package/dist/chunk-JLZQWFV3.js +618 -0
  28. package/dist/chunk-JLZQWFV3.js.map +1 -0
  29. package/dist/chunk-K2TPS5LB.js +569 -0
  30. package/dist/chunk-K2TPS5LB.js.map +1 -0
  31. package/dist/chunk-KKHDIONI.js +414 -0
  32. package/dist/chunk-KKHDIONI.js.map +1 -0
  33. package/dist/chunk-KMPRBJK4.js +74 -0
  34. package/dist/chunk-KMPRBJK4.js.map +1 -0
  35. package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
  36. package/dist/chunk-KTGTIOFD.js.map +1 -0
  37. package/dist/chunk-LSH4MMOZ.js +838 -0
  38. package/dist/chunk-LSH4MMOZ.js.map +1 -0
  39. package/dist/chunk-NG236HPC.js +57 -0
  40. package/dist/chunk-NG236HPC.js.map +1 -0
  41. package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
  42. package/dist/chunk-NLMNWKVM.js.map +1 -0
  43. package/dist/chunk-NU65VQ7M.js +99 -0
  44. package/dist/chunk-NU65VQ7M.js.map +1 -0
  45. package/dist/chunk-OWLAAMME.js +250 -0
  46. package/dist/chunk-OWLAAMME.js.map +1 -0
  47. package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
  48. package/dist/chunk-PC4UYEBM.js.map +1 -0
  49. package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
  50. package/dist/chunk-RAF443UI.js.map +1 -0
  51. package/dist/chunk-RZTMDUO7.js +49 -0
  52. package/dist/chunk-RZTMDUO7.js.map +1 -0
  53. package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
  54. package/dist/chunk-SESZDQPX.js.map +1 -0
  55. package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
  56. package/dist/chunk-TVVP3ZZQ.js.map +1 -0
  57. package/dist/chunk-WWYCWKUM.js +196 -0
  58. package/dist/chunk-WWYCWKUM.js.map +1 -0
  59. package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
  60. package/dist/chunk-YRZ4M5GS.js.map +1 -0
  61. package/dist/chunk-ZN274SWR.js +613 -0
  62. package/dist/chunk-ZN274SWR.js.map +1 -0
  63. package/dist/cli.js +10 -6
  64. package/dist/cli.js.map +1 -1
  65. package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
  66. package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
  67. package/dist/control.d.ts +8 -6
  68. package/dist/control.js +10 -7
  69. package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
  70. package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  71. package/dist/errors-BZ9sTdz7.d.ts +70 -0
  72. package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
  73. package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
  74. package/dist/governance/index.d.ts +5 -0
  75. package/dist/governance/index.js +18 -0
  76. package/dist/governance/index.js.map +1 -0
  77. package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
  78. package/dist/index-Oj9fAPPN.d.ts +270 -0
  79. package/dist/index.d.ts +2018 -3003
  80. package/dist/index.js +7443 -9102
  81. package/dist/index.js.map +1 -1
  82. package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
  83. package/dist/knowledge/index.d.ts +102 -0
  84. package/dist/knowledge/index.js +18 -0
  85. package/dist/knowledge/index.js.map +1 -0
  86. package/dist/meta-eval/index.d.ts +99 -0
  87. package/dist/meta-eval/index.js +324 -0
  88. package/dist/meta-eval/index.js.map +1 -0
  89. package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
  90. package/dist/openapi.json +491 -1
  91. package/dist/optimization.d.ts +11 -8
  92. package/dist/optimization.js +11 -9
  93. package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
  94. package/dist/pipelines/index.d.ts +172 -0
  95. package/dist/pipelines/index.js +345 -0
  96. package/dist/pipelines/index.js.map +1 -0
  97. package/dist/prm/index.d.ts +99 -0
  98. package/dist/prm/index.js +222 -0
  99. package/dist/prm/index.js.map +1 -0
  100. package/dist/query-DODUYdPg.d.ts +30 -0
  101. package/dist/release-report-BNgMdqPF.d.ts +292 -0
  102. package/dist/replay-BL96gCEP.d.ts +226 -0
  103. package/dist/reporting.d.ts +10 -295
  104. package/dist/reporting.js +10 -6
  105. package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-BPT8x_NT.d.ts} +148 -146
  106. package/dist/rl.d.ts +1762 -8
  107. package/dist/rl.js +2035 -58
  108. package/dist/rl.js.map +1 -1
  109. package/dist/rubric-D5tjHNJQ.d.ts +72 -0
  110. package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
  111. package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
  112. package/dist/sequential-Dgz1n51-.d.ts +139 -0
  113. package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  114. package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-C7VPYEj2.d.ts} +3 -76
  115. package/dist/telemetry/file.js +4 -1
  116. package/dist/telemetry/file.js.map +1 -1
  117. package/dist/telemetry/index.js +57 -57
  118. package/dist/telemetry/index.js.map +1 -1
  119. package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
  120. package/dist/traces.d.ts +142 -387
  121. package/dist/traces.js +1302 -40
  122. package/dist/traces.js.map +1 -1
  123. package/dist/trajectory-CnoBo-JY.d.ts +32 -0
  124. package/dist/wire/index.d.ts +369 -25
  125. package/dist/wire/index.js +22 -3
  126. package/package.json +44 -18
  127. package/dist/chunk-42I2QC2L.js.map +0 -1
  128. package/dist/chunk-5IIQKMD5.js.map +0 -1
  129. package/dist/chunk-6KQG5HAH.js.map +0 -1
  130. package/dist/chunk-6M774GY6.js.map +0 -1
  131. package/dist/chunk-7EAUOUQS.js.map +0 -1
  132. package/dist/chunk-AXHNWLIX.js.map +0 -1
  133. package/dist/chunk-EXGR4XEM.js.map +0 -1
  134. package/dist/chunk-IOXMGMHQ.js.map +0 -1
  135. package/dist/chunk-KAO3Q65R.js.map +0 -1
  136. package/dist/chunk-LZKIOBG2.js +0 -2026
  137. package/dist/chunk-LZKIOBG2.js.map +0 -1
  138. package/dist/chunk-QBW3YBTR.js.map +0 -1
  139. package/dist/chunk-QUKKGHTZ.js.map +0 -1
  140. package/dist/chunk-SQQLHODJ.js.map +0 -1
  141. package/dist/chunk-V5QSWN7L.js +0 -1310
  142. package/dist/chunk-V5QSWN7L.js.map +0 -1
  143. package/dist/chunk-VQQSPGSM.js.map +0 -1
  144. package/dist/chunk-XPHOZPOM.js +0 -1947
  145. package/dist/chunk-XPHOZPOM.js.map +0 -1
  146. package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
  147. package/dist/index-ekBXweiQ.d.ts +0 -1894
  148. package/dist/sequential-DgU2mFsE.d.ts +0 -304
package/dist/rl.js CHANGED
@@ -1,64 +1,2041 @@
1
- import {
2
- PredictiveValidityResearcher,
3
- adversarialScenarioSearch,
4
- analyzeOptimizationResult,
5
- applyEloUpdate,
6
- bestOfN,
7
- buildPairwiseFromCampaign,
8
- compareAdaptationCurves,
9
- detectRewardHacking,
10
- doublyRobust,
11
- extractPreferences,
12
- extractStepRewards,
13
- extractVerifiableReward,
14
- extractVerifiableRewardsFromRecords,
15
- filterDeterministicallyRewarded,
16
- firstPassK,
17
- fitBradleyTerry,
18
- injectIrrelevantClause,
19
- inverseProbabilityWeighting,
20
- observationsFromRunRecords,
21
- offPolicyEstimateAll,
22
- paretoFrontier,
23
- prmTrainingPairs,
24
- renameVariables,
25
- runAdaptationCurve,
26
- runComputeCurve,
27
- runContaminationProbe,
28
- runRLCampaign,
29
- runwiseStepRewardSummary,
30
- selfConsistency,
31
- selfNormalizedImportanceWeighting,
32
- shuffleOrder,
33
- stepRewardsToJsonl,
34
- thompsonCurriculum,
35
- toAnthropicFormat,
36
- toDpoJsonl,
37
- toDpoRows,
38
- toGrpoJsonl,
39
- toGrpoRows,
40
- toPrmJsonl,
41
- toPrmRows,
42
- toSftJsonl,
43
- toSftRows,
44
- toTRLFormat,
45
- trialToRunRecord,
46
- trialsToRunRecords,
47
- varianceBasedCurriculum,
48
- variantAggregateToRunRecord,
49
- verificationReportToRunRecord
50
- } from "./chunk-LZKIOBG2.js";
51
1
  import {
52
2
  runEvalCampaign
53
- } from "./chunk-EXGR4XEM.js";
54
- import "./chunk-KAO3Q65R.js";
55
- import "./chunk-AXHNWLIX.js";
56
- import "./chunk-IOXMGMHQ.js";
57
- import "./chunk-QUKKGHTZ.js";
58
- import "./chunk-SQQLHODJ.js";
59
- import "./chunk-5IIQKMD5.js";
60
- import "./chunk-6M774GY6.js";
3
+ } from "./chunk-SESZDQPX.js";
4
+ import "./chunk-4S4BM3QQ.js";
5
+ import {
6
+ rubricPredictiveValidity
7
+ } from "./chunk-YRZ4M5GS.js";
8
+ import {
9
+ evaluateInterimReleaseConfidence
10
+ } from "./chunk-NU65VQ7M.js";
11
+ import {
12
+ benjaminiHochberg
13
+ } from "./chunk-2A5XJB43.js";
14
+ import {
15
+ wilcoxonSignedRank
16
+ } from "./chunk-I4MBDTY5.js";
17
+ import "./chunk-KTGTIOFD.js";
18
+ import "./chunk-PC4UYEBM.js";
19
+ import "./chunk-TVVP3ZZQ.js";
20
+ import "./chunk-4F5DQN55.js";
21
+ import {
22
+ ValidationError
23
+ } from "./chunk-NG236HPC.js";
61
24
  import "./chunk-PZ5AY32C.js";
25
+
26
+ // src/rl/compute-curves.ts
27
+ async function runComputeCurve(opts) {
28
+ const points = [];
29
+ for (const budget of opts.budgets) {
30
+ const r = await opts.runAtBudget(budget);
31
+ points.push({
32
+ budgetId: budget.id,
33
+ cost: budget.cost,
34
+ score: r.score,
35
+ samples: r.samples,
36
+ std: r.std,
37
+ metrics: r.metrics
38
+ });
39
+ }
40
+ const sorted = [...points].sort((a, b) => a.cost - b.cost);
41
+ const logSlope = sorted.length >= 2 ? fitLogSlope(sorted) : null;
42
+ const best = points.reduce((a, b) => b.score > a.score ? b : a);
43
+ return { candidateId: opts.candidateId, points: sorted, logSlope, best };
44
+ }
45
+ async function bestOfN(opts) {
46
+ if (opts.n <= 0) throw new ValidationError("bestOfN: n must be > 0");
47
+ const rollouts = [];
48
+ const scores = [];
49
+ for (let i = 0; i < opts.n; i++) {
50
+ const r = await opts.sample(i);
51
+ rollouts.push(r);
52
+ scores.push(await opts.scoreFn(r));
53
+ }
54
+ let bestIndex = 0;
55
+ for (let i = 1; i < scores.length; i++) if (scores[i] > scores[bestIndex]) bestIndex = i;
56
+ const meanScore = scores.reduce((s, x) => s + x, 0) / scores.length;
57
+ return {
58
+ best: rollouts[bestIndex],
59
+ bestScore: scores[bestIndex],
60
+ scores,
61
+ meanScore,
62
+ bestIndex
63
+ };
64
+ }
65
+ async function selfConsistency(opts) {
66
+ if (opts.n <= 0) throw new ValidationError("selfConsistency: n must be > 0");
67
+ const rollouts = [];
68
+ const histogram = {};
69
+ for (let i = 0; i < opts.n; i++) {
70
+ const r = await opts.sample(i);
71
+ rollouts.push(r);
72
+ const key = opts.answerKey(r);
73
+ histogram[key] = (histogram[key] ?? 0) + 1;
74
+ }
75
+ let answer = "";
76
+ let max = -1;
77
+ for (const [k, v] of Object.entries(histogram)) {
78
+ if (v > max) {
79
+ max = v;
80
+ answer = k;
81
+ }
82
+ }
83
+ const representative = rollouts.find((r) => opts.answerKey(r) === answer) ?? rollouts[0];
84
+ return {
85
+ answer,
86
+ agreement: max / opts.n,
87
+ histogram,
88
+ representative,
89
+ rollouts
90
+ };
91
+ }
92
+ function paretoFrontier(points) {
93
+ const onFrontier = [];
94
+ for (const p of points) {
95
+ const dominated = points.some(
96
+ (q) => q !== p && q.cost <= p.cost && q.score >= p.score && (q.cost < p.cost || q.score > p.score)
97
+ );
98
+ if (!dominated) onFrontier.push(p);
99
+ }
100
+ return onFrontier.sort((a, b) => a.cost - b.cost);
101
+ }
102
+ function fitLogSlope(points) {
103
+ const xs = points.map((p) => Math.log(Math.max(1e-12, p.cost)));
104
+ const ys = points.map((p) => p.score);
105
+ const n = xs.length;
106
+ const mx = xs.reduce((s, x) => s + x, 0) / n;
107
+ const my = ys.reduce((s, y) => s + y, 0) / n;
108
+ let num = 0;
109
+ let den = 0;
110
+ for (let i = 0; i < n; i++) {
111
+ num += (xs[i] - mx) * (ys[i] - my);
112
+ den += (xs[i] - mx) ** 2;
113
+ }
114
+ return den === 0 ? 0 : num / den;
115
+ }
116
+
117
+ // src/rl/contamination.ts
118
+ async function runContaminationProbe(input, opts = {}) {
119
+ const fdr = opts.fdr ?? 0.05;
120
+ const minMedianDrop = opts.minMedianDrop ?? 0.05;
121
+ const floor = opts.scoreFloor ?? 0;
122
+ if (!input.perturbed && !input.perturbation) {
123
+ throw new ValidationError(
124
+ "runContaminationProbe: must supply either `perturbed` or `perturbation`."
125
+ );
126
+ }
127
+ const perturbed = input.perturbed ?? await Promise.all(input.originals.map((s) => input.perturbation.apply(s)));
128
+ if (perturbed.length !== input.originals.length) {
129
+ throw new ValidationError(
130
+ `runContaminationProbe: perturbed length ${perturbed.length} \u2260 originals ${input.originals.length}`
131
+ );
132
+ }
133
+ const origScores = await Promise.all(input.originals.map((s) => input.scoreFn(s)));
134
+ const pertScores = await Promise.all(perturbed.map((s) => input.scoreFn(s)));
135
+ const perScenario = input.originals.map((s, i) => ({
136
+ scenarioId: input.scenarioId(s),
137
+ originalScore: origScores[i],
138
+ perturbedScore: pertScores[i],
139
+ delta: pertScores[i] - origScores[i],
140
+ qValue: NaN
141
+ }));
142
+ const valid = perScenario.filter((p) => p.originalScore >= floor && p.perturbedScore >= floor);
143
+ if (valid.length < 4) {
144
+ return {
145
+ perScenario,
146
+ pairedTest: { w: 0, p: 1 },
147
+ medianDelta: 0,
148
+ meanDelta: 0,
149
+ contaminationSuspected: false,
150
+ reason: `insufficient valid scenarios (n=${valid.length}, need \u2265 4)`,
151
+ n: valid.length
152
+ };
153
+ }
154
+ const origValid = valid.map((p) => p.originalScore);
155
+ const pertValid = valid.map((p) => p.perturbedScore);
156
+ const pairedTest = wilcoxonSignedRank(origValid, pertValid);
157
+ const deltas = valid.map((p) => p.delta);
158
+ const sortedDeltas = [...deltas].sort((a, b) => a - b);
159
+ const median = sortedDeltas[Math.floor(sortedDeltas.length / 2)];
160
+ const mean2 = deltas.reduce((s, d) => s + d, 0) / deltas.length;
161
+ const pseudoP = valid.map((p) => Math.min(1, Math.max(1e-6, 1 - Math.abs(p.delta) / 1)));
162
+ const { qValues } = benjaminiHochberg(pseudoP, fdr);
163
+ for (let i = 0; i < valid.length; i++) {
164
+ const v = valid[i];
165
+ const idx = perScenario.findIndex((p) => p.scenarioId === v.scenarioId);
166
+ if (idx >= 0) perScenario[idx].qValue = qValues[i];
167
+ }
168
+ const contaminationSuspected = pairedTest.p < fdr && median <= -minMedianDrop;
169
+ const reason = contaminationSuspected ? `paired p=${pairedTest.p.toFixed(4)} < ${fdr} and median drop ${median.toFixed(4)} \u2265 ${minMedianDrop}` : pairedTest.p >= fdr ? `no significant difference (paired p=${pairedTest.p.toFixed(4)})` : `significant but small effect (median delta ${median.toFixed(4)})`;
170
+ return {
171
+ perScenario,
172
+ pairedTest,
173
+ medianDelta: median,
174
+ meanDelta: mean2,
175
+ contaminationSuspected,
176
+ reason,
177
+ n: valid.length
178
+ };
179
+ }
180
+ function renameVariables(identifiers, rename = (n, i) => `${n}_${(i % 26 + 10).toString(36)}`) {
181
+ return {
182
+ kind: "rename_variables",
183
+ apply(scenario) {
184
+ let prompt = scenario.prompt;
185
+ identifiers.forEach((id, i) => {
186
+ const replacement = rename(id, i);
187
+ const re = new RegExp(`\\b${escapeRegex(id)}\\b`, "g");
188
+ prompt = prompt.replace(re, replacement);
189
+ });
190
+ return { ...scenario, prompt };
191
+ }
192
+ };
193
+ }
194
+ function shuffleOrder(shuffleSection, seed) {
195
+ let s = seed >>> 0;
196
+ const rng = () => {
197
+ s = s + 1831565813 >>> 0;
198
+ let t = s;
199
+ t = Math.imul(t ^ t >>> 15, t | 1);
200
+ t ^= t + Math.imul(t ^ t >>> 7, t | 61);
201
+ return ((t ^ t >>> 14) >>> 0) / 4294967296;
202
+ };
203
+ return {
204
+ kind: "shuffle_order",
205
+ apply(scenario) {
206
+ const newPrompt = shuffleSection(scenario.prompt, rng);
207
+ return { ...scenario, prompt: newPrompt };
208
+ }
209
+ };
210
+ }
211
+ function injectIrrelevantClause(clause, position = "prefix") {
212
+ return {
213
+ kind: "inject_irrelevant_clause",
214
+ apply(scenario) {
215
+ const prompt = position === "prefix" ? `${clause} ${scenario.prompt}` : `${scenario.prompt} ${clause}`;
216
+ return { ...scenario, prompt };
217
+ }
218
+ };
219
+ }
220
+ function escapeRegex(s) {
221
+ return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
222
+ }
223
+
224
+ // src/rl/off-policy.ts
225
+ function inverseProbabilityWeighting(trajectories, opts = {}) {
226
+ const cap = opts.weightCap ?? Infinity;
227
+ const clip = opts.rewardClip ?? { low: 0, high: 1 };
228
+ if (trajectories.length === 0) {
229
+ return zeroEstimate();
230
+ }
231
+ const weights = [];
232
+ const weightedRewards = [];
233
+ let maxW = 0;
234
+ for (const t of trajectories) {
235
+ if (t.behaviorProb <= 0) {
236
+ throw new ValidationError(
237
+ `inverseProbabilityWeighting: behaviorProb must be > 0 (runId=${t.runId})`
238
+ );
239
+ }
240
+ const w = Math.min(cap, t.targetProb / t.behaviorProb);
241
+ const r = clamp(t.reward, clip.low, clip.high);
242
+ weights.push(w);
243
+ weightedRewards.push(w * r);
244
+ if (w > maxW) maxW = w;
245
+ }
246
+ const n = weights.length;
247
+ const value = weightedRewards.reduce((s, x) => s + x, 0) / n;
248
+ const variance = weightedRewards.reduce((s, x) => s + (x - value) ** 2, 0) / Math.max(1, n - 1);
249
+ const sumW = weights.reduce((s, w) => s + w, 0);
250
+ const sumW2 = weights.reduce((s, w) => s + w * w, 0);
251
+ const effN = sumW === 0 ? 0 : sumW * sumW / sumW2;
252
+ return {
253
+ value,
254
+ standardError: Math.sqrt(variance / n),
255
+ effectiveSampleSize: effN,
256
+ n,
257
+ maxImportanceWeight: maxW
258
+ };
259
+ }
260
+ function selfNormalizedImportanceWeighting(trajectories, opts = {}) {
261
+ const cap = opts.weightCap ?? Infinity;
262
+ const clip = opts.rewardClip ?? { low: 0, high: 1 };
263
+ if (trajectories.length === 0) return zeroEstimate();
264
+ const weights = [];
265
+ const rewards = [];
266
+ let maxW = 0;
267
+ for (const t of trajectories) {
268
+ if (t.behaviorProb <= 0) {
269
+ throw new ValidationError(
270
+ `selfNormalizedImportanceWeighting: behaviorProb must be > 0 (runId=${t.runId})`
271
+ );
272
+ }
273
+ const w = Math.min(cap, t.targetProb / t.behaviorProb);
274
+ weights.push(w);
275
+ rewards.push(clamp(t.reward, clip.low, clip.high));
276
+ if (w > maxW) maxW = w;
277
+ }
278
+ const sumW = weights.reduce((s, w) => s + w, 0);
279
+ const sumWR = weights.reduce((s, w, i) => s + w * rewards[i], 0);
280
+ const value = sumW === 0 ? 0 : sumWR / sumW;
281
+ const sumW2 = weights.reduce((s, w) => s + w * w, 0);
282
+ const effN = sumW === 0 ? 0 : sumW * sumW / sumW2;
283
+ const phi = weights.map((w, i) => w * (rewards[i] - value));
284
+ const variance = phi.reduce((s, x) => s + x * x, 0) / Math.max(1, sumW * sumW);
285
+ return {
286
+ value,
287
+ standardError: Math.sqrt(variance),
288
+ effectiveSampleSize: effN,
289
+ n: trajectories.length,
290
+ maxImportanceWeight: maxW
291
+ };
292
+ }
293
+ function doublyRobust(trajectories, opts = {}) {
294
+ const cap = opts.weightCap ?? Infinity;
295
+ const clip = opts.rewardClip ?? { low: 0, high: 1 };
296
+ if (trajectories.length === 0) return zeroEstimate();
297
+ const contributions = [];
298
+ let maxW = 0;
299
+ let sumW = 0;
300
+ let sumW2 = 0;
301
+ for (const t of trajectories) {
302
+ if (t.behaviorProb <= 0) {
303
+ throw new ValidationError(`doublyRobust: behaviorProb must be > 0 (runId=${t.runId})`);
304
+ }
305
+ const w = Math.min(cap, t.targetProb / t.behaviorProb);
306
+ const r = clamp(t.reward, clip.low, clip.high);
307
+ const q = typeof t.qHat === "number" && Number.isFinite(t.qHat) ? clamp(t.qHat, clip.low, clip.high) : null;
308
+ if (q === null) {
309
+ contributions.push(w * r);
310
+ } else {
311
+ contributions.push(q + w * (r - q));
312
+ }
313
+ if (w > maxW) maxW = w;
314
+ sumW += w;
315
+ sumW2 += w * w;
316
+ }
317
+ const n = contributions.length;
318
+ const value = contributions.reduce((s, x) => s + x, 0) / n;
319
+ const variance = contributions.reduce((s, x) => s + (x - value) ** 2, 0) / Math.max(1, n - 1);
320
+ const effN = sumW === 0 ? 0 : sumW * sumW / sumW2;
321
+ return {
322
+ value,
323
+ standardError: Math.sqrt(variance / n),
324
+ effectiveSampleSize: effN,
325
+ n,
326
+ maxImportanceWeight: maxW
327
+ };
328
+ }
329
+ function offPolicyEstimateAll(trajectories, opts = {}) {
330
+ return {
331
+ ips: inverseProbabilityWeighting(trajectories, opts),
332
+ snips: selfNormalizedImportanceWeighting(trajectories, opts),
333
+ dr: doublyRobust(trajectories, opts)
334
+ };
335
+ }
336
+ function zeroEstimate() {
337
+ return { value: 0, standardError: 0, effectiveSampleSize: 0, n: 0, maxImportanceWeight: 0 };
338
+ }
339
+ function clamp(x, lo, hi) {
340
+ if (!Number.isFinite(x)) return lo;
341
+ return Math.max(lo, Math.min(hi, x));
342
+ }
343
+
344
+ // src/rl/preferences.ts
345
+ var SPLIT_TAG_DEFAULT = "holdout";
346
+ var DEFAULT_REWARD = (run) => {
347
+ const v = run.outcome.holdoutScore ?? run.outcome.searchScore;
348
+ return typeof v === "number" && Number.isFinite(v) ? v : null;
349
+ };
350
+ function extractPreferences(runs, opts = {}) {
351
+ const strategy = opts.strategy ?? "paired-by-scenario-and-seed";
352
+ const minMargin = opts.minMargin ?? 0.05;
353
+ const splitTag = opts.splitTag ?? SPLIT_TAG_DEFAULT;
354
+ const rewardOf = opts.rewardOf ?? DEFAULT_REWARD;
355
+ const filtered = runs.filter((r) => r.splitTag === splitTag);
356
+ const scoredEntries = [];
357
+ for (const run of filtered) {
358
+ const s = rewardOf(run);
359
+ if (s === null) continue;
360
+ scoredEntries.push({ run, score: s });
361
+ }
362
+ const pairs = [];
363
+ let pairsBelowMargin = 0;
364
+ let cellsSingleton = 0;
365
+ let cellsInspected = 0;
366
+ if (strategy === "paired-by-scenario-and-seed") {
367
+ const groups = /* @__PURE__ */ new Map();
368
+ for (const e of scoredEntries) {
369
+ const sid = scenarioOf(e.run);
370
+ const key = `${sid}::${e.run.seed}`;
371
+ const arr = groups.get(key) ?? [];
372
+ arr.push(e);
373
+ groups.set(key, arr);
374
+ }
375
+ for (const [key, members] of groups.entries()) {
376
+ cellsInspected++;
377
+ if (members.length < 2) {
378
+ cellsSingleton++;
379
+ continue;
380
+ }
381
+ for (let i = 0; i < members.length; i++) {
382
+ for (let j = i + 1; j < members.length; j++) {
383
+ const a = members[i];
384
+ const b = members[j];
385
+ if (a.run.candidateId === b.run.candidateId) continue;
386
+ const result = makePair(a, b, key.split("::")[0], minMargin);
387
+ if (result.kind === "admit") pairs.push(result.pair);
388
+ else pairsBelowMargin++;
389
+ }
390
+ }
391
+ }
392
+ } else if (strategy === "paired-by-scenario") {
393
+ const byScenarioVariant = /* @__PURE__ */ new Map();
394
+ for (const e of scoredEntries) {
395
+ const sid = scenarioOf(e.run);
396
+ let perScenario = byScenarioVariant.get(sid);
397
+ if (!perScenario) {
398
+ perScenario = /* @__PURE__ */ new Map();
399
+ byScenarioVariant.set(sid, perScenario);
400
+ }
401
+ const cur = perScenario.get(e.run.candidateId);
402
+ if (cur) {
403
+ cur.sum += e.score;
404
+ cur.n++;
405
+ } else perScenario.set(e.run.candidateId, { run: e.run, sum: e.score, n: 1 });
406
+ }
407
+ for (const [sid, perVariant] of byScenarioVariant.entries()) {
408
+ cellsInspected++;
409
+ const arr = [...perVariant.entries()].map(([vid, agg]) => ({
410
+ run: agg.run,
411
+ score: agg.sum / agg.n,
412
+ variantId: vid
413
+ }));
414
+ if (arr.length < 2) {
415
+ cellsSingleton++;
416
+ continue;
417
+ }
418
+ for (let i = 0; i < arr.length; i++) {
419
+ for (let j = i + 1; j < arr.length; j++) {
420
+ const result = makePair(arr[i], arr[j], sid, minMargin);
421
+ if (result.kind === "admit") pairs.push(result.pair);
422
+ else pairsBelowMargin++;
423
+ }
424
+ }
425
+ }
426
+ } else {
427
+ const byScenario = /* @__PURE__ */ new Map();
428
+ for (const e of scoredEntries) {
429
+ const sid = scenarioOf(e.run);
430
+ const arr = byScenario.get(sid) ?? [];
431
+ arr.push(e);
432
+ byScenario.set(sid, arr);
433
+ }
434
+ for (const [sid, arr] of byScenario.entries()) {
435
+ cellsInspected++;
436
+ if (arr.length < 2) {
437
+ cellsSingleton++;
438
+ continue;
439
+ }
440
+ const sorted = [...arr].sort((a, b) => a.score - b.score);
441
+ const top = sorted[sorted.length - 1];
442
+ const bot = sorted[0];
443
+ if (top.run.candidateId === bot.run.candidateId) {
444
+ cellsSingleton++;
445
+ continue;
446
+ }
447
+ const result = makePair(bot, top, sid, minMargin);
448
+ if (result.kind === "admit") pairs.push(result.pair);
449
+ else pairsBelowMargin++;
450
+ }
451
+ }
452
+ return { pairs, cellsInspected, pairsBelowMargin, cellsSingleton, strategy };
453
+ }
454
+ function toTRLFormat(triples, promptOf) {
455
+ return triples.map((t) => ({
456
+ prompt: promptOf(t.meta.chosenPromptHash),
457
+ chosen: t.meta.chosenPromptHash,
458
+ // caller substitutes the model output via the runId map
459
+ rejected: t.meta.rejectedPromptHash
460
+ }));
461
+ }
462
+ function toAnthropicFormat(triples) {
463
+ return triples.map((t) => ({
464
+ scenarioId: t.scenarioId,
465
+ chosenRunId: t.chosenRunId,
466
+ rejectedRunId: t.rejectedRunId,
467
+ margin: t.marginScore
468
+ }));
469
+ }
470
+ function makePair(a, b, scenarioId, minMargin) {
471
+ const margin = Math.abs(a.score - b.score);
472
+ if (margin < minMargin) return { kind: "reject" };
473
+ const [chosen, rejected] = a.score > b.score ? [a, b] : [b, a];
474
+ return {
475
+ kind: "admit",
476
+ pair: {
477
+ scenarioId,
478
+ chosenRunId: chosen.run.runId,
479
+ rejectedRunId: rejected.run.runId,
480
+ chosenVariantId: chosen.run.candidateId,
481
+ rejectedVariantId: rejected.run.candidateId,
482
+ marginScore: chosen.score - rejected.score,
483
+ scores: { chosen: chosen.score, rejected: rejected.score },
484
+ seed: chosen.run.seed === rejected.run.seed ? chosen.run.seed : void 0,
485
+ meta: {
486
+ chosenPromptHash: chosen.run.promptHash,
487
+ rejectedPromptHash: rejected.run.promptHash,
488
+ chosenConfigHash: chosen.run.configHash,
489
+ rejectedConfigHash: rejected.run.configHash,
490
+ chosenModel: chosen.run.model,
491
+ rejectedModel: rejected.run.model
492
+ }
493
+ }
494
+ };
495
+ }
496
+ function scenarioOf(run) {
497
+ if (typeof run.scenarioId === "string" && run.scenarioId.length > 0) return run.scenarioId;
498
+ const fromRaw = run.outcome.raw.scenario_id;
499
+ if (typeof fromRaw === "number" && Number.isFinite(fromRaw)) return String(fromRaw);
500
+ if (typeof fromRaw === "string") return fromRaw;
501
+ return run.experimentId;
502
+ }
503
+
504
+ // src/rl/run-record-adapters.ts
505
+ function trialToRunRecord(trial, ctx, opts = {}) {
506
+ const splitTag = ctx.splitTag ?? "search";
507
+ const promptHash = typeof ctx.promptHash === "function" ? ctx.promptHash(trial) : ctx.promptHash;
508
+ const configHash = typeof ctx.configHash === "function" ? ctx.configHash(trial) : ctx.configHash;
509
+ const runId = opts.runId ?? defaultRunId(ctx, trial);
510
+ const experimentId = opts.experimentIdPerTrial?.(trial) ?? ctx.experimentId;
511
+ const costRecorded = typeof trial.cost === "number" && Number.isFinite(trial.cost);
512
+ const costUsd = costRecorded ? trial.cost : ctx.defaultCostUsd ?? 0;
513
+ const raw = { ...trial.metrics ?? {} };
514
+ if (!costRecorded) raw.cost_unknown = 1;
515
+ if (typeof trial.durationMs === "number") raw.duration_ms = trial.durationMs;
516
+ raw.rep = trial.rep;
517
+ const score = Number.isFinite(trial.score) ? trial.score : 0;
518
+ const outcome = { raw };
519
+ if (splitTag === "holdout") outcome.holdoutScore = score;
520
+ else outcome.searchScore = score;
521
+ return {
522
+ runId,
523
+ experimentId,
524
+ candidateId: trial.variantId,
525
+ seed: trial.rep,
526
+ model: ctx.model,
527
+ promptHash,
528
+ configHash,
529
+ commitSha: ctx.commitSha,
530
+ wallMs: trial.durationMs ?? 0,
531
+ costUsd,
532
+ tokenUsage: { input: 0, output: 0 },
533
+ outcome,
534
+ failureMode: trial.ok ? void 0 : trial.error ? "optimizer_trial_error" : "optimizer_trial_failed",
535
+ splitTag,
536
+ scenarioId: trial.scenarioId
537
+ };
538
+ }
539
+ function trialsToRunRecords(trials, ctx) {
540
+ return trials.map((t) => trialToRunRecord(t, ctx));
541
+ }
542
+ function verificationReportToRunRecord(report, ctx, opts = {}) {
543
+ const splitTag = ctx.splitTag ?? "search";
544
+ const runId = opts.runId ?? `run-${ctx.candidateId}-${ctx.experimentId}-${report.startedAt}`;
545
+ const promptHash = typeof ctx.promptHash === "function" ? "p".repeat(64) : ctx.promptHash;
546
+ const configHash = typeof ctx.configHash === "function" ? "c".repeat(64) : ctx.configHash;
547
+ const raw = {
548
+ pass_count: report.passCount,
549
+ fail_count: report.failCount,
550
+ error_count: report.errorCount,
551
+ skipped_count: report.skippedCount,
552
+ duration_ms: report.durationMs,
553
+ blended_score: report.blendedScore
554
+ };
555
+ for (const layer of report.layers) {
556
+ if (typeof layer.score === "number") raw[`layer.${layer.layer}`] = layer.score;
557
+ raw[`layer_${layer.layer}_pass`] = layer.status === "pass" ? 1 : 0;
558
+ if (layer.diagnostics) {
559
+ for (const [k, v] of Object.entries(layer.diagnostics)) {
560
+ if (typeof v === "number" && Number.isFinite(v)) raw[`layer.${layer.layer}.${k}`] = v;
561
+ }
562
+ }
563
+ }
564
+ const firstFail = report.layers.find((l) => l.status === "fail" || l.status === "error");
565
+ const outcome = { raw };
566
+ if (splitTag === "holdout") outcome.holdoutScore = report.blendedScore;
567
+ else outcome.searchScore = report.blendedScore;
568
+ return {
569
+ runId,
570
+ experimentId: ctx.experimentId,
571
+ candidateId: ctx.candidateId,
572
+ seed: 0,
573
+ model: ctx.model,
574
+ promptHash,
575
+ configHash,
576
+ commitSha: ctx.commitSha,
577
+ wallMs: report.durationMs,
578
+ costUsd: ctx.defaultCostUsd ?? 0,
579
+ tokenUsage: { input: 0, output: 0 },
580
+ outcome,
581
+ failureMode: firstFail ? failureModeFromLayer(firstFail) : void 0,
582
+ splitTag,
583
+ scenarioId: ctx.scenarioId
584
+ };
585
+ }
586
+ function variantAggregateToRunRecord(agg, ctx, opts = {}) {
587
+ const splitTag = ctx.splitTag ?? "search";
588
+ const runId = opts.runId ?? `agg-${agg.variantId}-${ctx.experimentId}`;
589
+ const promptHash = typeof ctx.promptHash === "function" ? "p".repeat(64) : ctx.promptHash;
590
+ const configHash = typeof ctx.configHash === "function" ? "c".repeat(64) : ctx.configHash;
591
+ const raw = {
592
+ ...agg.metrics,
593
+ ok_rate: agg.okRate,
594
+ duration_ms: agg.meanDurationMs,
595
+ n_scenarios: agg.scenarios.length
596
+ };
597
+ const outcome = { raw };
598
+ if (splitTag === "holdout") outcome.holdoutScore = agg.meanScore;
599
+ else outcome.searchScore = agg.meanScore;
600
+ return {
601
+ runId,
602
+ experimentId: ctx.experimentId,
603
+ candidateId: agg.variantId,
604
+ seed: 0,
605
+ model: ctx.model,
606
+ promptHash,
607
+ configHash,
608
+ commitSha: ctx.commitSha,
609
+ wallMs: agg.meanDurationMs,
610
+ costUsd: agg.meanCost,
611
+ tokenUsage: { input: 0, output: 0 },
612
+ outcome,
613
+ splitTag
614
+ };
615
+ }
616
+ function defaultRunId(ctx, t) {
617
+ return `run-${ctx.experimentId}-${t.variantId}-${t.scenarioId}-${t.rep}`;
618
+ }
619
+ function failureModeFromLayer(layer) {
620
+ if (layer.status === "error") return `layer_${layer.layer}_error`;
621
+ if (layer.status === "fail") return `layer_${layer.layer}_fail`;
622
+ if (layer.status === "timeout") return `layer_${layer.layer}_timeout`;
623
+ return `layer_${layer.layer}_${layer.status}`;
624
+ }
625
+
626
+ // src/rl/tournament.ts
627
+ function fitBradleyTerry(outcomes, opts = {}) {
628
+ const tol = opts.tolerance ?? 1e-6;
629
+ const maxIter = opts.maxIterations ?? 256;
630
+ const smoothing = opts.smoothing ?? 0.1;
631
+ const candidates = /* @__PURE__ */ new Set();
632
+ for (const o of outcomes) {
633
+ candidates.add(o.winner);
634
+ candidates.add(o.loser);
635
+ }
636
+ const ids = [...candidates].sort();
637
+ const idx = new Map(ids.map((id, i) => [id, i]));
638
+ const n = ids.length;
639
+ if (n === 0) return { ratings: [], iterations: 0, finalDelta: 0, converged: true };
640
+ if (n === 1) {
641
+ return {
642
+ ratings: [{ candidateId: ids[0], strength: 1, logStrength: 0, n: 0, wins: 0 }],
643
+ iterations: 0,
644
+ finalDelta: 0,
645
+ converged: true
646
+ };
647
+ }
648
+ const W = Array.from({ length: n }, () => new Array(n).fill(0));
649
+ const N = Array.from({ length: n }, () => new Array(n).fill(0));
650
+ for (const o of outcomes) {
651
+ const i = idx.get(o.winner);
652
+ const j = idx.get(o.loser);
653
+ const w = o.weight ?? 1;
654
+ if (o.draw) {
655
+ W[i][j] += 0.5 * w;
656
+ W[j][i] += 0.5 * w;
657
+ } else {
658
+ W[i][j] += w;
659
+ }
660
+ N[i][j] += w;
661
+ N[j][i] += w;
662
+ }
663
+ const winsTotal = new Array(n).fill(0);
664
+ for (let i = 0; i < n; i++) {
665
+ for (let j = 0; j < n; j++) winsTotal[i] += W[i][j];
666
+ winsTotal[i] += smoothing;
667
+ }
668
+ const compsTotal = new Array(n).fill(0);
669
+ for (let i = 0; i < n; i++) {
670
+ for (let j = 0; j < n; j++) compsTotal[i] += N[i][j];
671
+ }
672
+ let theta = new Array(n).fill(1);
673
+ let iter = 0;
674
+ let delta = Infinity;
675
+ for (; iter < maxIter; iter++) {
676
+ const newTheta = new Array(n);
677
+ for (let i = 0; i < n; i++) {
678
+ let denom = 0;
679
+ for (let j = 0; j < n; j++) {
680
+ if (j === i) continue;
681
+ if (N[i][j] === 0) continue;
682
+ denom += N[i][j] / (theta[i] + theta[j]);
683
+ }
684
+ newTheta[i] = denom === 0 ? theta[i] : winsTotal[i] / denom;
685
+ }
686
+ let logSum = 0;
687
+ for (let i = 0; i < n; i++) logSum += Math.log(Math.max(1e-300, newTheta[i]));
688
+ const norm = Math.exp(logSum / n);
689
+ for (let i = 0; i < n; i++) newTheta[i] = newTheta[i] / norm;
690
+ delta = 0;
691
+ for (let i = 0; i < n; i++) {
692
+ const d = Math.abs(newTheta[i] - theta[i]) / Math.max(1e-12, theta[i]);
693
+ if (d > delta) delta = d;
694
+ }
695
+ theta = newTheta;
696
+ if (delta < tol) break;
697
+ }
698
+ const minLog = Math.min(...theta.map((t) => Math.log(Math.max(1e-300, t))));
699
+ const ratings = ids.map((id, i) => ({
700
+ candidateId: id,
701
+ strength: theta[i],
702
+ logStrength: Math.log(Math.max(1e-300, theta[i])) - minLog,
703
+ n: compsTotal[i],
704
+ wins: winsTotal[i] - smoothing
705
+ }));
706
+ return {
707
+ ratings: ratings.sort((a, b) => b.strength - a.strength),
708
+ iterations: iter,
709
+ finalDelta: delta,
710
+ converged: delta < tol
711
+ };
712
+ }
713
+ function applyEloUpdate(ratings, outcome, opts = {}) {
714
+ const defaultRating = opts.defaultRating ?? 1500;
715
+ const k = opts.kFactor ?? 32;
716
+ const rW = ratings.get(outcome.winner) ?? defaultRating;
717
+ const rL = ratings.get(outcome.loser) ?? defaultRating;
718
+ const expectedW = 1 / (1 + 10 ** ((rL - rW) / 400));
719
+ const scoreW = outcome.draw ? 0.5 : 1;
720
+ const scoreL = outcome.draw ? 0.5 : 0;
721
+ const w = outcome.weight ?? 1;
722
+ const winnerDelta = k * w * (scoreW - expectedW);
723
+ const loserDelta = k * w * (scoreL - (1 - expectedW));
724
+ ratings.set(outcome.winner, rW + winnerDelta);
725
+ ratings.set(outcome.loser, rL + loserDelta);
726
+ return { winnerDelta, loserDelta };
727
+ }
728
+ function buildPairwiseFromCampaign(input) {
729
+ const drawMargin = input.drawMargin ?? 0;
730
+ const byKey = /* @__PURE__ */ new Map();
731
+ for (const r of input.runs) {
732
+ const arr = byKey.get(r.matchKey) ?? [];
733
+ arr.push({ candidateId: r.candidateId, score: r.score });
734
+ byKey.set(r.matchKey, arr);
735
+ }
736
+ const outcomes = [];
737
+ for (const arr of byKey.values()) {
738
+ for (let i = 0; i < arr.length; i++) {
739
+ for (let j = i + 1; j < arr.length; j++) {
740
+ const a = arr[i];
741
+ const b = arr[j];
742
+ if (a.candidateId === b.candidateId) continue;
743
+ const margin = Math.abs(a.score - b.score);
744
+ if (margin <= drawMargin) {
745
+ outcomes.push({ winner: a.candidateId, loser: b.candidateId, draw: true, weight: 1 });
746
+ } else {
747
+ const [winner, loser] = a.score > b.score ? [a, b] : [b, a];
748
+ outcomes.push({ winner: winner.candidateId, loser: loser.candidateId, weight: margin });
749
+ }
750
+ }
751
+ }
752
+ }
753
+ return outcomes;
754
+ }
755
+
756
+ // src/rl/verifiable-reward.ts
757
+ var DEFAULT_DETERMINISTIC_LAYERS = /* @__PURE__ */ new Set([
758
+ "install",
759
+ "typecheck",
760
+ "build",
761
+ "lint",
762
+ "test",
763
+ "compile",
764
+ "schema",
765
+ "sandbox",
766
+ "unit_tests",
767
+ "integration_tests"
768
+ ]);
769
+ var DEFAULT_SOURCE_FOR = (name) => {
770
+ const lower = name.toLowerCase();
771
+ if (lower.includes("test")) return "test";
772
+ if (lower.includes("compile") || lower.includes("build") || lower.includes("typecheck") || lower.includes("lint"))
773
+ return "compile";
774
+ if (lower.includes("schema")) return "schema";
775
+ if (lower.includes("sandbox")) return "sandbox";
776
+ if (lower.includes("judge") || lower.includes("semantic")) return "judge";
777
+ return "composite";
778
+ };
779
+ function extractVerifiableReward(report, opts = {}) {
780
+ const deterministicSet = new Set(opts.deterministicLayers ?? [...DEFAULT_DETERMINISTIC_LAYERS]);
781
+ const sourceFor = opts.sourceFor ?? DEFAULT_SOURCE_FOR;
782
+ const fallbackToJudge = opts.fallbackToJudge ?? true;
783
+ const judgeFloor = opts.judgeConfidenceFloor ?? 0.7;
784
+ const deterministic = report.layers.filter(
785
+ (l) => deterministicSet.has(l.layer) && typeof l.score === "number" && Number.isFinite(l.score)
786
+ );
787
+ if (deterministic.length === 1) {
788
+ const layer = deterministic[0];
789
+ return {
790
+ value: clamp01(layer.score),
791
+ source: sourceFor(layer.layer),
792
+ determinism: "deterministic",
793
+ confidence: 1,
794
+ origin: layer.layer,
795
+ breakdown: layerBreakdown(layer)
796
+ };
797
+ }
798
+ if (deterministic.length > 1) {
799
+ let num = 0;
800
+ let denom = 0;
801
+ const breakdown = {};
802
+ for (const l of deterministic) {
803
+ const w = l.detail?.weight ?? 1;
804
+ num += w * (l.score ?? 0);
805
+ denom += w;
806
+ breakdown[l.layer] = l.score;
807
+ }
808
+ return {
809
+ value: denom === 0 ? 0 : clamp01(num / denom),
810
+ source: "composite",
811
+ determinism: "deterministic",
812
+ confidence: 1,
813
+ origin: deterministic.map((l) => l.layer).join("+"),
814
+ breakdown
815
+ };
816
+ }
817
+ if (!fallbackToJudge) return null;
818
+ const judge = report.layers.find(
819
+ (l) => typeof l.score === "number" && Number.isFinite(l.score) && sourceFor(l.layer) === "judge"
820
+ ) ?? report.layers.find((l) => typeof l.score === "number" && Number.isFinite(l.score));
821
+ if (!judge) return null;
822
+ const confFromDetail = judge.detail?.confidence;
823
+ return {
824
+ value: clamp01(judge.score),
825
+ source: "judge",
826
+ determinism: "probabilistic",
827
+ confidence: typeof confFromDetail === "number" ? confFromDetail : judgeFloor,
828
+ origin: judge.layer,
829
+ breakdown: layerBreakdown(judge)
830
+ };
831
+ }
832
+ function extractVerifiableRewardsFromRecords(runs, opts = {}) {
833
+ const sourceFor = opts.sourceFor ?? DEFAULT_SOURCE_FOR;
834
+ const deterministicSet = new Set(opts.deterministicLayers ?? [...DEFAULT_DETERMINISTIC_LAYERS]);
835
+ const fallbackToJudge = opts.fallbackToJudge ?? true;
836
+ const judgeFloor = opts.judgeConfidenceFloor ?? 0.7;
837
+ return runs.map((run) => {
838
+ const layerScores = [];
839
+ for (const [k, v] of Object.entries(run.outcome.raw)) {
840
+ if (k.startsWith("layer.") && !k.includes(".", 6) && typeof v === "number" && Number.isFinite(v)) {
841
+ layerScores.push({ name: k.slice("layer.".length), score: v });
842
+ }
843
+ }
844
+ const det = layerScores.filter((l) => deterministicSet.has(l.name));
845
+ if (det.length === 1) {
846
+ const layer = det[0];
847
+ return {
848
+ runId: run.runId,
849
+ reward: {
850
+ value: clamp01(layer.score),
851
+ source: sourceFor(layer.name),
852
+ determinism: "deterministic",
853
+ confidence: 1,
854
+ origin: layer.name
855
+ }
856
+ };
857
+ }
858
+ if (det.length > 1) {
859
+ const value = det.reduce((s, l) => s + l.score, 0) / det.length;
860
+ const breakdown = Object.fromEntries(
861
+ det.map((l) => [l.name, l.score])
862
+ );
863
+ return {
864
+ runId: run.runId,
865
+ reward: {
866
+ value: clamp01(value),
867
+ source: "composite",
868
+ determinism: "deterministic",
869
+ confidence: 1,
870
+ origin: det.map((l) => l.name).join("+"),
871
+ breakdown
872
+ }
873
+ };
874
+ }
875
+ if (!fallbackToJudge) return { runId: run.runId, reward: null };
876
+ const primary = run.outcome.holdoutScore ?? run.outcome.searchScore;
877
+ if (typeof primary !== "number" || !Number.isFinite(primary)) {
878
+ return { runId: run.runId, reward: null };
879
+ }
880
+ return {
881
+ runId: run.runId,
882
+ reward: {
883
+ value: clamp01(primary),
884
+ source: "judge",
885
+ determinism: "probabilistic",
886
+ confidence: judgeFloor,
887
+ origin: "run.outcome.score"
888
+ }
889
+ };
890
+ });
891
+ }
892
+ function filterDeterministicallyRewarded(runs, opts = {}) {
893
+ const rewarded = extractVerifiableRewardsFromRecords(runs, { ...opts, fallbackToJudge: false });
894
+ const out = [];
895
+ for (let i = 0; i < runs.length; i++) {
896
+ const r = rewarded[i];
897
+ if (r.reward && r.reward.determinism === "deterministic") {
898
+ out.push({ run: runs[i], reward: r.reward });
899
+ }
900
+ }
901
+ return out;
902
+ }
903
+ function clamp01(x) {
904
+ if (!Number.isFinite(x)) return 0;
905
+ return Math.max(0, Math.min(1, x));
906
+ }
907
+ function layerBreakdown(l) {
908
+ const out = {};
909
+ if (l.diagnostics) {
910
+ for (const [k, v] of Object.entries(l.diagnostics)) {
911
+ if (typeof v === "number" && Number.isFinite(v)) out[k] = v;
912
+ }
913
+ }
914
+ return out;
915
+ }
916
+
917
+ // src/rl/active-curriculum.ts
918
+ function varianceBasedCurriculum(observations, candidateCells, opts) {
919
+ const variancePrior = opts.variancePrior ?? 0.05;
920
+ const floor = opts.floorPerCell ?? 1;
921
+ const budget = opts.budget;
922
+ const grouped = /* @__PURE__ */ new Map();
923
+ for (const o of observations) {
924
+ const k = `${o.variantId}::${o.scenarioId}`;
925
+ const arr = grouped.get(k) ?? [];
926
+ arr.push(o.score);
927
+ grouped.set(k, arr);
928
+ }
929
+ const cellStats = candidateCells.map((c) => {
930
+ const k = `${c.variantId}::${c.scenarioId}`;
931
+ const samples = grouped.get(k) ?? [];
932
+ const n = samples.length;
933
+ const mean2 = n === 0 ? 0.5 : samples.reduce((s, v) => s + v, 0) / n;
934
+ const variance = n < 2 ? variancePrior : samples.reduce((s, v) => s + (v - mean2) ** 2, 0) / (n - 1) + variancePrior;
935
+ const weight = Math.sqrt(variance) + 1 / Math.sqrt(Math.max(1, n));
936
+ return { variantId: c.variantId, scenarioId: c.scenarioId, n, mean: mean2, variance, weight };
937
+ });
938
+ const floorTotal = floor * cellStats.length;
939
+ if (floorTotal >= budget) {
940
+ const each = Math.max(1, Math.floor(budget / Math.max(1, cellStats.length)));
941
+ return cellStats.map((c) => ({
942
+ variantId: c.variantId,
943
+ scenarioId: c.scenarioId,
944
+ count: each,
945
+ reason: `floor allocation (budget tight; n=${c.n})`
946
+ }));
947
+ }
948
+ const remaining = budget - floorTotal;
949
+ const totalWeight = cellStats.reduce((s, c) => s + c.weight, 0);
950
+ return cellStats.map((c) => {
951
+ const proportional = totalWeight === 0 ? 0 : Math.round(c.weight / totalWeight * remaining);
952
+ return {
953
+ variantId: c.variantId,
954
+ scenarioId: c.scenarioId,
955
+ count: floor + proportional,
956
+ reason: `variance ${c.variance.toFixed(3)} (n=${c.n}, mean=${c.mean.toFixed(3)})`
957
+ };
958
+ });
959
+ }
960
+ function thompsonCurriculum(observations, candidateCells, opts) {
961
+ const threshold = opts.decisionThreshold ?? 0.5;
962
+ const alpha0 = opts.priorAlpha ?? 1;
963
+ const beta0 = opts.priorBeta ?? 1;
964
+ const rng = makeRng(opts.seed);
965
+ const grouped = /* @__PURE__ */ new Map();
966
+ for (const o of observations) {
967
+ const k = `${o.variantId}::${o.scenarioId}`;
968
+ const cur = grouped.get(k) ?? { passes: 0, failures: 0 };
969
+ const pass = o.pass ?? o.score >= threshold;
970
+ if (pass) cur.passes += 1;
971
+ else cur.failures += 1;
972
+ grouped.set(k, cur);
973
+ }
974
+ const stats = candidateCells.map((c) => {
975
+ const k = `${c.variantId}::${c.scenarioId}`;
976
+ const cur = grouped.get(k) ?? { passes: 0, failures: 0 };
977
+ const a = alpha0 + cur.passes;
978
+ const b = beta0 + cur.failures;
979
+ const sampled = sampleBeta(a, b, rng);
980
+ const distance = Math.abs(sampled - threshold);
981
+ const variance = a * b / ((a + b) ** 2 * (a + b + 1));
982
+ const sigma = Math.max(0.05, Math.sqrt(variance));
983
+ const weight = Math.exp(-((distance / sigma) ** 2));
984
+ return {
985
+ variantId: c.variantId,
986
+ scenarioId: c.scenarioId,
987
+ n: cur.passes + cur.failures,
988
+ sampled,
989
+ sigma,
990
+ weight,
991
+ a,
992
+ b
993
+ };
994
+ });
995
+ const totalWeight = stats.reduce((s, c) => s + c.weight, 0);
996
+ return stats.map((c) => {
997
+ const proportional = totalWeight === 0 ? 0 : Math.round(c.weight / totalWeight * opts.budget);
998
+ return {
999
+ variantId: c.variantId,
1000
+ scenarioId: c.scenarioId,
1001
+ count: Math.max(0, proportional),
1002
+ reason: `Beta(${c.a.toFixed(1)},${c.b.toFixed(1)}) sample=${c.sampled.toFixed(3)} (target ${threshold})`
1003
+ };
1004
+ });
1005
+ }
1006
+ function observationsFromRunRecords(runs, opts = {}) {
1007
+ const threshold = opts.passThreshold ?? 0.5;
1008
+ const useHoldout = opts.useHoldout ?? true;
1009
+ const out = [];
1010
+ for (const r of runs) {
1011
+ if (!r.scenarioId) continue;
1012
+ const score = useHoldout ? r.outcome.holdoutScore ?? r.outcome.searchScore : r.outcome.searchScore ?? r.outcome.holdoutScore;
1013
+ if (typeof score !== "number" || !Number.isFinite(score)) continue;
1014
+ out.push({
1015
+ variantId: r.candidateId,
1016
+ scenarioId: r.scenarioId,
1017
+ score,
1018
+ pass: score >= threshold
1019
+ });
1020
+ }
1021
+ return out;
1022
+ }
1023
+ function makeRng(seed) {
1024
+ if (seed === void 0) return Math.random;
1025
+ let s = seed >>> 0;
1026
+ return () => {
1027
+ s = s + 1831565813 >>> 0;
1028
+ let t = s;
1029
+ t = Math.imul(t ^ t >>> 15, t | 1);
1030
+ t ^= t + Math.imul(t ^ t >>> 7, t | 61);
1031
+ return ((t ^ t >>> 14) >>> 0) / 4294967296;
1032
+ };
1033
+ }
1034
+ function sampleBeta(alpha, beta, rng) {
1035
+ const a = Math.max(1, alpha);
1036
+ const b = Math.max(1, beta);
1037
+ const x = sampleGamma(a, rng);
1038
+ const y = sampleGamma(b, rng);
1039
+ return x / (x + y);
1040
+ }
1041
+ function sampleGamma(shape, rng) {
1042
+ const d = shape - 1 / 3;
1043
+ const c = 1 / Math.sqrt(9 * d);
1044
+ while (true) {
1045
+ let x;
1046
+ let v;
1047
+ do {
1048
+ const u1 = rng() || 1e-12;
1049
+ const u2 = rng() || 1e-12;
1050
+ x = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
1051
+ v = 1 + c * x;
1052
+ } while (v <= 0);
1053
+ v = v * v * v;
1054
+ const u = rng();
1055
+ if (u < 1 - 0.0331 * x ** 4) return d * v;
1056
+ if (Math.log(u) < 0.5 * x * x + d * (1 - v + Math.log(v))) return d * v;
1057
+ }
1058
+ }
1059
+
1060
+ // src/rl/adaptation-eval.ts
1061
+ async function runAdaptationCurve(opts) {
1062
+ const ks = opts.ks ?? [0, 1, 2, 4, 8, 16];
1063
+ const reps = opts.reps ?? 3;
1064
+ const passThreshold = opts.passThreshold ?? 0.5;
1065
+ const sortedKs = [...ks].sort((a, b) => a - b);
1066
+ const points = [];
1067
+ for (const k of sortedKs) {
1068
+ const perScenario = [];
1069
+ const allScores = [];
1070
+ let totalPasses = 0;
1071
+ let totalAttempts = 0;
1072
+ for (const scenario of opts.scenarios) {
1073
+ const sid = scenario.scenarioId ?? `scenario-${opts.scenarios.indexOf(scenario)}`;
1074
+ const scores = [];
1075
+ let passes = 0;
1076
+ for (let r = 0; r < reps; r++) {
1077
+ const score = await opts.runner.run({ scenario, k, rep: r });
1078
+ scores.push(score);
1079
+ if (score >= passThreshold) passes++;
1080
+ allScores.push(score);
1081
+ if (score >= passThreshold) totalPasses++;
1082
+ totalAttempts++;
1083
+ }
1084
+ const meanS = scores.reduce((s, v) => s + v, 0) / scores.length;
1085
+ perScenario.push({ scenarioId: sid, meanScore: meanS, passes, total: scores.length });
1086
+ }
1087
+ const meanScore = allScores.reduce((s, v) => s + v, 0) / Math.max(1, allScores.length);
1088
+ const variance = allScores.length < 2 ? 0 : allScores.reduce((s, v) => s + (v - meanScore) ** 2, 0) / (allScores.length - 1);
1089
+ points.push({
1090
+ k,
1091
+ meanScore,
1092
+ passRate: totalPasses / Math.max(1, totalAttempts),
1093
+ std: Math.sqrt(variance),
1094
+ n: allScores.length,
1095
+ perScenario
1096
+ });
1097
+ }
1098
+ const firstPassK2 = points.find((p) => p.passRate >= passThreshold)?.k ?? null;
1099
+ const maxK = sortedKs[sortedKs.length - 1] ?? 1;
1100
+ let area = 0;
1101
+ for (let i = 1; i < points.length; i++) {
1102
+ const x1 = points[i - 1].k;
1103
+ const x2 = points[i].k;
1104
+ const y1 = points[i - 1].meanScore;
1105
+ const y2 = points[i].meanScore;
1106
+ area += (y1 + y2) / 2 * (x2 - x1);
1107
+ }
1108
+ const adaptationArea = maxK === 0 ? 0 : area / maxK;
1109
+ return { points, firstPassK: firstPassK2, adaptationArea };
1110
+ }
1111
+ function compareAdaptationCurves(a, b, opts = {}) {
1112
+ const conf = opts.confidence ?? 0.95;
1113
+ const resamples = opts.bootstrapResamples ?? 500;
1114
+ const rng = makeRng2(opts.seed);
1115
+ const perK = [];
1116
+ for (const ap of a.points) {
1117
+ const bp = b.points.find((p) => p.k === ap.k);
1118
+ if (!bp) continue;
1119
+ const aMeans = ap.perScenario.map((s) => s.meanScore);
1120
+ const bMeans = bp.perScenario.map((s) => s.meanScore);
1121
+ const aCi = bootstrapMeanCi(aMeans, resamples, conf, rng);
1122
+ const bCi = bootstrapMeanCi(bMeans, resamples, conf, rng);
1123
+ perK.push({
1124
+ k: ap.k,
1125
+ deltaMean: ap.meanScore - bp.meanScore,
1126
+ aLow: aCi.low,
1127
+ aHigh: aCi.high,
1128
+ bLow: bCi.low,
1129
+ bHigh: bCi.high
1130
+ });
1131
+ }
1132
+ const areaDelta = a.adaptationArea - b.adaptationArea;
1133
+ const firstPassKDelta = a.firstPassK !== null && b.firstPassK !== null ? b.firstPassK - a.firstPassK : null;
1134
+ const meanDelta = perK.reduce((s, p) => s + p.deltaMean, 0) / Math.max(1, perK.length);
1135
+ let verdict;
1136
+ if (Math.abs(meanDelta) < 0.02 && Math.abs(areaDelta) < 0.02) verdict = "similar";
1137
+ else if (meanDelta > 0 && areaDelta > 0) verdict = "a_better";
1138
+ else if (meanDelta < 0 && areaDelta < 0) verdict = "b_better";
1139
+ else verdict = "similar";
1140
+ const rationale = `mean per-k delta=${meanDelta.toFixed(3)}, area delta=${areaDelta.toFixed(3)}` + (firstPassKDelta !== null ? `, first-pass-k delta=${firstPassKDelta}` : "");
1141
+ return { perK, areaDelta, firstPassKDelta, verdict, rationale };
1142
+ }
1143
+ function firstPassK(curve, threshold = 0.5) {
1144
+ return curve.points.find((p) => p.passRate >= threshold)?.k ?? null;
1145
+ }
1146
+ function makeRng2(seed) {
1147
+ if (seed === void 0) return Math.random;
1148
+ let s = seed >>> 0;
1149
+ return () => {
1150
+ s = s + 1831565813 >>> 0;
1151
+ let t = s;
1152
+ t = Math.imul(t ^ t >>> 15, t | 1);
1153
+ t ^= t + Math.imul(t ^ t >>> 7, t | 61);
1154
+ return ((t ^ t >>> 14) >>> 0) / 4294967296;
1155
+ };
1156
+ }
1157
+ function bootstrapMeanCi(xs, resamples, confidence, rng) {
1158
+ if (xs.length < 2) return { low: xs[0] ?? 0, high: xs[0] ?? 0 };
1159
+ const samples = new Array(resamples);
1160
+ for (let b = 0; b < resamples; b++) {
1161
+ let sum = 0;
1162
+ for (let i = 0; i < xs.length; i++) sum += xs[Math.floor(rng() * xs.length)];
1163
+ samples[b] = sum / xs.length;
1164
+ }
1165
+ samples.sort((a, b) => a - b);
1166
+ const alpha = 1 - confidence;
1167
+ return {
1168
+ low: samples[Math.floor(alpha / 2 * resamples)],
1169
+ high: samples[Math.min(resamples - 1, Math.ceil((1 - alpha / 2) * resamples) - 1)]
1170
+ };
1171
+ }
1172
+
1173
+ // src/rl/adversarial.ts
1174
+ async function adversarialScenarioSearch(opts) {
1175
+ const failureThreshold = opts.failureThreshold ?? 0.5;
1176
+ const rounds = opts.rounds ?? 3;
1177
+ const children = opts.childrenPerParent ?? 4;
1178
+ const budget = opts.budget ?? Number.POSITIVE_INFINITY;
1179
+ const seed = opts.seed ?? 1;
1180
+ const rng = mulberry32(seed);
1181
+ const scenarios = [];
1182
+ const seen = /* @__PURE__ */ new Set();
1183
+ let scoreCalls = 0;
1184
+ for (const s of opts.seeds) {
1185
+ const id = opts.mutateScenarioId(s);
1186
+ if (seen.has(id)) continue;
1187
+ seen.add(id);
1188
+ if (scoreCalls >= budget) break;
1189
+ const score = await opts.scoreFn(s);
1190
+ scoreCalls++;
1191
+ scenarios.push({
1192
+ id,
1193
+ generation: 0,
1194
+ parentId: null,
1195
+ scenario: s,
1196
+ score,
1197
+ mutationStrategy: null
1198
+ });
1199
+ }
1200
+ for (let g = 1; g <= rounds; g++) {
1201
+ if (scoreCalls >= budget) break;
1202
+ const parents = scenarios.filter((s) => s.generation === g - 1);
1203
+ for (const parent of parents) {
1204
+ for (const mutation of opts.mutations) {
1205
+ if (scoreCalls >= budget) break;
1206
+ const produced = await mutation.mutate(parent.scenario, rng);
1207
+ const childArr = Array.isArray(produced) ? produced : [produced];
1208
+ for (let k = 0; k < Math.min(children, childArr.length); k++) {
1209
+ if (scoreCalls >= budget) break;
1210
+ const child = childArr[k];
1211
+ const cid = opts.mutateScenarioId(child);
1212
+ if (seen.has(cid)) continue;
1213
+ seen.add(cid);
1214
+ const cscore = await opts.scoreFn(child);
1215
+ scoreCalls++;
1216
+ scenarios.push({
1217
+ id: cid,
1218
+ generation: g,
1219
+ parentId: parent.id,
1220
+ scenario: child,
1221
+ score: cscore,
1222
+ mutationStrategy: mutation.id
1223
+ });
1224
+ }
1225
+ }
1226
+ }
1227
+ }
1228
+ const failures = scenarios.filter((s) => s.score !== null && s.score < failureThreshold).sort((a, b) => (a.score ?? 0) - (b.score ?? 0));
1229
+ const byGeneration = [];
1230
+ const maxGen = scenarios.reduce((m, s) => Math.max(m, s.generation), 0);
1231
+ for (let g = 0; g <= maxGen; g++) {
1232
+ const gens = scenarios.filter((s) => s.generation === g);
1233
+ if (gens.length === 0) continue;
1234
+ const fails = gens.filter((s) => s.score !== null && s.score < failureThreshold).length;
1235
+ const meanScore = gens.reduce((sum, s) => sum + (s.score ?? 0), 0) / gens.length;
1236
+ byGeneration.push({ generation: g, total: gens.length, failures: fails, meanScore });
1237
+ }
1238
+ return { scenarios, failures, byGeneration, scoreCalls };
1239
+ }
1240
+ function mulberry32(seed) {
1241
+ let s = seed >>> 0;
1242
+ return () => {
1243
+ s = s + 1831565813 >>> 0;
1244
+ let t = s;
1245
+ t = Math.imul(t ^ t >>> 15, t | 1);
1246
+ t ^= t + Math.imul(t ^ t >>> 7, t | 61);
1247
+ return ((t ^ t >>> 14) >>> 0) / 4294967296;
1248
+ };
1249
+ }
1250
+
1251
+ // src/rl/exporters.ts
1252
+ async function toDpoRows(triples, lookups) {
1253
+ const out = [];
1254
+ for (const t of triples) {
1255
+ const [prompt, chosen, rejected] = await Promise.all([
1256
+ Promise.resolve(lookups.promptOf(t.chosenRunId)),
1257
+ Promise.resolve(lookups.completionOf(t.chosenRunId)),
1258
+ Promise.resolve(lookups.completionOf(t.rejectedRunId))
1259
+ ]);
1260
+ out.push({
1261
+ prompt,
1262
+ chosen,
1263
+ rejected,
1264
+ margin: t.marginScore,
1265
+ meta: {
1266
+ scenarioId: t.scenarioId,
1267
+ chosenVariantId: t.chosenVariantId,
1268
+ rejectedVariantId: t.rejectedVariantId,
1269
+ chosenRunId: t.chosenRunId,
1270
+ rejectedRunId: t.rejectedRunId,
1271
+ chosenModel: t.meta.chosenModel,
1272
+ rejectedModel: t.meta.rejectedModel
1273
+ }
1274
+ });
1275
+ }
1276
+ return out;
1277
+ }
1278
+ function toDpoJsonl(rows) {
1279
+ return rows.map((r) => JSON.stringify(r)).join("\n") + (rows.length > 0 ? "\n" : "");
1280
+ }
1281
+ async function toGrpoRows(runs, lookups) {
1282
+ const rewardOf = lookups.rewardOf ?? defaultReward;
1283
+ const grouped = /* @__PURE__ */ new Map();
1284
+ for (const r of runs) {
1285
+ const sid = r.scenarioId ?? r.experimentId;
1286
+ const arr = grouped.get(sid) ?? [];
1287
+ arr.push(r);
1288
+ grouped.set(sid, arr);
1289
+ }
1290
+ const rows = [];
1291
+ for (const [scenarioId, group] of grouped.entries()) {
1292
+ if (group.length === 0) continue;
1293
+ const prompt = await Promise.resolve(lookups.promptOf(group[0].runId));
1294
+ const completions = [];
1295
+ const rewards = [];
1296
+ const runIds = [];
1297
+ for (const r of group) {
1298
+ const reward = rewardOf(r);
1299
+ if (reward === null) continue;
1300
+ const completion = await Promise.resolve(lookups.completionOf(r.runId));
1301
+ completions.push(completion);
1302
+ rewards.push(reward);
1303
+ runIds.push(r.runId);
1304
+ }
1305
+ if (completions.length === 0) continue;
1306
+ rows.push({
1307
+ prompt,
1308
+ completions,
1309
+ rewards,
1310
+ runIds,
1311
+ meta: {
1312
+ scenarioId,
1313
+ n: completions.length,
1314
+ meanReward: rewards.reduce((s, x) => s + x, 0) / rewards.length
1315
+ }
1316
+ });
1317
+ }
1318
+ return rows;
1319
+ }
1320
+ function toGrpoJsonl(rows) {
1321
+ return rows.map((r) => JSON.stringify(r)).join("\n") + (rows.length > 0 ? "\n" : "");
1322
+ }
1323
+ async function toSftRows(runs, lookups) {
1324
+ const include = lookups.include ?? (() => true);
1325
+ const rows = [];
1326
+ for (const r of runs) {
1327
+ if (!include(r)) continue;
1328
+ const system = lookups.systemOf?.(r);
1329
+ const [prompt, completion] = await Promise.all([
1330
+ Promise.resolve(lookups.promptOf(r.runId)),
1331
+ Promise.resolve(lookups.completionOf(r.runId))
1332
+ ]);
1333
+ const messages = [];
1334
+ if (system) messages.push({ role: "system", content: system });
1335
+ messages.push({ role: "user", content: prompt });
1336
+ messages.push({ role: "assistant", content: completion });
1337
+ rows.push({
1338
+ messages,
1339
+ meta: {
1340
+ runId: r.runId,
1341
+ candidateId: r.candidateId,
1342
+ scenarioId: r.scenarioId,
1343
+ score: r.outcome.holdoutScore ?? r.outcome.searchScore,
1344
+ model: r.model
1345
+ }
1346
+ });
1347
+ }
1348
+ return rows;
1349
+ }
1350
+ function toSftJsonl(rows) {
1351
+ return rows.map((r) => JSON.stringify(r)).join("\n") + (rows.length > 0 ? "\n" : "");
1352
+ }
1353
+ async function toPrmRows(triples, lookups) {
1354
+ const rows = [];
1355
+ for (const t of triples) {
1356
+ const prompt = await Promise.resolve(lookups.promptOf(t.prefixRunId));
1357
+ const prefixSpanIds = lookups.prefixOf ? await Promise.resolve(lookups.prefixOf(t.prefixRunId, t.prefixStepIndex)) : [];
1358
+ const prefixStepText = [];
1359
+ for (const spanId of prefixSpanIds) {
1360
+ prefixStepText.push(await Promise.resolve(lookups.stepTextOf(t.prefixRunId, spanId)));
1361
+ }
1362
+ const chosenStep = await Promise.resolve(lookups.stepTextOf(t.prefixRunId, t.chosenSpanId));
1363
+ const rejectedStep = await Promise.resolve(
1364
+ lookups.stepTextOf(t.rejectedRunId, t.rejectedSpanId)
1365
+ );
1366
+ rows.push({
1367
+ prompt,
1368
+ prefixSpanIds,
1369
+ prefixStepText,
1370
+ chosenStep,
1371
+ rejectedStep,
1372
+ chosenReward: t.chosenReward,
1373
+ rejectedReward: t.rejectedReward,
1374
+ marginScore: t.marginScore,
1375
+ meta: {
1376
+ prefixRunId: t.prefixRunId,
1377
+ rejectedRunId: t.rejectedRunId,
1378
+ prefixStepIndex: t.prefixStepIndex
1379
+ }
1380
+ });
1381
+ }
1382
+ return rows;
1383
+ }
1384
+ function toPrmJsonl(rows) {
1385
+ return rows.map((r) => JSON.stringify(r)).join("\n") + (rows.length > 0 ? "\n" : "");
1386
+ }
1387
+ function stepRewardsToJsonl(stepRewards) {
1388
+ const rows = stepRewards.map((s) => ({
1389
+ runId: s.runId,
1390
+ spanId: s.spanId,
1391
+ stepIndex: s.stepIndex,
1392
+ reward: s.reward,
1393
+ determinism: s.determinism,
1394
+ weight: s.weight ?? 1
1395
+ }));
1396
+ return rows.map((r) => JSON.stringify(r)).join("\n") + (rows.length > 0 ? "\n" : "");
1397
+ }
1398
+ function defaultReward(run) {
1399
+ const v = run.outcome.holdoutScore ?? run.outcome.searchScore;
1400
+ return typeof v === "number" && Number.isFinite(v) ? v : null;
1401
+ }
1402
+
1403
+ // src/rl/reward-hacking.ts
1404
+ var DEFAULT_PROXY = (r) => {
1405
+ const v = r.outcome.holdoutScore ?? r.outcome.searchScore;
1406
+ return typeof v === "number" && Number.isFinite(v) ? v : null;
1407
+ };
1408
+ function detectRewardHacking(input) {
1409
+ const proxyOf = input.proxyOf ?? DEFAULT_PROXY;
1410
+ const truthOf = input.truthOf;
1411
+ const sus = input.thresholds?.suspect ?? 0.3;
1412
+ const gam = input.thresholds?.gaming ?? 0.6;
1413
+ const runs = input.runs.filter((r) => proxyOf(r) !== null);
1414
+ const n = runs.length;
1415
+ if (n < 4) {
1416
+ return {
1417
+ findings: [],
1418
+ verdict: "clean",
1419
+ n,
1420
+ rationale: [`fewer than 4 runs with proxy reward (n=${n}); insufficient evidence`]
1421
+ };
1422
+ }
1423
+ const windowSize = Math.max(1, input.windowSize ?? Math.min(50, Math.floor(n / 2)));
1424
+ const before = runs.slice(0, n - windowSize);
1425
+ const after = runs.slice(n - windowSize);
1426
+ const findings = [];
1427
+ if (truthOf) {
1428
+ const beforeProxy = before.map(proxyOf).filter((v) => typeof v === "number");
1429
+ const afterProxy = after.map(proxyOf).filter((v) => typeof v === "number");
1430
+ const beforeTruth = before.map(truthOf).filter((v) => typeof v === "number");
1431
+ const afterTruth = after.map(truthOf).filter((v) => typeof v === "number");
1432
+ if (beforeProxy.length >= 2 && afterProxy.length >= 2 && beforeTruth.length >= 2 && afterTruth.length >= 2) {
1433
+ const proxyDelta = mean(afterProxy) - mean(beforeProxy);
1434
+ const truthDelta = mean(afterTruth) - mean(beforeTruth);
1435
+ const gap = Math.max(0, proxyDelta - truthDelta);
1436
+ const severity = clamp012(gap * 5);
1437
+ findings.push({
1438
+ signal: "reward_divergence",
1439
+ severity,
1440
+ message: severity >= sus ? `proxy reward rose by ${proxyDelta.toFixed(3)} while truth changed by ${truthDelta.toFixed(3)} \u2014 potential Goodhart` : `proxy and truth moved together (proxy ${proxyDelta.toFixed(3)}, truth ${truthDelta.toFixed(3)})`,
1441
+ detail: {
1442
+ proxyDelta,
1443
+ truthDelta,
1444
+ gap,
1445
+ beforeN: beforeProxy.length,
1446
+ afterN: afterProxy.length
1447
+ }
1448
+ });
1449
+ }
1450
+ }
1451
+ {
1452
+ const beforeP = before.map(proxyOf).filter((v) => typeof v === "number");
1453
+ const afterP = after.map(proxyOf).filter((v) => typeof v === "number");
1454
+ if (beforeP.length >= 4 && afterP.length >= 4) {
1455
+ const ks = ksStatistic(beforeP, afterP);
1456
+ const severity = clamp012(ks - 0.2);
1457
+ findings.push({
1458
+ signal: "distribution_shift",
1459
+ severity,
1460
+ message: severity >= sus ? `KS=${ks.toFixed(3)} between before/after windows \u2014 distributional shift large` : `KS=${ks.toFixed(3)} between before/after windows \u2014 within-distribution drift`,
1461
+ detail: { ks, beforeN: beforeP.length, afterN: afterP.length }
1462
+ });
1463
+ }
1464
+ }
1465
+ {
1466
+ const secondaryOf = input.secondaryRewardOf ?? defaultSecondary(input.verifiableRewardOptions);
1467
+ const aligned = runs.map((r) => ({ p: proxyOf(r), s: secondaryOf(r) })).filter(
1468
+ (x) => typeof x.p === "number" && typeof x.s === "number"
1469
+ );
1470
+ if (aligned.length >= 4) {
1471
+ const ps = aligned.map((x) => x.p);
1472
+ const ss = aligned.map((x) => x.s);
1473
+ const r = pearsonR(ps, ss);
1474
+ const severity = clamp012(0.5 - Math.max(0, r));
1475
+ findings.push({
1476
+ signal: "reward_disagreement",
1477
+ severity,
1478
+ message: severity >= sus ? `proxy and independent secondary reward correlate \u03C1=${r.toFixed(3)} \u2014 possibly hacking proxy` : `proxy and secondary reward correlate \u03C1=${r.toFixed(3)}`,
1479
+ detail: { pearson: r, n: aligned.length }
1480
+ });
1481
+ }
1482
+ }
1483
+ {
1484
+ const detRuns = filterDeterministicallyRewarded(runs, input.verifiableRewardOptions ?? {});
1485
+ if (detRuns.length >= 4) {
1486
+ const detBefore = detRuns.slice(0, Math.floor(detRuns.length / 2));
1487
+ const detAfter = detRuns.slice(Math.floor(detRuns.length / 2));
1488
+ const detDelta = mean(detAfter.map((r) => r.reward.value)) - mean(detBefore.map((r) => r.reward.value));
1489
+ const proxyDelta = mean(after.map(proxyOf).filter((v) => typeof v === "number")) - mean(before.map(proxyOf).filter((v) => typeof v === "number"));
1490
+ const driftGap = Math.max(0, proxyDelta - detDelta);
1491
+ const severity = clamp012(driftGap * 5);
1492
+ findings.push({
1493
+ signal: "judge_drift",
1494
+ severity,
1495
+ message: severity >= sus ? `judge proxy +${proxyDelta.toFixed(3)} while deterministic reward +${detDelta.toFixed(3)} \u2014 judge drifting up without verifiable backing` : `judge and deterministic rewards move in step (judge ${proxyDelta.toFixed(3)}, det ${detDelta.toFixed(3)})`,
1496
+ detail: { proxyDelta, detDelta, driftGap, n: detRuns.length }
1497
+ });
1498
+ }
1499
+ }
1500
+ const maxSev = findings.reduce((m, f) => Math.max(m, f.severity), 0);
1501
+ const verdict = maxSev >= gam ? "gaming" : maxSev >= sus ? "suspect" : "clean";
1502
+ const rationale = findings.filter((f) => f.severity >= sus).map((f) => `${f.signal}: severity ${f.severity.toFixed(2)} \u2014 ${f.message}`);
1503
+ if (rationale.length === 0) rationale.push("no signals fired above suspect threshold");
1504
+ return { findings, verdict, rationale, n };
1505
+ }
1506
+ function mean(xs) {
1507
+ if (xs.length === 0) return 0;
1508
+ return xs.reduce((s, x) => s + x, 0) / xs.length;
1509
+ }
1510
+ function clamp012(x) {
1511
+ if (!Number.isFinite(x)) return 0;
1512
+ return Math.max(0, Math.min(1, x));
1513
+ }
1514
+ function pearsonR(a, b) {
1515
+ if (a.length !== b.length || a.length < 2) return 0;
1516
+ const ma = mean(a);
1517
+ const mb = mean(b);
1518
+ let num = 0, da = 0, db = 0;
1519
+ for (let i = 0; i < a.length; i++) {
1520
+ const xa = a[i] - ma;
1521
+ const xb = b[i] - mb;
1522
+ num += xa * xb;
1523
+ da += xa * xa;
1524
+ db += xb * xb;
1525
+ }
1526
+ if (da === 0 || db === 0) return 0;
1527
+ return num / Math.sqrt(da * db);
1528
+ }
1529
+ function ksStatistic(a, b) {
1530
+ const sortedA = [...a].sort((x, y) => x - y);
1531
+ const sortedB = [...b].sort((x, y) => x - y);
1532
+ const all = [.../* @__PURE__ */ new Set([...sortedA, ...sortedB])].sort((x, y) => x - y);
1533
+ let max = 0;
1534
+ for (const v of all) {
1535
+ const fa = sortedA.filter((x) => x <= v).length / sortedA.length;
1536
+ const fb = sortedB.filter((x) => x <= v).length / sortedB.length;
1537
+ max = Math.max(max, Math.abs(fa - fb));
1538
+ }
1539
+ return max;
1540
+ }
1541
+ function defaultSecondary(verifiableOpts) {
1542
+ return (run) => {
1543
+ const filtered = filterDeterministicallyRewarded([run], verifiableOpts ?? {});
1544
+ return filtered.length === 1 ? filtered[0].reward.value : null;
1545
+ };
1546
+ }
1547
+
1548
+ // src/rl/auto-research.ts
1549
+ async function analyzeOptimizationResult(opts) {
1550
+ const trials = extractTrials(opts.result);
1551
+ const runs = trialsToRunRecords(trials, opts.ctx);
1552
+ const rewardSignals = extractVerifiableRewardsFromRecords(runs, opts.verifiableReward ?? {});
1553
+ const preferences = extractPreferences(runs, {
1554
+ strategy: opts.preferences?.strategy ?? "paired-by-scenario-and-seed",
1555
+ minMargin: opts.preferences?.minMargin ?? 0.05,
1556
+ splitTag: opts.preferences?.splitTag ?? opts.ctx.splitTag ?? "search",
1557
+ rewardOf: opts.preferences?.rewardOf
1558
+ });
1559
+ let interimConfidence = null;
1560
+ if (opts.comparator) {
1561
+ const deltaSeries = collectPairedDeltaSeries(runs, opts.comparator);
1562
+ if (deltaSeries.some((s) => s.deltas.length > 0)) {
1563
+ interimConfidence = evaluateInterimReleaseConfidence({
1564
+ deltaSeries,
1565
+ alpha: opts.sequential?.alpha,
1566
+ bound: opts.sequential?.bound,
1567
+ rope: opts.sequential?.rope
1568
+ });
1569
+ }
1570
+ }
1571
+ const rewardHacking = detectRewardHacking({
1572
+ runs,
1573
+ verifiableRewardOptions: opts.verifiableReward
1574
+ });
1575
+ let predictiveValidity = null;
1576
+ if (opts.outcomes) {
1577
+ predictiveValidity = await rubricPredictiveValidity({
1578
+ runs,
1579
+ outcomes: opts.outcomes.store,
1580
+ outcomeMetrics: opts.outcomes.metrics
1581
+ });
1582
+ }
1583
+ const trainerRows = {};
1584
+ if (opts.trainerExport?.dpo) {
1585
+ trainerRows.dpo = await toDpoRows(preferences.pairs, opts.trainerExport.dpo);
1586
+ }
1587
+ if (opts.trainerExport?.grpo) {
1588
+ trainerRows.grpo = await toGrpoRows(runs, opts.trainerExport.grpo);
1589
+ }
1590
+ const summary = buildSummary({
1591
+ runs,
1592
+ preferences,
1593
+ interimConfidence,
1594
+ rewardHacking,
1595
+ predictiveValidity
1596
+ });
1597
+ return {
1598
+ runs,
1599
+ rewardSignals,
1600
+ preferences,
1601
+ interimConfidence,
1602
+ rewardHacking,
1603
+ predictiveValidity,
1604
+ trainerRows,
1605
+ summary
1606
+ };
1607
+ }
1608
+ function extractTrials(result) {
1609
+ if ("evolution" in result) {
1610
+ return collectFromEvolution(result.evolution);
1611
+ }
1612
+ return collectFromEvolution(result);
1613
+ }
1614
+ function collectFromEvolution(evolution) {
1615
+ const trials = [];
1616
+ for (const gen of evolution.generations) {
1617
+ for (const t of gen.trials ?? []) trials.push(t);
1618
+ }
1619
+ return trials;
1620
+ }
1621
+ function collectPairedDeltaSeries(runs, comparator) {
1622
+ const baseline = /* @__PURE__ */ new Map();
1623
+ for (const r of runs) {
1624
+ if (r.candidateId !== comparator) continue;
1625
+ const sid = r.scenarioId ?? r.experimentId;
1626
+ const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
1627
+ if (typeof score !== "number" || !Number.isFinite(score)) continue;
1628
+ baseline.set(`${sid}::${r.seed}`, score);
1629
+ }
1630
+ const byCandidate = /* @__PURE__ */ new Map();
1631
+ for (const r of runs) {
1632
+ if (r.candidateId === comparator) continue;
1633
+ const sid = r.scenarioId ?? r.experimentId;
1634
+ const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
1635
+ if (typeof score !== "number" || !Number.isFinite(score)) continue;
1636
+ const baseScore = baseline.get(`${sid}::${r.seed}`);
1637
+ if (typeof baseScore !== "number") continue;
1638
+ const arr = byCandidate.get(r.candidateId) ?? [];
1639
+ arr.push(score - baseScore);
1640
+ byCandidate.set(r.candidateId, arr);
1641
+ }
1642
+ return [...byCandidate.entries()].map(([candidateId, deltas]) => ({ candidateId, deltas }));
1643
+ }
1644
+ function buildSummary(args) {
1645
+ const lines = [
1646
+ `${args.runs.length} runs analysed`,
1647
+ `${args.preferences.pairs.length} preference pairs (${args.preferences.strategy})`,
1648
+ `reward-hacking verdict: ${args.rewardHacking.verdict}`
1649
+ ];
1650
+ if (args.interimConfidence) {
1651
+ lines.push(
1652
+ `sequential: ${args.interimConfidence.recommendation.decision}` + (args.interimConfidence.recommendation.candidateId ? ` ${args.interimConfidence.recommendation.candidateId}` : "")
1653
+ );
1654
+ }
1655
+ if (args.predictiveValidity?.ranked[0]) {
1656
+ const top = args.predictiveValidity.ranked[0];
1657
+ lines.push(`top-rubric: ${top.rubric} \u03C1=${top.spearman.toFixed(2)}`);
1658
+ }
1659
+ return lines.join(" | ");
1660
+ }
1661
+
1662
+ // src/rl/predictive-validity-researcher.ts
1663
+ var PredictiveValidityResearcher = class {
1664
+ opts;
1665
+ lastReport = null;
1666
+ constructor(opts) {
1667
+ this.opts = opts;
1668
+ }
1669
+ async inspectFailures(runs) {
1670
+ const threshold = this.opts.failureThreshold ?? 0.5;
1671
+ const failures = [];
1672
+ const failingRuns = runs.filter((r) => {
1673
+ const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
1674
+ return typeof score === "number" && score < threshold;
1675
+ });
1676
+ if (failingRuns.length === 0) return failures;
1677
+ const grouped = /* @__PURE__ */ new Map();
1678
+ for (const r of failingRuns) {
1679
+ const arr = grouped.get(r.candidateId) ?? [];
1680
+ arr.push(r);
1681
+ grouped.set(r.candidateId, arr);
1682
+ }
1683
+ for (const [candidateId, group] of grouped.entries()) {
1684
+ const meanScore = group.reduce((s, r) => {
1685
+ const x = r.outcome.holdoutScore ?? r.outcome.searchScore ?? 0;
1686
+ return s + x;
1687
+ }, 0) / group.length;
1688
+ failures.push({
1689
+ code: `low-score-${candidateId}`,
1690
+ description: `${candidateId} scored < ${threshold} on ${group.length} run(s) (mean ${meanScore.toFixed(3)})`,
1691
+ evidence: {
1692
+ runIds: group.slice(0, 8).map((r) => r.runId),
1693
+ samples: group.length
1694
+ }
1695
+ });
1696
+ }
1697
+ return failures;
1698
+ }
1699
+ async proposeChange(failures) {
1700
+ if (failures.length === 0) return [];
1701
+ if (this.lastReport === null) {
1702
+ return [
1703
+ {
1704
+ kind: "threshold",
1705
+ payload: { directive: "researcher.collect-more-outcomes" },
1706
+ rationale: "predictive-validity researcher has no prior report; cannot recommend rubric reweighting until at least one report exists"
1707
+ }
1708
+ ];
1709
+ }
1710
+ const decorativeThreshold = this.opts.decorativeThreshold ?? 0.4;
1711
+ const changes = [];
1712
+ for (const ranking of this.lastReport.ranked) {
1713
+ if (ranking.verdict === "load_bearing") continue;
1714
+ if (Math.abs(ranking.spearman) >= decorativeThreshold) continue;
1715
+ changes.push({
1716
+ kind: "reviewer_prompt",
1717
+ payload: {
1718
+ rubric: ranking.rubric,
1719
+ action: "down-weight",
1720
+ spearman: ranking.spearman,
1721
+ bestOutcome: ranking.bestOutcome
1722
+ },
1723
+ rationale: `predictive-validity Spearman=${ranking.spearman.toFixed(3)} vs ${ranking.bestOutcome} (decorative); recommend down-weighting`,
1724
+ expectedDelta: -Math.max(0, 0.05 - Math.abs(ranking.spearman))
1725
+ });
1726
+ }
1727
+ for (const ranking of this.lastReport.ranked.slice(0, 1)) {
1728
+ if (ranking.verdict !== "load_bearing") continue;
1729
+ changes.push({
1730
+ kind: "reviewer_prompt",
1731
+ payload: {
1732
+ rubric: ranking.rubric,
1733
+ action: "up-weight",
1734
+ spearman: ranking.spearman,
1735
+ bestOutcome: ranking.bestOutcome
1736
+ },
1737
+ rationale: `predictive-validity Spearman=${ranking.spearman.toFixed(3)} vs ${ranking.bestOutcome} (load-bearing); recommend up-weighting`,
1738
+ expectedDelta: Math.max(0, Math.abs(ranking.spearman) - 0.5) * 0.1
1739
+ });
1740
+ }
1741
+ return changes;
1742
+ }
1743
+ async applyChange(changes, baseline) {
1744
+ return {
1745
+ ...baseline,
1746
+ changes: [...baseline.changes, ...changes]
1747
+ };
1748
+ }
1749
+ async evaluateChange(plan) {
1750
+ const emptyGate = {
1751
+ promote: false,
1752
+ candidateId: plan.proposedCandidateId,
1753
+ baselineId: plan.baselineCandidateId,
1754
+ evidence: {
1755
+ productiveRuns: 0,
1756
+ medianPairedDelta: 0,
1757
+ pairedCI: { low: 0, high: 0 },
1758
+ pairedPValue: 1,
1759
+ searchScore: 0,
1760
+ holdoutScore: 0,
1761
+ overfitGap: 0,
1762
+ baselineOverfitGap: 0
1763
+ },
1764
+ reason: "predictive-validity researcher does not execute plans; the caller is expected to run the sweep and call rubricPredictiveValidity directly with the resulting RunRecord[].",
1765
+ rejectionCode: "few_runs"
1766
+ };
1767
+ return {
1768
+ plan,
1769
+ runs: [],
1770
+ gateDecision: emptyGate
1771
+ };
1772
+ }
1773
+ /**
1774
+ * Run the predictive-validity check explicitly against a fresh RunRecord
1775
+ * set. Updates the researcher's cached report so subsequent
1776
+ * `proposeChange` calls have evidence to draw from.
1777
+ */
1778
+ async runValidityCheck(runs) {
1779
+ const report = await rubricPredictiveValidity({
1780
+ runs,
1781
+ outcomes: this.opts.outcomes,
1782
+ outcomeMetrics: this.opts.outcomeMetrics,
1783
+ rubrics: this.opts.rubrics
1784
+ });
1785
+ if (this.opts.onReport) await this.opts.onReport(report);
1786
+ this.lastReport = report;
1787
+ return report;
1788
+ }
1789
+ /**
1790
+ * Force-feed a predictive-validity report into the researcher state —
1791
+ * useful when the consumer ran the report out-of-band and wants the
1792
+ * researcher's later proposals informed by it.
1793
+ */
1794
+ setReport(report) {
1795
+ this.lastReport = report;
1796
+ }
1797
+ getLastReport() {
1798
+ return this.lastReport;
1799
+ }
1800
+ };
1801
+
1802
+ // src/rl/process-reward.ts
1803
+ async function extractStepRewards(store, runId, opts) {
1804
+ const spans = await store.spans({ runId });
1805
+ const ordered = [...spans].sort((a, b) => a.startedAt - b.startedAt);
1806
+ const out = [];
1807
+ let idx = 0;
1808
+ for (const span of ordered) {
1809
+ if (opts.preFilter && !opts.preFilter(span)) continue;
1810
+ let scored = null;
1811
+ for (const s of opts.scorers) {
1812
+ if (!s.appliesTo.includes(span.kind)) continue;
1813
+ const r = await s.score(span);
1814
+ if (r) {
1815
+ scored = r;
1816
+ break;
1817
+ }
1818
+ }
1819
+ if (!scored) continue;
1820
+ out.push({
1821
+ spanId: span.spanId,
1822
+ runId,
1823
+ stepIndex: idx++,
1824
+ kind: span.kind,
1825
+ name: span.name,
1826
+ reward: scored.reward,
1827
+ determinism: scored.determinism,
1828
+ rationale: scored.rationale,
1829
+ weight: scored.weight
1830
+ });
1831
+ }
1832
+ return out;
1833
+ }
1834
+ function runwiseStepRewardSummary(stepRewards) {
1835
+ if (stepRewards.length === 0) {
1836
+ return {
1837
+ runId: "",
1838
+ totalSteps: 0,
1839
+ meanReward: 0,
1840
+ sumWeightedReward: 0,
1841
+ failureFraction: 0,
1842
+ worstStepDelta: 0,
1843
+ worstStepIndex: null
1844
+ };
1845
+ }
1846
+ const runId = stepRewards[0].runId;
1847
+ let sumW = 0;
1848
+ let sumWR = 0;
1849
+ let failures = 0;
1850
+ let worstDelta = 0;
1851
+ let worstIdx = null;
1852
+ let prev = stepRewards[0].reward;
1853
+ for (let i = 0; i < stepRewards.length; i++) {
1854
+ const s = stepRewards[i];
1855
+ const w = s.weight ?? 1;
1856
+ sumW += w;
1857
+ sumWR += w * s.reward;
1858
+ if (s.reward < 0.5) failures++;
1859
+ if (i > 0) {
1860
+ const delta = s.reward - prev;
1861
+ if (delta < worstDelta) {
1862
+ worstDelta = delta;
1863
+ worstIdx = i;
1864
+ }
1865
+ prev = s.reward;
1866
+ } else {
1867
+ prev = s.reward;
1868
+ }
1869
+ }
1870
+ return {
1871
+ runId,
1872
+ totalSteps: stepRewards.length,
1873
+ meanReward: sumW === 0 ? 0 : sumWR / sumW,
1874
+ sumWeightedReward: sumWR,
1875
+ failureFraction: failures / stepRewards.length,
1876
+ worstStepDelta: worstDelta,
1877
+ worstStepIndex: worstIdx
1878
+ };
1879
+ }
1880
+ function prmTrainingPairs(stepRewardsByRun, opts = {}) {
1881
+ const minMargin = opts.minMargin ?? 0.2;
1882
+ const minPrefix = opts.minPrefixLength ?? 1;
1883
+ const runs = [...stepRewardsByRun.entries()].map(([runId, steps]) => ({ runId, steps }));
1884
+ const triples = [];
1885
+ for (let i = 0; i < runs.length; i++) {
1886
+ for (let j = i + 1; j < runs.length; j++) {
1887
+ const a = runs[i];
1888
+ const b = runs[j];
1889
+ const minLen = Math.min(a.steps.length, b.steps.length);
1890
+ if (minLen < minPrefix + 1) continue;
1891
+ let divergenceIdx = -1;
1892
+ for (let k = 0; k < minLen; k++) {
1893
+ const sa = a.steps[k];
1894
+ const sb = b.steps[k];
1895
+ const structuralDivergence = sa.kind !== sb.kind || sa.name !== sb.name;
1896
+ const rewardGap = Math.abs(sa.reward - sb.reward);
1897
+ if (structuralDivergence || rewardGap >= minMargin) {
1898
+ divergenceIdx = k;
1899
+ break;
1900
+ }
1901
+ }
1902
+ if (divergenceIdx < 0) continue;
1903
+ if (divergenceIdx < minPrefix) continue;
1904
+ const aNext = a.steps[divergenceIdx];
1905
+ const bNext = b.steps[divergenceIdx];
1906
+ const margin = Math.abs(aNext.reward - bNext.reward);
1907
+ if (margin < minMargin) continue;
1908
+ const chosen = aNext.reward > bNext.reward ? aNext : bNext;
1909
+ const rejected = aNext.reward > bNext.reward ? bNext : aNext;
1910
+ const chosenRun = aNext.reward > bNext.reward ? a.runId : b.runId;
1911
+ const rejectedRun = aNext.reward > bNext.reward ? b.runId : a.runId;
1912
+ triples.push({
1913
+ prefixRunId: chosenRun,
1914
+ prefixStepIndex: divergenceIdx - 1,
1915
+ chosenSpanId: chosen.spanId,
1916
+ chosenReward: chosen.reward,
1917
+ rejectedSpanId: rejected.spanId,
1918
+ rejectedReward: rejected.reward,
1919
+ rejectedRunId: rejectedRun,
1920
+ marginScore: chosen.reward - rejected.reward
1921
+ });
1922
+ }
1923
+ }
1924
+ return triples;
1925
+ }
1926
+
1927
+ // src/rl/rl-campaign.ts
1928
+ async function runRLCampaign(opts) {
1929
+ const campaign = await runEvalCampaign(opts);
1930
+ const rewardSignals = extractVerifiableRewardsFromRecords(
1931
+ campaign.runs,
1932
+ opts.verifiableReward ?? {}
1933
+ );
1934
+ const preferences = extractPreferences(campaign.runs, {
1935
+ strategy: opts.preferences?.strategy ?? "paired-by-scenario-and-seed",
1936
+ minMargin: opts.preferences?.minMargin ?? 0.05,
1937
+ splitTag: opts.preferences?.splitTag ?? opts.splitTag ?? "holdout",
1938
+ rewardOf: opts.preferences?.rewardOf
1939
+ });
1940
+ let interimConfidence = null;
1941
+ if (opts.report?.comparator) {
1942
+ const comparator = opts.report.comparator;
1943
+ const deltaSeries = collectPairedDeltaSeries2(campaign.runs, comparator);
1944
+ if (deltaSeries.some((s) => s.deltas.length > 0)) {
1945
+ interimConfidence = evaluateInterimReleaseConfidence({
1946
+ deltaSeries,
1947
+ alpha: opts.sequential?.alpha,
1948
+ bound: opts.sequential?.bound,
1949
+ rope: opts.sequential?.rope ?? opts.report?.rope
1950
+ });
1951
+ }
1952
+ }
1953
+ const rewardHacking = detectRewardHacking({
1954
+ runs: campaign.runs,
1955
+ verifiableRewardOptions: opts.verifiableReward
1956
+ });
1957
+ let predictiveValidity = null;
1958
+ if (opts.outcomeStore && opts.outcomeMetrics && opts.outcomeMetrics.length > 0) {
1959
+ predictiveValidity = await rubricPredictiveValidity({
1960
+ runs: campaign.runs,
1961
+ outcomes: opts.outcomeStore,
1962
+ outcomeMetrics: opts.outcomeMetrics
1963
+ });
1964
+ }
1965
+ const trainerRows = {};
1966
+ if (opts.trainerExport?.dpo) {
1967
+ trainerRows.dpo = await toDpoRows(preferences.pairs, opts.trainerExport.dpo);
1968
+ }
1969
+ if (opts.trainerExport?.grpo) {
1970
+ trainerRows.grpo = await toGrpoRows(campaign.runs, opts.trainerExport.grpo);
1971
+ }
1972
+ if (opts.trainerExport?.sft) {
1973
+ trainerRows.sft = await toSftRows(campaign.runs, opts.trainerExport.sft);
1974
+ }
1975
+ const summary = buildSummary2({
1976
+ campaign,
1977
+ preferences,
1978
+ interimConfidence,
1979
+ rewardHacking,
1980
+ predictiveValidity
1981
+ });
1982
+ return {
1983
+ campaign,
1984
+ rewardSignals,
1985
+ preferences,
1986
+ interimConfidence,
1987
+ rewardHacking,
1988
+ predictiveValidity,
1989
+ trainerRows,
1990
+ summary,
1991
+ kind: "agent-eval-rl-campaign"
1992
+ };
1993
+ }
1994
+ function collectPairedDeltaSeries2(runs, comparator) {
1995
+ const baseline = /* @__PURE__ */ new Map();
1996
+ for (const r of runs) {
1997
+ if (r.candidateId !== comparator) continue;
1998
+ const sid = r.scenarioId ?? r.experimentId;
1999
+ const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
2000
+ if (typeof score !== "number" || !Number.isFinite(score)) continue;
2001
+ baseline.set(`${sid}::${r.seed}`, score);
2002
+ }
2003
+ const byCandidate = /* @__PURE__ */ new Map();
2004
+ for (const r of runs) {
2005
+ if (r.candidateId === comparator) continue;
2006
+ const sid = r.scenarioId ?? r.experimentId;
2007
+ const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
2008
+ if (typeof score !== "number" || !Number.isFinite(score)) continue;
2009
+ const baseScore = baseline.get(`${sid}::${r.seed}`);
2010
+ if (typeof baseScore !== "number") continue;
2011
+ const arr = byCandidate.get(r.candidateId) ?? [];
2012
+ arr.push(score - baseScore);
2013
+ byCandidate.set(r.candidateId, arr);
2014
+ }
2015
+ return [...byCandidate.entries()].map(([candidateId, deltas]) => ({ candidateId, deltas }));
2016
+ }
2017
+ function buildSummary2(args) {
2018
+ const c = args.campaign;
2019
+ const lines = [
2020
+ `${c.campaignId}: ${c.runs.length} successful runs / ${c.failedRuns.length} failed (fingerprint ${c.campaignFingerprint.slice(0, 12)}\u2026)`,
2021
+ `preferences: ${args.preferences.pairs.length} (${args.preferences.strategy}, ${args.preferences.pairsBelowMargin} below margin)`
2022
+ ];
2023
+ if (args.interimConfidence) {
2024
+ lines.push(
2025
+ `sequential verdict: ${args.interimConfidence.recommendation.decision}` + (args.interimConfidence.recommendation.candidateId ? ` ${args.interimConfidence.recommendation.candidateId}` : "")
2026
+ );
2027
+ }
2028
+ lines.push(
2029
+ `reward-hacking: ${args.rewardHacking.verdict} (${args.rewardHacking.findings.length} signals checked)`
2030
+ );
2031
+ if (args.predictiveValidity) {
2032
+ const top = args.predictiveValidity.ranked[0];
2033
+ lines.push(
2034
+ `top-rubric: ${top?.rubric ?? "none"} \u03C1=${(top?.spearman ?? 0).toFixed(2)} (${top?.verdict ?? "no data"})`
2035
+ );
2036
+ }
2037
+ return lines.join(" | ");
2038
+ }
62
2039
  export {
63
2040
  PredictiveValidityResearcher,
64
2041
  adversarialScenarioSearch,