@tangle-network/agent-eval 0.23.1 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/CHANGELOG.md +80 -0
  2. package/README.md +141 -79
  3. package/dist/baseline-4R5deP0N.d.ts +108 -0
  4. package/dist/benchmarks/index.d.ts +3 -2
  5. package/dist/benchmarks/index.js +1 -1
  6. package/dist/builder-eval/index.d.ts +249 -0
  7. package/dist/builder-eval/index.js +391 -0
  8. package/dist/builder-eval/index.js.map +1 -0
  9. package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
  10. package/dist/chunk-2A5XJB43.js.map +1 -0
  11. package/dist/chunk-47X6LRCE.js +76 -0
  12. package/dist/chunk-47X6LRCE.js.map +1 -0
  13. package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
  14. package/dist/chunk-4F5DQN55.js.map +1 -0
  15. package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
  16. package/dist/chunk-4S4BM3QQ.js.map +1 -0
  17. package/dist/chunk-5BKGXME7.js +65 -0
  18. package/dist/chunk-5BKGXME7.js.map +1 -0
  19. package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
  20. package/dist/chunk-6QDKWHLS.js.map +1 -0
  21. package/dist/chunk-I4MBDTY5.js +272 -0
  22. package/dist/chunk-I4MBDTY5.js.map +1 -0
  23. package/dist/chunk-K2TPS5LB.js +569 -0
  24. package/dist/chunk-K2TPS5LB.js.map +1 -0
  25. package/dist/chunk-KKHDIONI.js +414 -0
  26. package/dist/chunk-KKHDIONI.js.map +1 -0
  27. package/dist/chunk-KMPRBJK4.js +74 -0
  28. package/dist/chunk-KMPRBJK4.js.map +1 -0
  29. package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
  30. package/dist/chunk-KTGTIOFD.js.map +1 -0
  31. package/dist/chunk-LSH4MMOZ.js +838 -0
  32. package/dist/chunk-LSH4MMOZ.js.map +1 -0
  33. package/dist/chunk-NG236HPC.js +57 -0
  34. package/dist/chunk-NG236HPC.js.map +1 -0
  35. package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
  36. package/dist/chunk-NLMNWKVM.js.map +1 -0
  37. package/dist/chunk-NU65VQ7M.js +99 -0
  38. package/dist/chunk-NU65VQ7M.js.map +1 -0
  39. package/dist/chunk-OHEPNJQN.js +554 -0
  40. package/dist/chunk-OHEPNJQN.js.map +1 -0
  41. package/dist/chunk-OWLAAMME.js +250 -0
  42. package/dist/chunk-OWLAAMME.js.map +1 -0
  43. package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
  44. package/dist/chunk-PC4UYEBM.js.map +1 -0
  45. package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
  46. package/dist/chunk-RAF443UI.js.map +1 -0
  47. package/dist/chunk-RZTMDUO7.js +49 -0
  48. package/dist/chunk-RZTMDUO7.js.map +1 -0
  49. package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
  50. package/dist/chunk-SESZDQPX.js.map +1 -0
  51. package/dist/{chunk-6KQG5HAH.js → chunk-SY6WAAAD.js} +84 -71
  52. package/dist/chunk-SY6WAAAD.js.map +1 -0
  53. package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
  54. package/dist/chunk-TVVP3ZZQ.js.map +1 -0
  55. package/dist/{chunk-VQQSPGSM.js → chunk-VRJVTXRV.js} +169 -111
  56. package/dist/chunk-VRJVTXRV.js.map +1 -0
  57. package/dist/chunk-WWYCWKUM.js +196 -0
  58. package/dist/chunk-WWYCWKUM.js.map +1 -0
  59. package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
  60. package/dist/chunk-YRZ4M5GS.js.map +1 -0
  61. package/dist/chunk-ZN274SWR.js +613 -0
  62. package/dist/chunk-ZN274SWR.js.map +1 -0
  63. package/dist/cli.js +10 -6
  64. package/dist/cli.js.map +1 -1
  65. package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
  66. package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
  67. package/dist/control.d.ts +8 -6
  68. package/dist/control.js +10 -7
  69. package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
  70. package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  71. package/dist/errors-BZ9sTdz7.d.ts +70 -0
  72. package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
  73. package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
  74. package/dist/governance/index.d.ts +5 -0
  75. package/dist/governance/index.js +18 -0
  76. package/dist/governance/index.js.map +1 -0
  77. package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
  78. package/dist/index-Oj9fAPPN.d.ts +270 -0
  79. package/dist/index.d.ts +1866 -3151
  80. package/dist/index.js +5457 -7809
  81. package/dist/index.js.map +1 -1
  82. package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
  83. package/dist/knowledge/index.d.ts +102 -0
  84. package/dist/knowledge/index.js +18 -0
  85. package/dist/knowledge/index.js.map +1 -0
  86. package/dist/meta-eval/index.d.ts +99 -0
  87. package/dist/meta-eval/index.js +324 -0
  88. package/dist/meta-eval/index.js.map +1 -0
  89. package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
  90. package/dist/openapi.json +1 -1
  91. package/dist/optimization.d.ts +11 -8
  92. package/dist/optimization.js +11 -9
  93. package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
  94. package/dist/pipelines/index.d.ts +172 -0
  95. package/dist/pipelines/index.js +409 -0
  96. package/dist/pipelines/index.js.map +1 -0
  97. package/dist/prm/index.d.ts +99 -0
  98. package/dist/prm/index.js +222 -0
  99. package/dist/prm/index.js.map +1 -0
  100. package/dist/query-DODUYdPg.d.ts +30 -0
  101. package/dist/release-report-TDPn1cxq.d.ts +292 -0
  102. package/dist/replay-BL96gCEP.d.ts +226 -0
  103. package/dist/reporting.d.ts +10 -295
  104. package/dist/reporting.js +10 -6
  105. package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-CUOiGcGv.d.ts} +148 -146
  106. package/dist/rl.d.ts +1762 -8
  107. package/dist/rl.js +2035 -58
  108. package/dist/rl.js.map +1 -1
  109. package/dist/rubric-D5tjHNJQ.d.ts +72 -0
  110. package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
  111. package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
  112. package/dist/sequential-Dgz1n51-.d.ts +139 -0
  113. package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  114. package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-BXGs_9V0.d.ts} +3 -76
  115. package/dist/telemetry/file.js +4 -1
  116. package/dist/telemetry/file.js.map +1 -1
  117. package/dist/telemetry/index.js +57 -57
  118. package/dist/telemetry/index.js.map +1 -1
  119. package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
  120. package/dist/traces.d.ts +142 -387
  121. package/dist/traces.js +1302 -40
  122. package/dist/traces.js.map +1 -1
  123. package/dist/trajectory-CnoBo-JY.d.ts +32 -0
  124. package/dist/wire/index.d.ts +22 -22
  125. package/dist/wire/index.js +4 -3
  126. package/package.json +44 -18
  127. package/dist/chunk-42I2QC2L.js.map +0 -1
  128. package/dist/chunk-5IIQKMD5.js.map +0 -1
  129. package/dist/chunk-6KQG5HAH.js.map +0 -1
  130. package/dist/chunk-6M774GY6.js.map +0 -1
  131. package/dist/chunk-7EAUOUQS.js.map +0 -1
  132. package/dist/chunk-AXHNWLIX.js.map +0 -1
  133. package/dist/chunk-EXGR4XEM.js.map +0 -1
  134. package/dist/chunk-IOXMGMHQ.js.map +0 -1
  135. package/dist/chunk-KAO3Q65R.js.map +0 -1
  136. package/dist/chunk-LZKIOBG2.js +0 -2026
  137. package/dist/chunk-LZKIOBG2.js.map +0 -1
  138. package/dist/chunk-QBW3YBTR.js.map +0 -1
  139. package/dist/chunk-QUKKGHTZ.js.map +0 -1
  140. package/dist/chunk-SQQLHODJ.js.map +0 -1
  141. package/dist/chunk-V5QSWN7L.js +0 -1310
  142. package/dist/chunk-V5QSWN7L.js.map +0 -1
  143. package/dist/chunk-VQQSPGSM.js.map +0 -1
  144. package/dist/chunk-XPHOZPOM.js +0 -1947
  145. package/dist/chunk-XPHOZPOM.js.map +0 -1
  146. package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
  147. package/dist/index-ekBXweiQ.d.ts +0 -1894
  148. package/dist/sequential-DgU2mFsE.d.ts +0 -304
@@ -1,2026 +0,0 @@
1
- import {
2
- runEvalCampaign
3
- } from "./chunk-EXGR4XEM.js";
4
- import {
5
- evaluateInterimReleaseConfidence,
6
- rubricPredictiveValidity
7
- } from "./chunk-AXHNWLIX.js";
8
- import {
9
- benjaminiHochberg,
10
- wilcoxonSignedRank
11
- } from "./chunk-IOXMGMHQ.js";
12
-
13
- // src/rl/run-record-adapters.ts
14
- function trialToRunRecord(trial, ctx, opts = {}) {
15
- const splitTag = ctx.splitTag ?? "search";
16
- const promptHash = typeof ctx.promptHash === "function" ? ctx.promptHash(trial) : ctx.promptHash;
17
- const configHash = typeof ctx.configHash === "function" ? ctx.configHash(trial) : ctx.configHash;
18
- const runId = opts.runId ?? defaultRunId(ctx, trial);
19
- const experimentId = opts.experimentIdPerTrial?.(trial) ?? ctx.experimentId;
20
- const costRecorded = typeof trial.cost === "number" && Number.isFinite(trial.cost);
21
- const costUsd = costRecorded ? trial.cost : ctx.defaultCostUsd ?? 0;
22
- const raw = { ...trial.metrics ?? {} };
23
- if (!costRecorded) raw.cost_unknown = 1;
24
- if (typeof trial.durationMs === "number") raw.duration_ms = trial.durationMs;
25
- raw.rep = trial.rep;
26
- const score = Number.isFinite(trial.score) ? trial.score : 0;
27
- const outcome = { raw };
28
- if (splitTag === "holdout") outcome.holdoutScore = score;
29
- else outcome.searchScore = score;
30
- return {
31
- runId,
32
- experimentId,
33
- candidateId: trial.variantId,
34
- seed: trial.rep,
35
- model: ctx.model,
36
- promptHash,
37
- configHash,
38
- commitSha: ctx.commitSha,
39
- wallMs: trial.durationMs ?? 0,
40
- costUsd,
41
- tokenUsage: { input: 0, output: 0 },
42
- outcome,
43
- failureMode: trial.ok ? void 0 : trial.error ? "optimizer_trial_error" : "optimizer_trial_failed",
44
- splitTag,
45
- scenarioId: trial.scenarioId
46
- };
47
- }
48
- function trialsToRunRecords(trials, ctx) {
49
- return trials.map((t) => trialToRunRecord(t, ctx));
50
- }
51
- function verificationReportToRunRecord(report, ctx, opts = {}) {
52
- const splitTag = ctx.splitTag ?? "search";
53
- const runId = opts.runId ?? `run-${ctx.candidateId}-${ctx.experimentId}-${report.startedAt}`;
54
- const promptHash = typeof ctx.promptHash === "function" ? "p".repeat(64) : ctx.promptHash;
55
- const configHash = typeof ctx.configHash === "function" ? "c".repeat(64) : ctx.configHash;
56
- const raw = {
57
- pass_count: report.passCount,
58
- fail_count: report.failCount,
59
- error_count: report.errorCount,
60
- skipped_count: report.skippedCount,
61
- duration_ms: report.durationMs,
62
- blended_score: report.blendedScore
63
- };
64
- for (const layer of report.layers) {
65
- if (typeof layer.score === "number") raw[`layer.${layer.layer}`] = layer.score;
66
- raw[`layer_${layer.layer}_pass`] = layer.status === "pass" ? 1 : 0;
67
- if (layer.diagnostics) {
68
- for (const [k, v] of Object.entries(layer.diagnostics)) {
69
- if (typeof v === "number" && Number.isFinite(v)) raw[`layer.${layer.layer}.${k}`] = v;
70
- }
71
- }
72
- }
73
- const firstFail = report.layers.find((l) => l.status === "fail" || l.status === "error");
74
- const outcome = { raw };
75
- if (splitTag === "holdout") outcome.holdoutScore = report.blendedScore;
76
- else outcome.searchScore = report.blendedScore;
77
- return {
78
- runId,
79
- experimentId: ctx.experimentId,
80
- candidateId: ctx.candidateId,
81
- seed: 0,
82
- model: ctx.model,
83
- promptHash,
84
- configHash,
85
- commitSha: ctx.commitSha,
86
- wallMs: report.durationMs,
87
- costUsd: ctx.defaultCostUsd ?? 0,
88
- tokenUsage: { input: 0, output: 0 },
89
- outcome,
90
- failureMode: firstFail ? failureModeFromLayer(firstFail) : void 0,
91
- splitTag,
92
- scenarioId: ctx.scenarioId
93
- };
94
- }
95
- function variantAggregateToRunRecord(agg, ctx, opts = {}) {
96
- const splitTag = ctx.splitTag ?? "search";
97
- const runId = opts.runId ?? `agg-${agg.variantId}-${ctx.experimentId}`;
98
- const promptHash = typeof ctx.promptHash === "function" ? "p".repeat(64) : ctx.promptHash;
99
- const configHash = typeof ctx.configHash === "function" ? "c".repeat(64) : ctx.configHash;
100
- const raw = {
101
- ...agg.metrics,
102
- ok_rate: agg.okRate,
103
- duration_ms: agg.meanDurationMs,
104
- n_scenarios: agg.scenarios.length
105
- };
106
- const outcome = { raw };
107
- if (splitTag === "holdout") outcome.holdoutScore = agg.meanScore;
108
- else outcome.searchScore = agg.meanScore;
109
- return {
110
- runId,
111
- experimentId: ctx.experimentId,
112
- candidateId: agg.variantId,
113
- seed: 0,
114
- model: ctx.model,
115
- promptHash,
116
- configHash,
117
- commitSha: ctx.commitSha,
118
- wallMs: agg.meanDurationMs,
119
- costUsd: agg.meanCost,
120
- tokenUsage: { input: 0, output: 0 },
121
- outcome,
122
- splitTag
123
- };
124
- }
125
- function defaultRunId(ctx, t) {
126
- return `run-${ctx.experimentId}-${t.variantId}-${t.scenarioId}-${t.rep}`;
127
- }
128
- function failureModeFromLayer(layer) {
129
- if (layer.status === "error") return `layer_${layer.layer}_error`;
130
- if (layer.status === "fail") return `layer_${layer.layer}_fail`;
131
- if (layer.status === "timeout") return `layer_${layer.layer}_timeout`;
132
- return `layer_${layer.layer}_${layer.status}`;
133
- }
134
-
135
- // src/rl/verifiable-reward.ts
136
- var DEFAULT_DETERMINISTIC_LAYERS = /* @__PURE__ */ new Set([
137
- "install",
138
- "typecheck",
139
- "build",
140
- "lint",
141
- "test",
142
- "compile",
143
- "schema",
144
- "sandbox",
145
- "unit_tests",
146
- "integration_tests"
147
- ]);
148
- var DEFAULT_SOURCE_FOR = (name) => {
149
- const lower = name.toLowerCase();
150
- if (lower.includes("test")) return "test";
151
- if (lower.includes("compile") || lower.includes("build") || lower.includes("typecheck") || lower.includes("lint")) return "compile";
152
- if (lower.includes("schema")) return "schema";
153
- if (lower.includes("sandbox")) return "sandbox";
154
- if (lower.includes("judge") || lower.includes("semantic")) return "judge";
155
- return "composite";
156
- };
157
- function extractVerifiableReward(report, opts = {}) {
158
- const deterministicSet = new Set(opts.deterministicLayers ?? [...DEFAULT_DETERMINISTIC_LAYERS]);
159
- const sourceFor = opts.sourceFor ?? DEFAULT_SOURCE_FOR;
160
- const fallbackToJudge = opts.fallbackToJudge ?? true;
161
- const judgeFloor = opts.judgeConfidenceFloor ?? 0.7;
162
- const deterministic = report.layers.filter(
163
- (l) => deterministicSet.has(l.layer) && typeof l.score === "number" && Number.isFinite(l.score)
164
- );
165
- if (deterministic.length === 1) {
166
- const layer = deterministic[0];
167
- return {
168
- value: clamp01(layer.score),
169
- source: sourceFor(layer.layer),
170
- determinism: "deterministic",
171
- confidence: 1,
172
- origin: layer.layer,
173
- breakdown: layerBreakdown(layer)
174
- };
175
- }
176
- if (deterministic.length > 1) {
177
- let num = 0;
178
- let denom = 0;
179
- const breakdown = {};
180
- for (const l of deterministic) {
181
- const w = l.detail?.weight ?? 1;
182
- num += w * (l.score ?? 0);
183
- denom += w;
184
- breakdown[l.layer] = l.score;
185
- }
186
- return {
187
- value: denom === 0 ? 0 : clamp01(num / denom),
188
- source: "composite",
189
- determinism: "deterministic",
190
- confidence: 1,
191
- origin: deterministic.map((l) => l.layer).join("+"),
192
- breakdown
193
- };
194
- }
195
- if (!fallbackToJudge) return null;
196
- const judge = report.layers.find(
197
- (l) => typeof l.score === "number" && Number.isFinite(l.score) && sourceFor(l.layer) === "judge"
198
- ) ?? report.layers.find((l) => typeof l.score === "number" && Number.isFinite(l.score));
199
- if (!judge) return null;
200
- const confFromDetail = judge.detail?.confidence;
201
- return {
202
- value: clamp01(judge.score),
203
- source: "judge",
204
- determinism: "probabilistic",
205
- confidence: typeof confFromDetail === "number" ? confFromDetail : judgeFloor,
206
- origin: judge.layer,
207
- breakdown: layerBreakdown(judge)
208
- };
209
- }
210
- function extractVerifiableRewardsFromRecords(runs, opts = {}) {
211
- const sourceFor = opts.sourceFor ?? DEFAULT_SOURCE_FOR;
212
- const deterministicSet = new Set(opts.deterministicLayers ?? [...DEFAULT_DETERMINISTIC_LAYERS]);
213
- const fallbackToJudge = opts.fallbackToJudge ?? true;
214
- const judgeFloor = opts.judgeConfidenceFloor ?? 0.7;
215
- return runs.map((run) => {
216
- const layerScores = [];
217
- for (const [k, v] of Object.entries(run.outcome.raw)) {
218
- if (k.startsWith("layer.") && !k.includes(".", 6) && typeof v === "number" && Number.isFinite(v)) {
219
- layerScores.push({ name: k.slice("layer.".length), score: v });
220
- }
221
- }
222
- const det = layerScores.filter((l) => deterministicSet.has(l.name));
223
- if (det.length === 1) {
224
- const layer = det[0];
225
- return {
226
- runId: run.runId,
227
- reward: {
228
- value: clamp01(layer.score),
229
- source: sourceFor(layer.name),
230
- determinism: "deterministic",
231
- confidence: 1,
232
- origin: layer.name
233
- }
234
- };
235
- }
236
- if (det.length > 1) {
237
- const value = det.reduce((s, l) => s + l.score, 0) / det.length;
238
- const breakdown = Object.fromEntries(det.map((l) => [l.name, l.score]));
239
- return {
240
- runId: run.runId,
241
- reward: {
242
- value: clamp01(value),
243
- source: "composite",
244
- determinism: "deterministic",
245
- confidence: 1,
246
- origin: det.map((l) => l.name).join("+"),
247
- breakdown
248
- }
249
- };
250
- }
251
- if (!fallbackToJudge) return { runId: run.runId, reward: null };
252
- const primary = run.outcome.holdoutScore ?? run.outcome.searchScore;
253
- if (typeof primary !== "number" || !Number.isFinite(primary)) {
254
- return { runId: run.runId, reward: null };
255
- }
256
- return {
257
- runId: run.runId,
258
- reward: {
259
- value: clamp01(primary),
260
- source: "judge",
261
- determinism: "probabilistic",
262
- confidence: judgeFloor,
263
- origin: "run.outcome.score"
264
- }
265
- };
266
- });
267
- }
268
- function filterDeterministicallyRewarded(runs, opts = {}) {
269
- const rewarded = extractVerifiableRewardsFromRecords(runs, { ...opts, fallbackToJudge: false });
270
- const out = [];
271
- for (let i = 0; i < runs.length; i++) {
272
- const r = rewarded[i];
273
- if (r.reward && r.reward.determinism === "deterministic") {
274
- out.push({ run: runs[i], reward: r.reward });
275
- }
276
- }
277
- return out;
278
- }
279
- function clamp01(x) {
280
- if (!Number.isFinite(x)) return 0;
281
- return Math.max(0, Math.min(1, x));
282
- }
283
- function layerBreakdown(l) {
284
- const out = {};
285
- if (l.diagnostics) {
286
- for (const [k, v] of Object.entries(l.diagnostics)) {
287
- if (typeof v === "number" && Number.isFinite(v)) out[k] = v;
288
- }
289
- }
290
- return out;
291
- }
292
-
293
- // src/rl/preferences.ts
294
- var SPLIT_TAG_DEFAULT = "holdout";
295
- var DEFAULT_REWARD = (run) => {
296
- const v = run.outcome.holdoutScore ?? run.outcome.searchScore;
297
- return typeof v === "number" && Number.isFinite(v) ? v : null;
298
- };
299
- function extractPreferences(runs, opts = {}) {
300
- const strategy = opts.strategy ?? "paired-by-scenario-and-seed";
301
- const minMargin = opts.minMargin ?? 0.05;
302
- const splitTag = opts.splitTag ?? SPLIT_TAG_DEFAULT;
303
- const rewardOf = opts.rewardOf ?? DEFAULT_REWARD;
304
- const filtered = runs.filter((r) => r.splitTag === splitTag);
305
- const scoredEntries = [];
306
- for (const run of filtered) {
307
- const s = rewardOf(run);
308
- if (s === null) continue;
309
- scoredEntries.push({ run, score: s });
310
- }
311
- const pairs = [];
312
- let pairsBelowMargin = 0;
313
- let cellsSingleton = 0;
314
- let cellsInspected = 0;
315
- if (strategy === "paired-by-scenario-and-seed") {
316
- const groups = /* @__PURE__ */ new Map();
317
- for (const e of scoredEntries) {
318
- const sid = scenarioOf(e.run);
319
- const key = `${sid}::${e.run.seed}`;
320
- const arr = groups.get(key) ?? [];
321
- arr.push(e);
322
- groups.set(key, arr);
323
- }
324
- for (const [key, members] of groups.entries()) {
325
- cellsInspected++;
326
- if (members.length < 2) {
327
- cellsSingleton++;
328
- continue;
329
- }
330
- for (let i = 0; i < members.length; i++) {
331
- for (let j = i + 1; j < members.length; j++) {
332
- const a = members[i];
333
- const b = members[j];
334
- if (a.run.candidateId === b.run.candidateId) continue;
335
- const result = makePair(a, b, key.split("::")[0], minMargin);
336
- if (result.kind === "admit") pairs.push(result.pair);
337
- else pairsBelowMargin++;
338
- }
339
- }
340
- }
341
- } else if (strategy === "paired-by-scenario") {
342
- const byScenarioVariant = /* @__PURE__ */ new Map();
343
- for (const e of scoredEntries) {
344
- const sid = scenarioOf(e.run);
345
- let perScenario = byScenarioVariant.get(sid);
346
- if (!perScenario) {
347
- perScenario = /* @__PURE__ */ new Map();
348
- byScenarioVariant.set(sid, perScenario);
349
- }
350
- const cur = perScenario.get(e.run.candidateId);
351
- if (cur) {
352
- cur.sum += e.score;
353
- cur.n++;
354
- } else perScenario.set(e.run.candidateId, { run: e.run, sum: e.score, n: 1 });
355
- }
356
- for (const [sid, perVariant] of byScenarioVariant.entries()) {
357
- cellsInspected++;
358
- const arr = [...perVariant.entries()].map(([vid, agg]) => ({
359
- run: agg.run,
360
- score: agg.sum / agg.n,
361
- variantId: vid
362
- }));
363
- if (arr.length < 2) {
364
- cellsSingleton++;
365
- continue;
366
- }
367
- for (let i = 0; i < arr.length; i++) {
368
- for (let j = i + 1; j < arr.length; j++) {
369
- const result = makePair(arr[i], arr[j], sid, minMargin);
370
- if (result.kind === "admit") pairs.push(result.pair);
371
- else pairsBelowMargin++;
372
- }
373
- }
374
- }
375
- } else {
376
- const byScenario = /* @__PURE__ */ new Map();
377
- for (const e of scoredEntries) {
378
- const sid = scenarioOf(e.run);
379
- const arr = byScenario.get(sid) ?? [];
380
- arr.push(e);
381
- byScenario.set(sid, arr);
382
- }
383
- for (const [sid, arr] of byScenario.entries()) {
384
- cellsInspected++;
385
- if (arr.length < 2) {
386
- cellsSingleton++;
387
- continue;
388
- }
389
- const sorted = [...arr].sort((a, b) => a.score - b.score);
390
- const top = sorted[sorted.length - 1];
391
- const bot = sorted[0];
392
- if (top.run.candidateId === bot.run.candidateId) {
393
- cellsSingleton++;
394
- continue;
395
- }
396
- const result = makePair(bot, top, sid, minMargin);
397
- if (result.kind === "admit") pairs.push(result.pair);
398
- else pairsBelowMargin++;
399
- }
400
- }
401
- return { pairs, cellsInspected, pairsBelowMargin, cellsSingleton, strategy };
402
- }
403
- function toTRLFormat(triples, promptOf) {
404
- return triples.map((t) => ({
405
- prompt: promptOf(t.meta.chosenPromptHash),
406
- chosen: t.meta.chosenPromptHash,
407
- // caller substitutes the model output via the runId map
408
- rejected: t.meta.rejectedPromptHash
409
- }));
410
- }
411
- function toAnthropicFormat(triples) {
412
- return triples.map((t) => ({
413
- scenarioId: t.scenarioId,
414
- chosenRunId: t.chosenRunId,
415
- rejectedRunId: t.rejectedRunId,
416
- margin: t.marginScore
417
- }));
418
- }
419
- function makePair(a, b, scenarioId, minMargin) {
420
- const margin = Math.abs(a.score - b.score);
421
- if (margin < minMargin) return { kind: "reject" };
422
- const [chosen, rejected] = a.score > b.score ? [a, b] : [b, a];
423
- return {
424
- kind: "admit",
425
- pair: {
426
- scenarioId,
427
- chosenRunId: chosen.run.runId,
428
- rejectedRunId: rejected.run.runId,
429
- chosenVariantId: chosen.run.candidateId,
430
- rejectedVariantId: rejected.run.candidateId,
431
- marginScore: chosen.score - rejected.score,
432
- scores: { chosen: chosen.score, rejected: rejected.score },
433
- seed: chosen.run.seed === rejected.run.seed ? chosen.run.seed : void 0,
434
- meta: {
435
- chosenPromptHash: chosen.run.promptHash,
436
- rejectedPromptHash: rejected.run.promptHash,
437
- chosenConfigHash: chosen.run.configHash,
438
- rejectedConfigHash: rejected.run.configHash,
439
- chosenModel: chosen.run.model,
440
- rejectedModel: rejected.run.model
441
- }
442
- }
443
- };
444
- }
445
- function scenarioOf(run) {
446
- if (typeof run.scenarioId === "string" && run.scenarioId.length > 0) return run.scenarioId;
447
- const fromRaw = run.outcome.raw.scenario_id;
448
- if (typeof fromRaw === "number" && Number.isFinite(fromRaw)) return String(fromRaw);
449
- if (typeof fromRaw === "string") return fromRaw;
450
- return run.experimentId;
451
- }
452
-
453
- // src/rl/off-policy.ts
454
- function inverseProbabilityWeighting(trajectories, opts = {}) {
455
- const cap = opts.weightCap ?? Infinity;
456
- const clip = opts.rewardClip ?? { low: 0, high: 1 };
457
- if (trajectories.length === 0) {
458
- return zeroEstimate();
459
- }
460
- const weights = [];
461
- const weightedRewards = [];
462
- let maxW = 0;
463
- for (const t of trajectories) {
464
- if (t.behaviorProb <= 0) {
465
- throw new Error(`inverseProbabilityWeighting: behaviorProb must be > 0 (runId=${t.runId})`);
466
- }
467
- const w = Math.min(cap, t.targetProb / t.behaviorProb);
468
- const r = clamp(t.reward, clip.low, clip.high);
469
- weights.push(w);
470
- weightedRewards.push(w * r);
471
- if (w > maxW) maxW = w;
472
- }
473
- const n = weights.length;
474
- const value = weightedRewards.reduce((s, x) => s + x, 0) / n;
475
- const variance = weightedRewards.reduce((s, x) => s + (x - value) ** 2, 0) / Math.max(1, n - 1);
476
- const sumW = weights.reduce((s, w) => s + w, 0);
477
- const sumW2 = weights.reduce((s, w) => s + w * w, 0);
478
- const effN = sumW === 0 ? 0 : sumW * sumW / sumW2;
479
- return {
480
- value,
481
- standardError: Math.sqrt(variance / n),
482
- effectiveSampleSize: effN,
483
- n,
484
- maxImportanceWeight: maxW
485
- };
486
- }
487
- function selfNormalizedImportanceWeighting(trajectories, opts = {}) {
488
- const cap = opts.weightCap ?? Infinity;
489
- const clip = opts.rewardClip ?? { low: 0, high: 1 };
490
- if (trajectories.length === 0) return zeroEstimate();
491
- const weights = [];
492
- const rewards = [];
493
- let maxW = 0;
494
- for (const t of trajectories) {
495
- if (t.behaviorProb <= 0) {
496
- throw new Error(`selfNormalizedImportanceWeighting: behaviorProb must be > 0 (runId=${t.runId})`);
497
- }
498
- const w = Math.min(cap, t.targetProb / t.behaviorProb);
499
- weights.push(w);
500
- rewards.push(clamp(t.reward, clip.low, clip.high));
501
- if (w > maxW) maxW = w;
502
- }
503
- const sumW = weights.reduce((s, w) => s + w, 0);
504
- const sumWR = weights.reduce((s, w, i) => s + w * rewards[i], 0);
505
- const value = sumW === 0 ? 0 : sumWR / sumW;
506
- const sumW2 = weights.reduce((s, w) => s + w * w, 0);
507
- const effN = sumW === 0 ? 0 : sumW * sumW / sumW2;
508
- const phi = weights.map((w, i) => w * (rewards[i] - value));
509
- const variance = phi.reduce((s, x) => s + x * x, 0) / Math.max(1, sumW * sumW);
510
- return {
511
- value,
512
- standardError: Math.sqrt(variance),
513
- effectiveSampleSize: effN,
514
- n: trajectories.length,
515
- maxImportanceWeight: maxW
516
- };
517
- }
518
- function doublyRobust(trajectories, opts = {}) {
519
- const cap = opts.weightCap ?? Infinity;
520
- const clip = opts.rewardClip ?? { low: 0, high: 1 };
521
- if (trajectories.length === 0) return zeroEstimate();
522
- const contributions = [];
523
- let maxW = 0;
524
- let sumW = 0;
525
- let sumW2 = 0;
526
- for (const t of trajectories) {
527
- if (t.behaviorProb <= 0) {
528
- throw new Error(`doublyRobust: behaviorProb must be > 0 (runId=${t.runId})`);
529
- }
530
- const w = Math.min(cap, t.targetProb / t.behaviorProb);
531
- const r = clamp(t.reward, clip.low, clip.high);
532
- const q = typeof t.qHat === "number" && Number.isFinite(t.qHat) ? clamp(t.qHat, clip.low, clip.high) : null;
533
- if (q === null) {
534
- contributions.push(w * r);
535
- } else {
536
- contributions.push(q + w * (r - q));
537
- }
538
- if (w > maxW) maxW = w;
539
- sumW += w;
540
- sumW2 += w * w;
541
- }
542
- const n = contributions.length;
543
- const value = contributions.reduce((s, x) => s + x, 0) / n;
544
- const variance = contributions.reduce((s, x) => s + (x - value) ** 2, 0) / Math.max(1, n - 1);
545
- const effN = sumW === 0 ? 0 : sumW * sumW / sumW2;
546
- return {
547
- value,
548
- standardError: Math.sqrt(variance / n),
549
- effectiveSampleSize: effN,
550
- n,
551
- maxImportanceWeight: maxW
552
- };
553
- }
554
- function offPolicyEstimateAll(trajectories, opts = {}) {
555
- return {
556
- ips: inverseProbabilityWeighting(trajectories, opts),
557
- snips: selfNormalizedImportanceWeighting(trajectories, opts),
558
- dr: doublyRobust(trajectories, opts)
559
- };
560
- }
561
- function zeroEstimate() {
562
- return { value: 0, standardError: 0, effectiveSampleSize: 0, n: 0, maxImportanceWeight: 0 };
563
- }
564
- function clamp(x, lo, hi) {
565
- if (!Number.isFinite(x)) return lo;
566
- return Math.max(lo, Math.min(hi, x));
567
- }
568
-
569
- // src/rl/process-reward.ts
570
- async function extractStepRewards(store, runId, opts) {
571
- const spans = await store.spans({ runId });
572
- const ordered = [...spans].sort((a, b) => a.startedAt - b.startedAt);
573
- const out = [];
574
- let idx = 0;
575
- for (const span of ordered) {
576
- if (opts.preFilter && !opts.preFilter(span)) continue;
577
- let scored = null;
578
- for (const s of opts.scorers) {
579
- if (!s.appliesTo.includes(span.kind)) continue;
580
- const r = await s.score(span);
581
- if (r) {
582
- scored = r;
583
- break;
584
- }
585
- }
586
- if (!scored) continue;
587
- out.push({
588
- spanId: span.spanId,
589
- runId,
590
- stepIndex: idx++,
591
- kind: span.kind,
592
- name: span.name,
593
- reward: scored.reward,
594
- determinism: scored.determinism,
595
- rationale: scored.rationale,
596
- weight: scored.weight
597
- });
598
- }
599
- return out;
600
- }
601
- function runwiseStepRewardSummary(stepRewards) {
602
- if (stepRewards.length === 0) {
603
- return {
604
- runId: "",
605
- totalSteps: 0,
606
- meanReward: 0,
607
- sumWeightedReward: 0,
608
- failureFraction: 0,
609
- worstStepDelta: 0,
610
- worstStepIndex: null
611
- };
612
- }
613
- const runId = stepRewards[0].runId;
614
- let sumW = 0;
615
- let sumWR = 0;
616
- let failures = 0;
617
- let worstDelta = 0;
618
- let worstIdx = null;
619
- let prev = stepRewards[0].reward;
620
- for (let i = 0; i < stepRewards.length; i++) {
621
- const s = stepRewards[i];
622
- const w = s.weight ?? 1;
623
- sumW += w;
624
- sumWR += w * s.reward;
625
- if (s.reward < 0.5) failures++;
626
- if (i > 0) {
627
- const delta = s.reward - prev;
628
- if (delta < worstDelta) {
629
- worstDelta = delta;
630
- worstIdx = i;
631
- }
632
- prev = s.reward;
633
- } else {
634
- prev = s.reward;
635
- }
636
- }
637
- return {
638
- runId,
639
- totalSteps: stepRewards.length,
640
- meanReward: sumW === 0 ? 0 : sumWR / sumW,
641
- sumWeightedReward: sumWR,
642
- failureFraction: failures / stepRewards.length,
643
- worstStepDelta: worstDelta,
644
- worstStepIndex: worstIdx
645
- };
646
- }
647
- function prmTrainingPairs(stepRewardsByRun, opts = {}) {
648
- const minMargin = opts.minMargin ?? 0.2;
649
- const minPrefix = opts.minPrefixLength ?? 1;
650
- const runs = [...stepRewardsByRun.entries()].map(([runId, steps]) => ({ runId, steps }));
651
- const triples = [];
652
- for (let i = 0; i < runs.length; i++) {
653
- for (let j = i + 1; j < runs.length; j++) {
654
- const a = runs[i];
655
- const b = runs[j];
656
- const minLen = Math.min(a.steps.length, b.steps.length);
657
- if (minLen < minPrefix + 1) continue;
658
- let divergenceIdx = -1;
659
- for (let k = 0; k < minLen; k++) {
660
- const sa = a.steps[k];
661
- const sb = b.steps[k];
662
- const structuralDivergence = sa.kind !== sb.kind || sa.name !== sb.name;
663
- const rewardGap = Math.abs(sa.reward - sb.reward);
664
- if (structuralDivergence || rewardGap >= minMargin) {
665
- divergenceIdx = k;
666
- break;
667
- }
668
- }
669
- if (divergenceIdx < 0) continue;
670
- if (divergenceIdx < minPrefix) continue;
671
- const aNext = a.steps[divergenceIdx];
672
- const bNext = b.steps[divergenceIdx];
673
- const margin = Math.abs(aNext.reward - bNext.reward);
674
- if (margin < minMargin) continue;
675
- const chosen = aNext.reward > bNext.reward ? aNext : bNext;
676
- const rejected = aNext.reward > bNext.reward ? bNext : aNext;
677
- const chosenRun = aNext.reward > bNext.reward ? a.runId : b.runId;
678
- const rejectedRun = aNext.reward > bNext.reward ? b.runId : a.runId;
679
- triples.push({
680
- prefixRunId: chosenRun,
681
- prefixStepIndex: divergenceIdx - 1,
682
- chosenSpanId: chosen.spanId,
683
- chosenReward: chosen.reward,
684
- rejectedSpanId: rejected.spanId,
685
- rejectedReward: rejected.reward,
686
- rejectedRunId: rejectedRun,
687
- marginScore: chosen.reward - rejected.reward
688
- });
689
- }
690
- }
691
- return triples;
692
- }
693
-
694
- // src/rl/contamination.ts
695
- async function runContaminationProbe(input, opts = {}) {
696
- const fdr = opts.fdr ?? 0.05;
697
- const minMedianDrop = opts.minMedianDrop ?? 0.05;
698
- const floor = opts.scoreFloor ?? 0;
699
- if (!input.perturbed && !input.perturbation) {
700
- throw new Error("runContaminationProbe: must supply either `perturbed` or `perturbation`.");
701
- }
702
- const perturbed = input.perturbed ?? await Promise.all(
703
- input.originals.map((s) => input.perturbation.apply(s))
704
- );
705
- if (perturbed.length !== input.originals.length) {
706
- throw new Error(`runContaminationProbe: perturbed length ${perturbed.length} \u2260 originals ${input.originals.length}`);
707
- }
708
- const origScores = await Promise.all(input.originals.map((s) => input.scoreFn(s)));
709
- const pertScores = await Promise.all(perturbed.map((s) => input.scoreFn(s)));
710
- const perScenario = input.originals.map((s, i) => ({
711
- scenarioId: input.scenarioId(s),
712
- originalScore: origScores[i],
713
- perturbedScore: pertScores[i],
714
- delta: pertScores[i] - origScores[i],
715
- qValue: NaN
716
- }));
717
- const valid = perScenario.filter((p) => p.originalScore >= floor && p.perturbedScore >= floor);
718
- if (valid.length < 4) {
719
- return {
720
- perScenario,
721
- pairedTest: { w: 0, p: 1 },
722
- medianDelta: 0,
723
- meanDelta: 0,
724
- contaminationSuspected: false,
725
- reason: `insufficient valid scenarios (n=${valid.length}, need \u2265 4)`,
726
- n: valid.length
727
- };
728
- }
729
- const origValid = valid.map((p) => p.originalScore);
730
- const pertValid = valid.map((p) => p.perturbedScore);
731
- const pairedTest = wilcoxonSignedRank(origValid, pertValid);
732
- const deltas = valid.map((p) => p.delta);
733
- const sortedDeltas = [...deltas].sort((a, b) => a - b);
734
- const median = sortedDeltas[Math.floor(sortedDeltas.length / 2)];
735
- const mean2 = deltas.reduce((s, d) => s + d, 0) / deltas.length;
736
- const pseudoP = valid.map((p) => Math.min(1, Math.max(1e-6, 1 - Math.abs(p.delta) / 1)));
737
- const { qValues } = benjaminiHochberg(pseudoP, fdr);
738
- for (let i = 0; i < valid.length; i++) {
739
- const v = valid[i];
740
- const idx = perScenario.findIndex((p) => p.scenarioId === v.scenarioId);
741
- if (idx >= 0) perScenario[idx].qValue = qValues[i];
742
- }
743
- const contaminationSuspected = pairedTest.p < fdr && median <= -minMedianDrop;
744
- const reason = contaminationSuspected ? `paired p=${pairedTest.p.toFixed(4)} < ${fdr} and median drop ${median.toFixed(4)} \u2265 ${minMedianDrop}` : pairedTest.p >= fdr ? `no significant difference (paired p=${pairedTest.p.toFixed(4)})` : `significant but small effect (median delta ${median.toFixed(4)})`;
745
- return {
746
- perScenario,
747
- pairedTest,
748
- medianDelta: median,
749
- meanDelta: mean2,
750
- contaminationSuspected,
751
- reason,
752
- n: valid.length
753
- };
754
- }
755
- function renameVariables(identifiers, rename = (n, i) => `${n}_${(i % 26 + 10).toString(36)}`) {
756
- return {
757
- kind: "rename_variables",
758
- apply(scenario) {
759
- let prompt = scenario.prompt;
760
- identifiers.forEach((id, i) => {
761
- const replacement = rename(id, i);
762
- const re = new RegExp(`\\b${escapeRegex(id)}\\b`, "g");
763
- prompt = prompt.replace(re, replacement);
764
- });
765
- return { ...scenario, prompt };
766
- }
767
- };
768
- }
769
- function shuffleOrder(shuffleSection, seed) {
770
- let s = seed >>> 0;
771
- const rng = () => {
772
- s = s + 1831565813 >>> 0;
773
- let t = s;
774
- t = Math.imul(t ^ t >>> 15, t | 1);
775
- t ^= t + Math.imul(t ^ t >>> 7, t | 61);
776
- return ((t ^ t >>> 14) >>> 0) / 4294967296;
777
- };
778
- return {
779
- kind: "shuffle_order",
780
- apply(scenario) {
781
- const newPrompt = shuffleSection(scenario.prompt, rng);
782
- return { ...scenario, prompt: newPrompt };
783
- }
784
- };
785
- }
786
- function injectIrrelevantClause(clause, position = "prefix") {
787
- return {
788
- kind: "inject_irrelevant_clause",
789
- apply(scenario) {
790
- const prompt = position === "prefix" ? `${clause} ${scenario.prompt}` : `${scenario.prompt} ${clause}`;
791
- return { ...scenario, prompt };
792
- }
793
- };
794
- }
795
- function escapeRegex(s) {
796
- return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
797
- }
798
-
799
- // src/rl/tournament.ts
800
- function fitBradleyTerry(outcomes, opts = {}) {
801
- const tol = opts.tolerance ?? 1e-6;
802
- const maxIter = opts.maxIterations ?? 256;
803
- const smoothing = opts.smoothing ?? 0.1;
804
- const candidates = /* @__PURE__ */ new Set();
805
- for (const o of outcomes) {
806
- candidates.add(o.winner);
807
- candidates.add(o.loser);
808
- }
809
- const ids = [...candidates].sort();
810
- const idx = new Map(ids.map((id, i) => [id, i]));
811
- const n = ids.length;
812
- if (n === 0) return { ratings: [], iterations: 0, finalDelta: 0, converged: true };
813
- if (n === 1) {
814
- return {
815
- ratings: [{ candidateId: ids[0], strength: 1, logStrength: 0, n: 0, wins: 0 }],
816
- iterations: 0,
817
- finalDelta: 0,
818
- converged: true
819
- };
820
- }
821
- const W = Array.from({ length: n }, () => new Array(n).fill(0));
822
- const N = Array.from({ length: n }, () => new Array(n).fill(0));
823
- for (const o of outcomes) {
824
- const i = idx.get(o.winner);
825
- const j = idx.get(o.loser);
826
- const w = o.weight ?? 1;
827
- if (o.draw) {
828
- W[i][j] += 0.5 * w;
829
- W[j][i] += 0.5 * w;
830
- } else {
831
- W[i][j] += w;
832
- }
833
- N[i][j] += w;
834
- N[j][i] += w;
835
- }
836
- const winsTotal = new Array(n).fill(0);
837
- for (let i = 0; i < n; i++) {
838
- for (let j = 0; j < n; j++) winsTotal[i] += W[i][j];
839
- winsTotal[i] += smoothing;
840
- }
841
- const compsTotal = new Array(n).fill(0);
842
- for (let i = 0; i < n; i++) {
843
- for (let j = 0; j < n; j++) compsTotal[i] += N[i][j];
844
- }
845
- let theta = new Array(n).fill(1);
846
- let iter = 0;
847
- let delta = Infinity;
848
- for (; iter < maxIter; iter++) {
849
- const newTheta = new Array(n);
850
- for (let i = 0; i < n; i++) {
851
- let denom = 0;
852
- for (let j = 0; j < n; j++) {
853
- if (j === i) continue;
854
- if (N[i][j] === 0) continue;
855
- denom += N[i][j] / (theta[i] + theta[j]);
856
- }
857
- newTheta[i] = denom === 0 ? theta[i] : winsTotal[i] / denom;
858
- }
859
- let logSum = 0;
860
- for (let i = 0; i < n; i++) logSum += Math.log(Math.max(1e-300, newTheta[i]));
861
- const norm = Math.exp(logSum / n);
862
- for (let i = 0; i < n; i++) newTheta[i] = newTheta[i] / norm;
863
- delta = 0;
864
- for (let i = 0; i < n; i++) {
865
- const d = Math.abs(newTheta[i] - theta[i]) / Math.max(1e-12, theta[i]);
866
- if (d > delta) delta = d;
867
- }
868
- theta = newTheta;
869
- if (delta < tol) break;
870
- }
871
- const minLog = Math.min(...theta.map((t) => Math.log(Math.max(1e-300, t))));
872
- const ratings = ids.map((id, i) => ({
873
- candidateId: id,
874
- strength: theta[i],
875
- logStrength: Math.log(Math.max(1e-300, theta[i])) - minLog,
876
- n: compsTotal[i],
877
- wins: winsTotal[i] - smoothing
878
- }));
879
- return {
880
- ratings: ratings.sort((a, b) => b.strength - a.strength),
881
- iterations: iter,
882
- finalDelta: delta,
883
- converged: delta < tol
884
- };
885
- }
886
- function applyEloUpdate(ratings, outcome, opts = {}) {
887
- const defaultRating = opts.defaultRating ?? 1500;
888
- const k = opts.kFactor ?? 32;
889
- const rW = ratings.get(outcome.winner) ?? defaultRating;
890
- const rL = ratings.get(outcome.loser) ?? defaultRating;
891
- const expectedW = 1 / (1 + Math.pow(10, (rL - rW) / 400));
892
- const scoreW = outcome.draw ? 0.5 : 1;
893
- const scoreL = outcome.draw ? 0.5 : 0;
894
- const w = outcome.weight ?? 1;
895
- const winnerDelta = k * w * (scoreW - expectedW);
896
- const loserDelta = k * w * (scoreL - (1 - expectedW));
897
- ratings.set(outcome.winner, rW + winnerDelta);
898
- ratings.set(outcome.loser, rL + loserDelta);
899
- return { winnerDelta, loserDelta };
900
- }
901
- function buildPairwiseFromCampaign(input) {
902
- const drawMargin = input.drawMargin ?? 0;
903
- const byKey = /* @__PURE__ */ new Map();
904
- for (const r of input.runs) {
905
- const arr = byKey.get(r.matchKey) ?? [];
906
- arr.push({ candidateId: r.candidateId, score: r.score });
907
- byKey.set(r.matchKey, arr);
908
- }
909
- const outcomes = [];
910
- for (const arr of byKey.values()) {
911
- for (let i = 0; i < arr.length; i++) {
912
- for (let j = i + 1; j < arr.length; j++) {
913
- const a = arr[i];
914
- const b = arr[j];
915
- if (a.candidateId === b.candidateId) continue;
916
- const margin = Math.abs(a.score - b.score);
917
- if (margin <= drawMargin) {
918
- outcomes.push({ winner: a.candidateId, loser: b.candidateId, draw: true, weight: 1 });
919
- } else {
920
- const [winner, loser] = a.score > b.score ? [a, b] : [b, a];
921
- outcomes.push({ winner: winner.candidateId, loser: loser.candidateId, weight: margin });
922
- }
923
- }
924
- }
925
- }
926
- return outcomes;
927
- }
928
-
929
- // src/rl/adversarial.ts
930
- async function adversarialScenarioSearch(opts) {
931
- const failureThreshold = opts.failureThreshold ?? 0.5;
932
- const rounds = opts.rounds ?? 3;
933
- const children = opts.childrenPerParent ?? 4;
934
- const budget = opts.budget ?? Number.POSITIVE_INFINITY;
935
- const seed = opts.seed ?? 1;
936
- const rng = mulberry32(seed);
937
- const scenarios = [];
938
- const seen = /* @__PURE__ */ new Set();
939
- let scoreCalls = 0;
940
- for (const s of opts.seeds) {
941
- const id = opts.mutateScenarioId(s);
942
- if (seen.has(id)) continue;
943
- seen.add(id);
944
- if (scoreCalls >= budget) break;
945
- const score = await opts.scoreFn(s);
946
- scoreCalls++;
947
- scenarios.push({
948
- id,
949
- generation: 0,
950
- parentId: null,
951
- scenario: s,
952
- score,
953
- mutationStrategy: null
954
- });
955
- }
956
- for (let g = 1; g <= rounds; g++) {
957
- if (scoreCalls >= budget) break;
958
- const parents = scenarios.filter((s) => s.generation === g - 1);
959
- for (const parent of parents) {
960
- for (const mutation of opts.mutations) {
961
- if (scoreCalls >= budget) break;
962
- const produced = await mutation.mutate(parent.scenario, rng);
963
- const childArr = Array.isArray(produced) ? produced : [produced];
964
- for (let k = 0; k < Math.min(children, childArr.length); k++) {
965
- if (scoreCalls >= budget) break;
966
- const child = childArr[k];
967
- const cid = opts.mutateScenarioId(child);
968
- if (seen.has(cid)) continue;
969
- seen.add(cid);
970
- const cscore = await opts.scoreFn(child);
971
- scoreCalls++;
972
- scenarios.push({
973
- id: cid,
974
- generation: g,
975
- parentId: parent.id,
976
- scenario: child,
977
- score: cscore,
978
- mutationStrategy: mutation.id
979
- });
980
- }
981
- }
982
- }
983
- }
984
- const failures = scenarios.filter((s) => s.score !== null && s.score < failureThreshold).sort((a, b) => (a.score ?? 0) - (b.score ?? 0));
985
- const byGeneration = [];
986
- const maxGen = scenarios.reduce((m, s) => Math.max(m, s.generation), 0);
987
- for (let g = 0; g <= maxGen; g++) {
988
- const gens = scenarios.filter((s) => s.generation === g);
989
- if (gens.length === 0) continue;
990
- const fails = gens.filter((s) => s.score !== null && s.score < failureThreshold).length;
991
- const meanScore = gens.reduce((sum, s) => sum + (s.score ?? 0), 0) / gens.length;
992
- byGeneration.push({ generation: g, total: gens.length, failures: fails, meanScore });
993
- }
994
- return { scenarios, failures, byGeneration, scoreCalls };
995
- }
996
- function mulberry32(seed) {
997
- let s = seed >>> 0;
998
- return () => {
999
- s = s + 1831565813 >>> 0;
1000
- let t = s;
1001
- t = Math.imul(t ^ t >>> 15, t | 1);
1002
- t ^= t + Math.imul(t ^ t >>> 7, t | 61);
1003
- return ((t ^ t >>> 14) >>> 0) / 4294967296;
1004
- };
1005
- }
1006
-
1007
- // src/rl/compute-curves.ts
1008
- async function runComputeCurve(opts) {
1009
- const points = [];
1010
- for (const budget of opts.budgets) {
1011
- const r = await opts.runAtBudget(budget);
1012
- points.push({
1013
- budgetId: budget.id,
1014
- cost: budget.cost,
1015
- score: r.score,
1016
- samples: r.samples,
1017
- std: r.std,
1018
- metrics: r.metrics
1019
- });
1020
- }
1021
- const sorted = [...points].sort((a, b) => a.cost - b.cost);
1022
- const logSlope = sorted.length >= 2 ? fitLogSlope(sorted) : null;
1023
- const best = points.reduce((a, b) => b.score > a.score ? b : a);
1024
- return { candidateId: opts.candidateId, points: sorted, logSlope, best };
1025
- }
1026
- async function bestOfN(opts) {
1027
- if (opts.n <= 0) throw new Error("bestOfN: n must be > 0");
1028
- const rollouts = [];
1029
- const scores = [];
1030
- for (let i = 0; i < opts.n; i++) {
1031
- const r = await opts.sample(i);
1032
- rollouts.push(r);
1033
- scores.push(await opts.scoreFn(r));
1034
- }
1035
- let bestIndex = 0;
1036
- for (let i = 1; i < scores.length; i++) if (scores[i] > scores[bestIndex]) bestIndex = i;
1037
- const meanScore = scores.reduce((s, x) => s + x, 0) / scores.length;
1038
- return {
1039
- best: rollouts[bestIndex],
1040
- bestScore: scores[bestIndex],
1041
- scores,
1042
- meanScore,
1043
- bestIndex
1044
- };
1045
- }
1046
- async function selfConsistency(opts) {
1047
- if (opts.n <= 0) throw new Error("selfConsistency: n must be > 0");
1048
- const rollouts = [];
1049
- const histogram = {};
1050
- for (let i = 0; i < opts.n; i++) {
1051
- const r = await opts.sample(i);
1052
- rollouts.push(r);
1053
- const key = opts.answerKey(r);
1054
- histogram[key] = (histogram[key] ?? 0) + 1;
1055
- }
1056
- let answer = "";
1057
- let max = -1;
1058
- for (const [k, v] of Object.entries(histogram)) {
1059
- if (v > max) {
1060
- max = v;
1061
- answer = k;
1062
- }
1063
- }
1064
- const representative = rollouts.find((r) => opts.answerKey(r) === answer) ?? rollouts[0];
1065
- return {
1066
- answer,
1067
- agreement: max / opts.n,
1068
- histogram,
1069
- representative,
1070
- rollouts
1071
- };
1072
- }
1073
- function paretoFrontier(points) {
1074
- const onFrontier = [];
1075
- for (const p of points) {
1076
- const dominated = points.some(
1077
- (q) => q !== p && q.cost <= p.cost && q.score >= p.score && (q.cost < p.cost || q.score > p.score)
1078
- );
1079
- if (!dominated) onFrontier.push(p);
1080
- }
1081
- return onFrontier.sort((a, b) => a.cost - b.cost);
1082
- }
1083
- function fitLogSlope(points) {
1084
- const xs = points.map((p) => Math.log(Math.max(1e-12, p.cost)));
1085
- const ys = points.map((p) => p.score);
1086
- const n = xs.length;
1087
- const mx = xs.reduce((s, x) => s + x, 0) / n;
1088
- const my = ys.reduce((s, y) => s + y, 0) / n;
1089
- let num = 0;
1090
- let den = 0;
1091
- for (let i = 0; i < n; i++) {
1092
- num += (xs[i] - mx) * (ys[i] - my);
1093
- den += (xs[i] - mx) ** 2;
1094
- }
1095
- return den === 0 ? 0 : num / den;
1096
- }
1097
-
1098
- // src/rl/active-curriculum.ts
1099
- function varianceBasedCurriculum(observations, candidateCells, opts) {
1100
- const variancePrior = opts.variancePrior ?? 0.05;
1101
- const floor = opts.floorPerCell ?? 1;
1102
- const budget = opts.budget;
1103
- const grouped = /* @__PURE__ */ new Map();
1104
- for (const o of observations) {
1105
- const k = `${o.variantId}::${o.scenarioId}`;
1106
- const arr = grouped.get(k) ?? [];
1107
- arr.push(o.score);
1108
- grouped.set(k, arr);
1109
- }
1110
- const cellStats = candidateCells.map((c) => {
1111
- const k = `${c.variantId}::${c.scenarioId}`;
1112
- const samples = grouped.get(k) ?? [];
1113
- const n = samples.length;
1114
- const mean2 = n === 0 ? 0.5 : samples.reduce((s, v) => s + v, 0) / n;
1115
- const variance = n < 2 ? variancePrior : samples.reduce((s, v) => s + (v - mean2) ** 2, 0) / (n - 1) + variancePrior;
1116
- const weight = Math.sqrt(variance) + 1 / Math.sqrt(Math.max(1, n));
1117
- return { variantId: c.variantId, scenarioId: c.scenarioId, n, mean: mean2, variance, weight };
1118
- });
1119
- const floorTotal = floor * cellStats.length;
1120
- if (floorTotal >= budget) {
1121
- const each = Math.max(1, Math.floor(budget / Math.max(1, cellStats.length)));
1122
- return cellStats.map((c) => ({
1123
- variantId: c.variantId,
1124
- scenarioId: c.scenarioId,
1125
- count: each,
1126
- reason: `floor allocation (budget tight; n=${c.n})`
1127
- }));
1128
- }
1129
- const remaining = budget - floorTotal;
1130
- const totalWeight = cellStats.reduce((s, c) => s + c.weight, 0);
1131
- return cellStats.map((c) => {
1132
- const proportional = totalWeight === 0 ? 0 : Math.round(c.weight / totalWeight * remaining);
1133
- return {
1134
- variantId: c.variantId,
1135
- scenarioId: c.scenarioId,
1136
- count: floor + proportional,
1137
- reason: `variance ${c.variance.toFixed(3)} (n=${c.n}, mean=${c.mean.toFixed(3)})`
1138
- };
1139
- });
1140
- }
1141
- function thompsonCurriculum(observations, candidateCells, opts) {
1142
- const threshold = opts.decisionThreshold ?? 0.5;
1143
- const alpha0 = opts.priorAlpha ?? 1;
1144
- const beta0 = opts.priorBeta ?? 1;
1145
- const rng = makeRng(opts.seed);
1146
- const grouped = /* @__PURE__ */ new Map();
1147
- for (const o of observations) {
1148
- const k = `${o.variantId}::${o.scenarioId}`;
1149
- const cur = grouped.get(k) ?? { passes: 0, failures: 0 };
1150
- const pass = o.pass ?? o.score >= threshold;
1151
- if (pass) cur.passes += 1;
1152
- else cur.failures += 1;
1153
- grouped.set(k, cur);
1154
- }
1155
- const stats = candidateCells.map((c) => {
1156
- const k = `${c.variantId}::${c.scenarioId}`;
1157
- const cur = grouped.get(k) ?? { passes: 0, failures: 0 };
1158
- const a = alpha0 + cur.passes;
1159
- const b = beta0 + cur.failures;
1160
- const sampled = sampleBeta(a, b, rng);
1161
- const distance = Math.abs(sampled - threshold);
1162
- const variance = a * b / ((a + b) ** 2 * (a + b + 1));
1163
- const sigma = Math.max(0.05, Math.sqrt(variance));
1164
- const weight = Math.exp(-((distance / sigma) ** 2));
1165
- return {
1166
- variantId: c.variantId,
1167
- scenarioId: c.scenarioId,
1168
- n: cur.passes + cur.failures,
1169
- sampled,
1170
- sigma,
1171
- weight,
1172
- a,
1173
- b
1174
- };
1175
- });
1176
- const totalWeight = stats.reduce((s, c) => s + c.weight, 0);
1177
- return stats.map((c) => {
1178
- const proportional = totalWeight === 0 ? 0 : Math.round(c.weight / totalWeight * opts.budget);
1179
- return {
1180
- variantId: c.variantId,
1181
- scenarioId: c.scenarioId,
1182
- count: Math.max(0, proportional),
1183
- reason: `Beta(${c.a.toFixed(1)},${c.b.toFixed(1)}) sample=${c.sampled.toFixed(3)} (target ${threshold})`
1184
- };
1185
- });
1186
- }
1187
- function observationsFromRunRecords(runs, opts = {}) {
1188
- const threshold = opts.passThreshold ?? 0.5;
1189
- const useHoldout = opts.useHoldout ?? true;
1190
- const out = [];
1191
- for (const r of runs) {
1192
- if (!r.scenarioId) continue;
1193
- const score = useHoldout ? r.outcome.holdoutScore ?? r.outcome.searchScore : r.outcome.searchScore ?? r.outcome.holdoutScore;
1194
- if (typeof score !== "number" || !Number.isFinite(score)) continue;
1195
- out.push({
1196
- variantId: r.candidateId,
1197
- scenarioId: r.scenarioId,
1198
- score,
1199
- pass: score >= threshold
1200
- });
1201
- }
1202
- return out;
1203
- }
1204
- function makeRng(seed) {
1205
- if (seed === void 0) return Math.random;
1206
- let s = seed >>> 0;
1207
- return () => {
1208
- s = s + 1831565813 >>> 0;
1209
- let t = s;
1210
- t = Math.imul(t ^ t >>> 15, t | 1);
1211
- t ^= t + Math.imul(t ^ t >>> 7, t | 61);
1212
- return ((t ^ t >>> 14) >>> 0) / 4294967296;
1213
- };
1214
- }
1215
- function sampleBeta(alpha, beta, rng) {
1216
- const a = Math.max(1, alpha);
1217
- const b = Math.max(1, beta);
1218
- const x = sampleGamma(a, rng);
1219
- const y = sampleGamma(b, rng);
1220
- return x / (x + y);
1221
- }
1222
- function sampleGamma(shape, rng) {
1223
- const d = shape - 1 / 3;
1224
- const c = 1 / Math.sqrt(9 * d);
1225
- while (true) {
1226
- let x;
1227
- let v;
1228
- do {
1229
- const u1 = rng() || 1e-12;
1230
- const u2 = rng() || 1e-12;
1231
- x = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
1232
- v = 1 + c * x;
1233
- } while (v <= 0);
1234
- v = v * v * v;
1235
- const u = rng();
1236
- if (u < 1 - 0.0331 * x ** 4) return d * v;
1237
- if (Math.log(u) < 0.5 * x * x + d * (1 - v + Math.log(v))) return d * v;
1238
- }
1239
- }
1240
-
1241
- // src/rl/reward-hacking.ts
1242
- var DEFAULT_PROXY = (r) => {
1243
- const v = r.outcome.holdoutScore ?? r.outcome.searchScore;
1244
- return typeof v === "number" && Number.isFinite(v) ? v : null;
1245
- };
1246
- function detectRewardHacking(input) {
1247
- const proxyOf = input.proxyOf ?? DEFAULT_PROXY;
1248
- const truthOf = input.truthOf;
1249
- const sus = input.thresholds?.suspect ?? 0.3;
1250
- const gam = input.thresholds?.gaming ?? 0.6;
1251
- const runs = input.runs.filter((r) => proxyOf(r) !== null);
1252
- const n = runs.length;
1253
- if (n < 4) {
1254
- return {
1255
- findings: [],
1256
- verdict: "clean",
1257
- n,
1258
- rationale: [`fewer than 4 runs with proxy reward (n=${n}); insufficient evidence`]
1259
- };
1260
- }
1261
- const windowSize = Math.max(1, input.windowSize ?? Math.min(50, Math.floor(n / 2)));
1262
- const before = runs.slice(0, n - windowSize);
1263
- const after = runs.slice(n - windowSize);
1264
- const findings = [];
1265
- if (truthOf) {
1266
- const beforeProxy = before.map(proxyOf).filter((v) => typeof v === "number");
1267
- const afterProxy = after.map(proxyOf).filter((v) => typeof v === "number");
1268
- const beforeTruth = before.map(truthOf).filter((v) => typeof v === "number");
1269
- const afterTruth = after.map(truthOf).filter((v) => typeof v === "number");
1270
- if (beforeProxy.length >= 2 && afterProxy.length >= 2 && beforeTruth.length >= 2 && afterTruth.length >= 2) {
1271
- const proxyDelta = mean(afterProxy) - mean(beforeProxy);
1272
- const truthDelta = mean(afterTruth) - mean(beforeTruth);
1273
- const gap = Math.max(0, proxyDelta - truthDelta);
1274
- const severity = clamp012(gap * 5);
1275
- findings.push({
1276
- signal: "reward_divergence",
1277
- severity,
1278
- message: severity >= sus ? `proxy reward rose by ${proxyDelta.toFixed(3)} while truth changed by ${truthDelta.toFixed(3)} \u2014 potential Goodhart` : `proxy and truth moved together (proxy ${proxyDelta.toFixed(3)}, truth ${truthDelta.toFixed(3)})`,
1279
- detail: { proxyDelta, truthDelta, gap, beforeN: beforeProxy.length, afterN: afterProxy.length }
1280
- });
1281
- }
1282
- }
1283
- {
1284
- const beforeP = before.map(proxyOf).filter((v) => typeof v === "number");
1285
- const afterP = after.map(proxyOf).filter((v) => typeof v === "number");
1286
- if (beforeP.length >= 4 && afterP.length >= 4) {
1287
- const ks = ksStatistic(beforeP, afterP);
1288
- const severity = clamp012(ks - 0.2);
1289
- findings.push({
1290
- signal: "distribution_shift",
1291
- severity,
1292
- message: severity >= sus ? `KS=${ks.toFixed(3)} between before/after windows \u2014 distributional shift large` : `KS=${ks.toFixed(3)} between before/after windows \u2014 within-distribution drift`,
1293
- detail: { ks, beforeN: beforeP.length, afterN: afterP.length }
1294
- });
1295
- }
1296
- }
1297
- {
1298
- const secondaryOf = input.secondaryRewardOf ?? defaultSecondary(input.verifiableRewardOptions);
1299
- const aligned = runs.map((r) => ({ p: proxyOf(r), s: secondaryOf(r) })).filter((x) => typeof x.p === "number" && typeof x.s === "number");
1300
- if (aligned.length >= 4) {
1301
- const ps = aligned.map((x) => x.p);
1302
- const ss = aligned.map((x) => x.s);
1303
- const r = pearsonR(ps, ss);
1304
- const severity = clamp012(0.5 - Math.max(0, r));
1305
- findings.push({
1306
- signal: "reward_disagreement",
1307
- severity,
1308
- message: severity >= sus ? `proxy and independent secondary reward correlate \u03C1=${r.toFixed(3)} \u2014 possibly hacking proxy` : `proxy and secondary reward correlate \u03C1=${r.toFixed(3)}`,
1309
- detail: { pearson: r, n: aligned.length }
1310
- });
1311
- }
1312
- }
1313
- {
1314
- const detRuns = filterDeterministicallyRewarded(runs, input.verifiableRewardOptions ?? {});
1315
- if (detRuns.length >= 4) {
1316
- const detBefore = detRuns.slice(0, Math.floor(detRuns.length / 2));
1317
- const detAfter = detRuns.slice(Math.floor(detRuns.length / 2));
1318
- const detDelta = mean(detAfter.map((r) => r.reward.value)) - mean(detBefore.map((r) => r.reward.value));
1319
- const proxyDelta = mean(after.map(proxyOf).filter((v) => typeof v === "number")) - mean(before.map(proxyOf).filter((v) => typeof v === "number"));
1320
- const driftGap = Math.max(0, proxyDelta - detDelta);
1321
- const severity = clamp012(driftGap * 5);
1322
- findings.push({
1323
- signal: "judge_drift",
1324
- severity,
1325
- message: severity >= sus ? `judge proxy +${proxyDelta.toFixed(3)} while deterministic reward +${detDelta.toFixed(3)} \u2014 judge drifting up without verifiable backing` : `judge and deterministic rewards move in step (judge ${proxyDelta.toFixed(3)}, det ${detDelta.toFixed(3)})`,
1326
- detail: { proxyDelta, detDelta, driftGap, n: detRuns.length }
1327
- });
1328
- }
1329
- }
1330
- const maxSev = findings.reduce((m, f) => Math.max(m, f.severity), 0);
1331
- const verdict = maxSev >= gam ? "gaming" : maxSev >= sus ? "suspect" : "clean";
1332
- const rationale = findings.filter((f) => f.severity >= sus).map((f) => `${f.signal}: severity ${f.severity.toFixed(2)} \u2014 ${f.message}`);
1333
- if (rationale.length === 0) rationale.push("no signals fired above suspect threshold");
1334
- return { findings, verdict, rationale, n };
1335
- }
1336
- function mean(xs) {
1337
- if (xs.length === 0) return 0;
1338
- return xs.reduce((s, x) => s + x, 0) / xs.length;
1339
- }
1340
- function clamp012(x) {
1341
- if (!Number.isFinite(x)) return 0;
1342
- return Math.max(0, Math.min(1, x));
1343
- }
1344
- function pearsonR(a, b) {
1345
- if (a.length !== b.length || a.length < 2) return 0;
1346
- const ma = mean(a);
1347
- const mb = mean(b);
1348
- let num = 0, da = 0, db = 0;
1349
- for (let i = 0; i < a.length; i++) {
1350
- const xa = a[i] - ma;
1351
- const xb = b[i] - mb;
1352
- num += xa * xb;
1353
- da += xa * xa;
1354
- db += xb * xb;
1355
- }
1356
- if (da === 0 || db === 0) return 0;
1357
- return num / Math.sqrt(da * db);
1358
- }
1359
- function ksStatistic(a, b) {
1360
- const sortedA = [...a].sort((x, y) => x - y);
1361
- const sortedB = [...b].sort((x, y) => x - y);
1362
- const all = [.../* @__PURE__ */ new Set([...sortedA, ...sortedB])].sort((x, y) => x - y);
1363
- let max = 0;
1364
- for (const v of all) {
1365
- const fa = sortedA.filter((x) => x <= v).length / sortedA.length;
1366
- const fb = sortedB.filter((x) => x <= v).length / sortedB.length;
1367
- max = Math.max(max, Math.abs(fa - fb));
1368
- }
1369
- return max;
1370
- }
1371
- function defaultSecondary(verifiableOpts) {
1372
- return (run) => {
1373
- const filtered = filterDeterministicallyRewarded([run], verifiableOpts ?? {});
1374
- return filtered.length === 1 ? filtered[0].reward.value : null;
1375
- };
1376
- }
1377
-
1378
- // src/rl/adaptation-eval.ts
1379
- async function runAdaptationCurve(opts) {
1380
- const ks = opts.ks ?? [0, 1, 2, 4, 8, 16];
1381
- const reps = opts.reps ?? 3;
1382
- const passThreshold = opts.passThreshold ?? 0.5;
1383
- const sortedKs = [...ks].sort((a, b) => a - b);
1384
- const points = [];
1385
- for (const k of sortedKs) {
1386
- const perScenario = [];
1387
- const allScores = [];
1388
- let totalPasses = 0;
1389
- let totalAttempts = 0;
1390
- for (const scenario of opts.scenarios) {
1391
- const sid = scenario.scenarioId ?? `scenario-${opts.scenarios.indexOf(scenario)}`;
1392
- let scores = [];
1393
- let passes = 0;
1394
- for (let r = 0; r < reps; r++) {
1395
- const score = await opts.runner.run({ scenario, k, rep: r });
1396
- scores.push(score);
1397
- if (score >= passThreshold) passes++;
1398
- allScores.push(score);
1399
- if (score >= passThreshold) totalPasses++;
1400
- totalAttempts++;
1401
- }
1402
- const meanS = scores.reduce((s, v) => s + v, 0) / scores.length;
1403
- perScenario.push({ scenarioId: sid, meanScore: meanS, passes, total: scores.length });
1404
- }
1405
- const meanScore = allScores.reduce((s, v) => s + v, 0) / Math.max(1, allScores.length);
1406
- const variance = allScores.length < 2 ? 0 : allScores.reduce((s, v) => s + (v - meanScore) ** 2, 0) / (allScores.length - 1);
1407
- points.push({
1408
- k,
1409
- meanScore,
1410
- passRate: totalPasses / Math.max(1, totalAttempts),
1411
- std: Math.sqrt(variance),
1412
- n: allScores.length,
1413
- perScenario
1414
- });
1415
- }
1416
- const firstPassK2 = points.find((p) => p.passRate >= passThreshold)?.k ?? null;
1417
- const maxK = sortedKs[sortedKs.length - 1] ?? 1;
1418
- let area = 0;
1419
- for (let i = 1; i < points.length; i++) {
1420
- const x1 = points[i - 1].k;
1421
- const x2 = points[i].k;
1422
- const y1 = points[i - 1].meanScore;
1423
- const y2 = points[i].meanScore;
1424
- area += (y1 + y2) / 2 * (x2 - x1);
1425
- }
1426
- const adaptationArea = maxK === 0 ? 0 : area / maxK;
1427
- return { points, firstPassK: firstPassK2, adaptationArea };
1428
- }
1429
- function compareAdaptationCurves(a, b, opts = {}) {
1430
- const conf = opts.confidence ?? 0.95;
1431
- const resamples = opts.bootstrapResamples ?? 500;
1432
- const rng = makeRng2(opts.seed);
1433
- const perK = [];
1434
- for (const ap of a.points) {
1435
- const bp = b.points.find((p) => p.k === ap.k);
1436
- if (!bp) continue;
1437
- const aMeans = ap.perScenario.map((s) => s.meanScore);
1438
- const bMeans = bp.perScenario.map((s) => s.meanScore);
1439
- const aCi = bootstrapMeanCi(aMeans, resamples, conf, rng);
1440
- const bCi = bootstrapMeanCi(bMeans, resamples, conf, rng);
1441
- perK.push({
1442
- k: ap.k,
1443
- deltaMean: ap.meanScore - bp.meanScore,
1444
- aLow: aCi.low,
1445
- aHigh: aCi.high,
1446
- bLow: bCi.low,
1447
- bHigh: bCi.high
1448
- });
1449
- }
1450
- const areaDelta = a.adaptationArea - b.adaptationArea;
1451
- const firstPassKDelta = a.firstPassK !== null && b.firstPassK !== null ? b.firstPassK - a.firstPassK : null;
1452
- const meanDelta = perK.reduce((s, p) => s + p.deltaMean, 0) / Math.max(1, perK.length);
1453
- let verdict;
1454
- if (Math.abs(meanDelta) < 0.02 && Math.abs(areaDelta) < 0.02) verdict = "similar";
1455
- else if (meanDelta > 0 && areaDelta > 0) verdict = "a_better";
1456
- else if (meanDelta < 0 && areaDelta < 0) verdict = "b_better";
1457
- else verdict = "similar";
1458
- const rationale = `mean per-k delta=${meanDelta.toFixed(3)}, area delta=${areaDelta.toFixed(3)}` + (firstPassKDelta !== null ? `, first-pass-k delta=${firstPassKDelta}` : "");
1459
- return { perK, areaDelta, firstPassKDelta, verdict, rationale };
1460
- }
1461
- function firstPassK(curve, threshold = 0.5) {
1462
- return curve.points.find((p) => p.passRate >= threshold)?.k ?? null;
1463
- }
1464
- function makeRng2(seed) {
1465
- if (seed === void 0) return Math.random;
1466
- let s = seed >>> 0;
1467
- return () => {
1468
- s = s + 1831565813 >>> 0;
1469
- let t = s;
1470
- t = Math.imul(t ^ t >>> 15, t | 1);
1471
- t ^= t + Math.imul(t ^ t >>> 7, t | 61);
1472
- return ((t ^ t >>> 14) >>> 0) / 4294967296;
1473
- };
1474
- }
1475
- function bootstrapMeanCi(xs, resamples, confidence, rng) {
1476
- if (xs.length < 2) return { low: xs[0] ?? 0, high: xs[0] ?? 0 };
1477
- const samples = new Array(resamples);
1478
- for (let b = 0; b < resamples; b++) {
1479
- let sum = 0;
1480
- for (let i = 0; i < xs.length; i++) sum += xs[Math.floor(rng() * xs.length)];
1481
- samples[b] = sum / xs.length;
1482
- }
1483
- samples.sort((a, b) => a - b);
1484
- const alpha = 1 - confidence;
1485
- return {
1486
- low: samples[Math.floor(alpha / 2 * resamples)],
1487
- high: samples[Math.min(resamples - 1, Math.ceil((1 - alpha / 2) * resamples) - 1)]
1488
- };
1489
- }
1490
-
1491
- // src/rl/exporters.ts
1492
- async function toDpoRows(triples, lookups) {
1493
- const out = [];
1494
- for (const t of triples) {
1495
- const [prompt, chosen, rejected] = await Promise.all([
1496
- Promise.resolve(lookups.promptOf(t.chosenRunId)),
1497
- Promise.resolve(lookups.completionOf(t.chosenRunId)),
1498
- Promise.resolve(lookups.completionOf(t.rejectedRunId))
1499
- ]);
1500
- out.push({
1501
- prompt,
1502
- chosen,
1503
- rejected,
1504
- margin: t.marginScore,
1505
- meta: {
1506
- scenarioId: t.scenarioId,
1507
- chosenVariantId: t.chosenVariantId,
1508
- rejectedVariantId: t.rejectedVariantId,
1509
- chosenRunId: t.chosenRunId,
1510
- rejectedRunId: t.rejectedRunId,
1511
- chosenModel: t.meta.chosenModel,
1512
- rejectedModel: t.meta.rejectedModel
1513
- }
1514
- });
1515
- }
1516
- return out;
1517
- }
1518
- function toDpoJsonl(rows) {
1519
- return rows.map((r) => JSON.stringify(r)).join("\n") + (rows.length > 0 ? "\n" : "");
1520
- }
1521
- async function toGrpoRows(runs, lookups) {
1522
- const rewardOf = lookups.rewardOf ?? defaultReward;
1523
- const grouped = /* @__PURE__ */ new Map();
1524
- for (const r of runs) {
1525
- const sid = r.scenarioId ?? r.experimentId;
1526
- const arr = grouped.get(sid) ?? [];
1527
- arr.push(r);
1528
- grouped.set(sid, arr);
1529
- }
1530
- const rows = [];
1531
- for (const [scenarioId, group] of grouped.entries()) {
1532
- if (group.length === 0) continue;
1533
- const prompt = await Promise.resolve(lookups.promptOf(group[0].runId));
1534
- const completions = [];
1535
- const rewards = [];
1536
- const runIds = [];
1537
- for (const r of group) {
1538
- const reward = rewardOf(r);
1539
- if (reward === null) continue;
1540
- const completion = await Promise.resolve(lookups.completionOf(r.runId));
1541
- completions.push(completion);
1542
- rewards.push(reward);
1543
- runIds.push(r.runId);
1544
- }
1545
- if (completions.length === 0) continue;
1546
- rows.push({
1547
- prompt,
1548
- completions,
1549
- rewards,
1550
- runIds,
1551
- meta: {
1552
- scenarioId,
1553
- n: completions.length,
1554
- meanReward: rewards.reduce((s, x) => s + x, 0) / rewards.length
1555
- }
1556
- });
1557
- }
1558
- return rows;
1559
- }
1560
- function toGrpoJsonl(rows) {
1561
- return rows.map((r) => JSON.stringify(r)).join("\n") + (rows.length > 0 ? "\n" : "");
1562
- }
1563
- async function toSftRows(runs, lookups) {
1564
- const include = lookups.include ?? (() => true);
1565
- const rows = [];
1566
- for (const r of runs) {
1567
- if (!include(r)) continue;
1568
- const system = lookups.systemOf?.(r);
1569
- const [prompt, completion] = await Promise.all([
1570
- Promise.resolve(lookups.promptOf(r.runId)),
1571
- Promise.resolve(lookups.completionOf(r.runId))
1572
- ]);
1573
- const messages = [];
1574
- if (system) messages.push({ role: "system", content: system });
1575
- messages.push({ role: "user", content: prompt });
1576
- messages.push({ role: "assistant", content: completion });
1577
- rows.push({
1578
- messages,
1579
- meta: {
1580
- runId: r.runId,
1581
- candidateId: r.candidateId,
1582
- scenarioId: r.scenarioId,
1583
- score: r.outcome.holdoutScore ?? r.outcome.searchScore,
1584
- model: r.model
1585
- }
1586
- });
1587
- }
1588
- return rows;
1589
- }
1590
- function toSftJsonl(rows) {
1591
- return rows.map((r) => JSON.stringify(r)).join("\n") + (rows.length > 0 ? "\n" : "");
1592
- }
1593
- async function toPrmRows(triples, lookups) {
1594
- const rows = [];
1595
- for (const t of triples) {
1596
- const prompt = await Promise.resolve(lookups.promptOf(t.prefixRunId));
1597
- const prefixSpanIds = lookups.prefixOf ? await Promise.resolve(lookups.prefixOf(t.prefixRunId, t.prefixStepIndex)) : [];
1598
- const prefixStepText = [];
1599
- for (const spanId of prefixSpanIds) {
1600
- prefixStepText.push(await Promise.resolve(lookups.stepTextOf(t.prefixRunId, spanId)));
1601
- }
1602
- const chosenStep = await Promise.resolve(lookups.stepTextOf(t.prefixRunId, t.chosenSpanId));
1603
- const rejectedStep = await Promise.resolve(lookups.stepTextOf(t.rejectedRunId, t.rejectedSpanId));
1604
- rows.push({
1605
- prompt,
1606
- prefixSpanIds,
1607
- prefixStepText,
1608
- chosenStep,
1609
- rejectedStep,
1610
- chosenReward: t.chosenReward,
1611
- rejectedReward: t.rejectedReward,
1612
- marginScore: t.marginScore,
1613
- meta: {
1614
- prefixRunId: t.prefixRunId,
1615
- rejectedRunId: t.rejectedRunId,
1616
- prefixStepIndex: t.prefixStepIndex
1617
- }
1618
- });
1619
- }
1620
- return rows;
1621
- }
1622
- function toPrmJsonl(rows) {
1623
- return rows.map((r) => JSON.stringify(r)).join("\n") + (rows.length > 0 ? "\n" : "");
1624
- }
1625
- function stepRewardsToJsonl(stepRewards) {
1626
- const rows = stepRewards.map((s) => ({
1627
- runId: s.runId,
1628
- spanId: s.spanId,
1629
- stepIndex: s.stepIndex,
1630
- reward: s.reward,
1631
- determinism: s.determinism,
1632
- weight: s.weight ?? 1
1633
- }));
1634
- return rows.map((r) => JSON.stringify(r)).join("\n") + (rows.length > 0 ? "\n" : "");
1635
- }
1636
- function defaultReward(run) {
1637
- const v = run.outcome.holdoutScore ?? run.outcome.searchScore;
1638
- return typeof v === "number" && Number.isFinite(v) ? v : null;
1639
- }
1640
-
1641
- // src/rl/rl-campaign.ts
1642
- async function runRLCampaign(opts) {
1643
- const campaign = await runEvalCampaign(opts);
1644
- const rewardSignals = extractVerifiableRewardsFromRecords(
1645
- campaign.runs,
1646
- opts.verifiableReward ?? {}
1647
- );
1648
- const preferences = extractPreferences(campaign.runs, {
1649
- strategy: opts.preferences?.strategy ?? "paired-by-scenario-and-seed",
1650
- minMargin: opts.preferences?.minMargin ?? 0.05,
1651
- splitTag: opts.preferences?.splitTag ?? opts.splitTag ?? "holdout",
1652
- rewardOf: opts.preferences?.rewardOf
1653
- });
1654
- let interimConfidence = null;
1655
- if (opts.report?.comparator) {
1656
- const comparator = opts.report.comparator;
1657
- const deltaSeries = collectPairedDeltaSeries(campaign.runs, comparator);
1658
- if (deltaSeries.some((s) => s.deltas.length > 0)) {
1659
- interimConfidence = evaluateInterimReleaseConfidence({
1660
- deltaSeries,
1661
- alpha: opts.sequential?.alpha,
1662
- bound: opts.sequential?.bound,
1663
- rope: opts.sequential?.rope ?? opts.report?.rope
1664
- });
1665
- }
1666
- }
1667
- const rewardHacking = detectRewardHacking({
1668
- runs: campaign.runs,
1669
- verifiableRewardOptions: opts.verifiableReward
1670
- });
1671
- let predictiveValidity = null;
1672
- if (opts.outcomeStore && opts.outcomeMetrics && opts.outcomeMetrics.length > 0) {
1673
- predictiveValidity = await rubricPredictiveValidity({
1674
- runs: campaign.runs,
1675
- outcomes: opts.outcomeStore,
1676
- outcomeMetrics: opts.outcomeMetrics
1677
- });
1678
- }
1679
- const trainerRows = {};
1680
- if (opts.trainerExport?.dpo) {
1681
- trainerRows.dpo = await toDpoRows(preferences.pairs, opts.trainerExport.dpo);
1682
- }
1683
- if (opts.trainerExport?.grpo) {
1684
- trainerRows.grpo = await toGrpoRows(campaign.runs, opts.trainerExport.grpo);
1685
- }
1686
- if (opts.trainerExport?.sft) {
1687
- trainerRows.sft = await toSftRows(campaign.runs, opts.trainerExport.sft);
1688
- }
1689
- const summary = buildSummary({ campaign, preferences, interimConfidence, rewardHacking, predictiveValidity });
1690
- return {
1691
- campaign,
1692
- rewardSignals,
1693
- preferences,
1694
- interimConfidence,
1695
- rewardHacking,
1696
- predictiveValidity,
1697
- trainerRows,
1698
- summary,
1699
- kind: "agent-eval-rl-campaign"
1700
- };
1701
- }
1702
- function collectPairedDeltaSeries(runs, comparator) {
1703
- const baseline = /* @__PURE__ */ new Map();
1704
- for (const r of runs) {
1705
- if (r.candidateId !== comparator) continue;
1706
- const sid = r.scenarioId ?? r.experimentId;
1707
- const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
1708
- if (typeof score !== "number" || !Number.isFinite(score)) continue;
1709
- baseline.set(`${sid}::${r.seed}`, score);
1710
- }
1711
- const byCandidate = /* @__PURE__ */ new Map();
1712
- for (const r of runs) {
1713
- if (r.candidateId === comparator) continue;
1714
- const sid = r.scenarioId ?? r.experimentId;
1715
- const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
1716
- if (typeof score !== "number" || !Number.isFinite(score)) continue;
1717
- const baseScore = baseline.get(`${sid}::${r.seed}`);
1718
- if (typeof baseScore !== "number") continue;
1719
- const arr = byCandidate.get(r.candidateId) ?? [];
1720
- arr.push(score - baseScore);
1721
- byCandidate.set(r.candidateId, arr);
1722
- }
1723
- return [...byCandidate.entries()].map(([candidateId, deltas]) => ({ candidateId, deltas }));
1724
- }
1725
- function buildSummary(args) {
1726
- const c = args.campaign;
1727
- const lines = [
1728
- `${c.campaignId}: ${c.runs.length} successful runs / ${c.failedRuns.length} failed (fingerprint ${c.campaignFingerprint.slice(0, 12)}\u2026)`,
1729
- `preferences: ${args.preferences.pairs.length} (${args.preferences.strategy}, ${args.preferences.pairsBelowMargin} below margin)`
1730
- ];
1731
- if (args.interimConfidence) {
1732
- lines.push(`sequential verdict: ${args.interimConfidence.recommendation.decision}` + (args.interimConfidence.recommendation.candidateId ? ` ${args.interimConfidence.recommendation.candidateId}` : ""));
1733
- }
1734
- lines.push(`reward-hacking: ${args.rewardHacking.verdict} (${args.rewardHacking.findings.length} signals checked)`);
1735
- if (args.predictiveValidity) {
1736
- const top = args.predictiveValidity.ranked[0];
1737
- lines.push(`top-rubric: ${top?.rubric ?? "none"} \u03C1=${(top?.spearman ?? 0).toFixed(2)} (${top?.verdict ?? "no data"})`);
1738
- }
1739
- return lines.join(" | ");
1740
- }
1741
-
1742
- // src/rl/predictive-validity-researcher.ts
1743
- var PredictiveValidityResearcher = class {
1744
- opts;
1745
- lastReport = null;
1746
- constructor(opts) {
1747
- this.opts = opts;
1748
- }
1749
- async inspectFailures(runs) {
1750
- const threshold = this.opts.failureThreshold ?? 0.5;
1751
- const failures = [];
1752
- const failingRuns = runs.filter((r) => {
1753
- const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
1754
- return typeof score === "number" && score < threshold;
1755
- });
1756
- if (failingRuns.length === 0) return failures;
1757
- const grouped = /* @__PURE__ */ new Map();
1758
- for (const r of failingRuns) {
1759
- const arr = grouped.get(r.candidateId) ?? [];
1760
- arr.push(r);
1761
- grouped.set(r.candidateId, arr);
1762
- }
1763
- for (const [candidateId, group] of grouped.entries()) {
1764
- const meanScore = group.reduce((s, r) => {
1765
- const x = r.outcome.holdoutScore ?? r.outcome.searchScore ?? 0;
1766
- return s + x;
1767
- }, 0) / group.length;
1768
- failures.push({
1769
- code: `low-score-${candidateId}`,
1770
- description: `${candidateId} scored < ${threshold} on ${group.length} run(s) (mean ${meanScore.toFixed(3)})`,
1771
- evidence: {
1772
- runIds: group.slice(0, 8).map((r) => r.runId),
1773
- samples: group.length
1774
- }
1775
- });
1776
- }
1777
- return failures;
1778
- }
1779
- async proposeChange(failures) {
1780
- if (failures.length === 0) return [];
1781
- if (this.lastReport === null) {
1782
- return [{
1783
- kind: "threshold",
1784
- payload: { directive: "researcher.collect-more-outcomes" },
1785
- rationale: "predictive-validity researcher has no prior report; cannot recommend rubric reweighting until at least one report exists"
1786
- }];
1787
- }
1788
- const decorativeThreshold = this.opts.decorativeThreshold ?? 0.4;
1789
- const changes = [];
1790
- for (const ranking of this.lastReport.ranked) {
1791
- if (ranking.verdict === "load_bearing") continue;
1792
- if (Math.abs(ranking.spearman) >= decorativeThreshold) continue;
1793
- changes.push({
1794
- kind: "reviewer_prompt",
1795
- payload: { rubric: ranking.rubric, action: "down-weight", spearman: ranking.spearman, bestOutcome: ranking.bestOutcome },
1796
- rationale: `predictive-validity Spearman=${ranking.spearman.toFixed(3)} vs ${ranking.bestOutcome} (decorative); recommend down-weighting`,
1797
- expectedDelta: -Math.max(0, 0.05 - Math.abs(ranking.spearman))
1798
- });
1799
- }
1800
- for (const ranking of this.lastReport.ranked.slice(0, 1)) {
1801
- if (ranking.verdict !== "load_bearing") continue;
1802
- changes.push({
1803
- kind: "reviewer_prompt",
1804
- payload: { rubric: ranking.rubric, action: "up-weight", spearman: ranking.spearman, bestOutcome: ranking.bestOutcome },
1805
- rationale: `predictive-validity Spearman=${ranking.spearman.toFixed(3)} vs ${ranking.bestOutcome} (load-bearing); recommend up-weighting`,
1806
- expectedDelta: Math.max(0, Math.abs(ranking.spearman) - 0.5) * 0.1
1807
- });
1808
- }
1809
- return changes;
1810
- }
1811
- async applyChange(changes, baseline) {
1812
- return {
1813
- ...baseline,
1814
- changes: [...baseline.changes, ...changes]
1815
- };
1816
- }
1817
- async evaluateChange(plan) {
1818
- const emptyGate = {
1819
- promote: false,
1820
- candidateId: plan.proposedCandidateId,
1821
- baselineId: plan.baselineCandidateId,
1822
- evidence: {
1823
- productiveRuns: 0,
1824
- medianPairedDelta: 0,
1825
- pairedCI: { low: 0, high: 0 },
1826
- pairedPValue: 1,
1827
- searchScore: 0,
1828
- holdoutScore: 0,
1829
- overfitGap: 0,
1830
- baselineOverfitGap: 0
1831
- },
1832
- reason: "predictive-validity researcher does not execute plans; the caller is expected to run the sweep and call rubricPredictiveValidity directly with the resulting RunRecord[].",
1833
- rejectionCode: "few_runs"
1834
- };
1835
- return {
1836
- plan,
1837
- runs: [],
1838
- gateDecision: emptyGate
1839
- };
1840
- }
1841
- /**
1842
- * Run the predictive-validity check explicitly against a fresh RunRecord
1843
- * set. Updates the researcher's cached report so subsequent
1844
- * `proposeChange` calls have evidence to draw from.
1845
- */
1846
- async runValidityCheck(runs) {
1847
- const report = await rubricPredictiveValidity({
1848
- runs,
1849
- outcomes: this.opts.outcomes,
1850
- outcomeMetrics: this.opts.outcomeMetrics,
1851
- rubrics: this.opts.rubrics
1852
- });
1853
- if (this.opts.onReport) await this.opts.onReport(report);
1854
- this.lastReport = report;
1855
- return report;
1856
- }
1857
- /**
1858
- * Force-feed a predictive-validity report into the researcher state —
1859
- * useful when the consumer ran the report out-of-band and wants the
1860
- * researcher's later proposals informed by it.
1861
- */
1862
- setReport(report) {
1863
- this.lastReport = report;
1864
- }
1865
- getLastReport() {
1866
- return this.lastReport;
1867
- }
1868
- };
1869
-
1870
- // src/rl/auto-research.ts
1871
- async function analyzeOptimizationResult(opts) {
1872
- const trials = extractTrials(opts.result);
1873
- const runs = trialsToRunRecords(trials, opts.ctx);
1874
- const rewardSignals = extractVerifiableRewardsFromRecords(runs, opts.verifiableReward ?? {});
1875
- const preferences = extractPreferences(runs, {
1876
- strategy: opts.preferences?.strategy ?? "paired-by-scenario-and-seed",
1877
- minMargin: opts.preferences?.minMargin ?? 0.05,
1878
- splitTag: opts.preferences?.splitTag ?? opts.ctx.splitTag ?? "search",
1879
- rewardOf: opts.preferences?.rewardOf
1880
- });
1881
- let interimConfidence = null;
1882
- if (opts.comparator) {
1883
- const deltaSeries = collectPairedDeltaSeries2(runs, opts.comparator);
1884
- if (deltaSeries.some((s) => s.deltas.length > 0)) {
1885
- interimConfidence = evaluateInterimReleaseConfidence({
1886
- deltaSeries,
1887
- alpha: opts.sequential?.alpha,
1888
- bound: opts.sequential?.bound,
1889
- rope: opts.sequential?.rope
1890
- });
1891
- }
1892
- }
1893
- const rewardHacking = detectRewardHacking({
1894
- runs,
1895
- verifiableRewardOptions: opts.verifiableReward
1896
- });
1897
- let predictiveValidity = null;
1898
- if (opts.outcomes) {
1899
- predictiveValidity = await rubricPredictiveValidity({
1900
- runs,
1901
- outcomes: opts.outcomes.store,
1902
- outcomeMetrics: opts.outcomes.metrics
1903
- });
1904
- }
1905
- const trainerRows = {};
1906
- if (opts.trainerExport?.dpo) {
1907
- trainerRows.dpo = await toDpoRows(preferences.pairs, opts.trainerExport.dpo);
1908
- }
1909
- if (opts.trainerExport?.grpo) {
1910
- trainerRows.grpo = await toGrpoRows(runs, opts.trainerExport.grpo);
1911
- }
1912
- const summary = buildSummary2({ runs, preferences, interimConfidence, rewardHacking, predictiveValidity });
1913
- return {
1914
- runs,
1915
- rewardSignals,
1916
- preferences,
1917
- interimConfidence,
1918
- rewardHacking,
1919
- predictiveValidity,
1920
- trainerRows,
1921
- summary
1922
- };
1923
- }
1924
- function extractTrials(result) {
1925
- if ("evolution" in result) {
1926
- return collectFromEvolution(result.evolution);
1927
- }
1928
- return collectFromEvolution(result);
1929
- }
1930
- function collectFromEvolution(evolution) {
1931
- const trials = [];
1932
- for (const gen of evolution.generations) {
1933
- for (const t of gen.trials ?? []) trials.push(t);
1934
- }
1935
- return trials;
1936
- }
1937
- function collectPairedDeltaSeries2(runs, comparator) {
1938
- const baseline = /* @__PURE__ */ new Map();
1939
- for (const r of runs) {
1940
- if (r.candidateId !== comparator) continue;
1941
- const sid = r.scenarioId ?? r.experimentId;
1942
- const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
1943
- if (typeof score !== "number" || !Number.isFinite(score)) continue;
1944
- baseline.set(`${sid}::${r.seed}`, score);
1945
- }
1946
- const byCandidate = /* @__PURE__ */ new Map();
1947
- for (const r of runs) {
1948
- if (r.candidateId === comparator) continue;
1949
- const sid = r.scenarioId ?? r.experimentId;
1950
- const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
1951
- if (typeof score !== "number" || !Number.isFinite(score)) continue;
1952
- const baseScore = baseline.get(`${sid}::${r.seed}`);
1953
- if (typeof baseScore !== "number") continue;
1954
- const arr = byCandidate.get(r.candidateId) ?? [];
1955
- arr.push(score - baseScore);
1956
- byCandidate.set(r.candidateId, arr);
1957
- }
1958
- return [...byCandidate.entries()].map(([candidateId, deltas]) => ({ candidateId, deltas }));
1959
- }
1960
- function buildSummary2(args) {
1961
- const lines = [
1962
- `${args.runs.length} runs analysed`,
1963
- `${args.preferences.pairs.length} preference pairs (${args.preferences.strategy})`,
1964
- `reward-hacking verdict: ${args.rewardHacking.verdict}`
1965
- ];
1966
- if (args.interimConfidence) {
1967
- lines.push(`sequential: ${args.interimConfidence.recommendation.decision}` + (args.interimConfidence.recommendation.candidateId ? ` ${args.interimConfidence.recommendation.candidateId}` : ""));
1968
- }
1969
- if (args.predictiveValidity?.ranked[0]) {
1970
- const top = args.predictiveValidity.ranked[0];
1971
- lines.push(`top-rubric: ${top.rubric} \u03C1=${top.spearman.toFixed(2)}`);
1972
- }
1973
- return lines.join(" | ");
1974
- }
1975
-
1976
- export {
1977
- trialToRunRecord,
1978
- trialsToRunRecords,
1979
- verificationReportToRunRecord,
1980
- variantAggregateToRunRecord,
1981
- extractVerifiableReward,
1982
- extractVerifiableRewardsFromRecords,
1983
- filterDeterministicallyRewarded,
1984
- extractPreferences,
1985
- toTRLFormat,
1986
- toAnthropicFormat,
1987
- inverseProbabilityWeighting,
1988
- selfNormalizedImportanceWeighting,
1989
- doublyRobust,
1990
- offPolicyEstimateAll,
1991
- extractStepRewards,
1992
- runwiseStepRewardSummary,
1993
- prmTrainingPairs,
1994
- runContaminationProbe,
1995
- renameVariables,
1996
- shuffleOrder,
1997
- injectIrrelevantClause,
1998
- fitBradleyTerry,
1999
- applyEloUpdate,
2000
- buildPairwiseFromCampaign,
2001
- adversarialScenarioSearch,
2002
- runComputeCurve,
2003
- bestOfN,
2004
- selfConsistency,
2005
- paretoFrontier,
2006
- varianceBasedCurriculum,
2007
- thompsonCurriculum,
2008
- observationsFromRunRecords,
2009
- detectRewardHacking,
2010
- runAdaptationCurve,
2011
- compareAdaptationCurves,
2012
- firstPassK,
2013
- toDpoRows,
2014
- toDpoJsonl,
2015
- toGrpoRows,
2016
- toGrpoJsonl,
2017
- toSftRows,
2018
- toSftJsonl,
2019
- toPrmRows,
2020
- toPrmJsonl,
2021
- stepRewardsToJsonl,
2022
- runRLCampaign,
2023
- PredictiveValidityResearcher,
2024
- analyzeOptimizationResult
2025
- };
2026
- //# sourceMappingURL=chunk-LZKIOBG2.js.map