@tangle-network/agent-eval 0.21.0 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +236 -1
- package/README.md +17 -3
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
- package/dist/chunk-4W4NCYM2.js.map +1 -0
- package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
- package/dist/chunk-6M774GY6.js +53 -0
- package/dist/chunk-6M774GY6.js.map +1 -0
- package/dist/chunk-7EAUOUQS.js +495 -0
- package/dist/chunk-7EAUOUQS.js.map +1 -0
- package/dist/chunk-AXHNWLIX.js +246 -0
- package/dist/chunk-AXHNWLIX.js.map +1 -0
- package/dist/chunk-EXGR4XEM.js +283 -0
- package/dist/chunk-EXGR4XEM.js.map +1 -0
- package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
- package/dist/chunk-IOXMGMHQ.js.map +1 -0
- package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
- package/dist/chunk-LZKIOBG2.js +2026 -0
- package/dist/chunk-LZKIOBG2.js.map +1 -0
- package/dist/{chunk-YUFXO3TU.js → chunk-QBW3YBTR.js} +1 -1
- package/dist/chunk-QBW3YBTR.js.map +1 -0
- package/dist/chunk-QUKKGHTZ.js +121 -0
- package/dist/chunk-QUKKGHTZ.js.map +1 -0
- package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
- package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
- package/dist/{chunk-ARZ6BEV6.js → chunk-V5QSWN7L.js} +2 -2
- package/dist/{chunk-HRZELXCR.js → chunk-VQQSPGSM.js} +3 -3
- package/dist/cli.js +3 -3
- package/dist/{control-cxwMOAsy.d.ts → control-DvkH87qJ.d.ts} +2 -2
- package/dist/control.d.ts +3 -3
- package/dist/control.js +2 -2
- package/dist/eval-campaign-Ds5QljIh.d.ts +573 -0
- package/dist/{feedback-trajectory-CB0A32o3.d.ts → feedback-trajectory-c43WGtTX.d.ts} +1 -1
- package/dist/{index-c5saLbKD.d.ts → index-DDTlbHEK.d.ts} +1 -1
- package/dist/index-ekBXweiQ.d.ts +1894 -0
- package/dist/index.d.ts +20 -430
- package/dist/index.js +154 -34
- package/dist/index.js.map +1 -1
- package/dist/integrity-Cr5YodSY.d.ts +210 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +7 -145
- package/dist/optimization.js +12 -3
- package/dist/reporting.d.ts +294 -4
- package/dist/reporting.js +18 -9
- package/dist/rl.d.ts +8 -0
- package/dist/rl.js +113 -0
- package/dist/rl.js.map +1 -0
- package/dist/{run-record-CX_jcAyr.d.ts → run-record-DNiOMBrZ.d.ts} +10 -1
- package/dist/sequential-DgU2mFsE.d.ts +304 -0
- package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-Ce1r4EYo.d.ts} +382 -2
- package/dist/traces.d.ts +101 -181
- package/dist/traces.js +19 -8
- package/dist/wire/index.js +3 -3
- package/docs/auto-research-loop-end-to-end.md +186 -0
- package/docs/research-report-methodology.md +19 -4
- package/docs/three-package-architecture.md +180 -0
- package/docs/wire-protocol.md +1 -1
- package/package.json +7 -2
- package/dist/chunk-3IX6QTB7.js.map +0 -1
- package/dist/chunk-KRR4VMH7.js +0 -423
- package/dist/chunk-KRR4VMH7.js.map +0 -1
- package/dist/chunk-WOK2RTWG.js.map +0 -1
- package/dist/chunk-YUFXO3TU.js.map +0 -1
- package/dist/reporting-Da2ihlcM.d.ts +0 -672
- /package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
- /package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
- /package/dist/{chunk-ARZ6BEV6.js.map → chunk-V5QSWN7L.js.map} +0 -0
- /package/dist/{chunk-HRZELXCR.js.map → chunk-VQQSPGSM.js.map} +0 -0
|
@@ -0,0 +1,2026 @@
|
|
|
1
|
+
import {
|
|
2
|
+
runEvalCampaign
|
|
3
|
+
} from "./chunk-EXGR4XEM.js";
|
|
4
|
+
import {
|
|
5
|
+
evaluateInterimReleaseConfidence,
|
|
6
|
+
rubricPredictiveValidity
|
|
7
|
+
} from "./chunk-AXHNWLIX.js";
|
|
8
|
+
import {
|
|
9
|
+
benjaminiHochberg,
|
|
10
|
+
wilcoxonSignedRank
|
|
11
|
+
} from "./chunk-IOXMGMHQ.js";
|
|
12
|
+
|
|
13
|
+
// src/rl/run-record-adapters.ts
|
|
14
|
+
function trialToRunRecord(trial, ctx, opts = {}) {
|
|
15
|
+
const splitTag = ctx.splitTag ?? "search";
|
|
16
|
+
const promptHash = typeof ctx.promptHash === "function" ? ctx.promptHash(trial) : ctx.promptHash;
|
|
17
|
+
const configHash = typeof ctx.configHash === "function" ? ctx.configHash(trial) : ctx.configHash;
|
|
18
|
+
const runId = opts.runId ?? defaultRunId(ctx, trial);
|
|
19
|
+
const experimentId = opts.experimentIdPerTrial?.(trial) ?? ctx.experimentId;
|
|
20
|
+
const costRecorded = typeof trial.cost === "number" && Number.isFinite(trial.cost);
|
|
21
|
+
const costUsd = costRecorded ? trial.cost : ctx.defaultCostUsd ?? 0;
|
|
22
|
+
const raw = { ...trial.metrics ?? {} };
|
|
23
|
+
if (!costRecorded) raw.cost_unknown = 1;
|
|
24
|
+
if (typeof trial.durationMs === "number") raw.duration_ms = trial.durationMs;
|
|
25
|
+
raw.rep = trial.rep;
|
|
26
|
+
const score = Number.isFinite(trial.score) ? trial.score : 0;
|
|
27
|
+
const outcome = { raw };
|
|
28
|
+
if (splitTag === "holdout") outcome.holdoutScore = score;
|
|
29
|
+
else outcome.searchScore = score;
|
|
30
|
+
return {
|
|
31
|
+
runId,
|
|
32
|
+
experimentId,
|
|
33
|
+
candidateId: trial.variantId,
|
|
34
|
+
seed: trial.rep,
|
|
35
|
+
model: ctx.model,
|
|
36
|
+
promptHash,
|
|
37
|
+
configHash,
|
|
38
|
+
commitSha: ctx.commitSha,
|
|
39
|
+
wallMs: trial.durationMs ?? 0,
|
|
40
|
+
costUsd,
|
|
41
|
+
tokenUsage: { input: 0, output: 0 },
|
|
42
|
+
outcome,
|
|
43
|
+
failureMode: trial.ok ? void 0 : trial.error ? "optimizer_trial_error" : "optimizer_trial_failed",
|
|
44
|
+
splitTag,
|
|
45
|
+
scenarioId: trial.scenarioId
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
function trialsToRunRecords(trials, ctx) {
|
|
49
|
+
return trials.map((t) => trialToRunRecord(t, ctx));
|
|
50
|
+
}
|
|
51
|
+
function verificationReportToRunRecord(report, ctx, opts = {}) {
|
|
52
|
+
const splitTag = ctx.splitTag ?? "search";
|
|
53
|
+
const runId = opts.runId ?? `run-${ctx.candidateId}-${ctx.experimentId}-${report.startedAt}`;
|
|
54
|
+
const promptHash = typeof ctx.promptHash === "function" ? "p".repeat(64) : ctx.promptHash;
|
|
55
|
+
const configHash = typeof ctx.configHash === "function" ? "c".repeat(64) : ctx.configHash;
|
|
56
|
+
const raw = {
|
|
57
|
+
pass_count: report.passCount,
|
|
58
|
+
fail_count: report.failCount,
|
|
59
|
+
error_count: report.errorCount,
|
|
60
|
+
skipped_count: report.skippedCount,
|
|
61
|
+
duration_ms: report.durationMs,
|
|
62
|
+
blended_score: report.blendedScore
|
|
63
|
+
};
|
|
64
|
+
for (const layer of report.layers) {
|
|
65
|
+
if (typeof layer.score === "number") raw[`layer.${layer.layer}`] = layer.score;
|
|
66
|
+
raw[`layer_${layer.layer}_pass`] = layer.status === "pass" ? 1 : 0;
|
|
67
|
+
if (layer.diagnostics) {
|
|
68
|
+
for (const [k, v] of Object.entries(layer.diagnostics)) {
|
|
69
|
+
if (typeof v === "number" && Number.isFinite(v)) raw[`layer.${layer.layer}.${k}`] = v;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
const firstFail = report.layers.find((l) => l.status === "fail" || l.status === "error");
|
|
74
|
+
const outcome = { raw };
|
|
75
|
+
if (splitTag === "holdout") outcome.holdoutScore = report.blendedScore;
|
|
76
|
+
else outcome.searchScore = report.blendedScore;
|
|
77
|
+
return {
|
|
78
|
+
runId,
|
|
79
|
+
experimentId: ctx.experimentId,
|
|
80
|
+
candidateId: ctx.candidateId,
|
|
81
|
+
seed: 0,
|
|
82
|
+
model: ctx.model,
|
|
83
|
+
promptHash,
|
|
84
|
+
configHash,
|
|
85
|
+
commitSha: ctx.commitSha,
|
|
86
|
+
wallMs: report.durationMs,
|
|
87
|
+
costUsd: ctx.defaultCostUsd ?? 0,
|
|
88
|
+
tokenUsage: { input: 0, output: 0 },
|
|
89
|
+
outcome,
|
|
90
|
+
failureMode: firstFail ? failureModeFromLayer(firstFail) : void 0,
|
|
91
|
+
splitTag,
|
|
92
|
+
scenarioId: ctx.scenarioId
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
function variantAggregateToRunRecord(agg, ctx, opts = {}) {
|
|
96
|
+
const splitTag = ctx.splitTag ?? "search";
|
|
97
|
+
const runId = opts.runId ?? `agg-${agg.variantId}-${ctx.experimentId}`;
|
|
98
|
+
const promptHash = typeof ctx.promptHash === "function" ? "p".repeat(64) : ctx.promptHash;
|
|
99
|
+
const configHash = typeof ctx.configHash === "function" ? "c".repeat(64) : ctx.configHash;
|
|
100
|
+
const raw = {
|
|
101
|
+
...agg.metrics,
|
|
102
|
+
ok_rate: agg.okRate,
|
|
103
|
+
duration_ms: agg.meanDurationMs,
|
|
104
|
+
n_scenarios: agg.scenarios.length
|
|
105
|
+
};
|
|
106
|
+
const outcome = { raw };
|
|
107
|
+
if (splitTag === "holdout") outcome.holdoutScore = agg.meanScore;
|
|
108
|
+
else outcome.searchScore = agg.meanScore;
|
|
109
|
+
return {
|
|
110
|
+
runId,
|
|
111
|
+
experimentId: ctx.experimentId,
|
|
112
|
+
candidateId: agg.variantId,
|
|
113
|
+
seed: 0,
|
|
114
|
+
model: ctx.model,
|
|
115
|
+
promptHash,
|
|
116
|
+
configHash,
|
|
117
|
+
commitSha: ctx.commitSha,
|
|
118
|
+
wallMs: agg.meanDurationMs,
|
|
119
|
+
costUsd: agg.meanCost,
|
|
120
|
+
tokenUsage: { input: 0, output: 0 },
|
|
121
|
+
outcome,
|
|
122
|
+
splitTag
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
function defaultRunId(ctx, t) {
|
|
126
|
+
return `run-${ctx.experimentId}-${t.variantId}-${t.scenarioId}-${t.rep}`;
|
|
127
|
+
}
|
|
128
|
+
function failureModeFromLayer(layer) {
|
|
129
|
+
if (layer.status === "error") return `layer_${layer.layer}_error`;
|
|
130
|
+
if (layer.status === "fail") return `layer_${layer.layer}_fail`;
|
|
131
|
+
if (layer.status === "timeout") return `layer_${layer.layer}_timeout`;
|
|
132
|
+
return `layer_${layer.layer}_${layer.status}`;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// src/rl/verifiable-reward.ts
|
|
136
|
+
var DEFAULT_DETERMINISTIC_LAYERS = /* @__PURE__ */ new Set([
|
|
137
|
+
"install",
|
|
138
|
+
"typecheck",
|
|
139
|
+
"build",
|
|
140
|
+
"lint",
|
|
141
|
+
"test",
|
|
142
|
+
"compile",
|
|
143
|
+
"schema",
|
|
144
|
+
"sandbox",
|
|
145
|
+
"unit_tests",
|
|
146
|
+
"integration_tests"
|
|
147
|
+
]);
|
|
148
|
+
var DEFAULT_SOURCE_FOR = (name) => {
|
|
149
|
+
const lower = name.toLowerCase();
|
|
150
|
+
if (lower.includes("test")) return "test";
|
|
151
|
+
if (lower.includes("compile") || lower.includes("build") || lower.includes("typecheck") || lower.includes("lint")) return "compile";
|
|
152
|
+
if (lower.includes("schema")) return "schema";
|
|
153
|
+
if (lower.includes("sandbox")) return "sandbox";
|
|
154
|
+
if (lower.includes("judge") || lower.includes("semantic")) return "judge";
|
|
155
|
+
return "composite";
|
|
156
|
+
};
|
|
157
|
+
function extractVerifiableReward(report, opts = {}) {
|
|
158
|
+
const deterministicSet = new Set(opts.deterministicLayers ?? [...DEFAULT_DETERMINISTIC_LAYERS]);
|
|
159
|
+
const sourceFor = opts.sourceFor ?? DEFAULT_SOURCE_FOR;
|
|
160
|
+
const fallbackToJudge = opts.fallbackToJudge ?? true;
|
|
161
|
+
const judgeFloor = opts.judgeConfidenceFloor ?? 0.7;
|
|
162
|
+
const deterministic = report.layers.filter(
|
|
163
|
+
(l) => deterministicSet.has(l.layer) && typeof l.score === "number" && Number.isFinite(l.score)
|
|
164
|
+
);
|
|
165
|
+
if (deterministic.length === 1) {
|
|
166
|
+
const layer = deterministic[0];
|
|
167
|
+
return {
|
|
168
|
+
value: clamp01(layer.score),
|
|
169
|
+
source: sourceFor(layer.layer),
|
|
170
|
+
determinism: "deterministic",
|
|
171
|
+
confidence: 1,
|
|
172
|
+
origin: layer.layer,
|
|
173
|
+
breakdown: layerBreakdown(layer)
|
|
174
|
+
};
|
|
175
|
+
}
|
|
176
|
+
if (deterministic.length > 1) {
|
|
177
|
+
let num = 0;
|
|
178
|
+
let denom = 0;
|
|
179
|
+
const breakdown = {};
|
|
180
|
+
for (const l of deterministic) {
|
|
181
|
+
const w = l.detail?.weight ?? 1;
|
|
182
|
+
num += w * (l.score ?? 0);
|
|
183
|
+
denom += w;
|
|
184
|
+
breakdown[l.layer] = l.score;
|
|
185
|
+
}
|
|
186
|
+
return {
|
|
187
|
+
value: denom === 0 ? 0 : clamp01(num / denom),
|
|
188
|
+
source: "composite",
|
|
189
|
+
determinism: "deterministic",
|
|
190
|
+
confidence: 1,
|
|
191
|
+
origin: deterministic.map((l) => l.layer).join("+"),
|
|
192
|
+
breakdown
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
if (!fallbackToJudge) return null;
|
|
196
|
+
const judge = report.layers.find(
|
|
197
|
+
(l) => typeof l.score === "number" && Number.isFinite(l.score) && sourceFor(l.layer) === "judge"
|
|
198
|
+
) ?? report.layers.find((l) => typeof l.score === "number" && Number.isFinite(l.score));
|
|
199
|
+
if (!judge) return null;
|
|
200
|
+
const confFromDetail = judge.detail?.confidence;
|
|
201
|
+
return {
|
|
202
|
+
value: clamp01(judge.score),
|
|
203
|
+
source: "judge",
|
|
204
|
+
determinism: "probabilistic",
|
|
205
|
+
confidence: typeof confFromDetail === "number" ? confFromDetail : judgeFloor,
|
|
206
|
+
origin: judge.layer,
|
|
207
|
+
breakdown: layerBreakdown(judge)
|
|
208
|
+
};
|
|
209
|
+
}
|
|
210
|
+
function extractVerifiableRewardsFromRecords(runs, opts = {}) {
|
|
211
|
+
const sourceFor = opts.sourceFor ?? DEFAULT_SOURCE_FOR;
|
|
212
|
+
const deterministicSet = new Set(opts.deterministicLayers ?? [...DEFAULT_DETERMINISTIC_LAYERS]);
|
|
213
|
+
const fallbackToJudge = opts.fallbackToJudge ?? true;
|
|
214
|
+
const judgeFloor = opts.judgeConfidenceFloor ?? 0.7;
|
|
215
|
+
return runs.map((run) => {
|
|
216
|
+
const layerScores = [];
|
|
217
|
+
for (const [k, v] of Object.entries(run.outcome.raw)) {
|
|
218
|
+
if (k.startsWith("layer.") && !k.includes(".", 6) && typeof v === "number" && Number.isFinite(v)) {
|
|
219
|
+
layerScores.push({ name: k.slice("layer.".length), score: v });
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
const det = layerScores.filter((l) => deterministicSet.has(l.name));
|
|
223
|
+
if (det.length === 1) {
|
|
224
|
+
const layer = det[0];
|
|
225
|
+
return {
|
|
226
|
+
runId: run.runId,
|
|
227
|
+
reward: {
|
|
228
|
+
value: clamp01(layer.score),
|
|
229
|
+
source: sourceFor(layer.name),
|
|
230
|
+
determinism: "deterministic",
|
|
231
|
+
confidence: 1,
|
|
232
|
+
origin: layer.name
|
|
233
|
+
}
|
|
234
|
+
};
|
|
235
|
+
}
|
|
236
|
+
if (det.length > 1) {
|
|
237
|
+
const value = det.reduce((s, l) => s + l.score, 0) / det.length;
|
|
238
|
+
const breakdown = Object.fromEntries(det.map((l) => [l.name, l.score]));
|
|
239
|
+
return {
|
|
240
|
+
runId: run.runId,
|
|
241
|
+
reward: {
|
|
242
|
+
value: clamp01(value),
|
|
243
|
+
source: "composite",
|
|
244
|
+
determinism: "deterministic",
|
|
245
|
+
confidence: 1,
|
|
246
|
+
origin: det.map((l) => l.name).join("+"),
|
|
247
|
+
breakdown
|
|
248
|
+
}
|
|
249
|
+
};
|
|
250
|
+
}
|
|
251
|
+
if (!fallbackToJudge) return { runId: run.runId, reward: null };
|
|
252
|
+
const primary = run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
253
|
+
if (typeof primary !== "number" || !Number.isFinite(primary)) {
|
|
254
|
+
return { runId: run.runId, reward: null };
|
|
255
|
+
}
|
|
256
|
+
return {
|
|
257
|
+
runId: run.runId,
|
|
258
|
+
reward: {
|
|
259
|
+
value: clamp01(primary),
|
|
260
|
+
source: "judge",
|
|
261
|
+
determinism: "probabilistic",
|
|
262
|
+
confidence: judgeFloor,
|
|
263
|
+
origin: "run.outcome.score"
|
|
264
|
+
}
|
|
265
|
+
};
|
|
266
|
+
});
|
|
267
|
+
}
|
|
268
|
+
function filterDeterministicallyRewarded(runs, opts = {}) {
|
|
269
|
+
const rewarded = extractVerifiableRewardsFromRecords(runs, { ...opts, fallbackToJudge: false });
|
|
270
|
+
const out = [];
|
|
271
|
+
for (let i = 0; i < runs.length; i++) {
|
|
272
|
+
const r = rewarded[i];
|
|
273
|
+
if (r.reward && r.reward.determinism === "deterministic") {
|
|
274
|
+
out.push({ run: runs[i], reward: r.reward });
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
return out;
|
|
278
|
+
}
|
|
279
|
+
function clamp01(x) {
|
|
280
|
+
if (!Number.isFinite(x)) return 0;
|
|
281
|
+
return Math.max(0, Math.min(1, x));
|
|
282
|
+
}
|
|
283
|
+
function layerBreakdown(l) {
|
|
284
|
+
const out = {};
|
|
285
|
+
if (l.diagnostics) {
|
|
286
|
+
for (const [k, v] of Object.entries(l.diagnostics)) {
|
|
287
|
+
if (typeof v === "number" && Number.isFinite(v)) out[k] = v;
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
return out;
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
// src/rl/preferences.ts
|
|
294
|
+
var SPLIT_TAG_DEFAULT = "holdout";
|
|
295
|
+
var DEFAULT_REWARD = (run) => {
|
|
296
|
+
const v = run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
297
|
+
return typeof v === "number" && Number.isFinite(v) ? v : null;
|
|
298
|
+
};
|
|
299
|
+
function extractPreferences(runs, opts = {}) {
|
|
300
|
+
const strategy = opts.strategy ?? "paired-by-scenario-and-seed";
|
|
301
|
+
const minMargin = opts.minMargin ?? 0.05;
|
|
302
|
+
const splitTag = opts.splitTag ?? SPLIT_TAG_DEFAULT;
|
|
303
|
+
const rewardOf = opts.rewardOf ?? DEFAULT_REWARD;
|
|
304
|
+
const filtered = runs.filter((r) => r.splitTag === splitTag);
|
|
305
|
+
const scoredEntries = [];
|
|
306
|
+
for (const run of filtered) {
|
|
307
|
+
const s = rewardOf(run);
|
|
308
|
+
if (s === null) continue;
|
|
309
|
+
scoredEntries.push({ run, score: s });
|
|
310
|
+
}
|
|
311
|
+
const pairs = [];
|
|
312
|
+
let pairsBelowMargin = 0;
|
|
313
|
+
let cellsSingleton = 0;
|
|
314
|
+
let cellsInspected = 0;
|
|
315
|
+
if (strategy === "paired-by-scenario-and-seed") {
|
|
316
|
+
const groups = /* @__PURE__ */ new Map();
|
|
317
|
+
for (const e of scoredEntries) {
|
|
318
|
+
const sid = scenarioOf(e.run);
|
|
319
|
+
const key = `${sid}::${e.run.seed}`;
|
|
320
|
+
const arr = groups.get(key) ?? [];
|
|
321
|
+
arr.push(e);
|
|
322
|
+
groups.set(key, arr);
|
|
323
|
+
}
|
|
324
|
+
for (const [key, members] of groups.entries()) {
|
|
325
|
+
cellsInspected++;
|
|
326
|
+
if (members.length < 2) {
|
|
327
|
+
cellsSingleton++;
|
|
328
|
+
continue;
|
|
329
|
+
}
|
|
330
|
+
for (let i = 0; i < members.length; i++) {
|
|
331
|
+
for (let j = i + 1; j < members.length; j++) {
|
|
332
|
+
const a = members[i];
|
|
333
|
+
const b = members[j];
|
|
334
|
+
if (a.run.candidateId === b.run.candidateId) continue;
|
|
335
|
+
const result = makePair(a, b, key.split("::")[0], minMargin);
|
|
336
|
+
if (result.kind === "admit") pairs.push(result.pair);
|
|
337
|
+
else pairsBelowMargin++;
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
} else if (strategy === "paired-by-scenario") {
|
|
342
|
+
const byScenarioVariant = /* @__PURE__ */ new Map();
|
|
343
|
+
for (const e of scoredEntries) {
|
|
344
|
+
const sid = scenarioOf(e.run);
|
|
345
|
+
let perScenario = byScenarioVariant.get(sid);
|
|
346
|
+
if (!perScenario) {
|
|
347
|
+
perScenario = /* @__PURE__ */ new Map();
|
|
348
|
+
byScenarioVariant.set(sid, perScenario);
|
|
349
|
+
}
|
|
350
|
+
const cur = perScenario.get(e.run.candidateId);
|
|
351
|
+
if (cur) {
|
|
352
|
+
cur.sum += e.score;
|
|
353
|
+
cur.n++;
|
|
354
|
+
} else perScenario.set(e.run.candidateId, { run: e.run, sum: e.score, n: 1 });
|
|
355
|
+
}
|
|
356
|
+
for (const [sid, perVariant] of byScenarioVariant.entries()) {
|
|
357
|
+
cellsInspected++;
|
|
358
|
+
const arr = [...perVariant.entries()].map(([vid, agg]) => ({
|
|
359
|
+
run: agg.run,
|
|
360
|
+
score: agg.sum / agg.n,
|
|
361
|
+
variantId: vid
|
|
362
|
+
}));
|
|
363
|
+
if (arr.length < 2) {
|
|
364
|
+
cellsSingleton++;
|
|
365
|
+
continue;
|
|
366
|
+
}
|
|
367
|
+
for (let i = 0; i < arr.length; i++) {
|
|
368
|
+
for (let j = i + 1; j < arr.length; j++) {
|
|
369
|
+
const result = makePair(arr[i], arr[j], sid, minMargin);
|
|
370
|
+
if (result.kind === "admit") pairs.push(result.pair);
|
|
371
|
+
else pairsBelowMargin++;
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
} else {
|
|
376
|
+
const byScenario = /* @__PURE__ */ new Map();
|
|
377
|
+
for (const e of scoredEntries) {
|
|
378
|
+
const sid = scenarioOf(e.run);
|
|
379
|
+
const arr = byScenario.get(sid) ?? [];
|
|
380
|
+
arr.push(e);
|
|
381
|
+
byScenario.set(sid, arr);
|
|
382
|
+
}
|
|
383
|
+
for (const [sid, arr] of byScenario.entries()) {
|
|
384
|
+
cellsInspected++;
|
|
385
|
+
if (arr.length < 2) {
|
|
386
|
+
cellsSingleton++;
|
|
387
|
+
continue;
|
|
388
|
+
}
|
|
389
|
+
const sorted = [...arr].sort((a, b) => a.score - b.score);
|
|
390
|
+
const top = sorted[sorted.length - 1];
|
|
391
|
+
const bot = sorted[0];
|
|
392
|
+
if (top.run.candidateId === bot.run.candidateId) {
|
|
393
|
+
cellsSingleton++;
|
|
394
|
+
continue;
|
|
395
|
+
}
|
|
396
|
+
const result = makePair(bot, top, sid, minMargin);
|
|
397
|
+
if (result.kind === "admit") pairs.push(result.pair);
|
|
398
|
+
else pairsBelowMargin++;
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
return { pairs, cellsInspected, pairsBelowMargin, cellsSingleton, strategy };
|
|
402
|
+
}
|
|
403
|
+
function toTRLFormat(triples, promptOf) {
|
|
404
|
+
return triples.map((t) => ({
|
|
405
|
+
prompt: promptOf(t.meta.chosenPromptHash),
|
|
406
|
+
chosen: t.meta.chosenPromptHash,
|
|
407
|
+
// caller substitutes the model output via the runId map
|
|
408
|
+
rejected: t.meta.rejectedPromptHash
|
|
409
|
+
}));
|
|
410
|
+
}
|
|
411
|
+
function toAnthropicFormat(triples) {
|
|
412
|
+
return triples.map((t) => ({
|
|
413
|
+
scenarioId: t.scenarioId,
|
|
414
|
+
chosenRunId: t.chosenRunId,
|
|
415
|
+
rejectedRunId: t.rejectedRunId,
|
|
416
|
+
margin: t.marginScore
|
|
417
|
+
}));
|
|
418
|
+
}
|
|
419
|
+
function makePair(a, b, scenarioId, minMargin) {
|
|
420
|
+
const margin = Math.abs(a.score - b.score);
|
|
421
|
+
if (margin < minMargin) return { kind: "reject" };
|
|
422
|
+
const [chosen, rejected] = a.score > b.score ? [a, b] : [b, a];
|
|
423
|
+
return {
|
|
424
|
+
kind: "admit",
|
|
425
|
+
pair: {
|
|
426
|
+
scenarioId,
|
|
427
|
+
chosenRunId: chosen.run.runId,
|
|
428
|
+
rejectedRunId: rejected.run.runId,
|
|
429
|
+
chosenVariantId: chosen.run.candidateId,
|
|
430
|
+
rejectedVariantId: rejected.run.candidateId,
|
|
431
|
+
marginScore: chosen.score - rejected.score,
|
|
432
|
+
scores: { chosen: chosen.score, rejected: rejected.score },
|
|
433
|
+
seed: chosen.run.seed === rejected.run.seed ? chosen.run.seed : void 0,
|
|
434
|
+
meta: {
|
|
435
|
+
chosenPromptHash: chosen.run.promptHash,
|
|
436
|
+
rejectedPromptHash: rejected.run.promptHash,
|
|
437
|
+
chosenConfigHash: chosen.run.configHash,
|
|
438
|
+
rejectedConfigHash: rejected.run.configHash,
|
|
439
|
+
chosenModel: chosen.run.model,
|
|
440
|
+
rejectedModel: rejected.run.model
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
};
|
|
444
|
+
}
|
|
445
|
+
function scenarioOf(run) {
|
|
446
|
+
if (typeof run.scenarioId === "string" && run.scenarioId.length > 0) return run.scenarioId;
|
|
447
|
+
const fromRaw = run.outcome.raw.scenario_id;
|
|
448
|
+
if (typeof fromRaw === "number" && Number.isFinite(fromRaw)) return String(fromRaw);
|
|
449
|
+
if (typeof fromRaw === "string") return fromRaw;
|
|
450
|
+
return run.experimentId;
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
// src/rl/off-policy.ts
|
|
454
|
+
function inverseProbabilityWeighting(trajectories, opts = {}) {
|
|
455
|
+
const cap = opts.weightCap ?? Infinity;
|
|
456
|
+
const clip = opts.rewardClip ?? { low: 0, high: 1 };
|
|
457
|
+
if (trajectories.length === 0) {
|
|
458
|
+
return zeroEstimate();
|
|
459
|
+
}
|
|
460
|
+
const weights = [];
|
|
461
|
+
const weightedRewards = [];
|
|
462
|
+
let maxW = 0;
|
|
463
|
+
for (const t of trajectories) {
|
|
464
|
+
if (t.behaviorProb <= 0) {
|
|
465
|
+
throw new Error(`inverseProbabilityWeighting: behaviorProb must be > 0 (runId=${t.runId})`);
|
|
466
|
+
}
|
|
467
|
+
const w = Math.min(cap, t.targetProb / t.behaviorProb);
|
|
468
|
+
const r = clamp(t.reward, clip.low, clip.high);
|
|
469
|
+
weights.push(w);
|
|
470
|
+
weightedRewards.push(w * r);
|
|
471
|
+
if (w > maxW) maxW = w;
|
|
472
|
+
}
|
|
473
|
+
const n = weights.length;
|
|
474
|
+
const value = weightedRewards.reduce((s, x) => s + x, 0) / n;
|
|
475
|
+
const variance = weightedRewards.reduce((s, x) => s + (x - value) ** 2, 0) / Math.max(1, n - 1);
|
|
476
|
+
const sumW = weights.reduce((s, w) => s + w, 0);
|
|
477
|
+
const sumW2 = weights.reduce((s, w) => s + w * w, 0);
|
|
478
|
+
const effN = sumW === 0 ? 0 : sumW * sumW / sumW2;
|
|
479
|
+
return {
|
|
480
|
+
value,
|
|
481
|
+
standardError: Math.sqrt(variance / n),
|
|
482
|
+
effectiveSampleSize: effN,
|
|
483
|
+
n,
|
|
484
|
+
maxImportanceWeight: maxW
|
|
485
|
+
};
|
|
486
|
+
}
|
|
487
|
+
function selfNormalizedImportanceWeighting(trajectories, opts = {}) {
|
|
488
|
+
const cap = opts.weightCap ?? Infinity;
|
|
489
|
+
const clip = opts.rewardClip ?? { low: 0, high: 1 };
|
|
490
|
+
if (trajectories.length === 0) return zeroEstimate();
|
|
491
|
+
const weights = [];
|
|
492
|
+
const rewards = [];
|
|
493
|
+
let maxW = 0;
|
|
494
|
+
for (const t of trajectories) {
|
|
495
|
+
if (t.behaviorProb <= 0) {
|
|
496
|
+
throw new Error(`selfNormalizedImportanceWeighting: behaviorProb must be > 0 (runId=${t.runId})`);
|
|
497
|
+
}
|
|
498
|
+
const w = Math.min(cap, t.targetProb / t.behaviorProb);
|
|
499
|
+
weights.push(w);
|
|
500
|
+
rewards.push(clamp(t.reward, clip.low, clip.high));
|
|
501
|
+
if (w > maxW) maxW = w;
|
|
502
|
+
}
|
|
503
|
+
const sumW = weights.reduce((s, w) => s + w, 0);
|
|
504
|
+
const sumWR = weights.reduce((s, w, i) => s + w * rewards[i], 0);
|
|
505
|
+
const value = sumW === 0 ? 0 : sumWR / sumW;
|
|
506
|
+
const sumW2 = weights.reduce((s, w) => s + w * w, 0);
|
|
507
|
+
const effN = sumW === 0 ? 0 : sumW * sumW / sumW2;
|
|
508
|
+
const phi = weights.map((w, i) => w * (rewards[i] - value));
|
|
509
|
+
const variance = phi.reduce((s, x) => s + x * x, 0) / Math.max(1, sumW * sumW);
|
|
510
|
+
return {
|
|
511
|
+
value,
|
|
512
|
+
standardError: Math.sqrt(variance),
|
|
513
|
+
effectiveSampleSize: effN,
|
|
514
|
+
n: trajectories.length,
|
|
515
|
+
maxImportanceWeight: maxW
|
|
516
|
+
};
|
|
517
|
+
}
|
|
518
|
+
function doublyRobust(trajectories, opts = {}) {
|
|
519
|
+
const cap = opts.weightCap ?? Infinity;
|
|
520
|
+
const clip = opts.rewardClip ?? { low: 0, high: 1 };
|
|
521
|
+
if (trajectories.length === 0) return zeroEstimate();
|
|
522
|
+
const contributions = [];
|
|
523
|
+
let maxW = 0;
|
|
524
|
+
let sumW = 0;
|
|
525
|
+
let sumW2 = 0;
|
|
526
|
+
for (const t of trajectories) {
|
|
527
|
+
if (t.behaviorProb <= 0) {
|
|
528
|
+
throw new Error(`doublyRobust: behaviorProb must be > 0 (runId=${t.runId})`);
|
|
529
|
+
}
|
|
530
|
+
const w = Math.min(cap, t.targetProb / t.behaviorProb);
|
|
531
|
+
const r = clamp(t.reward, clip.low, clip.high);
|
|
532
|
+
const q = typeof t.qHat === "number" && Number.isFinite(t.qHat) ? clamp(t.qHat, clip.low, clip.high) : null;
|
|
533
|
+
if (q === null) {
|
|
534
|
+
contributions.push(w * r);
|
|
535
|
+
} else {
|
|
536
|
+
contributions.push(q + w * (r - q));
|
|
537
|
+
}
|
|
538
|
+
if (w > maxW) maxW = w;
|
|
539
|
+
sumW += w;
|
|
540
|
+
sumW2 += w * w;
|
|
541
|
+
}
|
|
542
|
+
const n = contributions.length;
|
|
543
|
+
const value = contributions.reduce((s, x) => s + x, 0) / n;
|
|
544
|
+
const variance = contributions.reduce((s, x) => s + (x - value) ** 2, 0) / Math.max(1, n - 1);
|
|
545
|
+
const effN = sumW === 0 ? 0 : sumW * sumW / sumW2;
|
|
546
|
+
return {
|
|
547
|
+
value,
|
|
548
|
+
standardError: Math.sqrt(variance / n),
|
|
549
|
+
effectiveSampleSize: effN,
|
|
550
|
+
n,
|
|
551
|
+
maxImportanceWeight: maxW
|
|
552
|
+
};
|
|
553
|
+
}
|
|
554
|
+
function offPolicyEstimateAll(trajectories, opts = {}) {
|
|
555
|
+
return {
|
|
556
|
+
ips: inverseProbabilityWeighting(trajectories, opts),
|
|
557
|
+
snips: selfNormalizedImportanceWeighting(trajectories, opts),
|
|
558
|
+
dr: doublyRobust(trajectories, opts)
|
|
559
|
+
};
|
|
560
|
+
}
|
|
561
|
+
function zeroEstimate() {
|
|
562
|
+
return { value: 0, standardError: 0, effectiveSampleSize: 0, n: 0, maxImportanceWeight: 0 };
|
|
563
|
+
}
|
|
564
|
+
function clamp(x, lo, hi) {
|
|
565
|
+
if (!Number.isFinite(x)) return lo;
|
|
566
|
+
return Math.max(lo, Math.min(hi, x));
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
// src/rl/process-reward.ts
|
|
570
|
+
async function extractStepRewards(store, runId, opts) {
|
|
571
|
+
const spans = await store.spans({ runId });
|
|
572
|
+
const ordered = [...spans].sort((a, b) => a.startedAt - b.startedAt);
|
|
573
|
+
const out = [];
|
|
574
|
+
let idx = 0;
|
|
575
|
+
for (const span of ordered) {
|
|
576
|
+
if (opts.preFilter && !opts.preFilter(span)) continue;
|
|
577
|
+
let scored = null;
|
|
578
|
+
for (const s of opts.scorers) {
|
|
579
|
+
if (!s.appliesTo.includes(span.kind)) continue;
|
|
580
|
+
const r = await s.score(span);
|
|
581
|
+
if (r) {
|
|
582
|
+
scored = r;
|
|
583
|
+
break;
|
|
584
|
+
}
|
|
585
|
+
}
|
|
586
|
+
if (!scored) continue;
|
|
587
|
+
out.push({
|
|
588
|
+
spanId: span.spanId,
|
|
589
|
+
runId,
|
|
590
|
+
stepIndex: idx++,
|
|
591
|
+
kind: span.kind,
|
|
592
|
+
name: span.name,
|
|
593
|
+
reward: scored.reward,
|
|
594
|
+
determinism: scored.determinism,
|
|
595
|
+
rationale: scored.rationale,
|
|
596
|
+
weight: scored.weight
|
|
597
|
+
});
|
|
598
|
+
}
|
|
599
|
+
return out;
|
|
600
|
+
}
|
|
601
|
+
function runwiseStepRewardSummary(stepRewards) {
|
|
602
|
+
if (stepRewards.length === 0) {
|
|
603
|
+
return {
|
|
604
|
+
runId: "",
|
|
605
|
+
totalSteps: 0,
|
|
606
|
+
meanReward: 0,
|
|
607
|
+
sumWeightedReward: 0,
|
|
608
|
+
failureFraction: 0,
|
|
609
|
+
worstStepDelta: 0,
|
|
610
|
+
worstStepIndex: null
|
|
611
|
+
};
|
|
612
|
+
}
|
|
613
|
+
const runId = stepRewards[0].runId;
|
|
614
|
+
let sumW = 0;
|
|
615
|
+
let sumWR = 0;
|
|
616
|
+
let failures = 0;
|
|
617
|
+
let worstDelta = 0;
|
|
618
|
+
let worstIdx = null;
|
|
619
|
+
let prev = stepRewards[0].reward;
|
|
620
|
+
for (let i = 0; i < stepRewards.length; i++) {
|
|
621
|
+
const s = stepRewards[i];
|
|
622
|
+
const w = s.weight ?? 1;
|
|
623
|
+
sumW += w;
|
|
624
|
+
sumWR += w * s.reward;
|
|
625
|
+
if (s.reward < 0.5) failures++;
|
|
626
|
+
if (i > 0) {
|
|
627
|
+
const delta = s.reward - prev;
|
|
628
|
+
if (delta < worstDelta) {
|
|
629
|
+
worstDelta = delta;
|
|
630
|
+
worstIdx = i;
|
|
631
|
+
}
|
|
632
|
+
prev = s.reward;
|
|
633
|
+
} else {
|
|
634
|
+
prev = s.reward;
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
return {
|
|
638
|
+
runId,
|
|
639
|
+
totalSteps: stepRewards.length,
|
|
640
|
+
meanReward: sumW === 0 ? 0 : sumWR / sumW,
|
|
641
|
+
sumWeightedReward: sumWR,
|
|
642
|
+
failureFraction: failures / stepRewards.length,
|
|
643
|
+
worstStepDelta: worstDelta,
|
|
644
|
+
worstStepIndex: worstIdx
|
|
645
|
+
};
|
|
646
|
+
}
|
|
647
|
+
function prmTrainingPairs(stepRewardsByRun, opts = {}) {
|
|
648
|
+
const minMargin = opts.minMargin ?? 0.2;
|
|
649
|
+
const minPrefix = opts.minPrefixLength ?? 1;
|
|
650
|
+
const runs = [...stepRewardsByRun.entries()].map(([runId, steps]) => ({ runId, steps }));
|
|
651
|
+
const triples = [];
|
|
652
|
+
for (let i = 0; i < runs.length; i++) {
|
|
653
|
+
for (let j = i + 1; j < runs.length; j++) {
|
|
654
|
+
const a = runs[i];
|
|
655
|
+
const b = runs[j];
|
|
656
|
+
const minLen = Math.min(a.steps.length, b.steps.length);
|
|
657
|
+
if (minLen < minPrefix + 1) continue;
|
|
658
|
+
let divergenceIdx = -1;
|
|
659
|
+
for (let k = 0; k < minLen; k++) {
|
|
660
|
+
const sa = a.steps[k];
|
|
661
|
+
const sb = b.steps[k];
|
|
662
|
+
const structuralDivergence = sa.kind !== sb.kind || sa.name !== sb.name;
|
|
663
|
+
const rewardGap = Math.abs(sa.reward - sb.reward);
|
|
664
|
+
if (structuralDivergence || rewardGap >= minMargin) {
|
|
665
|
+
divergenceIdx = k;
|
|
666
|
+
break;
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
if (divergenceIdx < 0) continue;
|
|
670
|
+
if (divergenceIdx < minPrefix) continue;
|
|
671
|
+
const aNext = a.steps[divergenceIdx];
|
|
672
|
+
const bNext = b.steps[divergenceIdx];
|
|
673
|
+
const margin = Math.abs(aNext.reward - bNext.reward);
|
|
674
|
+
if (margin < minMargin) continue;
|
|
675
|
+
const chosen = aNext.reward > bNext.reward ? aNext : bNext;
|
|
676
|
+
const rejected = aNext.reward > bNext.reward ? bNext : aNext;
|
|
677
|
+
const chosenRun = aNext.reward > bNext.reward ? a.runId : b.runId;
|
|
678
|
+
const rejectedRun = aNext.reward > bNext.reward ? b.runId : a.runId;
|
|
679
|
+
triples.push({
|
|
680
|
+
prefixRunId: chosenRun,
|
|
681
|
+
prefixStepIndex: divergenceIdx - 1,
|
|
682
|
+
chosenSpanId: chosen.spanId,
|
|
683
|
+
chosenReward: chosen.reward,
|
|
684
|
+
rejectedSpanId: rejected.spanId,
|
|
685
|
+
rejectedReward: rejected.reward,
|
|
686
|
+
rejectedRunId: rejectedRun,
|
|
687
|
+
marginScore: chosen.reward - rejected.reward
|
|
688
|
+
});
|
|
689
|
+
}
|
|
690
|
+
}
|
|
691
|
+
return triples;
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
// src/rl/contamination.ts
|
|
695
|
+
async function runContaminationProbe(input, opts = {}) {
|
|
696
|
+
const fdr = opts.fdr ?? 0.05;
|
|
697
|
+
const minMedianDrop = opts.minMedianDrop ?? 0.05;
|
|
698
|
+
const floor = opts.scoreFloor ?? 0;
|
|
699
|
+
if (!input.perturbed && !input.perturbation) {
|
|
700
|
+
throw new Error("runContaminationProbe: must supply either `perturbed` or `perturbation`.");
|
|
701
|
+
}
|
|
702
|
+
const perturbed = input.perturbed ?? await Promise.all(
|
|
703
|
+
input.originals.map((s) => input.perturbation.apply(s))
|
|
704
|
+
);
|
|
705
|
+
if (perturbed.length !== input.originals.length) {
|
|
706
|
+
throw new Error(`runContaminationProbe: perturbed length ${perturbed.length} \u2260 originals ${input.originals.length}`);
|
|
707
|
+
}
|
|
708
|
+
const origScores = await Promise.all(input.originals.map((s) => input.scoreFn(s)));
|
|
709
|
+
const pertScores = await Promise.all(perturbed.map((s) => input.scoreFn(s)));
|
|
710
|
+
const perScenario = input.originals.map((s, i) => ({
|
|
711
|
+
scenarioId: input.scenarioId(s),
|
|
712
|
+
originalScore: origScores[i],
|
|
713
|
+
perturbedScore: pertScores[i],
|
|
714
|
+
delta: pertScores[i] - origScores[i],
|
|
715
|
+
qValue: NaN
|
|
716
|
+
}));
|
|
717
|
+
const valid = perScenario.filter((p) => p.originalScore >= floor && p.perturbedScore >= floor);
|
|
718
|
+
if (valid.length < 4) {
|
|
719
|
+
return {
|
|
720
|
+
perScenario,
|
|
721
|
+
pairedTest: { w: 0, p: 1 },
|
|
722
|
+
medianDelta: 0,
|
|
723
|
+
meanDelta: 0,
|
|
724
|
+
contaminationSuspected: false,
|
|
725
|
+
reason: `insufficient valid scenarios (n=${valid.length}, need \u2265 4)`,
|
|
726
|
+
n: valid.length
|
|
727
|
+
};
|
|
728
|
+
}
|
|
729
|
+
const origValid = valid.map((p) => p.originalScore);
|
|
730
|
+
const pertValid = valid.map((p) => p.perturbedScore);
|
|
731
|
+
const pairedTest = wilcoxonSignedRank(origValid, pertValid);
|
|
732
|
+
const deltas = valid.map((p) => p.delta);
|
|
733
|
+
const sortedDeltas = [...deltas].sort((a, b) => a - b);
|
|
734
|
+
const median = sortedDeltas[Math.floor(sortedDeltas.length / 2)];
|
|
735
|
+
const mean2 = deltas.reduce((s, d) => s + d, 0) / deltas.length;
|
|
736
|
+
const pseudoP = valid.map((p) => Math.min(1, Math.max(1e-6, 1 - Math.abs(p.delta) / 1)));
|
|
737
|
+
const { qValues } = benjaminiHochberg(pseudoP, fdr);
|
|
738
|
+
for (let i = 0; i < valid.length; i++) {
|
|
739
|
+
const v = valid[i];
|
|
740
|
+
const idx = perScenario.findIndex((p) => p.scenarioId === v.scenarioId);
|
|
741
|
+
if (idx >= 0) perScenario[idx].qValue = qValues[i];
|
|
742
|
+
}
|
|
743
|
+
const contaminationSuspected = pairedTest.p < fdr && median <= -minMedianDrop;
|
|
744
|
+
const reason = contaminationSuspected ? `paired p=${pairedTest.p.toFixed(4)} < ${fdr} and median drop ${median.toFixed(4)} \u2265 ${minMedianDrop}` : pairedTest.p >= fdr ? `no significant difference (paired p=${pairedTest.p.toFixed(4)})` : `significant but small effect (median delta ${median.toFixed(4)})`;
|
|
745
|
+
return {
|
|
746
|
+
perScenario,
|
|
747
|
+
pairedTest,
|
|
748
|
+
medianDelta: median,
|
|
749
|
+
meanDelta: mean2,
|
|
750
|
+
contaminationSuspected,
|
|
751
|
+
reason,
|
|
752
|
+
n: valid.length
|
|
753
|
+
};
|
|
754
|
+
}
|
|
755
|
+
function renameVariables(identifiers, rename = (n, i) => `${n}_${(i % 26 + 10).toString(36)}`) {
|
|
756
|
+
return {
|
|
757
|
+
kind: "rename_variables",
|
|
758
|
+
apply(scenario) {
|
|
759
|
+
let prompt = scenario.prompt;
|
|
760
|
+
identifiers.forEach((id, i) => {
|
|
761
|
+
const replacement = rename(id, i);
|
|
762
|
+
const re = new RegExp(`\\b${escapeRegex(id)}\\b`, "g");
|
|
763
|
+
prompt = prompt.replace(re, replacement);
|
|
764
|
+
});
|
|
765
|
+
return { ...scenario, prompt };
|
|
766
|
+
}
|
|
767
|
+
};
|
|
768
|
+
}
|
|
769
|
+
function shuffleOrder(shuffleSection, seed) {
|
|
770
|
+
let s = seed >>> 0;
|
|
771
|
+
const rng = () => {
|
|
772
|
+
s = s + 1831565813 >>> 0;
|
|
773
|
+
let t = s;
|
|
774
|
+
t = Math.imul(t ^ t >>> 15, t | 1);
|
|
775
|
+
t ^= t + Math.imul(t ^ t >>> 7, t | 61);
|
|
776
|
+
return ((t ^ t >>> 14) >>> 0) / 4294967296;
|
|
777
|
+
};
|
|
778
|
+
return {
|
|
779
|
+
kind: "shuffle_order",
|
|
780
|
+
apply(scenario) {
|
|
781
|
+
const newPrompt = shuffleSection(scenario.prompt, rng);
|
|
782
|
+
return { ...scenario, prompt: newPrompt };
|
|
783
|
+
}
|
|
784
|
+
};
|
|
785
|
+
}
|
|
786
|
+
function injectIrrelevantClause(clause, position = "prefix") {
|
|
787
|
+
return {
|
|
788
|
+
kind: "inject_irrelevant_clause",
|
|
789
|
+
apply(scenario) {
|
|
790
|
+
const prompt = position === "prefix" ? `${clause} ${scenario.prompt}` : `${scenario.prompt} ${clause}`;
|
|
791
|
+
return { ...scenario, prompt };
|
|
792
|
+
}
|
|
793
|
+
};
|
|
794
|
+
}
|
|
795
|
+
function escapeRegex(s) {
|
|
796
|
+
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
// src/rl/tournament.ts
|
|
800
|
+
function fitBradleyTerry(outcomes, opts = {}) {
|
|
801
|
+
const tol = opts.tolerance ?? 1e-6;
|
|
802
|
+
const maxIter = opts.maxIterations ?? 256;
|
|
803
|
+
const smoothing = opts.smoothing ?? 0.1;
|
|
804
|
+
const candidates = /* @__PURE__ */ new Set();
|
|
805
|
+
for (const o of outcomes) {
|
|
806
|
+
candidates.add(o.winner);
|
|
807
|
+
candidates.add(o.loser);
|
|
808
|
+
}
|
|
809
|
+
const ids = [...candidates].sort();
|
|
810
|
+
const idx = new Map(ids.map((id, i) => [id, i]));
|
|
811
|
+
const n = ids.length;
|
|
812
|
+
if (n === 0) return { ratings: [], iterations: 0, finalDelta: 0, converged: true };
|
|
813
|
+
if (n === 1) {
|
|
814
|
+
return {
|
|
815
|
+
ratings: [{ candidateId: ids[0], strength: 1, logStrength: 0, n: 0, wins: 0 }],
|
|
816
|
+
iterations: 0,
|
|
817
|
+
finalDelta: 0,
|
|
818
|
+
converged: true
|
|
819
|
+
};
|
|
820
|
+
}
|
|
821
|
+
const W = Array.from({ length: n }, () => new Array(n).fill(0));
|
|
822
|
+
const N = Array.from({ length: n }, () => new Array(n).fill(0));
|
|
823
|
+
for (const o of outcomes) {
|
|
824
|
+
const i = idx.get(o.winner);
|
|
825
|
+
const j = idx.get(o.loser);
|
|
826
|
+
const w = o.weight ?? 1;
|
|
827
|
+
if (o.draw) {
|
|
828
|
+
W[i][j] += 0.5 * w;
|
|
829
|
+
W[j][i] += 0.5 * w;
|
|
830
|
+
} else {
|
|
831
|
+
W[i][j] += w;
|
|
832
|
+
}
|
|
833
|
+
N[i][j] += w;
|
|
834
|
+
N[j][i] += w;
|
|
835
|
+
}
|
|
836
|
+
const winsTotal = new Array(n).fill(0);
|
|
837
|
+
for (let i = 0; i < n; i++) {
|
|
838
|
+
for (let j = 0; j < n; j++) winsTotal[i] += W[i][j];
|
|
839
|
+
winsTotal[i] += smoothing;
|
|
840
|
+
}
|
|
841
|
+
const compsTotal = new Array(n).fill(0);
|
|
842
|
+
for (let i = 0; i < n; i++) {
|
|
843
|
+
for (let j = 0; j < n; j++) compsTotal[i] += N[i][j];
|
|
844
|
+
}
|
|
845
|
+
let theta = new Array(n).fill(1);
|
|
846
|
+
let iter = 0;
|
|
847
|
+
let delta = Infinity;
|
|
848
|
+
for (; iter < maxIter; iter++) {
|
|
849
|
+
const newTheta = new Array(n);
|
|
850
|
+
for (let i = 0; i < n; i++) {
|
|
851
|
+
let denom = 0;
|
|
852
|
+
for (let j = 0; j < n; j++) {
|
|
853
|
+
if (j === i) continue;
|
|
854
|
+
if (N[i][j] === 0) continue;
|
|
855
|
+
denom += N[i][j] / (theta[i] + theta[j]);
|
|
856
|
+
}
|
|
857
|
+
newTheta[i] = denom === 0 ? theta[i] : winsTotal[i] / denom;
|
|
858
|
+
}
|
|
859
|
+
let logSum = 0;
|
|
860
|
+
for (let i = 0; i < n; i++) logSum += Math.log(Math.max(1e-300, newTheta[i]));
|
|
861
|
+
const norm = Math.exp(logSum / n);
|
|
862
|
+
for (let i = 0; i < n; i++) newTheta[i] = newTheta[i] / norm;
|
|
863
|
+
delta = 0;
|
|
864
|
+
for (let i = 0; i < n; i++) {
|
|
865
|
+
const d = Math.abs(newTheta[i] - theta[i]) / Math.max(1e-12, theta[i]);
|
|
866
|
+
if (d > delta) delta = d;
|
|
867
|
+
}
|
|
868
|
+
theta = newTheta;
|
|
869
|
+
if (delta < tol) break;
|
|
870
|
+
}
|
|
871
|
+
const minLog = Math.min(...theta.map((t) => Math.log(Math.max(1e-300, t))));
|
|
872
|
+
const ratings = ids.map((id, i) => ({
|
|
873
|
+
candidateId: id,
|
|
874
|
+
strength: theta[i],
|
|
875
|
+
logStrength: Math.log(Math.max(1e-300, theta[i])) - minLog,
|
|
876
|
+
n: compsTotal[i],
|
|
877
|
+
wins: winsTotal[i] - smoothing
|
|
878
|
+
}));
|
|
879
|
+
return {
|
|
880
|
+
ratings: ratings.sort((a, b) => b.strength - a.strength),
|
|
881
|
+
iterations: iter,
|
|
882
|
+
finalDelta: delta,
|
|
883
|
+
converged: delta < tol
|
|
884
|
+
};
|
|
885
|
+
}
|
|
886
|
+
function applyEloUpdate(ratings, outcome, opts = {}) {
|
|
887
|
+
const defaultRating = opts.defaultRating ?? 1500;
|
|
888
|
+
const k = opts.kFactor ?? 32;
|
|
889
|
+
const rW = ratings.get(outcome.winner) ?? defaultRating;
|
|
890
|
+
const rL = ratings.get(outcome.loser) ?? defaultRating;
|
|
891
|
+
const expectedW = 1 / (1 + Math.pow(10, (rL - rW) / 400));
|
|
892
|
+
const scoreW = outcome.draw ? 0.5 : 1;
|
|
893
|
+
const scoreL = outcome.draw ? 0.5 : 0;
|
|
894
|
+
const w = outcome.weight ?? 1;
|
|
895
|
+
const winnerDelta = k * w * (scoreW - expectedW);
|
|
896
|
+
const loserDelta = k * w * (scoreL - (1 - expectedW));
|
|
897
|
+
ratings.set(outcome.winner, rW + winnerDelta);
|
|
898
|
+
ratings.set(outcome.loser, rL + loserDelta);
|
|
899
|
+
return { winnerDelta, loserDelta };
|
|
900
|
+
}
|
|
901
|
+
function buildPairwiseFromCampaign(input) {
|
|
902
|
+
const drawMargin = input.drawMargin ?? 0;
|
|
903
|
+
const byKey = /* @__PURE__ */ new Map();
|
|
904
|
+
for (const r of input.runs) {
|
|
905
|
+
const arr = byKey.get(r.matchKey) ?? [];
|
|
906
|
+
arr.push({ candidateId: r.candidateId, score: r.score });
|
|
907
|
+
byKey.set(r.matchKey, arr);
|
|
908
|
+
}
|
|
909
|
+
const outcomes = [];
|
|
910
|
+
for (const arr of byKey.values()) {
|
|
911
|
+
for (let i = 0; i < arr.length; i++) {
|
|
912
|
+
for (let j = i + 1; j < arr.length; j++) {
|
|
913
|
+
const a = arr[i];
|
|
914
|
+
const b = arr[j];
|
|
915
|
+
if (a.candidateId === b.candidateId) continue;
|
|
916
|
+
const margin = Math.abs(a.score - b.score);
|
|
917
|
+
if (margin <= drawMargin) {
|
|
918
|
+
outcomes.push({ winner: a.candidateId, loser: b.candidateId, draw: true, weight: 1 });
|
|
919
|
+
} else {
|
|
920
|
+
const [winner, loser] = a.score > b.score ? [a, b] : [b, a];
|
|
921
|
+
outcomes.push({ winner: winner.candidateId, loser: loser.candidateId, weight: margin });
|
|
922
|
+
}
|
|
923
|
+
}
|
|
924
|
+
}
|
|
925
|
+
}
|
|
926
|
+
return outcomes;
|
|
927
|
+
}
|
|
928
|
+
|
|
929
|
+
// src/rl/adversarial.ts
|
|
930
|
+
async function adversarialScenarioSearch(opts) {
|
|
931
|
+
const failureThreshold = opts.failureThreshold ?? 0.5;
|
|
932
|
+
const rounds = opts.rounds ?? 3;
|
|
933
|
+
const children = opts.childrenPerParent ?? 4;
|
|
934
|
+
const budget = opts.budget ?? Number.POSITIVE_INFINITY;
|
|
935
|
+
const seed = opts.seed ?? 1;
|
|
936
|
+
const rng = mulberry32(seed);
|
|
937
|
+
const scenarios = [];
|
|
938
|
+
const seen = /* @__PURE__ */ new Set();
|
|
939
|
+
let scoreCalls = 0;
|
|
940
|
+
for (const s of opts.seeds) {
|
|
941
|
+
const id = opts.mutateScenarioId(s);
|
|
942
|
+
if (seen.has(id)) continue;
|
|
943
|
+
seen.add(id);
|
|
944
|
+
if (scoreCalls >= budget) break;
|
|
945
|
+
const score = await opts.scoreFn(s);
|
|
946
|
+
scoreCalls++;
|
|
947
|
+
scenarios.push({
|
|
948
|
+
id,
|
|
949
|
+
generation: 0,
|
|
950
|
+
parentId: null,
|
|
951
|
+
scenario: s,
|
|
952
|
+
score,
|
|
953
|
+
mutationStrategy: null
|
|
954
|
+
});
|
|
955
|
+
}
|
|
956
|
+
for (let g = 1; g <= rounds; g++) {
|
|
957
|
+
if (scoreCalls >= budget) break;
|
|
958
|
+
const parents = scenarios.filter((s) => s.generation === g - 1);
|
|
959
|
+
for (const parent of parents) {
|
|
960
|
+
for (const mutation of opts.mutations) {
|
|
961
|
+
if (scoreCalls >= budget) break;
|
|
962
|
+
const produced = await mutation.mutate(parent.scenario, rng);
|
|
963
|
+
const childArr = Array.isArray(produced) ? produced : [produced];
|
|
964
|
+
for (let k = 0; k < Math.min(children, childArr.length); k++) {
|
|
965
|
+
if (scoreCalls >= budget) break;
|
|
966
|
+
const child = childArr[k];
|
|
967
|
+
const cid = opts.mutateScenarioId(child);
|
|
968
|
+
if (seen.has(cid)) continue;
|
|
969
|
+
seen.add(cid);
|
|
970
|
+
const cscore = await opts.scoreFn(child);
|
|
971
|
+
scoreCalls++;
|
|
972
|
+
scenarios.push({
|
|
973
|
+
id: cid,
|
|
974
|
+
generation: g,
|
|
975
|
+
parentId: parent.id,
|
|
976
|
+
scenario: child,
|
|
977
|
+
score: cscore,
|
|
978
|
+
mutationStrategy: mutation.id
|
|
979
|
+
});
|
|
980
|
+
}
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
}
|
|
984
|
+
const failures = scenarios.filter((s) => s.score !== null && s.score < failureThreshold).sort((a, b) => (a.score ?? 0) - (b.score ?? 0));
|
|
985
|
+
const byGeneration = [];
|
|
986
|
+
const maxGen = scenarios.reduce((m, s) => Math.max(m, s.generation), 0);
|
|
987
|
+
for (let g = 0; g <= maxGen; g++) {
|
|
988
|
+
const gens = scenarios.filter((s) => s.generation === g);
|
|
989
|
+
if (gens.length === 0) continue;
|
|
990
|
+
const fails = gens.filter((s) => s.score !== null && s.score < failureThreshold).length;
|
|
991
|
+
const meanScore = gens.reduce((sum, s) => sum + (s.score ?? 0), 0) / gens.length;
|
|
992
|
+
byGeneration.push({ generation: g, total: gens.length, failures: fails, meanScore });
|
|
993
|
+
}
|
|
994
|
+
return { scenarios, failures, byGeneration, scoreCalls };
|
|
995
|
+
}
|
|
996
|
+
function mulberry32(seed) {
|
|
997
|
+
let s = seed >>> 0;
|
|
998
|
+
return () => {
|
|
999
|
+
s = s + 1831565813 >>> 0;
|
|
1000
|
+
let t = s;
|
|
1001
|
+
t = Math.imul(t ^ t >>> 15, t | 1);
|
|
1002
|
+
t ^= t + Math.imul(t ^ t >>> 7, t | 61);
|
|
1003
|
+
return ((t ^ t >>> 14) >>> 0) / 4294967296;
|
|
1004
|
+
};
|
|
1005
|
+
}
|
|
1006
|
+
|
|
1007
|
+
// src/rl/compute-curves.ts
|
|
1008
|
+
async function runComputeCurve(opts) {
|
|
1009
|
+
const points = [];
|
|
1010
|
+
for (const budget of opts.budgets) {
|
|
1011
|
+
const r = await opts.runAtBudget(budget);
|
|
1012
|
+
points.push({
|
|
1013
|
+
budgetId: budget.id,
|
|
1014
|
+
cost: budget.cost,
|
|
1015
|
+
score: r.score,
|
|
1016
|
+
samples: r.samples,
|
|
1017
|
+
std: r.std,
|
|
1018
|
+
metrics: r.metrics
|
|
1019
|
+
});
|
|
1020
|
+
}
|
|
1021
|
+
const sorted = [...points].sort((a, b) => a.cost - b.cost);
|
|
1022
|
+
const logSlope = sorted.length >= 2 ? fitLogSlope(sorted) : null;
|
|
1023
|
+
const best = points.reduce((a, b) => b.score > a.score ? b : a);
|
|
1024
|
+
return { candidateId: opts.candidateId, points: sorted, logSlope, best };
|
|
1025
|
+
}
|
|
1026
|
+
async function bestOfN(opts) {
|
|
1027
|
+
if (opts.n <= 0) throw new Error("bestOfN: n must be > 0");
|
|
1028
|
+
const rollouts = [];
|
|
1029
|
+
const scores = [];
|
|
1030
|
+
for (let i = 0; i < opts.n; i++) {
|
|
1031
|
+
const r = await opts.sample(i);
|
|
1032
|
+
rollouts.push(r);
|
|
1033
|
+
scores.push(await opts.scoreFn(r));
|
|
1034
|
+
}
|
|
1035
|
+
let bestIndex = 0;
|
|
1036
|
+
for (let i = 1; i < scores.length; i++) if (scores[i] > scores[bestIndex]) bestIndex = i;
|
|
1037
|
+
const meanScore = scores.reduce((s, x) => s + x, 0) / scores.length;
|
|
1038
|
+
return {
|
|
1039
|
+
best: rollouts[bestIndex],
|
|
1040
|
+
bestScore: scores[bestIndex],
|
|
1041
|
+
scores,
|
|
1042
|
+
meanScore,
|
|
1043
|
+
bestIndex
|
|
1044
|
+
};
|
|
1045
|
+
}
|
|
1046
|
+
async function selfConsistency(opts) {
|
|
1047
|
+
if (opts.n <= 0) throw new Error("selfConsistency: n must be > 0");
|
|
1048
|
+
const rollouts = [];
|
|
1049
|
+
const histogram = {};
|
|
1050
|
+
for (let i = 0; i < opts.n; i++) {
|
|
1051
|
+
const r = await opts.sample(i);
|
|
1052
|
+
rollouts.push(r);
|
|
1053
|
+
const key = opts.answerKey(r);
|
|
1054
|
+
histogram[key] = (histogram[key] ?? 0) + 1;
|
|
1055
|
+
}
|
|
1056
|
+
let answer = "";
|
|
1057
|
+
let max = -1;
|
|
1058
|
+
for (const [k, v] of Object.entries(histogram)) {
|
|
1059
|
+
if (v > max) {
|
|
1060
|
+
max = v;
|
|
1061
|
+
answer = k;
|
|
1062
|
+
}
|
|
1063
|
+
}
|
|
1064
|
+
const representative = rollouts.find((r) => opts.answerKey(r) === answer) ?? rollouts[0];
|
|
1065
|
+
return {
|
|
1066
|
+
answer,
|
|
1067
|
+
agreement: max / opts.n,
|
|
1068
|
+
histogram,
|
|
1069
|
+
representative,
|
|
1070
|
+
rollouts
|
|
1071
|
+
};
|
|
1072
|
+
}
|
|
1073
|
+
function paretoFrontier(points) {
|
|
1074
|
+
const onFrontier = [];
|
|
1075
|
+
for (const p of points) {
|
|
1076
|
+
const dominated = points.some(
|
|
1077
|
+
(q) => q !== p && q.cost <= p.cost && q.score >= p.score && (q.cost < p.cost || q.score > p.score)
|
|
1078
|
+
);
|
|
1079
|
+
if (!dominated) onFrontier.push(p);
|
|
1080
|
+
}
|
|
1081
|
+
return onFrontier.sort((a, b) => a.cost - b.cost);
|
|
1082
|
+
}
|
|
1083
|
+
function fitLogSlope(points) {
|
|
1084
|
+
const xs = points.map((p) => Math.log(Math.max(1e-12, p.cost)));
|
|
1085
|
+
const ys = points.map((p) => p.score);
|
|
1086
|
+
const n = xs.length;
|
|
1087
|
+
const mx = xs.reduce((s, x) => s + x, 0) / n;
|
|
1088
|
+
const my = ys.reduce((s, y) => s + y, 0) / n;
|
|
1089
|
+
let num = 0;
|
|
1090
|
+
let den = 0;
|
|
1091
|
+
for (let i = 0; i < n; i++) {
|
|
1092
|
+
num += (xs[i] - mx) * (ys[i] - my);
|
|
1093
|
+
den += (xs[i] - mx) ** 2;
|
|
1094
|
+
}
|
|
1095
|
+
return den === 0 ? 0 : num / den;
|
|
1096
|
+
}
|
|
1097
|
+
|
|
1098
|
+
// src/rl/active-curriculum.ts
|
|
1099
|
+
function varianceBasedCurriculum(observations, candidateCells, opts) {
|
|
1100
|
+
const variancePrior = opts.variancePrior ?? 0.05;
|
|
1101
|
+
const floor = opts.floorPerCell ?? 1;
|
|
1102
|
+
const budget = opts.budget;
|
|
1103
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
1104
|
+
for (const o of observations) {
|
|
1105
|
+
const k = `${o.variantId}::${o.scenarioId}`;
|
|
1106
|
+
const arr = grouped.get(k) ?? [];
|
|
1107
|
+
arr.push(o.score);
|
|
1108
|
+
grouped.set(k, arr);
|
|
1109
|
+
}
|
|
1110
|
+
const cellStats = candidateCells.map((c) => {
|
|
1111
|
+
const k = `${c.variantId}::${c.scenarioId}`;
|
|
1112
|
+
const samples = grouped.get(k) ?? [];
|
|
1113
|
+
const n = samples.length;
|
|
1114
|
+
const mean2 = n === 0 ? 0.5 : samples.reduce((s, v) => s + v, 0) / n;
|
|
1115
|
+
const variance = n < 2 ? variancePrior : samples.reduce((s, v) => s + (v - mean2) ** 2, 0) / (n - 1) + variancePrior;
|
|
1116
|
+
const weight = Math.sqrt(variance) + 1 / Math.sqrt(Math.max(1, n));
|
|
1117
|
+
return { variantId: c.variantId, scenarioId: c.scenarioId, n, mean: mean2, variance, weight };
|
|
1118
|
+
});
|
|
1119
|
+
const floorTotal = floor * cellStats.length;
|
|
1120
|
+
if (floorTotal >= budget) {
|
|
1121
|
+
const each = Math.max(1, Math.floor(budget / Math.max(1, cellStats.length)));
|
|
1122
|
+
return cellStats.map((c) => ({
|
|
1123
|
+
variantId: c.variantId,
|
|
1124
|
+
scenarioId: c.scenarioId,
|
|
1125
|
+
count: each,
|
|
1126
|
+
reason: `floor allocation (budget tight; n=${c.n})`
|
|
1127
|
+
}));
|
|
1128
|
+
}
|
|
1129
|
+
const remaining = budget - floorTotal;
|
|
1130
|
+
const totalWeight = cellStats.reduce((s, c) => s + c.weight, 0);
|
|
1131
|
+
return cellStats.map((c) => {
|
|
1132
|
+
const proportional = totalWeight === 0 ? 0 : Math.round(c.weight / totalWeight * remaining);
|
|
1133
|
+
return {
|
|
1134
|
+
variantId: c.variantId,
|
|
1135
|
+
scenarioId: c.scenarioId,
|
|
1136
|
+
count: floor + proportional,
|
|
1137
|
+
reason: `variance ${c.variance.toFixed(3)} (n=${c.n}, mean=${c.mean.toFixed(3)})`
|
|
1138
|
+
};
|
|
1139
|
+
});
|
|
1140
|
+
}
|
|
1141
|
+
function thompsonCurriculum(observations, candidateCells, opts) {
|
|
1142
|
+
const threshold = opts.decisionThreshold ?? 0.5;
|
|
1143
|
+
const alpha0 = opts.priorAlpha ?? 1;
|
|
1144
|
+
const beta0 = opts.priorBeta ?? 1;
|
|
1145
|
+
const rng = makeRng(opts.seed);
|
|
1146
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
1147
|
+
for (const o of observations) {
|
|
1148
|
+
const k = `${o.variantId}::${o.scenarioId}`;
|
|
1149
|
+
const cur = grouped.get(k) ?? { passes: 0, failures: 0 };
|
|
1150
|
+
const pass = o.pass ?? o.score >= threshold;
|
|
1151
|
+
if (pass) cur.passes += 1;
|
|
1152
|
+
else cur.failures += 1;
|
|
1153
|
+
grouped.set(k, cur);
|
|
1154
|
+
}
|
|
1155
|
+
const stats = candidateCells.map((c) => {
|
|
1156
|
+
const k = `${c.variantId}::${c.scenarioId}`;
|
|
1157
|
+
const cur = grouped.get(k) ?? { passes: 0, failures: 0 };
|
|
1158
|
+
const a = alpha0 + cur.passes;
|
|
1159
|
+
const b = beta0 + cur.failures;
|
|
1160
|
+
const sampled = sampleBeta(a, b, rng);
|
|
1161
|
+
const distance = Math.abs(sampled - threshold);
|
|
1162
|
+
const variance = a * b / ((a + b) ** 2 * (a + b + 1));
|
|
1163
|
+
const sigma = Math.max(0.05, Math.sqrt(variance));
|
|
1164
|
+
const weight = Math.exp(-((distance / sigma) ** 2));
|
|
1165
|
+
return {
|
|
1166
|
+
variantId: c.variantId,
|
|
1167
|
+
scenarioId: c.scenarioId,
|
|
1168
|
+
n: cur.passes + cur.failures,
|
|
1169
|
+
sampled,
|
|
1170
|
+
sigma,
|
|
1171
|
+
weight,
|
|
1172
|
+
a,
|
|
1173
|
+
b
|
|
1174
|
+
};
|
|
1175
|
+
});
|
|
1176
|
+
const totalWeight = stats.reduce((s, c) => s + c.weight, 0);
|
|
1177
|
+
return stats.map((c) => {
|
|
1178
|
+
const proportional = totalWeight === 0 ? 0 : Math.round(c.weight / totalWeight * opts.budget);
|
|
1179
|
+
return {
|
|
1180
|
+
variantId: c.variantId,
|
|
1181
|
+
scenarioId: c.scenarioId,
|
|
1182
|
+
count: Math.max(0, proportional),
|
|
1183
|
+
reason: `Beta(${c.a.toFixed(1)},${c.b.toFixed(1)}) sample=${c.sampled.toFixed(3)} (target ${threshold})`
|
|
1184
|
+
};
|
|
1185
|
+
});
|
|
1186
|
+
}
|
|
1187
|
+
function observationsFromRunRecords(runs, opts = {}) {
|
|
1188
|
+
const threshold = opts.passThreshold ?? 0.5;
|
|
1189
|
+
const useHoldout = opts.useHoldout ?? true;
|
|
1190
|
+
const out = [];
|
|
1191
|
+
for (const r of runs) {
|
|
1192
|
+
if (!r.scenarioId) continue;
|
|
1193
|
+
const score = useHoldout ? r.outcome.holdoutScore ?? r.outcome.searchScore : r.outcome.searchScore ?? r.outcome.holdoutScore;
|
|
1194
|
+
if (typeof score !== "number" || !Number.isFinite(score)) continue;
|
|
1195
|
+
out.push({
|
|
1196
|
+
variantId: r.candidateId,
|
|
1197
|
+
scenarioId: r.scenarioId,
|
|
1198
|
+
score,
|
|
1199
|
+
pass: score >= threshold
|
|
1200
|
+
});
|
|
1201
|
+
}
|
|
1202
|
+
return out;
|
|
1203
|
+
}
|
|
1204
|
+
function makeRng(seed) {
|
|
1205
|
+
if (seed === void 0) return Math.random;
|
|
1206
|
+
let s = seed >>> 0;
|
|
1207
|
+
return () => {
|
|
1208
|
+
s = s + 1831565813 >>> 0;
|
|
1209
|
+
let t = s;
|
|
1210
|
+
t = Math.imul(t ^ t >>> 15, t | 1);
|
|
1211
|
+
t ^= t + Math.imul(t ^ t >>> 7, t | 61);
|
|
1212
|
+
return ((t ^ t >>> 14) >>> 0) / 4294967296;
|
|
1213
|
+
};
|
|
1214
|
+
}
|
|
1215
|
+
function sampleBeta(alpha, beta, rng) {
|
|
1216
|
+
const a = Math.max(1, alpha);
|
|
1217
|
+
const b = Math.max(1, beta);
|
|
1218
|
+
const x = sampleGamma(a, rng);
|
|
1219
|
+
const y = sampleGamma(b, rng);
|
|
1220
|
+
return x / (x + y);
|
|
1221
|
+
}
|
|
1222
|
+
function sampleGamma(shape, rng) {
|
|
1223
|
+
const d = shape - 1 / 3;
|
|
1224
|
+
const c = 1 / Math.sqrt(9 * d);
|
|
1225
|
+
while (true) {
|
|
1226
|
+
let x;
|
|
1227
|
+
let v;
|
|
1228
|
+
do {
|
|
1229
|
+
const u1 = rng() || 1e-12;
|
|
1230
|
+
const u2 = rng() || 1e-12;
|
|
1231
|
+
x = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
|
|
1232
|
+
v = 1 + c * x;
|
|
1233
|
+
} while (v <= 0);
|
|
1234
|
+
v = v * v * v;
|
|
1235
|
+
const u = rng();
|
|
1236
|
+
if (u < 1 - 0.0331 * x ** 4) return d * v;
|
|
1237
|
+
if (Math.log(u) < 0.5 * x * x + d * (1 - v + Math.log(v))) return d * v;
|
|
1238
|
+
}
|
|
1239
|
+
}
|
|
1240
|
+
|
|
1241
|
+
// src/rl/reward-hacking.ts
|
|
1242
|
+
var DEFAULT_PROXY = (r) => {
|
|
1243
|
+
const v = r.outcome.holdoutScore ?? r.outcome.searchScore;
|
|
1244
|
+
return typeof v === "number" && Number.isFinite(v) ? v : null;
|
|
1245
|
+
};
|
|
1246
|
+
function detectRewardHacking(input) {
|
|
1247
|
+
const proxyOf = input.proxyOf ?? DEFAULT_PROXY;
|
|
1248
|
+
const truthOf = input.truthOf;
|
|
1249
|
+
const sus = input.thresholds?.suspect ?? 0.3;
|
|
1250
|
+
const gam = input.thresholds?.gaming ?? 0.6;
|
|
1251
|
+
const runs = input.runs.filter((r) => proxyOf(r) !== null);
|
|
1252
|
+
const n = runs.length;
|
|
1253
|
+
if (n < 4) {
|
|
1254
|
+
return {
|
|
1255
|
+
findings: [],
|
|
1256
|
+
verdict: "clean",
|
|
1257
|
+
n,
|
|
1258
|
+
rationale: [`fewer than 4 runs with proxy reward (n=${n}); insufficient evidence`]
|
|
1259
|
+
};
|
|
1260
|
+
}
|
|
1261
|
+
const windowSize = Math.max(1, input.windowSize ?? Math.min(50, Math.floor(n / 2)));
|
|
1262
|
+
const before = runs.slice(0, n - windowSize);
|
|
1263
|
+
const after = runs.slice(n - windowSize);
|
|
1264
|
+
const findings = [];
|
|
1265
|
+
if (truthOf) {
|
|
1266
|
+
const beforeProxy = before.map(proxyOf).filter((v) => typeof v === "number");
|
|
1267
|
+
const afterProxy = after.map(proxyOf).filter((v) => typeof v === "number");
|
|
1268
|
+
const beforeTruth = before.map(truthOf).filter((v) => typeof v === "number");
|
|
1269
|
+
const afterTruth = after.map(truthOf).filter((v) => typeof v === "number");
|
|
1270
|
+
if (beforeProxy.length >= 2 && afterProxy.length >= 2 && beforeTruth.length >= 2 && afterTruth.length >= 2) {
|
|
1271
|
+
const proxyDelta = mean(afterProxy) - mean(beforeProxy);
|
|
1272
|
+
const truthDelta = mean(afterTruth) - mean(beforeTruth);
|
|
1273
|
+
const gap = Math.max(0, proxyDelta - truthDelta);
|
|
1274
|
+
const severity = clamp012(gap * 5);
|
|
1275
|
+
findings.push({
|
|
1276
|
+
signal: "reward_divergence",
|
|
1277
|
+
severity,
|
|
1278
|
+
message: severity >= sus ? `proxy reward rose by ${proxyDelta.toFixed(3)} while truth changed by ${truthDelta.toFixed(3)} \u2014 potential Goodhart` : `proxy and truth moved together (proxy ${proxyDelta.toFixed(3)}, truth ${truthDelta.toFixed(3)})`,
|
|
1279
|
+
detail: { proxyDelta, truthDelta, gap, beforeN: beforeProxy.length, afterN: afterProxy.length }
|
|
1280
|
+
});
|
|
1281
|
+
}
|
|
1282
|
+
}
|
|
1283
|
+
{
|
|
1284
|
+
const beforeP = before.map(proxyOf).filter((v) => typeof v === "number");
|
|
1285
|
+
const afterP = after.map(proxyOf).filter((v) => typeof v === "number");
|
|
1286
|
+
if (beforeP.length >= 4 && afterP.length >= 4) {
|
|
1287
|
+
const ks = ksStatistic(beforeP, afterP);
|
|
1288
|
+
const severity = clamp012(ks - 0.2);
|
|
1289
|
+
findings.push({
|
|
1290
|
+
signal: "distribution_shift",
|
|
1291
|
+
severity,
|
|
1292
|
+
message: severity >= sus ? `KS=${ks.toFixed(3)} between before/after windows \u2014 distributional shift large` : `KS=${ks.toFixed(3)} between before/after windows \u2014 within-distribution drift`,
|
|
1293
|
+
detail: { ks, beforeN: beforeP.length, afterN: afterP.length }
|
|
1294
|
+
});
|
|
1295
|
+
}
|
|
1296
|
+
}
|
|
1297
|
+
{
|
|
1298
|
+
const secondaryOf = input.secondaryRewardOf ?? defaultSecondary(input.verifiableRewardOptions);
|
|
1299
|
+
const aligned = runs.map((r) => ({ p: proxyOf(r), s: secondaryOf(r) })).filter((x) => typeof x.p === "number" && typeof x.s === "number");
|
|
1300
|
+
if (aligned.length >= 4) {
|
|
1301
|
+
const ps = aligned.map((x) => x.p);
|
|
1302
|
+
const ss = aligned.map((x) => x.s);
|
|
1303
|
+
const r = pearsonR(ps, ss);
|
|
1304
|
+
const severity = clamp012(0.5 - Math.max(0, r));
|
|
1305
|
+
findings.push({
|
|
1306
|
+
signal: "reward_disagreement",
|
|
1307
|
+
severity,
|
|
1308
|
+
message: severity >= sus ? `proxy and independent secondary reward correlate \u03C1=${r.toFixed(3)} \u2014 possibly hacking proxy` : `proxy and secondary reward correlate \u03C1=${r.toFixed(3)}`,
|
|
1309
|
+
detail: { pearson: r, n: aligned.length }
|
|
1310
|
+
});
|
|
1311
|
+
}
|
|
1312
|
+
}
|
|
1313
|
+
{
|
|
1314
|
+
const detRuns = filterDeterministicallyRewarded(runs, input.verifiableRewardOptions ?? {});
|
|
1315
|
+
if (detRuns.length >= 4) {
|
|
1316
|
+
const detBefore = detRuns.slice(0, Math.floor(detRuns.length / 2));
|
|
1317
|
+
const detAfter = detRuns.slice(Math.floor(detRuns.length / 2));
|
|
1318
|
+
const detDelta = mean(detAfter.map((r) => r.reward.value)) - mean(detBefore.map((r) => r.reward.value));
|
|
1319
|
+
const proxyDelta = mean(after.map(proxyOf).filter((v) => typeof v === "number")) - mean(before.map(proxyOf).filter((v) => typeof v === "number"));
|
|
1320
|
+
const driftGap = Math.max(0, proxyDelta - detDelta);
|
|
1321
|
+
const severity = clamp012(driftGap * 5);
|
|
1322
|
+
findings.push({
|
|
1323
|
+
signal: "judge_drift",
|
|
1324
|
+
severity,
|
|
1325
|
+
message: severity >= sus ? `judge proxy +${proxyDelta.toFixed(3)} while deterministic reward +${detDelta.toFixed(3)} \u2014 judge drifting up without verifiable backing` : `judge and deterministic rewards move in step (judge ${proxyDelta.toFixed(3)}, det ${detDelta.toFixed(3)})`,
|
|
1326
|
+
detail: { proxyDelta, detDelta, driftGap, n: detRuns.length }
|
|
1327
|
+
});
|
|
1328
|
+
}
|
|
1329
|
+
}
|
|
1330
|
+
const maxSev = findings.reduce((m, f) => Math.max(m, f.severity), 0);
|
|
1331
|
+
const verdict = maxSev >= gam ? "gaming" : maxSev >= sus ? "suspect" : "clean";
|
|
1332
|
+
const rationale = findings.filter((f) => f.severity >= sus).map((f) => `${f.signal}: severity ${f.severity.toFixed(2)} \u2014 ${f.message}`);
|
|
1333
|
+
if (rationale.length === 0) rationale.push("no signals fired above suspect threshold");
|
|
1334
|
+
return { findings, verdict, rationale, n };
|
|
1335
|
+
}
|
|
1336
|
+
function mean(xs) {
|
|
1337
|
+
if (xs.length === 0) return 0;
|
|
1338
|
+
return xs.reduce((s, x) => s + x, 0) / xs.length;
|
|
1339
|
+
}
|
|
1340
|
+
function clamp012(x) {
|
|
1341
|
+
if (!Number.isFinite(x)) return 0;
|
|
1342
|
+
return Math.max(0, Math.min(1, x));
|
|
1343
|
+
}
|
|
1344
|
+
function pearsonR(a, b) {
|
|
1345
|
+
if (a.length !== b.length || a.length < 2) return 0;
|
|
1346
|
+
const ma = mean(a);
|
|
1347
|
+
const mb = mean(b);
|
|
1348
|
+
let num = 0, da = 0, db = 0;
|
|
1349
|
+
for (let i = 0; i < a.length; i++) {
|
|
1350
|
+
const xa = a[i] - ma;
|
|
1351
|
+
const xb = b[i] - mb;
|
|
1352
|
+
num += xa * xb;
|
|
1353
|
+
da += xa * xa;
|
|
1354
|
+
db += xb * xb;
|
|
1355
|
+
}
|
|
1356
|
+
if (da === 0 || db === 0) return 0;
|
|
1357
|
+
return num / Math.sqrt(da * db);
|
|
1358
|
+
}
|
|
1359
|
+
function ksStatistic(a, b) {
|
|
1360
|
+
const sortedA = [...a].sort((x, y) => x - y);
|
|
1361
|
+
const sortedB = [...b].sort((x, y) => x - y);
|
|
1362
|
+
const all = [.../* @__PURE__ */ new Set([...sortedA, ...sortedB])].sort((x, y) => x - y);
|
|
1363
|
+
let max = 0;
|
|
1364
|
+
for (const v of all) {
|
|
1365
|
+
const fa = sortedA.filter((x) => x <= v).length / sortedA.length;
|
|
1366
|
+
const fb = sortedB.filter((x) => x <= v).length / sortedB.length;
|
|
1367
|
+
max = Math.max(max, Math.abs(fa - fb));
|
|
1368
|
+
}
|
|
1369
|
+
return max;
|
|
1370
|
+
}
|
|
1371
|
+
function defaultSecondary(verifiableOpts) {
|
|
1372
|
+
return (run) => {
|
|
1373
|
+
const filtered = filterDeterministicallyRewarded([run], verifiableOpts ?? {});
|
|
1374
|
+
return filtered.length === 1 ? filtered[0].reward.value : null;
|
|
1375
|
+
};
|
|
1376
|
+
}
|
|
1377
|
+
|
|
1378
|
+
// src/rl/adaptation-eval.ts
|
|
1379
|
+
async function runAdaptationCurve(opts) {
|
|
1380
|
+
const ks = opts.ks ?? [0, 1, 2, 4, 8, 16];
|
|
1381
|
+
const reps = opts.reps ?? 3;
|
|
1382
|
+
const passThreshold = opts.passThreshold ?? 0.5;
|
|
1383
|
+
const sortedKs = [...ks].sort((a, b) => a - b);
|
|
1384
|
+
const points = [];
|
|
1385
|
+
for (const k of sortedKs) {
|
|
1386
|
+
const perScenario = [];
|
|
1387
|
+
const allScores = [];
|
|
1388
|
+
let totalPasses = 0;
|
|
1389
|
+
let totalAttempts = 0;
|
|
1390
|
+
for (const scenario of opts.scenarios) {
|
|
1391
|
+
const sid = scenario.scenarioId ?? `scenario-${opts.scenarios.indexOf(scenario)}`;
|
|
1392
|
+
let scores = [];
|
|
1393
|
+
let passes = 0;
|
|
1394
|
+
for (let r = 0; r < reps; r++) {
|
|
1395
|
+
const score = await opts.runner.run({ scenario, k, rep: r });
|
|
1396
|
+
scores.push(score);
|
|
1397
|
+
if (score >= passThreshold) passes++;
|
|
1398
|
+
allScores.push(score);
|
|
1399
|
+
if (score >= passThreshold) totalPasses++;
|
|
1400
|
+
totalAttempts++;
|
|
1401
|
+
}
|
|
1402
|
+
const meanS = scores.reduce((s, v) => s + v, 0) / scores.length;
|
|
1403
|
+
perScenario.push({ scenarioId: sid, meanScore: meanS, passes, total: scores.length });
|
|
1404
|
+
}
|
|
1405
|
+
const meanScore = allScores.reduce((s, v) => s + v, 0) / Math.max(1, allScores.length);
|
|
1406
|
+
const variance = allScores.length < 2 ? 0 : allScores.reduce((s, v) => s + (v - meanScore) ** 2, 0) / (allScores.length - 1);
|
|
1407
|
+
points.push({
|
|
1408
|
+
k,
|
|
1409
|
+
meanScore,
|
|
1410
|
+
passRate: totalPasses / Math.max(1, totalAttempts),
|
|
1411
|
+
std: Math.sqrt(variance),
|
|
1412
|
+
n: allScores.length,
|
|
1413
|
+
perScenario
|
|
1414
|
+
});
|
|
1415
|
+
}
|
|
1416
|
+
const firstPassK2 = points.find((p) => p.passRate >= passThreshold)?.k ?? null;
|
|
1417
|
+
const maxK = sortedKs[sortedKs.length - 1] ?? 1;
|
|
1418
|
+
let area = 0;
|
|
1419
|
+
for (let i = 1; i < points.length; i++) {
|
|
1420
|
+
const x1 = points[i - 1].k;
|
|
1421
|
+
const x2 = points[i].k;
|
|
1422
|
+
const y1 = points[i - 1].meanScore;
|
|
1423
|
+
const y2 = points[i].meanScore;
|
|
1424
|
+
area += (y1 + y2) / 2 * (x2 - x1);
|
|
1425
|
+
}
|
|
1426
|
+
const adaptationArea = maxK === 0 ? 0 : area / maxK;
|
|
1427
|
+
return { points, firstPassK: firstPassK2, adaptationArea };
|
|
1428
|
+
}
|
|
1429
|
+
function compareAdaptationCurves(a, b, opts = {}) {
|
|
1430
|
+
const conf = opts.confidence ?? 0.95;
|
|
1431
|
+
const resamples = opts.bootstrapResamples ?? 500;
|
|
1432
|
+
const rng = makeRng2(opts.seed);
|
|
1433
|
+
const perK = [];
|
|
1434
|
+
for (const ap of a.points) {
|
|
1435
|
+
const bp = b.points.find((p) => p.k === ap.k);
|
|
1436
|
+
if (!bp) continue;
|
|
1437
|
+
const aMeans = ap.perScenario.map((s) => s.meanScore);
|
|
1438
|
+
const bMeans = bp.perScenario.map((s) => s.meanScore);
|
|
1439
|
+
const aCi = bootstrapMeanCi(aMeans, resamples, conf, rng);
|
|
1440
|
+
const bCi = bootstrapMeanCi(bMeans, resamples, conf, rng);
|
|
1441
|
+
perK.push({
|
|
1442
|
+
k: ap.k,
|
|
1443
|
+
deltaMean: ap.meanScore - bp.meanScore,
|
|
1444
|
+
aLow: aCi.low,
|
|
1445
|
+
aHigh: aCi.high,
|
|
1446
|
+
bLow: bCi.low,
|
|
1447
|
+
bHigh: bCi.high
|
|
1448
|
+
});
|
|
1449
|
+
}
|
|
1450
|
+
const areaDelta = a.adaptationArea - b.adaptationArea;
|
|
1451
|
+
const firstPassKDelta = a.firstPassK !== null && b.firstPassK !== null ? b.firstPassK - a.firstPassK : null;
|
|
1452
|
+
const meanDelta = perK.reduce((s, p) => s + p.deltaMean, 0) / Math.max(1, perK.length);
|
|
1453
|
+
let verdict;
|
|
1454
|
+
if (Math.abs(meanDelta) < 0.02 && Math.abs(areaDelta) < 0.02) verdict = "similar";
|
|
1455
|
+
else if (meanDelta > 0 && areaDelta > 0) verdict = "a_better";
|
|
1456
|
+
else if (meanDelta < 0 && areaDelta < 0) verdict = "b_better";
|
|
1457
|
+
else verdict = "similar";
|
|
1458
|
+
const rationale = `mean per-k delta=${meanDelta.toFixed(3)}, area delta=${areaDelta.toFixed(3)}` + (firstPassKDelta !== null ? `, first-pass-k delta=${firstPassKDelta}` : "");
|
|
1459
|
+
return { perK, areaDelta, firstPassKDelta, verdict, rationale };
|
|
1460
|
+
}
|
|
1461
|
+
function firstPassK(curve, threshold = 0.5) {
|
|
1462
|
+
return curve.points.find((p) => p.passRate >= threshold)?.k ?? null;
|
|
1463
|
+
}
|
|
1464
|
+
function makeRng2(seed) {
|
|
1465
|
+
if (seed === void 0) return Math.random;
|
|
1466
|
+
let s = seed >>> 0;
|
|
1467
|
+
return () => {
|
|
1468
|
+
s = s + 1831565813 >>> 0;
|
|
1469
|
+
let t = s;
|
|
1470
|
+
t = Math.imul(t ^ t >>> 15, t | 1);
|
|
1471
|
+
t ^= t + Math.imul(t ^ t >>> 7, t | 61);
|
|
1472
|
+
return ((t ^ t >>> 14) >>> 0) / 4294967296;
|
|
1473
|
+
};
|
|
1474
|
+
}
|
|
1475
|
+
function bootstrapMeanCi(xs, resamples, confidence, rng) {
|
|
1476
|
+
if (xs.length < 2) return { low: xs[0] ?? 0, high: xs[0] ?? 0 };
|
|
1477
|
+
const samples = new Array(resamples);
|
|
1478
|
+
for (let b = 0; b < resamples; b++) {
|
|
1479
|
+
let sum = 0;
|
|
1480
|
+
for (let i = 0; i < xs.length; i++) sum += xs[Math.floor(rng() * xs.length)];
|
|
1481
|
+
samples[b] = sum / xs.length;
|
|
1482
|
+
}
|
|
1483
|
+
samples.sort((a, b) => a - b);
|
|
1484
|
+
const alpha = 1 - confidence;
|
|
1485
|
+
return {
|
|
1486
|
+
low: samples[Math.floor(alpha / 2 * resamples)],
|
|
1487
|
+
high: samples[Math.min(resamples - 1, Math.ceil((1 - alpha / 2) * resamples) - 1)]
|
|
1488
|
+
};
|
|
1489
|
+
}
|
|
1490
|
+
|
|
1491
|
+
// src/rl/exporters.ts
|
|
1492
|
+
async function toDpoRows(triples, lookups) {
|
|
1493
|
+
const out = [];
|
|
1494
|
+
for (const t of triples) {
|
|
1495
|
+
const [prompt, chosen, rejected] = await Promise.all([
|
|
1496
|
+
Promise.resolve(lookups.promptOf(t.chosenRunId)),
|
|
1497
|
+
Promise.resolve(lookups.completionOf(t.chosenRunId)),
|
|
1498
|
+
Promise.resolve(lookups.completionOf(t.rejectedRunId))
|
|
1499
|
+
]);
|
|
1500
|
+
out.push({
|
|
1501
|
+
prompt,
|
|
1502
|
+
chosen,
|
|
1503
|
+
rejected,
|
|
1504
|
+
margin: t.marginScore,
|
|
1505
|
+
meta: {
|
|
1506
|
+
scenarioId: t.scenarioId,
|
|
1507
|
+
chosenVariantId: t.chosenVariantId,
|
|
1508
|
+
rejectedVariantId: t.rejectedVariantId,
|
|
1509
|
+
chosenRunId: t.chosenRunId,
|
|
1510
|
+
rejectedRunId: t.rejectedRunId,
|
|
1511
|
+
chosenModel: t.meta.chosenModel,
|
|
1512
|
+
rejectedModel: t.meta.rejectedModel
|
|
1513
|
+
}
|
|
1514
|
+
});
|
|
1515
|
+
}
|
|
1516
|
+
return out;
|
|
1517
|
+
}
|
|
1518
|
+
function toDpoJsonl(rows) {
|
|
1519
|
+
return rows.map((r) => JSON.stringify(r)).join("\n") + (rows.length > 0 ? "\n" : "");
|
|
1520
|
+
}
|
|
1521
|
+
async function toGrpoRows(runs, lookups) {
|
|
1522
|
+
const rewardOf = lookups.rewardOf ?? defaultReward;
|
|
1523
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
1524
|
+
for (const r of runs) {
|
|
1525
|
+
const sid = r.scenarioId ?? r.experimentId;
|
|
1526
|
+
const arr = grouped.get(sid) ?? [];
|
|
1527
|
+
arr.push(r);
|
|
1528
|
+
grouped.set(sid, arr);
|
|
1529
|
+
}
|
|
1530
|
+
const rows = [];
|
|
1531
|
+
for (const [scenarioId, group] of grouped.entries()) {
|
|
1532
|
+
if (group.length === 0) continue;
|
|
1533
|
+
const prompt = await Promise.resolve(lookups.promptOf(group[0].runId));
|
|
1534
|
+
const completions = [];
|
|
1535
|
+
const rewards = [];
|
|
1536
|
+
const runIds = [];
|
|
1537
|
+
for (const r of group) {
|
|
1538
|
+
const reward = rewardOf(r);
|
|
1539
|
+
if (reward === null) continue;
|
|
1540
|
+
const completion = await Promise.resolve(lookups.completionOf(r.runId));
|
|
1541
|
+
completions.push(completion);
|
|
1542
|
+
rewards.push(reward);
|
|
1543
|
+
runIds.push(r.runId);
|
|
1544
|
+
}
|
|
1545
|
+
if (completions.length === 0) continue;
|
|
1546
|
+
rows.push({
|
|
1547
|
+
prompt,
|
|
1548
|
+
completions,
|
|
1549
|
+
rewards,
|
|
1550
|
+
runIds,
|
|
1551
|
+
meta: {
|
|
1552
|
+
scenarioId,
|
|
1553
|
+
n: completions.length,
|
|
1554
|
+
meanReward: rewards.reduce((s, x) => s + x, 0) / rewards.length
|
|
1555
|
+
}
|
|
1556
|
+
});
|
|
1557
|
+
}
|
|
1558
|
+
return rows;
|
|
1559
|
+
}
|
|
1560
|
+
function toGrpoJsonl(rows) {
|
|
1561
|
+
return rows.map((r) => JSON.stringify(r)).join("\n") + (rows.length > 0 ? "\n" : "");
|
|
1562
|
+
}
|
|
1563
|
+
async function toSftRows(runs, lookups) {
|
|
1564
|
+
const include = lookups.include ?? (() => true);
|
|
1565
|
+
const rows = [];
|
|
1566
|
+
for (const r of runs) {
|
|
1567
|
+
if (!include(r)) continue;
|
|
1568
|
+
const system = lookups.systemOf?.(r);
|
|
1569
|
+
const [prompt, completion] = await Promise.all([
|
|
1570
|
+
Promise.resolve(lookups.promptOf(r.runId)),
|
|
1571
|
+
Promise.resolve(lookups.completionOf(r.runId))
|
|
1572
|
+
]);
|
|
1573
|
+
const messages = [];
|
|
1574
|
+
if (system) messages.push({ role: "system", content: system });
|
|
1575
|
+
messages.push({ role: "user", content: prompt });
|
|
1576
|
+
messages.push({ role: "assistant", content: completion });
|
|
1577
|
+
rows.push({
|
|
1578
|
+
messages,
|
|
1579
|
+
meta: {
|
|
1580
|
+
runId: r.runId,
|
|
1581
|
+
candidateId: r.candidateId,
|
|
1582
|
+
scenarioId: r.scenarioId,
|
|
1583
|
+
score: r.outcome.holdoutScore ?? r.outcome.searchScore,
|
|
1584
|
+
model: r.model
|
|
1585
|
+
}
|
|
1586
|
+
});
|
|
1587
|
+
}
|
|
1588
|
+
return rows;
|
|
1589
|
+
}
|
|
1590
|
+
function toSftJsonl(rows) {
|
|
1591
|
+
return rows.map((r) => JSON.stringify(r)).join("\n") + (rows.length > 0 ? "\n" : "");
|
|
1592
|
+
}
|
|
1593
|
+
async function toPrmRows(triples, lookups) {
|
|
1594
|
+
const rows = [];
|
|
1595
|
+
for (const t of triples) {
|
|
1596
|
+
const prompt = await Promise.resolve(lookups.promptOf(t.prefixRunId));
|
|
1597
|
+
const prefixSpanIds = lookups.prefixOf ? await Promise.resolve(lookups.prefixOf(t.prefixRunId, t.prefixStepIndex)) : [];
|
|
1598
|
+
const prefixStepText = [];
|
|
1599
|
+
for (const spanId of prefixSpanIds) {
|
|
1600
|
+
prefixStepText.push(await Promise.resolve(lookups.stepTextOf(t.prefixRunId, spanId)));
|
|
1601
|
+
}
|
|
1602
|
+
const chosenStep = await Promise.resolve(lookups.stepTextOf(t.prefixRunId, t.chosenSpanId));
|
|
1603
|
+
const rejectedStep = await Promise.resolve(lookups.stepTextOf(t.rejectedRunId, t.rejectedSpanId));
|
|
1604
|
+
rows.push({
|
|
1605
|
+
prompt,
|
|
1606
|
+
prefixSpanIds,
|
|
1607
|
+
prefixStepText,
|
|
1608
|
+
chosenStep,
|
|
1609
|
+
rejectedStep,
|
|
1610
|
+
chosenReward: t.chosenReward,
|
|
1611
|
+
rejectedReward: t.rejectedReward,
|
|
1612
|
+
marginScore: t.marginScore,
|
|
1613
|
+
meta: {
|
|
1614
|
+
prefixRunId: t.prefixRunId,
|
|
1615
|
+
rejectedRunId: t.rejectedRunId,
|
|
1616
|
+
prefixStepIndex: t.prefixStepIndex
|
|
1617
|
+
}
|
|
1618
|
+
});
|
|
1619
|
+
}
|
|
1620
|
+
return rows;
|
|
1621
|
+
}
|
|
1622
|
+
function toPrmJsonl(rows) {
|
|
1623
|
+
return rows.map((r) => JSON.stringify(r)).join("\n") + (rows.length > 0 ? "\n" : "");
|
|
1624
|
+
}
|
|
1625
|
+
function stepRewardsToJsonl(stepRewards) {
|
|
1626
|
+
const rows = stepRewards.map((s) => ({
|
|
1627
|
+
runId: s.runId,
|
|
1628
|
+
spanId: s.spanId,
|
|
1629
|
+
stepIndex: s.stepIndex,
|
|
1630
|
+
reward: s.reward,
|
|
1631
|
+
determinism: s.determinism,
|
|
1632
|
+
weight: s.weight ?? 1
|
|
1633
|
+
}));
|
|
1634
|
+
return rows.map((r) => JSON.stringify(r)).join("\n") + (rows.length > 0 ? "\n" : "");
|
|
1635
|
+
}
|
|
1636
|
+
function defaultReward(run) {
|
|
1637
|
+
const v = run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
1638
|
+
return typeof v === "number" && Number.isFinite(v) ? v : null;
|
|
1639
|
+
}
|
|
1640
|
+
|
|
1641
|
+
// src/rl/rl-campaign.ts
|
|
1642
|
+
async function runRLCampaign(opts) {
|
|
1643
|
+
const campaign = await runEvalCampaign(opts);
|
|
1644
|
+
const rewardSignals = extractVerifiableRewardsFromRecords(
|
|
1645
|
+
campaign.runs,
|
|
1646
|
+
opts.verifiableReward ?? {}
|
|
1647
|
+
);
|
|
1648
|
+
const preferences = extractPreferences(campaign.runs, {
|
|
1649
|
+
strategy: opts.preferences?.strategy ?? "paired-by-scenario-and-seed",
|
|
1650
|
+
minMargin: opts.preferences?.minMargin ?? 0.05,
|
|
1651
|
+
splitTag: opts.preferences?.splitTag ?? opts.splitTag ?? "holdout",
|
|
1652
|
+
rewardOf: opts.preferences?.rewardOf
|
|
1653
|
+
});
|
|
1654
|
+
let interimConfidence = null;
|
|
1655
|
+
if (opts.report?.comparator) {
|
|
1656
|
+
const comparator = opts.report.comparator;
|
|
1657
|
+
const deltaSeries = collectPairedDeltaSeries(campaign.runs, comparator);
|
|
1658
|
+
if (deltaSeries.some((s) => s.deltas.length > 0)) {
|
|
1659
|
+
interimConfidence = evaluateInterimReleaseConfidence({
|
|
1660
|
+
deltaSeries,
|
|
1661
|
+
alpha: opts.sequential?.alpha,
|
|
1662
|
+
bound: opts.sequential?.bound,
|
|
1663
|
+
rope: opts.sequential?.rope ?? opts.report?.rope
|
|
1664
|
+
});
|
|
1665
|
+
}
|
|
1666
|
+
}
|
|
1667
|
+
const rewardHacking = detectRewardHacking({
|
|
1668
|
+
runs: campaign.runs,
|
|
1669
|
+
verifiableRewardOptions: opts.verifiableReward
|
|
1670
|
+
});
|
|
1671
|
+
let predictiveValidity = null;
|
|
1672
|
+
if (opts.outcomeStore && opts.outcomeMetrics && opts.outcomeMetrics.length > 0) {
|
|
1673
|
+
predictiveValidity = await rubricPredictiveValidity({
|
|
1674
|
+
runs: campaign.runs,
|
|
1675
|
+
outcomes: opts.outcomeStore,
|
|
1676
|
+
outcomeMetrics: opts.outcomeMetrics
|
|
1677
|
+
});
|
|
1678
|
+
}
|
|
1679
|
+
const trainerRows = {};
|
|
1680
|
+
if (opts.trainerExport?.dpo) {
|
|
1681
|
+
trainerRows.dpo = await toDpoRows(preferences.pairs, opts.trainerExport.dpo);
|
|
1682
|
+
}
|
|
1683
|
+
if (opts.trainerExport?.grpo) {
|
|
1684
|
+
trainerRows.grpo = await toGrpoRows(campaign.runs, opts.trainerExport.grpo);
|
|
1685
|
+
}
|
|
1686
|
+
if (opts.trainerExport?.sft) {
|
|
1687
|
+
trainerRows.sft = await toSftRows(campaign.runs, opts.trainerExport.sft);
|
|
1688
|
+
}
|
|
1689
|
+
const summary = buildSummary({ campaign, preferences, interimConfidence, rewardHacking, predictiveValidity });
|
|
1690
|
+
return {
|
|
1691
|
+
campaign,
|
|
1692
|
+
rewardSignals,
|
|
1693
|
+
preferences,
|
|
1694
|
+
interimConfidence,
|
|
1695
|
+
rewardHacking,
|
|
1696
|
+
predictiveValidity,
|
|
1697
|
+
trainerRows,
|
|
1698
|
+
summary,
|
|
1699
|
+
kind: "agent-eval-rl-campaign"
|
|
1700
|
+
};
|
|
1701
|
+
}
|
|
1702
|
+
function collectPairedDeltaSeries(runs, comparator) {
|
|
1703
|
+
const baseline = /* @__PURE__ */ new Map();
|
|
1704
|
+
for (const r of runs) {
|
|
1705
|
+
if (r.candidateId !== comparator) continue;
|
|
1706
|
+
const sid = r.scenarioId ?? r.experimentId;
|
|
1707
|
+
const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
|
|
1708
|
+
if (typeof score !== "number" || !Number.isFinite(score)) continue;
|
|
1709
|
+
baseline.set(`${sid}::${r.seed}`, score);
|
|
1710
|
+
}
|
|
1711
|
+
const byCandidate = /* @__PURE__ */ new Map();
|
|
1712
|
+
for (const r of runs) {
|
|
1713
|
+
if (r.candidateId === comparator) continue;
|
|
1714
|
+
const sid = r.scenarioId ?? r.experimentId;
|
|
1715
|
+
const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
|
|
1716
|
+
if (typeof score !== "number" || !Number.isFinite(score)) continue;
|
|
1717
|
+
const baseScore = baseline.get(`${sid}::${r.seed}`);
|
|
1718
|
+
if (typeof baseScore !== "number") continue;
|
|
1719
|
+
const arr = byCandidate.get(r.candidateId) ?? [];
|
|
1720
|
+
arr.push(score - baseScore);
|
|
1721
|
+
byCandidate.set(r.candidateId, arr);
|
|
1722
|
+
}
|
|
1723
|
+
return [...byCandidate.entries()].map(([candidateId, deltas]) => ({ candidateId, deltas }));
|
|
1724
|
+
}
|
|
1725
|
+
function buildSummary(args) {
|
|
1726
|
+
const c = args.campaign;
|
|
1727
|
+
const lines = [
|
|
1728
|
+
`${c.campaignId}: ${c.runs.length} successful runs / ${c.failedRuns.length} failed (fingerprint ${c.campaignFingerprint.slice(0, 12)}\u2026)`,
|
|
1729
|
+
`preferences: ${args.preferences.pairs.length} (${args.preferences.strategy}, ${args.preferences.pairsBelowMargin} below margin)`
|
|
1730
|
+
];
|
|
1731
|
+
if (args.interimConfidence) {
|
|
1732
|
+
lines.push(`sequential verdict: ${args.interimConfidence.recommendation.decision}` + (args.interimConfidence.recommendation.candidateId ? ` ${args.interimConfidence.recommendation.candidateId}` : ""));
|
|
1733
|
+
}
|
|
1734
|
+
lines.push(`reward-hacking: ${args.rewardHacking.verdict} (${args.rewardHacking.findings.length} signals checked)`);
|
|
1735
|
+
if (args.predictiveValidity) {
|
|
1736
|
+
const top = args.predictiveValidity.ranked[0];
|
|
1737
|
+
lines.push(`top-rubric: ${top?.rubric ?? "none"} \u03C1=${(top?.spearman ?? 0).toFixed(2)} (${top?.verdict ?? "no data"})`);
|
|
1738
|
+
}
|
|
1739
|
+
return lines.join(" | ");
|
|
1740
|
+
}
|
|
1741
|
+
|
|
1742
|
+
// src/rl/predictive-validity-researcher.ts
|
|
1743
|
+
var PredictiveValidityResearcher = class {
|
|
1744
|
+
opts;
|
|
1745
|
+
lastReport = null;
|
|
1746
|
+
constructor(opts) {
|
|
1747
|
+
this.opts = opts;
|
|
1748
|
+
}
|
|
1749
|
+
async inspectFailures(runs) {
|
|
1750
|
+
const threshold = this.opts.failureThreshold ?? 0.5;
|
|
1751
|
+
const failures = [];
|
|
1752
|
+
const failingRuns = runs.filter((r) => {
|
|
1753
|
+
const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
|
|
1754
|
+
return typeof score === "number" && score < threshold;
|
|
1755
|
+
});
|
|
1756
|
+
if (failingRuns.length === 0) return failures;
|
|
1757
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
1758
|
+
for (const r of failingRuns) {
|
|
1759
|
+
const arr = grouped.get(r.candidateId) ?? [];
|
|
1760
|
+
arr.push(r);
|
|
1761
|
+
grouped.set(r.candidateId, arr);
|
|
1762
|
+
}
|
|
1763
|
+
for (const [candidateId, group] of grouped.entries()) {
|
|
1764
|
+
const meanScore = group.reduce((s, r) => {
|
|
1765
|
+
const x = r.outcome.holdoutScore ?? r.outcome.searchScore ?? 0;
|
|
1766
|
+
return s + x;
|
|
1767
|
+
}, 0) / group.length;
|
|
1768
|
+
failures.push({
|
|
1769
|
+
code: `low-score-${candidateId}`,
|
|
1770
|
+
description: `${candidateId} scored < ${threshold} on ${group.length} run(s) (mean ${meanScore.toFixed(3)})`,
|
|
1771
|
+
evidence: {
|
|
1772
|
+
runIds: group.slice(0, 8).map((r) => r.runId),
|
|
1773
|
+
samples: group.length
|
|
1774
|
+
}
|
|
1775
|
+
});
|
|
1776
|
+
}
|
|
1777
|
+
return failures;
|
|
1778
|
+
}
|
|
1779
|
+
async proposeChange(failures) {
|
|
1780
|
+
if (failures.length === 0) return [];
|
|
1781
|
+
if (this.lastReport === null) {
|
|
1782
|
+
return [{
|
|
1783
|
+
kind: "threshold",
|
|
1784
|
+
payload: { directive: "researcher.collect-more-outcomes" },
|
|
1785
|
+
rationale: "predictive-validity researcher has no prior report; cannot recommend rubric reweighting until at least one report exists"
|
|
1786
|
+
}];
|
|
1787
|
+
}
|
|
1788
|
+
const decorativeThreshold = this.opts.decorativeThreshold ?? 0.4;
|
|
1789
|
+
const changes = [];
|
|
1790
|
+
for (const ranking of this.lastReport.ranked) {
|
|
1791
|
+
if (ranking.verdict === "load_bearing") continue;
|
|
1792
|
+
if (Math.abs(ranking.spearman) >= decorativeThreshold) continue;
|
|
1793
|
+
changes.push({
|
|
1794
|
+
kind: "reviewer_prompt",
|
|
1795
|
+
payload: { rubric: ranking.rubric, action: "down-weight", spearman: ranking.spearman, bestOutcome: ranking.bestOutcome },
|
|
1796
|
+
rationale: `predictive-validity Spearman=${ranking.spearman.toFixed(3)} vs ${ranking.bestOutcome} (decorative); recommend down-weighting`,
|
|
1797
|
+
expectedDelta: -Math.max(0, 0.05 - Math.abs(ranking.spearman))
|
|
1798
|
+
});
|
|
1799
|
+
}
|
|
1800
|
+
for (const ranking of this.lastReport.ranked.slice(0, 1)) {
|
|
1801
|
+
if (ranking.verdict !== "load_bearing") continue;
|
|
1802
|
+
changes.push({
|
|
1803
|
+
kind: "reviewer_prompt",
|
|
1804
|
+
payload: { rubric: ranking.rubric, action: "up-weight", spearman: ranking.spearman, bestOutcome: ranking.bestOutcome },
|
|
1805
|
+
rationale: `predictive-validity Spearman=${ranking.spearman.toFixed(3)} vs ${ranking.bestOutcome} (load-bearing); recommend up-weighting`,
|
|
1806
|
+
expectedDelta: Math.max(0, Math.abs(ranking.spearman) - 0.5) * 0.1
|
|
1807
|
+
});
|
|
1808
|
+
}
|
|
1809
|
+
return changes;
|
|
1810
|
+
}
|
|
1811
|
+
async applyChange(changes, baseline) {
|
|
1812
|
+
return {
|
|
1813
|
+
...baseline,
|
|
1814
|
+
changes: [...baseline.changes, ...changes]
|
|
1815
|
+
};
|
|
1816
|
+
}
|
|
1817
|
+
async evaluateChange(plan) {
|
|
1818
|
+
const emptyGate = {
|
|
1819
|
+
promote: false,
|
|
1820
|
+
candidateId: plan.proposedCandidateId,
|
|
1821
|
+
baselineId: plan.baselineCandidateId,
|
|
1822
|
+
evidence: {
|
|
1823
|
+
productiveRuns: 0,
|
|
1824
|
+
medianPairedDelta: 0,
|
|
1825
|
+
pairedCI: { low: 0, high: 0 },
|
|
1826
|
+
pairedPValue: 1,
|
|
1827
|
+
searchScore: 0,
|
|
1828
|
+
holdoutScore: 0,
|
|
1829
|
+
overfitGap: 0,
|
|
1830
|
+
baselineOverfitGap: 0
|
|
1831
|
+
},
|
|
1832
|
+
reason: "predictive-validity researcher does not execute plans; the caller is expected to run the sweep and call rubricPredictiveValidity directly with the resulting RunRecord[].",
|
|
1833
|
+
rejectionCode: "few_runs"
|
|
1834
|
+
};
|
|
1835
|
+
return {
|
|
1836
|
+
plan,
|
|
1837
|
+
runs: [],
|
|
1838
|
+
gateDecision: emptyGate
|
|
1839
|
+
};
|
|
1840
|
+
}
|
|
1841
|
+
/**
|
|
1842
|
+
* Run the predictive-validity check explicitly against a fresh RunRecord
|
|
1843
|
+
* set. Updates the researcher's cached report so subsequent
|
|
1844
|
+
* `proposeChange` calls have evidence to draw from.
|
|
1845
|
+
*/
|
|
1846
|
+
async runValidityCheck(runs) {
|
|
1847
|
+
const report = await rubricPredictiveValidity({
|
|
1848
|
+
runs,
|
|
1849
|
+
outcomes: this.opts.outcomes,
|
|
1850
|
+
outcomeMetrics: this.opts.outcomeMetrics,
|
|
1851
|
+
rubrics: this.opts.rubrics
|
|
1852
|
+
});
|
|
1853
|
+
if (this.opts.onReport) await this.opts.onReport(report);
|
|
1854
|
+
this.lastReport = report;
|
|
1855
|
+
return report;
|
|
1856
|
+
}
|
|
1857
|
+
/**
|
|
1858
|
+
* Force-feed a predictive-validity report into the researcher state —
|
|
1859
|
+
* useful when the consumer ran the report out-of-band and wants the
|
|
1860
|
+
* researcher's later proposals informed by it.
|
|
1861
|
+
*/
|
|
1862
|
+
setReport(report) {
|
|
1863
|
+
this.lastReport = report;
|
|
1864
|
+
}
|
|
1865
|
+
getLastReport() {
|
|
1866
|
+
return this.lastReport;
|
|
1867
|
+
}
|
|
1868
|
+
};
|
|
1869
|
+
|
|
1870
|
+
// src/rl/auto-research.ts
|
|
1871
|
+
async function analyzeOptimizationResult(opts) {
|
|
1872
|
+
const trials = extractTrials(opts.result);
|
|
1873
|
+
const runs = trialsToRunRecords(trials, opts.ctx);
|
|
1874
|
+
const rewardSignals = extractVerifiableRewardsFromRecords(runs, opts.verifiableReward ?? {});
|
|
1875
|
+
const preferences = extractPreferences(runs, {
|
|
1876
|
+
strategy: opts.preferences?.strategy ?? "paired-by-scenario-and-seed",
|
|
1877
|
+
minMargin: opts.preferences?.minMargin ?? 0.05,
|
|
1878
|
+
splitTag: opts.preferences?.splitTag ?? opts.ctx.splitTag ?? "search",
|
|
1879
|
+
rewardOf: opts.preferences?.rewardOf
|
|
1880
|
+
});
|
|
1881
|
+
let interimConfidence = null;
|
|
1882
|
+
if (opts.comparator) {
|
|
1883
|
+
const deltaSeries = collectPairedDeltaSeries2(runs, opts.comparator);
|
|
1884
|
+
if (deltaSeries.some((s) => s.deltas.length > 0)) {
|
|
1885
|
+
interimConfidence = evaluateInterimReleaseConfidence({
|
|
1886
|
+
deltaSeries,
|
|
1887
|
+
alpha: opts.sequential?.alpha,
|
|
1888
|
+
bound: opts.sequential?.bound,
|
|
1889
|
+
rope: opts.sequential?.rope
|
|
1890
|
+
});
|
|
1891
|
+
}
|
|
1892
|
+
}
|
|
1893
|
+
const rewardHacking = detectRewardHacking({
|
|
1894
|
+
runs,
|
|
1895
|
+
verifiableRewardOptions: opts.verifiableReward
|
|
1896
|
+
});
|
|
1897
|
+
let predictiveValidity = null;
|
|
1898
|
+
if (opts.outcomes) {
|
|
1899
|
+
predictiveValidity = await rubricPredictiveValidity({
|
|
1900
|
+
runs,
|
|
1901
|
+
outcomes: opts.outcomes.store,
|
|
1902
|
+
outcomeMetrics: opts.outcomes.metrics
|
|
1903
|
+
});
|
|
1904
|
+
}
|
|
1905
|
+
const trainerRows = {};
|
|
1906
|
+
if (opts.trainerExport?.dpo) {
|
|
1907
|
+
trainerRows.dpo = await toDpoRows(preferences.pairs, opts.trainerExport.dpo);
|
|
1908
|
+
}
|
|
1909
|
+
if (opts.trainerExport?.grpo) {
|
|
1910
|
+
trainerRows.grpo = await toGrpoRows(runs, opts.trainerExport.grpo);
|
|
1911
|
+
}
|
|
1912
|
+
const summary = buildSummary2({ runs, preferences, interimConfidence, rewardHacking, predictiveValidity });
|
|
1913
|
+
return {
|
|
1914
|
+
runs,
|
|
1915
|
+
rewardSignals,
|
|
1916
|
+
preferences,
|
|
1917
|
+
interimConfidence,
|
|
1918
|
+
rewardHacking,
|
|
1919
|
+
predictiveValidity,
|
|
1920
|
+
trainerRows,
|
|
1921
|
+
summary
|
|
1922
|
+
};
|
|
1923
|
+
}
|
|
1924
|
+
function extractTrials(result) {
|
|
1925
|
+
if ("evolution" in result) {
|
|
1926
|
+
return collectFromEvolution(result.evolution);
|
|
1927
|
+
}
|
|
1928
|
+
return collectFromEvolution(result);
|
|
1929
|
+
}
|
|
1930
|
+
function collectFromEvolution(evolution) {
|
|
1931
|
+
const trials = [];
|
|
1932
|
+
for (const gen of evolution.generations) {
|
|
1933
|
+
for (const t of gen.trials ?? []) trials.push(t);
|
|
1934
|
+
}
|
|
1935
|
+
return trials;
|
|
1936
|
+
}
|
|
1937
|
+
function collectPairedDeltaSeries2(runs, comparator) {
|
|
1938
|
+
const baseline = /* @__PURE__ */ new Map();
|
|
1939
|
+
for (const r of runs) {
|
|
1940
|
+
if (r.candidateId !== comparator) continue;
|
|
1941
|
+
const sid = r.scenarioId ?? r.experimentId;
|
|
1942
|
+
const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
|
|
1943
|
+
if (typeof score !== "number" || !Number.isFinite(score)) continue;
|
|
1944
|
+
baseline.set(`${sid}::${r.seed}`, score);
|
|
1945
|
+
}
|
|
1946
|
+
const byCandidate = /* @__PURE__ */ new Map();
|
|
1947
|
+
for (const r of runs) {
|
|
1948
|
+
if (r.candidateId === comparator) continue;
|
|
1949
|
+
const sid = r.scenarioId ?? r.experimentId;
|
|
1950
|
+
const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
|
|
1951
|
+
if (typeof score !== "number" || !Number.isFinite(score)) continue;
|
|
1952
|
+
const baseScore = baseline.get(`${sid}::${r.seed}`);
|
|
1953
|
+
if (typeof baseScore !== "number") continue;
|
|
1954
|
+
const arr = byCandidate.get(r.candidateId) ?? [];
|
|
1955
|
+
arr.push(score - baseScore);
|
|
1956
|
+
byCandidate.set(r.candidateId, arr);
|
|
1957
|
+
}
|
|
1958
|
+
return [...byCandidate.entries()].map(([candidateId, deltas]) => ({ candidateId, deltas }));
|
|
1959
|
+
}
|
|
1960
|
+
function buildSummary2(args) {
|
|
1961
|
+
const lines = [
|
|
1962
|
+
`${args.runs.length} runs analysed`,
|
|
1963
|
+
`${args.preferences.pairs.length} preference pairs (${args.preferences.strategy})`,
|
|
1964
|
+
`reward-hacking verdict: ${args.rewardHacking.verdict}`
|
|
1965
|
+
];
|
|
1966
|
+
if (args.interimConfidence) {
|
|
1967
|
+
lines.push(`sequential: ${args.interimConfidence.recommendation.decision}` + (args.interimConfidence.recommendation.candidateId ? ` ${args.interimConfidence.recommendation.candidateId}` : ""));
|
|
1968
|
+
}
|
|
1969
|
+
if (args.predictiveValidity?.ranked[0]) {
|
|
1970
|
+
const top = args.predictiveValidity.ranked[0];
|
|
1971
|
+
lines.push(`top-rubric: ${top.rubric} \u03C1=${top.spearman.toFixed(2)}`);
|
|
1972
|
+
}
|
|
1973
|
+
return lines.join(" | ");
|
|
1974
|
+
}
|
|
1975
|
+
|
|
1976
|
+
export {
|
|
1977
|
+
trialToRunRecord,
|
|
1978
|
+
trialsToRunRecords,
|
|
1979
|
+
verificationReportToRunRecord,
|
|
1980
|
+
variantAggregateToRunRecord,
|
|
1981
|
+
extractVerifiableReward,
|
|
1982
|
+
extractVerifiableRewardsFromRecords,
|
|
1983
|
+
filterDeterministicallyRewarded,
|
|
1984
|
+
extractPreferences,
|
|
1985
|
+
toTRLFormat,
|
|
1986
|
+
toAnthropicFormat,
|
|
1987
|
+
inverseProbabilityWeighting,
|
|
1988
|
+
selfNormalizedImportanceWeighting,
|
|
1989
|
+
doublyRobust,
|
|
1990
|
+
offPolicyEstimateAll,
|
|
1991
|
+
extractStepRewards,
|
|
1992
|
+
runwiseStepRewardSummary,
|
|
1993
|
+
prmTrainingPairs,
|
|
1994
|
+
runContaminationProbe,
|
|
1995
|
+
renameVariables,
|
|
1996
|
+
shuffleOrder,
|
|
1997
|
+
injectIrrelevantClause,
|
|
1998
|
+
fitBradleyTerry,
|
|
1999
|
+
applyEloUpdate,
|
|
2000
|
+
buildPairwiseFromCampaign,
|
|
2001
|
+
adversarialScenarioSearch,
|
|
2002
|
+
runComputeCurve,
|
|
2003
|
+
bestOfN,
|
|
2004
|
+
selfConsistency,
|
|
2005
|
+
paretoFrontier,
|
|
2006
|
+
varianceBasedCurriculum,
|
|
2007
|
+
thompsonCurriculum,
|
|
2008
|
+
observationsFromRunRecords,
|
|
2009
|
+
detectRewardHacking,
|
|
2010
|
+
runAdaptationCurve,
|
|
2011
|
+
compareAdaptationCurves,
|
|
2012
|
+
firstPassK,
|
|
2013
|
+
toDpoRows,
|
|
2014
|
+
toDpoJsonl,
|
|
2015
|
+
toGrpoRows,
|
|
2016
|
+
toGrpoJsonl,
|
|
2017
|
+
toSftRows,
|
|
2018
|
+
toSftJsonl,
|
|
2019
|
+
toPrmRows,
|
|
2020
|
+
toPrmJsonl,
|
|
2021
|
+
stepRewardsToJsonl,
|
|
2022
|
+
runRLCampaign,
|
|
2023
|
+
PredictiveValidityResearcher,
|
|
2024
|
+
analyzeOptimizationResult
|
|
2025
|
+
};
|
|
2026
|
+
//# sourceMappingURL=chunk-LZKIOBG2.js.map
|